├── .gitignore ├── .idea ├── compiler.xml ├── encodings.xml ├── inspectionProfiles │ └── Project_Default.xml ├── libraries │ ├── Maven__com_belerweb_pinyin4j_2_5_0.xml │ ├── Maven__com_sun_rt_1_5_0_06.xml │ └── Maven__commons_lang_commons_lang_2_6.xml ├── misc.xml ├── modules.xml ├── uiDesigner.xml └── vcs.xml ├── README.md ├── SegAndNewWordDiscover.iml ├── explain.md ├── pom.xml ├── segment.properties └── src ├── main └── java │ ├── computer │ └── Occurrence.java │ ├── concurrent_compute │ ├── ConCalculateUtil.java │ ├── ConCompute.java │ ├── ExtractWordsConCompute.java │ ├── MIERConCompute.java │ ├── WordCountConCompute.java │ └── extract │ │ └── queue │ │ ├── Constans.java │ │ ├── Consumer.java │ │ ├── MyBlockingQueue.java │ │ └── Producer.java │ ├── config │ ├── CommonValue.java │ ├── Config.java │ ├── Constants.java │ └── Logger.java │ ├── io │ ├── ByteArray.java │ ├── ByteUtil.java │ └── IOUtil.java │ ├── pojo │ ├── CharType.java │ ├── CompoundWord.java │ ├── IWord.java │ ├── LineMsg.java │ ├── SegMsg.java │ ├── Sentence.java │ ├── Term.java │ ├── Word.java │ └── WordFactory.java │ ├── seg │ ├── PreProcess.java │ └── Segment.java │ ├── serilize │ ├── JsonSerializationUtil.java │ └── readAndWriteJson.java │ ├── trie │ ├── AhoCorasick │ │ ├── AhoCorasickDoubleArrayTrie.java │ │ └── State.java │ ├── ITrie.java │ ├── Trie.java │ └── bintrie │ │ ├── BaseNode.java │ │ ├── BinTrie.java │ │ ├── Node.java │ │ ├── _ValueArray.java │ │ └── util │ │ └── ArrayTool.java │ └── util │ ├── FileUtils.java │ ├── HanUtils.java │ ├── Predefine.java │ └── TextUtility.java └── test └── java ├── SegmentTest ├── SegTest.java └── WordCountTest.java └── concurrent └── SegCountProcess.java /.gitignore: -------------------------------------------------------------------------------- 1 | */target 2 | *.iml 3 | /.idea 4 | *.class 5 | target/ 6 | .project 7 | .settings/ 8 | .classpath 9 | *.MF 10 | *.txt 11 | *.jar -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | -------------------------------------------------------------------------------- /.idea/encodings.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | -------------------------------------------------------------------------------- /.idea/inspectionProfiles/Project_Default.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 6 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_belerweb_pinyin4j_2_5_0.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__com_sun_rt_1_5_0_06.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/libraries/Maven__commons_lang_commons_lang_2_6.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 9 | 14 | 15 | 16 | 17 | 18 | -------------------------------------------------------------------------------- /.idea/modules.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/README.md -------------------------------------------------------------------------------- /SegAndNewWordDiscover.iml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /explain.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/explain.md -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | edu.csu.cn 8 | SegAndNewWordDiscover 9 | pom 10 | 1.0-SNAPSHOT 11 | 12 | 13 | 14 | commons-lang 15 | commons-lang 16 | 2.6 17 | 18 | 19 | 24 | 25 | 26 | 27 | com.sun 28 | rt 29 | 1.5.0_06 30 | 31 | 32 | 37 | 38 | com.google.code.gson 39 | gson 40 | 2.8.0 41 | 42 | 43 | 44 | 45 | redis.clients 46 | jedis 47 | 2.10.0 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | org.apache.maven.plugins 56 | maven-compiler-plugin 57 | 58 | 1.8 59 | 1.8 60 | 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /segment.properties: -------------------------------------------------------------------------------- 1 | # 互信息过滤阈值默认1.0f 2 | MI_THRESHOLD_VALUE=0.9f 3 | # 左右信息熵比率阈值默认0.4f 4 | ENTROPY_THETA=0.4f 5 | # 候选串切取最大长度实际为所赋值减一如6,实际的最大长度为5,默认5 6 | MAX_WORD_LEN=6 7 | # 左邻字信息熵最小阈值默认 0.01f 8 | MIN_LEFT_ENTROPY=0.01f 9 | # 右邻字信息熵最小阈值默认 0.01f 10 | MIN_RIGHT_ENTROPY=0.01f 11 | ################################ redis相关 ################################ 12 | # redis 服务地址 13 | REDIS_HOST=localhost 14 | # redis 端口地址 15 | REDIS_PORT= 6379 16 | # redis 密码验证默认为 root 17 | REDIS_AUTH_PASSWORD=root 18 | ############################################################################### 19 | # 语料输入路径,默认为"data\\test-text.txt" 20 | ## CORPUS_INPUT_PATH=data/test-text.txt 21 | NOVEL_INPUT_PATH=data/test.txt 22 | # 抽词结果输出 23 | EXTRACT_OUTPUT=data/result.txt 24 | # 是否开启调试，true会输出调试信息,默认为false 25 | DEBUG_MODE= true 26 | ################################ 多线程并发相关 ################################ 27 | # 并发词频统计线程数 28 | WC_THREAD_NUM=20 29 | COMPUTE_THREAD_NUM=20 30 | # 并发抽词线程数 31 | SEG_THREAD_NUM= 3 32 | -------------------------------------------------------------------------------- /src/main/java/computer/Occurrence.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/computer/Occurrence.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/ConCalculateUtil.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/ConCalculateUtil.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/ConCompute.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/ConCompute.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/ExtractWordsConCompute.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/ExtractWordsConCompute.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/MIERConCompute.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/MIERConCompute.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/WordCountConCompute.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/WordCountConCompute.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/extract/queue/Constans.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/extract/queue/Constans.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/extract/queue/Consumer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/extract/queue/Consumer.java -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/extract/queue/MyBlockingQueue.java: -------------------------------------------------------------------------------- 1 | package concurrent_compute.extract.queue; 2 | 3 | import pojo.LineMsg; 4 | 5 | import java.util.concurrent.LinkedTransferQueue; 6 | 7 | /** 8 | * 消息生产消费的模型 9 | */ 10 | public class MyBlockingQueue { 11 | public static LinkedTransferQueue fairQueue = new LinkedTransferQueue(); 12 | 13 | // 消息生产 14 | public static void produce(LineMsg msg) { 15 | fairQueue.add(msg); 16 | } 17 | 18 | // 消息消费 19 | public static LineMsg consume() { 20 | return fairQueue.poll(); 21 | } 22 | } 23 | 24 | 25 | 26 | -------------------------------------------------------------------------------- /src/main/java/concurrent_compute/extract/queue/Producer.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/concurrent_compute/extract/queue/Producer.java -------------------------------------------------------------------------------- /src/main/java/config/CommonValue.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/config/CommonValue.java -------------------------------------------------------------------------------- /src/main/java/config/Config.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/config/Config.java -------------------------------------------------------------------------------- /src/main/java/config/Constants.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/config/Constants.java -------------------------------------------------------------------------------- /src/main/java/config/Logger.java: -------------------------------------------------------------------------------- 1 | package config; 2 | 3 | /** 4 | * Created by bruce_shan on 2018/12/4 16:44. 5 | * Corporation CSU Software 6 | */ 7 | public class Logger { 8 | /** 9 | * 日志组件 10 | */ 11 | public static java.util.logging.Logger logger = java.util.logging.Logger.getLogger("newWordDiscover");; 12 | 13 | 14 | } 15 | -------------------------------------------------------------------------------- /src/main/java/io/ByteArray.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/10/30 14:33 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package io; 13 | 14 | 15 | import static config.Logger.logger; 16 | 17 | /** 18 | * 对字节数组进行封装，提供方便的读取操作 19 | * 20 | * @author hankcs 21 | */ 22 | public class ByteArray 23 | { 24 | /** 25 | * 当前字节数组，不一定是全部字节，可能只是一个片段 26 | */ 27 | byte[] bytes; 28 | /** 29 | * 当前已读取的字节数，或下一个字节的指针 30 | */ 31 | int offset; 32 | 33 | public ByteArray(byte[] bytes) 34 | { 35 | this.bytes = bytes; 36 | } 37 | 38 | /** 39 | * 从文件读取一个字节数组 40 | * 41 | * @param path 42 | * @return 43 | */ 44 | /* public static ByteArray createByteArray(String path) 45 | { 46 | byte[] bytes = IOUtil.readBytes(path); 47 | if (bytes == null) return null; 48 | return new ByteArray(bytes); 49 | }*/ 50 | 51 | /** 52 | * 获取全部字节 53 | * @return 54 | */ 55 | public byte[] getBytes() 56 | { 57 | return bytes; 58 | } 59 | 60 | /** 61 | * 读取一个int 62 | * 63 | * @return 64 | */ 65 | public int nextInt() 66 | { 67 | int result = ByteUtil.bytesHighFirstToInt(bytes, offset); 68 | offset += 4; 69 | return result; 70 | } 71 | 72 | public double nextDouble() 73 | { 74 | double result = ByteUtil.bytesHighFirstToDouble(bytes, offset); 75 | offset += 8; 76 | return result; 77 | } 78 | 79 | /** 80 | * 读取一个char，对应于writeChar 81 | * 82 | * @return 83 | */ 84 | public char nextChar() 85 | { 86 | char result = ByteUtil.bytesHighFirstToChar(bytes, offset); 87 | offset += 2; 88 | return result; 89 | } 90 | 91 | /** 92 | * 读取一个字节 93 | * 94 | * @return 95 | */ 96 | public byte nextByte() 97 | { 98 | return bytes[offset++]; 99 | } 100 | 101 | /** 102 | * 读取一个布尔值 103 | * @return 104 | */ 105 | public boolean nextBoolean() 106 | { 107 | return nextByte() == 1; 108 | } 109 | 110 | public boolean hasMore() 111 | { 112 | return offset < bytes.length; 113 | } 114 | 115 | /** 116 | * 读取一个String，注意这个String是双字节版的，在字符之前有一个整型表示长度 117 | * 118 | * @return 119 | */ 120 | public String nextString() 121 | { 122 | char[] buffer = new char[nextInt()]; 123 | for (int i = 0; i < buffer.length; ++i) 124 | { 125 | buffer[i] = nextChar(); 126 | } 127 | return new String(buffer); 128 | } 129 | 130 | public float nextFloat() 131 | { 132 | float result = ByteUtil.bytesHighFirstToFloat(bytes, offset); 133 | offset += 4; 134 | return result; 135 | } 136 | 137 | /** 138 | * 读取一个无符号短整型 139 | * @return 140 | */ 141 | public int nextUnsignedShort() 142 | { 143 | byte a = nextByte(); 144 | byte b = nextByte(); 145 | return (((a & 0xff) << 8) | (b & 0xff)); 146 | } 147 | 148 | /** 149 | * 读取一个UTF字符串 150 | * @return 151 | */ 152 | public String nextUTF() 153 | { 154 | int utflen = nextUnsignedShort(); 155 | byte[] bytearr = null; 156 | char[] chararr = null; 157 | bytearr = new byte[utflen]; 158 | chararr = new char[utflen]; 159 | 160 | int c, char2, char3; 161 | int count = 0; 162 | int chararr_count = 0; 163 | 164 | for (int i = 0; i < utflen; ++i) 165 | { 166 | bytearr[i] = nextByte(); 167 | } 168 | 169 | while (count < utflen) 170 | { 171 | c = (int) bytearr[count] & 0xff; 172 | if (c > 127) break; 173 | count++; 174 | chararr[chararr_count++] = (char) c; 175 | } 176 | 177 | while (count < utflen) 178 | { 179 | c = (int) bytearr[count] & 0xff; 180 | switch (c >> 4) 181 | { 182 | case 0: 183 | case 1: 184 | case 2: 185 | case 3: 186 | case 4: 187 | case 5: 188 | case 6: 189 | case 7: 190 | /* 0xxxxxxx*/ 191 | count++; 192 | chararr[chararr_count++] = (char) c; 193 | break; 194 | case 12: 195 | case 13: 196 | /* 110x xxxx 10xx xxxx*/ 197 | count += 2; 198 | if (count > utflen) 199 | logger.severe( 200 | "malformed input: partial character at end"); 201 | char2 = (int) bytearr[count - 1]; 202 | if ((char2 & 0xC0) != 0x80) 203 | logger.severe( 204 | "malformed input around byte " + count); 205 | chararr[chararr_count++] = (char) (((c & 0x1F) << 6) | 206 | (char2 & 0x3F)); 207 | break; 208 | case 14: 209 | /* 1110 xxxx 10xx xxxx 10xx xxxx */ 210 | count += 3; 211 | if (count > utflen) 212 | logger.severe( 213 | "malformed input: partial character at end"); 214 | char2 = (int) bytearr[count - 2]; 215 | char3 = (int) bytearr[count - 1]; 216 | if (((char2 & 0xC0) != 0x80) || ((char3 & 0xC0) != 0x80)) 217 | logger.severe( 218 | "malformed input around byte " + (count - 1)); 219 | chararr[chararr_count++] = (char) (((c & 0x0F) << 12) | 220 | ((char2 & 0x3F) << 6) | 221 | ((char3 & 0x3F) << 0)); 222 | break; 223 | default: 224 | /* 10xx xxxx, 1111 xxxx */ 225 | logger.severe( 226 | "malformed input around byte " + count); 227 | } 228 | } 229 | // The number of chars produced may be less than utflen 230 | return new String(chararr, 0, chararr_count); 231 | } 232 | 233 | public int getOffset() 234 | { 235 | return offset; 236 | } 237 | 238 | public int getLength() 239 | { 240 | return bytes.length; 241 | } 242 | 243 | /** 244 | * 通知执行关闭/销毁操作 245 | */ 246 | public void close() 247 | { 248 | bytes = null; 249 | } 250 | 251 | @Override 252 | protected void finalize() throws Throwable 253 | { 254 | // 如果忘记close，则在垃圾回收器释放内存的时候close，总好过完全不close 255 | close(); 256 | } 257 | } -------------------------------------------------------------------------------- /src/main/java/io/ByteUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/11/25 17:55 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package io; 13 | 14 | import java.io.DataOutputStream; 15 | import java.io.IOException; 16 | 17 | /** 18 | * 对数字和字节进行转换。
19 | * 基础知识：
20 | * 假设数据存储是以大端模式存储的：
21 | * byte: 字节类型占8位二进制 00000000
22 | * char: 字符类型占2个字节 16位二进制 byte[0] byte[1]
23 | * int : 整数类型占4个字节 32位二进制 byte[0] byte[1] byte[2] byte[3]
24 | * long: 长整数类型占8个字节 64位二进制 byte[0] byte[1] byte[2] byte[3] byte[4] byte[5] 25 | * byte[6] byte[7]
26 | * float: 浮点数(小数) 占4个字节 32位二进制 byte[0] byte[1] byte[2] byte[3]
27 | * double: 双精度浮点数(小数) 占8个字节 64位二进制 byte[0] byte[1] byte[2] byte[3] byte[4] 28 | * byte[5] byte[6] byte[7]
29 | */ 30 | public class ByteUtil 31 | { 32 | 33 | /** 34 | * 将一个2位字节数组转换为char字符。
35 | * 注意，函数中不会对字节数组长度进行判断，请自行保证传入参数的正确性。 36 | * 37 | * @param b 字节数组 38 | * @return char字符 39 | */ 40 | public static char bytesToChar(byte[] b) 41 | { 42 | char c = (char) ((b[0] << 8) & 0xFF00L); 43 | c |= (char) (b[1] & 0xFFL); 44 | return c; 45 | } 46 | 47 | /** 48 | * 将一个8位字节数组转换为双精度浮点数。
49 | * 注意，函数中不会对字节数组长度进行判断，请自行保证传入参数的正确性。 50 | * 51 | * @param b 字节数组 52 | * @return 双精度浮点数 53 | */ 54 | public static double bytesToDouble(byte[] b) 55 | { 56 | return Double.longBitsToDouble(bytesToLong(b)); 57 | } 58 | 59 | /** 60 | * 读取double，高位在前 61 | * 62 | * @param bytes 63 | * @param start 64 | * @return 65 | */ 66 | public static double bytesHighFirstToDouble(byte[] bytes, int start) 67 | { 68 | long l = ((long) bytes[start] << 56) & 0xFF00000000000000L; 69 | // 如果不强制转换为long，那么默认会当作int，导致最高32位丢失 70 | l |= ((long) bytes[1 + start] << 48) & 0xFF000000000000L; 71 | l |= ((long) bytes[2 + start] << 40) & 0xFF0000000000L; 72 | l |= ((long) bytes[3 + start] << 32) & 0xFF00000000L; 73 | l |= ((long) bytes[4 + start] << 24) & 0xFF000000L; 74 | l |= ((long) bytes[5 + start] << 16) & 0xFF0000L; 75 | l |= ((long) bytes[6 + start] << 8) & 0xFF00L; 76 | l |= (long) bytes[7 + start] & 0xFFL; 77 | 78 | return Double.longBitsToDouble(l); 79 | } 80 | 81 | /** 82 | * 将一个4位字节数组转换为浮点数。
83 | * 注意，函数中不会对字节数组长度进行判断，请自行保证传入参数的正确性。 84 | * 85 | * @param b 字节数组 86 | * @return 浮点数 87 | */ 88 | public static float bytesToFloat(byte[] b) 89 | { 90 | return Float.intBitsToFloat(bytesToInt(b)); 91 | } 92 | 93 | /** 94 | * 将一个4位字节数组转换为4整数。
95 | * 注意，函数中不会对字节数组长度进行判断，请自行保证传入参数的正确性。 96 | * 97 | * @param b 字节数组 98 | * @return 整数 99 | */ 100 | public static int bytesToInt(byte[] b) 101 | { 102 | int i = (b[0] << 24) & 0xFF000000; 103 | i |= (b[1] << 16) & 0xFF0000; 104 | i |= (b[2] << 8) & 0xFF00; 105 | i |= b[3] & 0xFF; 106 | return i; 107 | } 108 | 109 | /** 110 | * 将一个8位字节数组转换为长整数。
111 | * 注意，函数中不会对字节数组长度进行判断，请自行保证传入参数的正确性。 112 | * 113 | * @param b 字节数组 114 | * @return 长整数 115 | */ 116 | public static long bytesToLong(byte[] b) 117 | { 118 | long l = ((long) b[0] << 56) & 0xFF00000000000000L; 119 | // 如果不强制转换为long，那么默认会当作int，导致最高32位丢失 120 | l |= ((long) b[1] << 48) & 0xFF000000000000L; 121 | l |= ((long) b[2] << 40) & 0xFF0000000000L; 122 | l |= ((long) b[3] << 32) & 0xFF00000000L; 123 | l |= ((long) b[4] << 24) & 0xFF000000L; 124 | l |= ((long) b[5] << 16) & 0xFF0000L; 125 | l |= ((long) b[6] << 8) & 0xFF00L; 126 | l |= (long) b[7] & 0xFFL; 127 | return l; 128 | } 129 | 130 | public static long bytesHighFirstToLong(byte[] b) 131 | { 132 | long l = ((long) b[0] << 56) & 0xFF00000000000000L; 133 | // 如果不强制转换为long，那么默认会当作int，导致最高32位丢失 134 | l |= ((long) b[1] << 48) & 0xFF000000000000L; 135 | l |= ((long) b[2] << 40) & 0xFF0000000000L; 136 | l |= ((long) b[3] << 32) & 0xFF00000000L; 137 | l |= ((long) b[4] << 24) & 0xFF000000L; 138 | l |= ((long) b[5] << 16) & 0xFF0000L; 139 | l |= ((long) b[6] << 8) & 0xFF00L; 140 | l |= (long) b[7] & 0xFFL; 141 | return l; 142 | } 143 | 144 | /** 145 | * 将一个char字符转换位字节数组（2个字节），b[0]存储高位字符，大端 146 | * 147 | * @param c 字符（java char 2个字节） 148 | * @return 代表字符的字节数组 149 | */ 150 | public static byte[] charToBytes(char c) 151 | { 152 | byte[] b = new byte[8]; 153 | b[0] = (byte) (c >>> 8); 154 | b[1] = (byte) c; 155 | return b; 156 | } 157 | 158 | /** 159 | * 将一个双精度浮点数转换位字节数组（8个字节），b[0]存储高位字符，大端 160 | * 161 | * @param d 双精度浮点数 162 | * @return 代表双精度浮点数的字节数组 163 | */ 164 | public static byte[] doubleToBytes(double d) 165 | { 166 | return longToBytes(Double.doubleToLongBits(d)); 167 | } 168 | 169 | /** 170 | * 将一个浮点数转换为字节数组（4个字节），b[0]存储高位字符，大端 171 | * 172 | * @param f 浮点数 173 | * @return 代表浮点数的字节数组 174 | */ 175 | public static byte[] floatToBytes(float f) 176 | { 177 | return intToBytes(Float.floatToIntBits(f)); 178 | } 179 | 180 | /** 181 | * 将一个整数转换位字节数组(4个字节)，b[0]存储高位字符，大端 182 | * 183 | * @param i 整数 184 | * @return 代表整数的字节数组 185 | */ 186 | public static byte[] intToBytes(int i) 187 | { 188 | byte[] b = new byte[4]; 189 | b[0] = (byte) (i >>> 24); 190 | b[1] = (byte) (i >>> 16); 191 | b[2] = (byte) (i >>> 8); 192 | b[3] = (byte) i; 193 | return b; 194 | } 195 | 196 | /** 197 | * 将一个长整数转换位字节数组(8个字节)，b[0]存储高位字符，大端 198 | * 199 | * @param l 长整数 200 | * @return 代表长整数的字节数组 201 | */ 202 | public static byte[] longToBytes(long l) 203 | { 204 | byte[] b = new byte[8]; 205 | b[0] = (byte) (l >>> 56); 206 | b[1] = (byte) (l >>> 48); 207 | b[2] = (byte) (l >>> 40); 208 | b[3] = (byte) (l >>> 32); 209 | b[4] = (byte) (l >>> 24); 210 | b[5] = (byte) (l >>> 16); 211 | b[6] = (byte) (l >>> 8); 212 | b[7] = (byte) (l); 213 | return b; 214 | } 215 | 216 | /** 217 | * 字节数组和整型的转换 218 | * 219 | * @param bytes 字节数组 220 | * @return 整型 221 | */ 222 | public static int bytesToInt(byte[] bytes, int start) 223 | { 224 | int num = bytes[start] & 0xFF; 225 | num |= ((bytes[start + 1] << 8) & 0xFF00); 226 | num |= ((bytes[start + 2] << 16) & 0xFF0000); 227 | num |= ((bytes[start + 3] << 24) & 0xFF000000); 228 | return num; 229 | } 230 | 231 | /** 232 | * 字节数组和整型的转换，高位在前，适用于读取writeInt的数据 233 | * 234 | * @param bytes 字节数组 235 | * @return 整型 236 | */ 237 | public static int bytesHighFirstToInt(byte[] bytes, int start) 238 | { 239 | int num = bytes[start + 3] & 0xFF; 240 | num |= ((bytes[start + 2] << 8) & 0xFF00); 241 | num |= ((bytes[start + 1] << 16) & 0xFF0000); 242 | num |= ((bytes[start] << 24) & 0xFF000000); 243 | return num; 244 | } 245 | 246 | /** 247 | * 字节数组转char，高位在前，适用于读取writeChar的数据 248 | * 249 | * @param bytes 250 | * @param start 251 | * @return 252 | */ 253 | public static char bytesHighFirstToChar(byte[] bytes, int start) 254 | { 255 | char c = (char) (((bytes[start] & 0xFF) << 8) | (bytes[start + 1] & 0xFF)); 256 | return c; 257 | } 258 | 259 | /** 260 | * 读取float，高位在前 261 | * 262 | * @param bytes 263 | * @param start 264 | * @return 265 | */ 266 | public static float bytesHighFirstToFloat(byte[] bytes, int start) 267 | { 268 | int l = bytesHighFirstToInt(bytes, start); 269 | return Float.intBitsToFloat(l); 270 | } 271 | 272 | /** 273 | * 无符号整型输出 274 | * @param out 275 | * @param uint 276 | * @throws IOException 277 | */ 278 | public static void writeUnsignedInt(DataOutputStream out, int uint) throws IOException 279 | { 280 | out.writeByte((byte) ((uint >>> 8) & 0xFF)); 281 | out.writeByte((byte) ((uint >>> 0) & 0xFF)); 282 | } 283 | 284 | public static int convertTwoCharToInt(char high, char low) 285 | { 286 | int result = high << 16; 287 | result |= low; 288 | return result; 289 | } 290 | 291 | public static char[] convertIntToTwoChar(int n) 292 | { 293 | char[] result = new char[2]; 294 | result[0] = (char) (n >>> 16); 295 | result[1] = (char) (0x0000FFFF & n); 296 | return result; 297 | } 298 | } 299 | -------------------------------------------------------------------------------- /src/main/java/io/IOUtil.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 23:04 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package io; 13 | 14 | 15 | import util.TextUtility; 16 | 17 | import java.io.*; 18 | import java.nio.ByteBuffer; 19 | import java.nio.channels.FileChannel; 20 | import java.nio.charset.Charset; 21 | import java.util.*; 22 | 23 | import static config.Logger.logger; 24 | 25 | 26 | /** 27 | * 一些常用的IO操作 28 | * 29 | * @author hankcs 30 | */ 31 | public class IOUtil { 32 | /** 33 | * 序列化对象 34 | * 35 | * @param o 36 | * @param path 37 | * @return 38 | */ 39 | public static boolean saveObjectTo(Object o, String path) { 40 | try { 41 | ObjectOutputStream oos = new ObjectOutputStream(IOUtil.newOutputStream(path)); 42 | oos.writeObject(o); 43 | oos.close(); 44 | } catch (IOException e) { 45 | //logger.warning("在保存对象" + o + "到" + path + "时发生异常" + e); 46 | return false; 47 | } 48 | 49 | return true; 50 | } 51 | 52 | /** 53 | * 反序列化对象 54 | * 55 | * @param path 56 | * @return 57 | */ 58 | public static Object readObjectFrom(String path) { 59 | ObjectInputStream ois = null; 60 | try { 61 | ois = new ObjectInputStream(IOUtil.newInputStream(path)); 62 | Object o = ois.readObject(); 63 | ois.close(); 64 | return o; 65 | } catch (Exception e) { 66 | // logger.warning("在从" + path + "读取对象时发生异常" + e); 67 | } 68 | 69 | return null; 70 | } 71 | 72 | /** 73 | * 一次性读入纯文本 74 | * 75 | * @param path 76 | * @return 77 | */ 78 | public static String readTxt(String path) { 79 | if (path == null) return null; 80 | try { 81 | InputStream in = new FileInputStream(path); 82 | 83 | byte[] fileContent = new byte[in.available()]; 84 | int read = readBytesFromOtherInputStream(in, fileContent); 85 | in.close(); 86 | // 处理 UTF-8 BOM 87 | if (read >= 3 && fileContent[0] == -17 && fileContent[1] == -69 && fileContent[2] == -65) 88 | return new String(fileContent, 3, fileContent.length - 3, Charset.forName("UTF-8")); 89 | return new String(fileContent, Charset.forName("UTF-8")); 90 | } catch (FileNotFoundException e) { 91 | //logger.warning("找不到" + path + e); 92 | return null; 93 | } catch (IOException e) { 94 | // logger.warning("读取" + path + "发生IO异常" + e); 95 | return null; 96 | } 97 | } 98 | 99 | public static LinkedList readCsv(String path) { 100 | LinkedList resultList = new LinkedList(); 101 | LinkedList lineList = readLineList(path); 102 | for (String line : lineList) { 103 | resultList.add(line.split(",")); 104 | } 105 | return resultList; 106 | } 107 | 108 | /** 109 | * 快速保存 110 | * 111 | * @param path 112 | * @param content 113 | * @return 114 | */ 115 | public static boolean saveTxt(String path, String content) { 116 | try { 117 | FileChannel fc = new FileOutputStream(path).getChannel(); 118 | fc.write(ByteBuffer.wrap(content.getBytes())); 119 | fc.close(); 120 | } catch (Exception e) { 121 | // logger.throwing("IOUtil", "saveTxt", e); 122 | // logger.warning("IOUtil saveTxt 到" + path + "失败" + e.toString()); 123 | return false; 124 | } 125 | return true; 126 | } 127 | 128 | public static boolean saveTxt(String path, StringBuilder content) { 129 | return saveTxt(path, content.toString()); 130 | } 131 | 132 | public static boolean saveCollectionToTxt(Collection collection, String path) { 133 | StringBuilder sb = new StringBuilder(); 134 | for (Object o : collection) { 135 | sb.append(o); 136 | sb.append('\n'); 137 | } 138 | return saveTxt(path, sb.toString()); 139 | } 140 | 141 | /** 142 | * 将整个文件读取为字节数组 143 | * 144 | * @param path 145 | * @return 146 | */ 147 | public static byte[] readBytes(String path) { 148 | { 149 | try { 150 | return readBytesFromFileInputStream(new FileInputStream(path)); 151 | } catch (Exception e) { 152 | logger.warning("读取" + path + "时发生异常" + e); 153 | } 154 | return null; 155 | } 156 | } 157 | 158 | public static String readTxt(String file, String charsetName) throws IOException { 159 | InputStream is = new FileInputStream(file); 160 | byte[] targetArray = new byte[is.available()]; 161 | int len; 162 | int off = 0; 163 | while ((len = is.read(targetArray, off, targetArray.length - off)) != -1 && off < targetArray.length) { 164 | off += len; 165 | } 166 | is.close(); 167 | 168 | return new String(targetArray, charsetName); 169 | } 170 | 171 | public static String baseName(String path) { 172 | if (path == null || path.length() == 0) 173 | return ""; 174 | path = path.replaceAll("[/\\\\]+", "/"); 175 | int len = path.length(), 176 | upCount = 0; 177 | while (len > 0) { 178 | //remove trailing separator 179 | if (path.charAt(len - 1) == '/') { 180 | len--; 181 | if (len == 0) 182 | return ""; 183 | } 184 | int lastInd = path.lastIndexOf('/', len - 1); 185 | String fileName = path.substring(lastInd + 1, len); 186 | if (fileName.equals(".")) { 187 | len--; 188 | } else if (fileName.equals("..")) { 189 | len -= 2; 190 | upCount++; 191 | } else { 192 | if (upCount == 0) 193 | return fileName; 194 | upCount--; 195 | len -= fileName.length(); 196 | } 197 | } 198 | return ""; 199 | } 200 | 201 | private static byte[] readBytesFromFileInputStream(FileInputStream fis) throws IOException { 202 | FileChannel channel = fis.getChannel(); 203 | int fileSize = (int) channel.size(); 204 | ByteBuffer byteBuffer = ByteBuffer.allocate(fileSize); 205 | channel.read(byteBuffer); 206 | byteBuffer.flip(); 207 | byte[] bytes = byteBuffer.array(); 208 | byteBuffer.clear(); 209 | channel.close(); 210 | fis.close(); 211 | return bytes; 212 | } 213 | 214 | /** 215 | * 将非FileInputStream的某InputStream中的全部数据读入到字节数组中 216 | * 217 | * @param is 218 | * @return 219 | * @throws IOException 220 | */ 221 | public static byte[] readBytesFromOtherInputStream(InputStream is) throws IOException { 222 | ByteArrayOutputStream data = new ByteArrayOutputStream(); 223 | 224 | int readBytes; 225 | byte[] buffer = new byte[Math.max(is.available(), 4096)]; // 最低4KB的缓冲区 226 | 227 | while ((readBytes = is.read(buffer, 0, buffer.length)) != -1) { 228 | data.write(buffer, 0, readBytes); 229 | } 230 | 231 | data.flush(); 232 | 233 | return data.toByteArray(); 234 | } 235 | 236 | /** 237 | * 从InputStream读取指定长度的字节出来 238 | * 239 | * @param is 流 240 | * @param targetArray output 241 | * @return 实际读取了多少字节，返回0表示遇到了文件尾部 242 | * @throws IOException 243 | */ 244 | public static int readBytesFromOtherInputStream(InputStream is, byte[] targetArray) throws IOException { 245 | assert targetArray != null; 246 | if (targetArray.length == 0) return 0; 247 | int len; 248 | int off = 0; 249 | while (off < targetArray.length && (len = is.read(targetArray, off, targetArray.length - off)) != -1) { 250 | off += len; 251 | } 252 | return off; 253 | } 254 | 255 | public static LinkedList readLineList(String path) { 256 | LinkedList result = new LinkedList(); 257 | String txt = readTxt(path); 258 | if (txt == null) return result; 259 | StringTokenizer tokenizer = new StringTokenizer(txt, "\n"); 260 | while (tokenizer.hasMoreTokens()) { 261 | result.add(tokenizer.nextToken()); 262 | } 263 | 264 | return result; 265 | } 266 | 267 | /** 268 | * 用省内存的方式读取大文件 269 | * 270 | * @param path 271 | * @return 272 | */ 273 | public static LinkedList readLineListWithLessMemory(String path) { 274 | LinkedList result = new LinkedList(); 275 | String line = null; 276 | boolean first = true; 277 | try { 278 | BufferedReader bw = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8")); 279 | while ((line = bw.readLine()) != null) { 280 | if (first) { 281 | first = false; 282 | if (!line.isEmpty() && line.charAt(0) == '\uFEFF') 283 | line = line.substring(1); 284 | } 285 | result.add(line); 286 | } 287 | bw.close(); 288 | } catch (Exception e) { 289 | logger.warning("加载" + path + "失败，" + e); 290 | } 291 | 292 | return result; 293 | } 294 | 295 | public static boolean saveMapToTxt(Map map, String path) { 296 | return saveMapToTxt(map, path, "="); 297 | } 298 | 299 | public static boolean saveMapToTxt(Map map, String path, String separator) { 300 | map = new TreeMap(map); 301 | return saveEntrySetToTxt(map.entrySet(), path, separator); 302 | } 303 | 304 | public static boolean saveEntrySetToTxt(Set> entrySet, String path, String separator) { 305 | StringBuilder sbOut = new StringBuilder(); 306 | for (Map.Entry entry : entrySet) { 307 | sbOut.append(entry.getKey()); 308 | sbOut.append(separator); 309 | sbOut.append(entry.getValue()); 310 | sbOut.append('\n'); 311 | } 312 | return saveTxt(path, sbOut.toString()); 313 | } 314 | 315 | /** 316 | * 获取文件所在目录的路径 317 | * 318 | * @param path 319 | * @return 320 | */ 321 | public static String dirname(String path) { 322 | int index = path.lastIndexOf('/'); 323 | if (index == -1) return path; 324 | return path.substring(0, index + 1); 325 | } 326 | 327 | public static LineIterator readLine(String path) { 328 | return new LineIterator(path); 329 | } 330 | 331 | /** 332 | * 删除本地文件 333 | * 334 | * @param path 335 | * @return 336 | */ 337 | public static boolean deleteFile(String path) { 338 | return new File(path).delete(); 339 | } 340 | 341 | /** 342 | * 去除文件第一行中的UTF8 BOM
343 | * 这是Java的bug，且官方不会修复。参考 https://stackoverflow.com/questions/4897876/reading-utf-8-bom-marker 344 | * 345 | * @param line 文件第一行 346 | * @return 去除BOM的部分 347 | */ 348 | public static String removeUTF8BOM(String line) { 349 | if (line != null && line.startsWith("\uFEFF")) // UTF-8 byte order mark (EF BB BF) 350 | { 351 | line = line.substring(1); 352 | } 353 | return line; 354 | } 355 | 356 | /** 357 | * 递归遍历获取目录下的所有文件 358 | * 359 | * @param path 根目录 360 | * @return 文件列表 361 | */ 362 | public static List fileList(String path) { 363 | List fileList = new LinkedList(); 364 | File folder = new File(path); 365 | if (folder.isDirectory()) 366 | enumerate(folder, fileList); 367 | else 368 | fileList.add(folder); // 兼容路径为文件的情况 369 | return fileList; 370 | } 371 | 372 | /** 373 | * 递归遍历目录 374 | * 375 | * @param folder 目录 376 | * @param fileList 储存文件 377 | */ 378 | private static void enumerate(File folder, List fileList) { 379 | File[] fileArray = folder.listFiles(); 380 | if (fileArray != null) { 381 | for (File file : fileArray) { 382 | if (file.isFile() && !file.getName().startsWith(".")) // 过滤隐藏文件 383 | { 384 | fileList.add(file); 385 | } else { 386 | enumerate(file, fileList); 387 | } 388 | } 389 | } 390 | } 391 | 392 | /** 393 | * 方便读取按行读取大文件 394 | */ 395 | public static class LineIterator implements Iterator, Iterable { 396 | BufferedReader bw; 397 | String line; 398 | 399 | public LineIterator(BufferedReader bw) { 400 | this.bw = bw; 401 | try { 402 | line = bw.readLine(); 403 | line = IOUtil.removeUTF8BOM(line); 404 | } catch (IOException e) { 405 | logger.warning("在读取过程中发生错误" + TextUtility.exceptionToString(e)); 406 | bw = null; 407 | } 408 | } 409 | 410 | public LineIterator(String path) { 411 | try { 412 | bw = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8")); 413 | line = bw.readLine(); 414 | line = IOUtil.removeUTF8BOM(line); 415 | } catch (FileNotFoundException e) { 416 | logger.warning("文件" + path + "不存在，接下来的调用会返回null\n" + TextUtility.exceptionToString(e)); 417 | bw = null; 418 | } catch (IOException e) { 419 | logger.warning("在读取过程中发生错误" + TextUtility.exceptionToString(e)); 420 | bw = null; 421 | } 422 | } 423 | 424 | public void close() { 425 | if (bw == null) return; 426 | try { 427 | bw.close(); 428 | bw = null; 429 | } catch (IOException e) { 430 | logger.warning("关闭文件失败" + TextUtility.exceptionToString(e)); 431 | } 432 | return; 433 | } 434 | 435 | @Override 436 | public boolean hasNext() { 437 | if (bw == null) return false; 438 | if (line == null) { 439 | try { 440 | bw.close(); 441 | bw = null; 442 | } catch (IOException e) { 443 | logger.warning("关闭文件失败" + TextUtility.exceptionToString(e)); 444 | } 445 | return false; 446 | } 447 | 448 | return true; 449 | } 450 | 451 | @Override 452 | public String next() { 453 | String preLine = line; 454 | try { 455 | if (bw != null) { 456 | line = bw.readLine(); 457 | if (line == null && bw != null) { 458 | try { 459 | bw.close(); 460 | bw = null; 461 | } catch (IOException e) { 462 | logger.warning("关闭文件失败" + TextUtility.exceptionToString(e)); 463 | } 464 | } 465 | } else { 466 | line = null; 467 | } 468 | } catch (IOException e) { 469 | logger.warning("在读取过程中发生错误" + TextUtility.exceptionToString(e)); 470 | } 471 | return preLine; 472 | } 473 | 474 | @Override 475 | public void remove() { 476 | throw new UnsupportedOperationException("只读，不可写！"); 477 | } 478 | 479 | @Override 480 | public Iterator iterator() { 481 | return this; 482 | } 483 | } 484 | 485 | /** 486 | * 创建一个BufferedWriter 487 | * 488 | * @param path 489 | * @return 490 | * @throws FileNotFoundException 491 | * @throws UnsupportedEncodingException 492 | */ 493 | public static BufferedWriter newBufferedWriter(String path) throws IOException { 494 | return new BufferedWriter(new OutputStreamWriter(IOUtil.newOutputStream(path), "UTF-8")); 495 | } 496 | 497 | /** 498 | * 创建一个BufferedReader 499 | * 500 | * @param path 501 | * @return 502 | * @throws FileNotFoundException 503 | * @throws UnsupportedEncodingException 504 | */ 505 | public static BufferedReader newBufferedReader(String path) throws IOException { 506 | return new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8")); 507 | } 508 | 509 | public static BufferedWriter newBufferedWriter(String path, boolean append) throws FileNotFoundException, UnsupportedEncodingException { 510 | return new BufferedWriter(new OutputStreamWriter(new FileOutputStream(path, append), "UTF-8")); 511 | } 512 | 513 | /** 514 | * 创建输入流（经过IO适配器创建） 515 | * 516 | * @param path 517 | * @return 518 | * @throws IOException 519 | */ 520 | public static InputStream newInputStream(String path) throws IOException { 521 | return new FileInputStream(path); 522 | } 523 | 524 | /** 525 | * 创建输出流（经过IO适配器创建） 526 | * 527 | * @param path 528 | * @return 529 | * @throws IOException 530 | */ 531 | public static OutputStream newOutputStream(String path) throws IOException { 532 | return new FileOutputStream(path); 533 | } 534 | 535 | /** 536 | * 获取最后一个分隔符的后缀 537 | * 538 | * @param name 539 | * @param delimiter 540 | * @return 541 | */ 542 | public static String getSuffix(String name, String delimiter) { 543 | return name.substring(name.lastIndexOf(delimiter) + 1); 544 | } 545 | 546 | /** 547 | * 写数组，用制表符分割 548 | * 549 | * @param bw 550 | * @param params 551 | * @throws IOException 552 | */ 553 | public static void writeLine(BufferedWriter bw, String... params) throws IOException { 554 | for (int i = 0; i < params.length - 1; i++) { 555 | bw.write(params[i]); 556 | bw.write('\t'); 557 | } 558 | bw.write(params[params.length - 1]); 559 | } 560 | 561 | /* *//** 562 | * 加载词典，词典必须遵守HanLP核心词典格式 563 | * @param pathArray 词典路径，可以有任意个。每个路径支持用空格表示默认词性，比如“全国地名大全.txt ns” 564 | * @return 一个储存了词条的map 565 | * @throws IOException 异常表示加载失败 566 | *//* 567 | public static TreeMap loadDictionary(String... pathArray) throws IOException 568 | { 569 | TreeMap map = new TreeMap(); 570 | for (String path : pathArray) 571 | { 572 | File file = new File(path); 573 | String fileName = file.getName(); 574 | int natureIndex = fileName.lastIndexOf(' '); 575 | Nature defaultNature = Nature.n; 576 | if (natureIndex > 0) 577 | { 578 | String natureString = fileName.substring(natureIndex + 1); 579 | path = file.getParent() + File.separator + fileName.substring(0, natureIndex); 580 | if (natureString.length() > 0 && !natureString.endsWith(".txt") && !natureString.endsWith(".csv")) 581 | { 582 | defaultNature = Nature.create(natureString); 583 | } 584 | } 585 | BufferedReader br = new BufferedReader(new InputStreamReader(IOUtil.newInputStream(path), "UTF-8")); 586 | loadDictionary(br, map, path.endsWith(".csv"), defaultNature); 587 | } 588 | 589 | return map; 590 | }*/ 591 | 592 | /* *//** 593 | * 将一个BufferedReader中的词条加载到词典 594 | * @param br 源 595 | * @param storage 储存位置 596 | * @throws IOException 异常表示加载失败 597 | *//* 598 | public static void loadDictionary(BufferedReader br, TreeMap storage, boolean isCSV, Nature defaultNature) throws IOException 599 | { 600 | String splitter = "\\s"; 601 | if (isCSV) 602 | { 603 | splitter = ","; 604 | } 605 | String line; 606 | boolean firstLine = true; 607 | while ((line = br.readLine()) != null) 608 | { 609 | if (firstLine) 610 | { 611 | line = IOUtil.removeUTF8BOM(line); 612 | firstLine = false; 613 | } 614 | String param[] = line.split(splitter); 615 | 616 | int natureCount = (param.length - 1) / 2; 617 | CoreDictionary.Attribute attribute; 618 | if (natureCount == 0) 619 | { 620 | attribute = new CoreDictionary.Attribute(defaultNature); 621 | } 622 | else 623 | { 624 | attribute = new CoreDictionary.Attribute(natureCount); 625 | for (int i = 0; i < natureCount; ++i) 626 | { 627 | attribute.nature[i] = LexiconUtility.convertStringToNature(param[1 + 2 * i]); 628 | attribute.frequency[i] = Integer.parseInt(param[2 + 2 * i]); 629 | attribute.totalFrequency += attribute.frequency[i]; 630 | } 631 | } 632 | storage.put(param[0], attribute); 633 | } 634 | br.close(); 635 | } 636 | 637 | public static void writeCustomNature(DataOutputStream out, LinkedHashSet customNatureCollector) throws IOException 638 | { 639 | if (customNatureCollector.size() == 0) return; 640 | out.writeInt(-customNatureCollector.size()); 641 | for (Nature nature : customNatureCollector) 642 | { 643 | TextUtility.writeString(nature.toString(), out); 644 | } 645 | }*/ 646 | 647 | /** 648 | * 本地文件是否存在 649 | * 650 | * @param path 651 | * @return 652 | */ 653 | public static boolean isFileExisted(String path) { 654 | File file = new File(path); 655 | return file.isFile() && file.exists(); 656 | } 657 | } 658 | -------------------------------------------------------------------------------- /src/main/java/pojo/CharType.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/12/5 15:37 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | /** 15 | * 字符类型 16 | * 17 | * @author hankcs 18 | */ 19 | public class CharType 20 | { 21 | /** 22 | * 单字节 23 | */ 24 | public static final byte CT_SINGLE = 5; 25 | 26 | /** 27 | * 分隔符"!,.?()[]{}+= 28 | */ 29 | public static final byte CT_DELIMITER = CT_SINGLE + 1; 30 | 31 | /** 32 | * 中文字符 33 | */ 34 | public static final byte CT_CHINESE = CT_SINGLE + 2; 35 | 36 | /** 37 | * 字母 38 | */ 39 | public static final byte CT_LETTER = CT_SINGLE + 3; 40 | 41 | /** 42 | * 数字 43 | */ 44 | public static final byte CT_NUM = CT_SINGLE + 4; 45 | 46 | /** 47 | * 序号 48 | */ 49 | public static final byte CT_INDEX = CT_SINGLE + 5; 50 | 51 | /** 52 | * 中文数字 53 | */ 54 | public static final byte CT_CNUM = CT_SINGLE + 6; 55 | 56 | /** 57 | * 其他 58 | */ 59 | public static final byte CT_OTHER = CT_SINGLE + 12; 60 | 61 | public static byte[] type; 62 | /* 63 | static 64 | { 65 | type = new byte[65536]; 66 | logger.info("字符类型对应表开始加载 " + Config.CharTypePath); 67 | long start = System.currentTimeMillis(); 68 | ByteArray byteArray = ByteArray.createByteArray(Config.CharTypePath); 69 | if (byteArray == null) 70 | { 71 | try 72 | { 73 | byteArray = generate(); 74 | } 75 | catch (IOException e) 76 | { 77 | throw new IllegalArgumentException("字符类型对应表 " + Config.CharTypePath + " 加载失败： " + TextUtility.exceptionToString(e)); 78 | } 79 | } 80 | while (byteArray.hasMore()) 81 | { 82 | int b = byteArray.nextChar(); 83 | int e = byteArray.nextChar(); 84 | byte t = byteArray.nextByte(); 85 | for (int i = b; i <= e; ++i) 86 | { 87 | type[i] = t; 88 | } 89 | } 90 | logger.info("字符类型对应表加载成功，耗时" + (System.currentTimeMillis() - start) + " ms"); 91 | } 92 | 93 | private static ByteArray generate() throws IOException 94 | { 95 | int preType = 5; 96 | int preChar = 0; 97 | List typeList = new LinkedList(); 98 | for (int i = 0; i <= Character.MAX_VALUE; ++i) 99 | { 100 | int type = TextUtility.charType((char) i); 101 | // System.out.printf("%d %d\n", i, TextUtility.charType((char) i)); 102 | if (type != preType) 103 | { 104 | int[] array = new int[3]; 105 | array[0] = preChar; 106 | array[1] = i - 1; 107 | array[2] = preType; 108 | typeList.add(array); 109 | // System.out.printf("%d %d %d\n", array[0], array[1], array[2]); 110 | preChar = i; 111 | } 112 | preType = type; 113 | } 114 | { 115 | int[] array = new int[3]; 116 | array[0] = preChar; 117 | array[1] = (int) Character.MAX_VALUE; 118 | array[2] = preType; 119 | typeList.add(array); 120 | } 121 | // System.out.print("int[" + typeList.size() + "][3] array = \n"); 122 | DataOutputStream out = new DataOutputStream(new FileOutputStream(Config.CharTypePath)); 123 | for (int[] array : typeList) 124 | { 125 | // System.out.printf("%d %d %d\n", array[0], array[1], array[2]); 126 | out.writeChar(array[0]); 127 | out.writeChar(array[1]); 128 | out.writeByte(array[2]); 129 | } 130 | out.close(); 131 | ByteArray byteArray = ByteArray.createByteArray(Config.CharTypePath); 132 | return byteArray; 133 | }*/ 134 | 135 | /** 136 | * 获取字符的类型 137 | * 138 | * @param c 139 | * @return 140 | */ 141 | public static byte get(char c) 142 | { 143 | return type[(int) c]; 144 | } 145 | 146 | /** 147 | * 设置字符类型 148 | * 149 | * @param c 字符 150 | * @param t 类型 151 | */ 152 | public static void set(char c, byte t) 153 | { 154 | type[c] = t; 155 | } 156 | } 157 | -------------------------------------------------------------------------------- /src/main/java/pojo/CompoundWord.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 17:42 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | 15 | import java.util.Iterator; 16 | import java.util.LinkedList; 17 | import java.util.List; 18 | 19 | import static util.Predefine.logger; 20 | 21 | /** 22 | * 复合词，由两个或以上的word构成 23 | * @author hankcs 24 | */ 25 | public class CompoundWord implements IWord, Iterable 26 | { 27 | /** 28 | * 由这些词复合而来 29 | */ 30 | public List innerList; 31 | 32 | /** 33 | * 标签，通常是词性 34 | */ 35 | public String label; 36 | 37 | @Override 38 | public String getValue() 39 | { 40 | StringBuilder sb = new StringBuilder(); 41 | for (Word word : innerList) 42 | { 43 | sb.append(word.value); 44 | } 45 | return sb.toString(); 46 | } 47 | 48 | @Override 49 | public String getLabel() 50 | { 51 | return label; 52 | } 53 | 54 | @Override 55 | public void setLabel(String label) 56 | { 57 | this.label = label; 58 | } 59 | 60 | @Override 61 | public void setValue(String value) 62 | { 63 | innerList.clear(); 64 | innerList.add(new Word(value, label)); 65 | } 66 | 67 | @Override 68 | public int length() 69 | { 70 | return getValue().length(); 71 | } 72 | 73 | @Override 74 | public String toString() 75 | { 76 | StringBuilder sb = new StringBuilder(); 77 | sb.append('['); 78 | int i = 1; 79 | for (Word word : innerList) 80 | { 81 | sb.append(word.getValue()); 82 | String label = word.getLabel(); 83 | if (label != null) 84 | { 85 | sb.append('/').append(label); 86 | } 87 | if (i != innerList.size()) 88 | { 89 | sb.append(' '); 90 | } 91 | ++i; 92 | } 93 | sb.append("]/"); 94 | sb.append(label); 95 | return sb.toString(); 96 | } 97 | 98 | /** 99 | * 转换为一个简单词 100 | * @return 101 | */ 102 | public Word toWord() 103 | { 104 | return new Word(getValue(), getLabel()); 105 | } 106 | 107 | public CompoundWord(List innerList, String label) 108 | { 109 | this.innerList = innerList; 110 | this.label = label; 111 | } 112 | 113 | public static CompoundWord create(String param) 114 | { 115 | if (param == null) return null; 116 | int cutIndex = param.lastIndexOf(']'); 117 | if (cutIndex <= 2 || cutIndex == param.length() - 1) return null; 118 | String wordParam = param.substring(1, cutIndex); 119 | List wordList = new LinkedList(); 120 | for (String single : wordParam.split("\\s+")) 121 | { 122 | if (single.length() == 0) continue; 123 | Word word = Word.create(single); 124 | if (word == null) 125 | { 126 | logger.warning("使用参数" + single + "构造单词时发生错误"); 127 | return null; 128 | } 129 | wordList.add(word); 130 | } 131 | String labelParam = param.substring(cutIndex + 1); 132 | if (labelParam.startsWith("/")) 133 | { 134 | labelParam = labelParam.substring(1); 135 | } 136 | return new CompoundWord(wordList, labelParam); 137 | } 138 | 139 | @Override 140 | public Iterator iterator() 141 | { 142 | return innerList.iterator(); 143 | } 144 | } 145 | -------------------------------------------------------------------------------- /src/main/java/pojo/IWord.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 17:43 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | import java.io.Serializable; 15 | 16 | /** 17 | * 词语接口 18 | * @author hankcs 19 | */ 20 | public interface IWord extends Serializable 21 | { 22 | /** 23 | * 获取单词 24 | * @return 25 | */ 26 | String getValue(); 27 | 28 | /** 29 | * 获取标签 30 | * @return 31 | */ 32 | String getLabel(); 33 | 34 | /** 35 | * 设置标签 36 | * @param label 37 | */ 38 | void setLabel(String label); 39 | 40 | /** 41 | * 设置单词 42 | * @param value 43 | */ 44 | void setValue(String value); 45 | 46 | /** 47 | * 单词长度 48 | * @return 49 | */ 50 | int length(); 51 | } 52 | -------------------------------------------------------------------------------- /src/main/java/pojo/LineMsg.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/pojo/LineMsg.java -------------------------------------------------------------------------------- /src/main/java/pojo/SegMsg.java: -------------------------------------------------------------------------------- 1 | package pojo; 2 | 3 | public class SegMsg { 4 | public String seg; 5 | public int count; 6 | 7 | public SegMsg() { 8 | } 9 | 10 | public SegMsg(String seg, int count) { 11 | this.seg = seg; 12 | this.count = count; 13 | } 14 | 15 | @Override 16 | public String toString() { 17 | return "SegMsg{" + 18 | "seg='" + seg + '\'' + 19 | ", count=" + count + 20 | '}'; 21 | } 22 | } 23 | -------------------------------------------------------------------------------- /src/main/java/pojo/Sentence.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 18:04 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | 15 | 16 | import java.io.Serializable; 17 | import java.util.Iterator; 18 | import java.util.LinkedList; 19 | import java.util.List; 20 | import java.util.ListIterator; 21 | import java.util.regex.Matcher; 22 | import java.util.regex.Pattern; 23 | 24 | import static util.Predefine.logger; 25 | 26 | 27 | /** 28 | * 句子，指的是以。！等标点结尾的句子 29 | * 30 | * @author hankcs 31 | */ 32 | public class Sentence implements Serializable, Iterable 33 | { 34 | /** 35 | * 词语列表（复合或简单单词的列表） 36 | */ 37 | public List wordList; 38 | 39 | public Sentence(List wordList) 40 | { 41 | this.wordList = wordList; 42 | } 43 | 44 | @Override 45 | public String toString() 46 | { 47 | StringBuilder sb = new StringBuilder(size() * 4); 48 | int i = 1; 49 | for (IWord word : wordList) 50 | { 51 | sb.append(word); 52 | if (i != wordList.size()) sb.append(' '); 53 | ++i; 54 | } 55 | return sb.toString(); 56 | } 57 | 58 | /** 59 | * 转换为空格分割无标签的String 60 | * 61 | * @return 62 | */ 63 | public String toStringWithoutLabels() 64 | { 65 | StringBuilder sb = new StringBuilder(size() * 4); 66 | int i = 1; 67 | for (IWord word : wordList) 68 | { 69 | if (word instanceof CompoundWord) 70 | { 71 | int j = 0; 72 | for (Word w : ((CompoundWord) word).innerList) 73 | { 74 | sb.append(w.getValue()); 75 | if (++j != ((CompoundWord) word).innerList.size()) 76 | sb.append(' '); 77 | } 78 | } 79 | else 80 | sb.append(word.getValue()); 81 | if (i != wordList.size()) sb.append(' '); 82 | ++i; 83 | } 84 | return sb.toString(); 85 | } 86 | 87 | /** 88 | * brat standoff format
89 | * http://brat.nlplab.org/standoff.html 90 | * 91 | * @return 92 | */ 93 | /* public String toStandoff() 94 | { 95 | return toStandoff(false); 96 | }*/ 97 | 98 | /* *//** 99 | * brat standoff format
100 | * http://brat.nlplab.org/standoff.html 101 | * 102 | * @param withComment 103 | * @return 104 | *//* 105 | public String toStandoff(boolean withComment) 106 | { 107 | StringBuilder sb = new StringBuilder(size() * 4); 108 | String delimiter = " "; 109 | String text = text(delimiter); 110 | sb.append(text).append('\n'); 111 | int i = 1; 112 | int offset = 0; 113 | for (IWord word : wordList) 114 | { 115 | assert text.charAt(offset) == word.getValue().charAt(0); 116 | printWord(word, sb, i, offset, withComment); 117 | ++i; 118 | if (word instanceof CompoundWord) 119 | { 120 | int offsetChild = offset; 121 | for (Word child : ((CompoundWord) word).innerList) 122 | { 123 | printWord(child, sb, i, offsetChild, withComment); 124 | offsetChild += child.length(); 125 | offsetChild += delimiter.length(); 126 | ++i; 127 | } 128 | offset += delimiter.length() * ((CompoundWord) word).innerList.size(); 129 | } 130 | else 131 | { 132 | offset += delimiter.length(); 133 | } 134 | offset += word.length(); 135 | } 136 | return sb.toString(); 137 | }*/ 138 | 139 | /* *//** 140 | * 按照 PartOfSpeechTagDictionary 指定的映射表将词语词性翻译过去 141 | * 142 | * @return 143 | *//* 144 | public Sentence translateLabels() 145 | { 146 | for (IWord word : wordList) 147 | { 148 | word.setLabel(PartOfSpeechTagDictionary.translate(word.getLabel())); 149 | if (word instanceof CompoundWord) 150 | { 151 | for (Word child : ((CompoundWord) word).innerList) 152 | { 153 | child.setLabel(PartOfSpeechTagDictionary.translate(child.getLabel())); 154 | } 155 | } 156 | } 157 | return this; 158 | }*/ 159 | 160 | /** 161 | * 按照 PartOfSpeechTagDictionary 指定的映射表将复合词词语词性翻译过去 162 | * 163 | * @return 164 | */ 165 | /* public Sentence translateCompoundWordLabels() 166 | { 167 | for (IWord word : wordList) 168 | { 169 | if (word instanceof CompoundWord) 170 | word.setLabel(PartOfSpeechTagDictionary.translate(word.getLabel())); 171 | } 172 | return this; 173 | }*/ 174 | 175 | /* private void printWord(IWord word, StringBuilder sb, int id, int offset) 176 | { 177 | printWord(word, sb, id, offset, false); 178 | }*/ 179 | 180 | /* private void printWord(IWord word, StringBuilder sb, int id, int offset, boolean withComment) 181 | { 182 | char delimiter = '\t'; 183 | char endLine = '\n'; 184 | sb.append('T').append(id).append(delimiter); 185 | sb.append(word.getLabel()).append(delimiter); 186 | int length = word.length(); 187 | if (word instanceof CompoundWord) 188 | { 189 | length += ((CompoundWord) word).innerList.size() - 1; 190 | } 191 | sb.append(offset).append(delimiter).append(offset + length).append(delimiter); 192 | sb.append(word.getValue()).append(endLine); 193 | String translated = PartOfSpeechTagDictionary.translate(word.getLabel()); 194 | if (withComment && !word.getLabel().equals(translated)) 195 | { 196 | sb.append('#').append(id).append(delimiter).append("AnnotatorNotes").append(delimiter) 197 | .append('T').append(id).append(delimiter).append(translated) 198 | .append(endLine); 199 | } 200 | }*/ 201 | 202 | /** 203 | * 以人民日报2014语料格式的字符串创建一个结构化句子 204 | * 205 | * @param param 206 | * @return 207 | */ 208 | public static Sentence create(String param) 209 | { 210 | if (param == null) 211 | { 212 | return null; 213 | } 214 | param = param.trim(); 215 | if (param.isEmpty()) 216 | { 217 | return null; 218 | } 219 | Pattern pattern = Pattern.compile("(\\[(([^\\s]+/[0-9a-zA-Z]+)\\s+)+?([^\\s]+/[0-9a-zA-Z]+)]/?[0-9a-zA-Z]+)|([^\\s]+/[0-9a-zA-Z]+)"); 220 | Matcher matcher = pattern.matcher(param); 221 | List wordList = new LinkedList(); 222 | while (matcher.find()) 223 | { 224 | String single = matcher.group(); 225 | IWord word = WordFactory.create(single); 226 | if (word == null) 227 | { 228 | logger.warning("在用 " + single + " 构造单词时失败，句子构造参数为 " + param); 229 | return null; 230 | } 231 | wordList.add(word); 232 | } 233 | if (wordList.isEmpty()) // 按照无词性来解析 234 | { 235 | for (String w : param.split("\\s+")) 236 | { 237 | wordList.add(new Word(w, null)); 238 | } 239 | } 240 | 241 | return new Sentence(wordList); 242 | } 243 | 244 | /** 245 | * 句子中单词（复合词或简单词）的数量 246 | * 247 | * @return 248 | */ 249 | public int size() 250 | { 251 | return wordList.size(); 252 | } 253 | 254 | /** 255 | * 句子文本长度 256 | * 257 | * @return 258 | */ 259 | public int length() 260 | { 261 | int length = 0; 262 | for (IWord word : this) 263 | { 264 | length += word.getValue().length(); 265 | } 266 | 267 | return length; 268 | } 269 | 270 | /** 271 | * 原始文本形式（无标注，raw text） 272 | * 273 | * @return 274 | */ 275 | public String text() 276 | { 277 | return text(null); 278 | } 279 | 280 | /** 281 | * 原始文本形式（无标注，raw text） 282 | * 283 | * @param delimiter 词语之间的分隔符 284 | * @return 285 | */ 286 | public String text(String delimiter) 287 | { 288 | if (delimiter == null) delimiter = ""; 289 | StringBuilder sb = new StringBuilder(size() * 3); 290 | for (IWord word : this) 291 | { 292 | if (word instanceof CompoundWord) 293 | { 294 | for (Word child : ((CompoundWord) word).innerList) 295 | { 296 | sb.append(child.getValue()).append(delimiter); 297 | } 298 | } 299 | else 300 | { 301 | sb.append(word.getValue()).append(delimiter); 302 | } 303 | } 304 | sb.setLength(sb.length() - delimiter.length()); 305 | 306 | return sb.toString(); 307 | } 308 | 309 | @Override 310 | public Iterator iterator() 311 | { 312 | return wordList.iterator(); 313 | } 314 | 315 | /** 316 | * 找出所有词性为label的单词（不检查复合词内部的简单词） 317 | * 318 | * @param label 319 | * @return 320 | */ 321 | public List findWordsByLabel(String label) 322 | { 323 | List wordList = new LinkedList(); 324 | for (IWord word : this) 325 | { 326 | if (label.equals(word.getLabel())) 327 | { 328 | wordList.add(word); 329 | } 330 | } 331 | return wordList; 332 | } 333 | 334 | /** 335 | * 找出第一个词性为label的单词（不检查复合词内部的简单词） 336 | * 337 | * @param label 338 | * @return 339 | */ 340 | public IWord findFirstWordByLabel(String label) 341 | { 342 | for (IWord word : this) 343 | { 344 | if (label.equals(word.getLabel())) 345 | { 346 | return word; 347 | } 348 | } 349 | return null; 350 | } 351 | 352 | /** 353 | * 找出第一个词性为label的单词的指针（不检查复合词内部的简单词）
354 | * 若要查看该单词，请调用 previous
355 | * 若要删除该单词，请调用 remove
356 | * 357 | * @param label 358 | * @return 359 | */ 360 | public ListIterator findFirstWordIteratorByLabel(String label) 361 | { 362 | ListIterator listIterator = this.wordList.listIterator(); 363 | while (listIterator.hasNext()) 364 | { 365 | IWord word = listIterator.next(); 366 | if (label.equals(word.getLabel())) 367 | { 368 | return listIterator; 369 | } 370 | } 371 | return null; 372 | } 373 | 374 | /** 375 | * 是否含有词性为label的单词 376 | * 377 | * @param label 378 | * @return 379 | */ 380 | public boolean containsWordWithLabel(String label) 381 | { 382 | return findFirstWordByLabel(label) != null; 383 | } 384 | 385 | /** 386 | * 转换为简单单词列表 387 | * 388 | * @return 389 | */ 390 | public List toSimpleWordList() 391 | { 392 | List wordList = new LinkedList(); 393 | for (IWord word : this.wordList) 394 | { 395 | if (word instanceof CompoundWord) 396 | { 397 | wordList.addAll(((CompoundWord) word).innerList); 398 | } 399 | else 400 | { 401 | wordList.add((Word) word); 402 | } 403 | } 404 | 405 | return wordList; 406 | } 407 | 408 | /** 409 | * 获取所有单词构成的数组 410 | * 411 | * @return 412 | */ 413 | public String[] toWordArray() 414 | { 415 | List wordList = toSimpleWordList(); 416 | String[] wordArray = new String[wordList.size()]; 417 | Iterator iterator = wordList.iterator(); 418 | for (int i = 0; i < wordArray.length; i++) 419 | { 420 | wordArray[i] = iterator.next().value; 421 | } 422 | return wordArray; 423 | } 424 | 425 | /** 426 | * word pos 427 | * 428 | * @return 429 | */ 430 | public String[][] toWordTagArray() 431 | { 432 | List wordList = toSimpleWordList(); 433 | String[][] pair = new String[2][wordList.size()]; 434 | Iterator iterator = wordList.iterator(); 435 | for (int i = 0; i < pair[0].length; i++) 436 | { 437 | Word word = iterator.next(); 438 | pair[0][i] = word.value; 439 | pair[1][i] = word.label; 440 | } 441 | return pair; 442 | } 443 | /* 444 | /** 445 | * word pos ner 446 | * 447 | * @param tagSet 448 | * @return 449 | */ 450 | /* public String[][] toWordTagNerArray(NERTagSet tagSet) 451 | { 452 | List tupleList = Utility.convertSentenceToNER(this, tagSet); 453 | String[][] result = new String[3][tupleList.size()]; 454 | Iterator iterator = tupleList.iterator(); 455 | for (int i = 0; i < result[0].length; i++) 456 | { 457 | String[] tuple = iterator.next(); 458 | for (int j = 0; j < 3; ++j) 459 | { 460 | result[j][i] = tuple[j]; 461 | } 462 | } 463 | return result; 464 | }*/ 465 | 466 | public Sentence mergeCompoundWords() 467 | { 468 | ListIterator listIterator = wordList.listIterator(); 469 | while (listIterator.hasNext()) 470 | { 471 | IWord word = listIterator.next(); 472 | if (word instanceof CompoundWord) 473 | { 474 | listIterator.set(new Word(word.getValue(), word.getLabel())); 475 | } 476 | } 477 | return this; 478 | } 479 | 480 | @Override 481 | public boolean equals(Object o) 482 | { 483 | if (this == o) return true; 484 | if (o == null || getClass() != o.getClass()) return false; 485 | 486 | Sentence sentence = (Sentence) o; 487 | return toString().equals(sentence.toString()); 488 | } 489 | 490 | @Override 491 | public int hashCode() 492 | { 493 | return toString().hashCode(); 494 | } 495 | } 496 | -------------------------------------------------------------------------------- /src/main/java/pojo/Term.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/pojo/Term.java -------------------------------------------------------------------------------- /src/main/java/pojo/Word.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 17:25 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | 15 | import static config.Logger.logger; 16 | 17 | /** 18 | * 一个单词 19 | * @author hankcs 20 | */ 21 | public class Word implements IWord 22 | { 23 | /** 24 | * 单词的真实值，比如“程序” 25 | */ 26 | public String value; 27 | /** 28 | * 单词的标签，比如“n” 29 | */ 30 | public String label; 31 | 32 | @Override 33 | public String toString() 34 | { 35 | if (label == null) 36 | return value; 37 | return value + '/' + label; 38 | } 39 | 40 | public Word(String value, String label) 41 | { 42 | this.value = value; 43 | this.label = label; 44 | } 45 | 46 | /** 47 | * 通过参数构造一个单词 48 | * @param param 比如人民网/nz 49 | * @return 一个单词 50 | */ 51 | public static Word create(String param) 52 | { 53 | if (param == null) return null; 54 | int cutIndex = param.lastIndexOf('/'); 55 | if (cutIndex <= 0 || cutIndex == param.length() - 1) 56 | { 57 | logger.warning("使用 " + param + "创建单个单词失败"); 58 | return null; 59 | } 60 | 61 | return new Word(param.substring(0, cutIndex), param.substring(cutIndex + 1)); 62 | } 63 | 64 | @Override 65 | public String getValue() 66 | { 67 | return value; 68 | } 69 | 70 | @Override 71 | public String getLabel() 72 | { 73 | return label; 74 | } 75 | 76 | @Override 77 | public void setLabel(String label) 78 | { 79 | this.label = label; 80 | } 81 | 82 | @Override 83 | public void setValue(String value) 84 | { 85 | this.value = value; 86 | } 87 | 88 | @Override 89 | public int length() 90 | { 91 | return value.length(); 92 | } 93 | } 94 | -------------------------------------------------------------------------------- /src/main/java/pojo/WordFactory.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/9/8 18:49 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package pojo; 13 | 14 | /** 15 | * 一个很方便的工厂类，能够自动生成不同类型的词语 16 | * @author hankcs 17 | */ 18 | public class WordFactory 19 | { 20 | /** 21 | * 根据参数字符串产生对应的词语 22 | * @param param 23 | * @return 24 | */ 25 | public static IWord create(String param) 26 | { 27 | if (param == null) return null; 28 | if (param.startsWith("[") && !param.startsWith("[/")) 29 | { 30 | return CompoundWord.create(param); 31 | } 32 | else 33 | { 34 | return Word.create(param); 35 | } 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/main/java/seg/PreProcess.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/seg/PreProcess.java -------------------------------------------------------------------------------- /src/main/java/seg/Segment.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/seg/Segment.java -------------------------------------------------------------------------------- /src/main/java/serilize/JsonSerializationUtil.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/serilize/JsonSerializationUtil.java -------------------------------------------------------------------------------- /src/main/java/serilize/readAndWriteJson.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/serilize/readAndWriteJson.java -------------------------------------------------------------------------------- /src/main/java/trie/AhoCorasick/AhoCorasickDoubleArrayTrie.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/12/22 21:13 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie.AhoCorasick; 13 | 14 | 15 | import io.ByteArray; 16 | import java.io.DataOutputStream; 17 | import java.io.IOException; 18 | import java.io.ObjectInputStream; 19 | import java.io.ObjectOutputStream; 20 | import java.util.*; 21 | import java.util.concurrent.LinkedBlockingDeque; 22 | 23 | /** 24 | * 基于双数组Trie树的AhoCorasick自动机 25 | * 26 | * @author hankcs 27 | */ 28 | public class AhoCorasickDoubleArrayTrie 29 | { 30 | /** 31 | * 双数组值check 32 | */ 33 | protected int check[]; 34 | /** 35 | * 双数组之base 36 | */ 37 | protected int base[]; 38 | /** 39 | * fail表 40 | */ 41 | int fail[]; 42 | /** 43 | * 输出表 44 | */ 45 | int[][] output; 46 | /** 47 | * 保存value 48 | */ 49 | protected V[] v; 50 | 51 | /** 52 | * 每个key的长度 53 | */ 54 | protected int[] l; 55 | 56 | /** 57 | * base 和 check 的大小 58 | */ 59 | protected int size; 60 | 61 | public AhoCorasickDoubleArrayTrie() 62 | { 63 | } 64 | 65 | /** 66 | * 由一个词典创建 67 | * 68 | * @param dictionary 词典 69 | */ 70 | public AhoCorasickDoubleArrayTrie(TreeMap dictionary) 71 | { 72 | build(dictionary); 73 | } 74 | 75 | /** 76 | * 匹配母文本 77 | * 78 | * @param text 一些文本 79 | * @return 一个pair列表 80 | */ 81 | public List> parseText(String text) 82 | { 83 | int position = 1; 84 | int currentState = 0; 85 | List> collectedEmits = new LinkedList>(); 86 | for (int i = 0; i < text.length(); ++i) 87 | { 88 | currentState = getState(currentState, text.charAt(i)); 89 | storeEmits(position, currentState, collectedEmits); 90 | ++position; 91 | } 92 | 93 | return collectedEmits; 94 | } 95 | 96 | /** 97 | * 处理文本 98 | * 99 | * @param text 文本 100 | * @param processor 处理器 101 | */ 102 | public void parseText(String text, IHit processor) 103 | { 104 | int position = 1; 105 | int currentState = 0; 106 | for (int i = 0; i < text.length(); ++i) 107 | { 108 | currentState = getState(currentState, text.charAt(i)); 109 | int[] hitArray = output[currentState]; 110 | if (hitArray != null) 111 | { 112 | for (int hit : hitArray) 113 | { 114 | processor.hit(position - l[hit], position, v[hit]); 115 | } 116 | } 117 | ++position; 118 | } 119 | } 120 | 121 | /** 122 | * 处理文本 123 | * 124 | * @param text 125 | * @param processor 126 | */ 127 | public void parseText(char[] text, IHit processor) 128 | { 129 | int position = 1; 130 | int currentState = 0; 131 | for (char c : text) 132 | { 133 | currentState = getState(currentState, c); 134 | int[] hitArray = output[currentState]; 135 | if (hitArray != null) 136 | { 137 | for (int hit : hitArray) 138 | { 139 | processor.hit(position - l[hit], position, v[hit]); 140 | } 141 | } 142 | ++position; 143 | } 144 | } 145 | 146 | /** 147 | * 处理文本 148 | * 149 | * @param text 150 | * @param processor 151 | */ 152 | public void parseText(char[] text, IHitFull processor) 153 | { 154 | int position = 1; 155 | int currentState = 0; 156 | for (char c : text) 157 | { 158 | currentState = getState(currentState, c); 159 | int[] hitArray = output[currentState]; 160 | if (hitArray != null) 161 | { 162 | for (int hit : hitArray) 163 | { 164 | processor.hit(position - l[hit], position, v[hit], hit); 165 | } 166 | } 167 | ++position; 168 | } 169 | } 170 | 171 | /** 172 | * 持久化 173 | * 174 | * @param out 一个DataOutputStream 175 | * @throws Exception 可能的IO异常等 176 | */ 177 | public void save(DataOutputStream out) throws Exception 178 | { 179 | out.writeInt(size); 180 | for (int i = 0; i < size; i++) 181 | { 182 | out.writeInt(base[i]); 183 | out.writeInt(check[i]); 184 | out.writeInt(fail[i]); 185 | int output[] = this.output[i]; 186 | if (output == null) 187 | { 188 | out.writeInt(0); 189 | } 190 | else 191 | { 192 | out.writeInt(output.length); 193 | for (int o : output) 194 | { 195 | out.writeInt(o); 196 | } 197 | } 198 | } 199 | out.writeInt(l.length); 200 | for (int length : l) 201 | { 202 | out.writeInt(length); 203 | } 204 | } 205 | 206 | /** 207 | * 持久化 208 | * 209 | * @param out 一个ObjectOutputStream 210 | * @throws IOException 可能的IO异常 211 | */ 212 | public void save(ObjectOutputStream out) throws IOException 213 | { 214 | out.writeObject(base); 215 | out.writeObject(check); 216 | out.writeObject(fail); 217 | out.writeObject(output); 218 | out.writeObject(l); 219 | } 220 | 221 | /** 222 | * 载入 223 | * 224 | * @param in 一个ObjectInputStream 225 | * @param value 值（持久化的时候并没有持久化值，现在需要额外提供） 226 | * @throws IOException 227 | * @throws ClassNotFoundException 228 | */ 229 | public void load(ObjectInputStream in, V[] value) throws IOException, ClassNotFoundException 230 | { 231 | base = (int[]) in.readObject(); 232 | check = (int[]) in.readObject(); 233 | fail = (int[]) in.readObject(); 234 | output = (int[][]) in.readObject(); 235 | l = (int[]) in.readObject(); 236 | v = value; 237 | } 238 | 239 | /** 240 | * 载入 241 | * 242 | * @param byteArray 一个字节数组 243 | * @param value 值数组 244 | * @return 成功与否 245 | */ 246 | public boolean load(ByteArray byteArray, V[] value) 247 | { 248 | if (byteArray == null) return false; 249 | size = byteArray.nextInt(); 250 | base = new int[size + 65535]; // 多留一些，防止越界 251 | check = new int[size + 65535]; 252 | fail = new int[size + 65535]; 253 | output = new int[size + 65535][]; 254 | int length; 255 | for (int i = 0; i < size; ++i) 256 | { 257 | base[i] = byteArray.nextInt(); 258 | check[i] = byteArray.nextInt(); 259 | fail[i] = byteArray.nextInt(); 260 | length = byteArray.nextInt(); 261 | if (length == 0) continue; 262 | output[i] = new int[length]; 263 | for (int j = 0; j < output[i].length; ++j) 264 | { 265 | output[i][j] = byteArray.nextInt(); 266 | } 267 | } 268 | length = byteArray.nextInt(); 269 | l = new int[length]; 270 | for (int i = 0; i < l.length; ++i) 271 | { 272 | l[i] = byteArray.nextInt(); 273 | } 274 | v = value; 275 | return true; 276 | } 277 | 278 | /** 279 | * 获取值 280 | * 281 | * @param key 键 282 | * @return 283 | */ 284 | public V get(String key) 285 | { 286 | int index = exactMatchSearch(key); 287 | if (index >= 0) 288 | { 289 | return v[index]; 290 | } 291 | 292 | return null; 293 | } 294 | 295 | /** 296 | * 更新某个键对应的值 297 | * 298 | * @param key 键 299 | * @param value 值 300 | * @return 是否成功（失败的原因是没有这个键） 301 | */ 302 | public boolean set(String key, V value) 303 | { 304 | int index = exactMatchSearch(key); 305 | if (index >= 0) 306 | { 307 | v[index] = value; 308 | return true; 309 | } 310 | 311 | return false; 312 | } 313 | 314 | /** 315 | * 从值数组中提取下标为index的值
316 | * 注意为了效率，此处不进行参数校验 317 | * 318 | * @param index 下标 319 | * @return 值 320 | */ 321 | public V get(int index) 322 | { 323 | return v[index]; 324 | } 325 | 326 | /** 327 | * 命中一个模式串的处理方法 328 | */ 329 | public interface IHit 330 | { 331 | /** 332 | * 命中一个模式串 333 | * 334 | * @param begin 模式串在母文本中的起始位置 335 | * @param end 模式串在母文本中的终止位置 336 | * @param value 模式串对应的值 337 | */ 338 | void hit(int begin, int end, V value); 339 | } 340 | 341 | public interface IHitFull 342 | { 343 | /** 344 | * 命中一个模式串 345 | * 346 | * @param begin 模式串在母文本中的起始位置 347 | * @param end 模式串在母文本中的终止位置 348 | * @param value 模式串对应的值 349 | * @param index 模式串对应的值的下标 350 | */ 351 | void hit(int begin, int end, V value, int index); 352 | } 353 | 354 | /** 355 | * 一个命中结果 356 | * 357 | * @param 358 | */ 359 | public class Hit 360 | { 361 | /** 362 | * 模式串在母文本中的起始位置 363 | */ 364 | public final int begin; 365 | /** 366 | * 模式串在母文本中的终止位置 367 | */ 368 | public final int end; 369 | /** 370 | * 模式串对应的值 371 | */ 372 | public final V value; 373 | 374 | public Hit(int begin, int end, V value) 375 | { 376 | this.begin = begin; 377 | this.end = end; 378 | this.value = value; 379 | } 380 | 381 | @Override 382 | public String toString() 383 | { 384 | return String.format("[%d:%d]=%s", begin, end, value); 385 | } 386 | } 387 | 388 | /** 389 | * 转移状态，支持failure转移 390 | * 391 | * @param currentState 392 | * @param character 393 | * @return 394 | */ 395 | private int getState(int currentState, char character) 396 | { 397 | int newCurrentState = transitionWithRoot(currentState, character); // 先按success跳转 398 | while (newCurrentState == -1) // 跳转失败的话，按failure跳转 399 | { 400 | currentState = fail[currentState]; 401 | newCurrentState = transitionWithRoot(currentState, character); 402 | } 403 | return newCurrentState; 404 | } 405 | 406 | /** 407 | * 保存输出 408 | * 409 | * @param position 410 | * @param currentState 411 | * @param collectedEmits 412 | */ 413 | private void storeEmits(int position, int currentState, List> collectedEmits) 414 | { 415 | int[] hitArray = output[currentState]; 416 | if (hitArray != null) 417 | { 418 | for (int hit : hitArray) 419 | { 420 | collectedEmits.add(new Hit(position - l[hit], position, v[hit])); 421 | } 422 | } 423 | } 424 | 425 | /** 426 | * 转移状态 427 | * 428 | * @param current 429 | * @param c 430 | * @return 431 | */ 432 | protected int transition(int current, char c) 433 | { 434 | int b = current; 435 | int p; 436 | 437 | p = b + c + 1; 438 | if (b == check[p]) 439 | b = base[p]; 440 | else 441 | return -1; 442 | 443 | p = b; 444 | return p; 445 | } 446 | 447 | /** 448 | * c转移，如果是根节点则返回自己 449 | * 450 | * @param nodePos 451 | * @param c 452 | * @return 453 | */ 454 | protected int transitionWithRoot(int nodePos, char c) 455 | { 456 | int b = base[nodePos]; 457 | int p; 458 | 459 | p = b + c + 1; 460 | if (b != check[p]) 461 | { 462 | if (nodePos == 0) return 0; 463 | return -1; 464 | } 465 | 466 | return p; 467 | } 468 | 469 | 470 | /** 471 | * 由一个排序好的map创建 472 | */ 473 | public void build(TreeMap map) 474 | { 475 | new Builder().build(map); 476 | } 477 | 478 | /** 479 | * 获取直接相连的子节点 480 | * 481 | * @param parent 父节点 482 | * @param siblings （子）兄弟节点 483 | * @return 兄弟节点个数 484 | */ 485 | private int fetch(State parent, List> siblings) 486 | { 487 | if (parent.isAcceptable()) 488 | { 489 | State fakeNode = new State(-(parent.getDepth() + 1)); // 此节点是parent的子节点，同时具备parent的输出 490 | fakeNode.addEmit(parent.getLargestValueId()); 491 | siblings.add(new AbstractMap.SimpleEntry(0, fakeNode)); 492 | } 493 | for (Map.Entry entry : parent.getSuccess().entrySet()) 494 | { 495 | siblings.add(new AbstractMap.SimpleEntry(entry.getKey() + 1, entry.getValue())); 496 | } 497 | return siblings.size(); 498 | } 499 | 500 | /** 501 | * 精确匹配 502 | * 503 | * @param key 键 504 | * @return 值的下标 505 | */ 506 | public int exactMatchSearch(String key) 507 | { 508 | return exactMatchSearch(key, 0, 0, 0); 509 | } 510 | 511 | /** 512 | * 精确匹配 513 | * 514 | * @param key 515 | * @param pos 516 | * @param len 517 | * @param nodePos 518 | * @return 519 | */ 520 | private int exactMatchSearch(String key, int pos, int len, int nodePos) 521 | { 522 | if (len <= 0) 523 | len = key.length(); 524 | if (nodePos <= 0) 525 | nodePos = 0; 526 | 527 | int result = -1; 528 | 529 | char[] keyChars = key.toCharArray(); 530 | 531 | int b = base[nodePos]; 532 | int p; 533 | 534 | for (int i = pos; i < len; i++) 535 | { 536 | p = b + (int) (keyChars[i]) + 1; 537 | if (b == check[p]) 538 | b = base[p]; 539 | else 540 | return result; 541 | } 542 | 543 | p = b; 544 | int n = base[p]; 545 | if (b == check[p] && n < 0) 546 | { 547 | result = -n - 1; 548 | } 549 | return result; 550 | } 551 | 552 | /** 553 | * 精确查询 554 | * 555 | * @param keyChars 键的char数组 556 | * @param pos char数组的起始位置 557 | * @param len 键的长度 558 | * @param nodePos 开始查找的位置（本参数允许从非根节点查询） 559 | * @return 查到的节点代表的value ID，负数表示不存在 560 | */ 561 | private int exactMatchSearch(char[] keyChars, int pos, int len, int nodePos) 562 | { 563 | int result = -1; 564 | 565 | int b = base[nodePos]; 566 | int p; 567 | 568 | for (int i = pos; i < len; i++) 569 | { 570 | p = b + (int) (keyChars[i]) + 1; 571 | if (b == check[p]) 572 | b = base[p]; 573 | else 574 | return result; 575 | } 576 | 577 | p = b; 578 | int n = base[p]; 579 | if (b == check[p] && n < 0) 580 | { 581 | result = -n - 1; 582 | } 583 | return result; 584 | } 585 | 586 | /** 587 | * 一个顺序输出变量名与变量值的调试类 588 | */ 589 | private static class DebugArray 590 | { 591 | Map nameValueMap = new LinkedHashMap(); 592 | 593 | public void add(String name, int value) 594 | { 595 | String valueInMap = nameValueMap.get(name); 596 | if (valueInMap == null) 597 | { 598 | valueInMap = ""; 599 | } 600 | 601 | valueInMap += " " + String.format("%5d", value); 602 | 603 | nameValueMap.put(name, valueInMap); 604 | } 605 | 606 | @Override 607 | public String toString() 608 | { 609 | String text = ""; 610 | for (Map.Entry entry : nameValueMap.entrySet()) 611 | { 612 | String name = entry.getKey(); 613 | String value = entry.getValue(); 614 | text += String.format("%-5s", name) + "= " + value + '\n'; 615 | } 616 | 617 | return text; 618 | } 619 | 620 | public void println() 621 | { 622 | System.out.print(this); 623 | } 624 | } 625 | 626 | /** 627 | * 大小，即包含多少个模式串 628 | * 629 | * @return 630 | */ 631 | public int size() 632 | { 633 | return v == null ? 0 : v.length; 634 | } 635 | 636 | /** 637 | * 构建工具 638 | */ 639 | private class Builder 640 | { 641 | /** 642 | * 根节点，仅仅用于构建过程 643 | */ 644 | private State rootState = new State(); 645 | /** 646 | * 是否占用，仅仅用于构建 647 | */ 648 | private boolean used[]; 649 | /** 650 | * 已分配在内存中的大小 651 | */ 652 | private int allocSize; 653 | /** 654 | * 一个控制增长速度的变量 655 | */ 656 | private int progress; 657 | /** 658 | * 下一个插入的位置将从此开始搜索 659 | */ 660 | private int nextCheckPos; 661 | /** 662 | * 键值对的大小 663 | */ 664 | private int keySize; 665 | 666 | /** 667 | * 由一个排序好的map创建 668 | */ 669 | @SuppressWarnings("unchecked") 670 | public void build(TreeMap map) 671 | { 672 | // 把值保存下来 673 | v = (V[]) map.values().toArray(); 674 | l = new int[v.length]; 675 | Set keySet = map.keySet(); 676 | // 构建二分trie树 677 | addAllKeyword(keySet); 678 | // 在二分trie树的基础上构建双数组trie树 679 | buildDoubleArrayTrie(keySet); 680 | used = null; 681 | // 构建failure表并且合并output表 682 | constructFailureStates(); 683 | rootState = null; 684 | loseWeight(); 685 | } 686 | 687 | /** 688 | * 添加一个键 689 | * 690 | * @param keyword 键 691 | * @param index 值的下标 692 | */ 693 | private void addKeyword(String keyword, int index) 694 | { 695 | State currentState = this.rootState; 696 | for (Character character : keyword.toCharArray()) 697 | { 698 | currentState = currentState.addState(character); 699 | } 700 | currentState.addEmit(index); 701 | l[index] = keyword.length(); 702 | } 703 | 704 | /** 705 | * 一系列键 706 | * 707 | * @param keywordSet 708 | */ 709 | private void addAllKeyword(Collection keywordSet) 710 | { 711 | int i = 0; 712 | for (String keyword : keywordSet) 713 | { 714 | addKeyword(keyword, i++); 715 | } 716 | } 717 | 718 | /** 719 | * 建立failure表 720 | */ 721 | private void constructFailureStates() 722 | { 723 | fail = new int[size + 1]; 724 | fail[1] = base[0]; 725 | output = new int[size + 1][]; 726 | Queue queue = new LinkedBlockingDeque(); 727 | 728 | // 第一步，将深度为1的节点的failure设为根节点 729 | for (State depthOneState : this.rootState.getStates()) 730 | { 731 | depthOneState.setFailure(this.rootState, fail); 732 | queue.add(depthOneState); 733 | constructOutput(depthOneState); 734 | } 735 | 736 | // 第二步，为深度 > 1 的节点建立failure表，这是一个bfs 737 | while (!queue.isEmpty()) 738 | { 739 | State currentState = queue.remove(); 740 | 741 | for (Character transition : currentState.getTransitions()) 742 | { 743 | State targetState = currentState.nextState(transition); 744 | queue.add(targetState); 745 | 746 | State traceFailureState = currentState.failure(); 747 | while (traceFailureState.nextState(transition) == null) 748 | { 749 | traceFailureState = traceFailureState.failure(); 750 | } 751 | State newFailureState = traceFailureState.nextState(transition); 752 | targetState.setFailure(newFailureState, fail); 753 | targetState.addEmit(newFailureState.emit()); 754 | constructOutput(targetState); 755 | } 756 | } 757 | } 758 | 759 | /** 760 | * 建立output表 761 | */ 762 | private void constructOutput(State targetState) 763 | { 764 | Collection emit = targetState.emit(); 765 | if (emit == null || emit.size() == 0) return; 766 | int output[] = new int[emit.size()]; 767 | Iterator it = emit.iterator(); 768 | for (int i = 0; i < output.length; ++i) 769 | { 770 | output[i] = it.next(); 771 | } 772 | AhoCorasickDoubleArrayTrie.this.output[targetState.getIndex()] = output; 773 | } 774 | 775 | private void buildDoubleArrayTrie(Set keySet) 776 | { 777 | progress = 0; 778 | keySize = keySet.size(); 779 | resize(65536 * 32); // 32个双字节 780 | 781 | base[0] = 1; 782 | nextCheckPos = 0; 783 | 784 | State root_node = this.rootState; 785 | 786 | List> siblings = new ArrayList>(root_node.getSuccess().entrySet().size()); 787 | fetch(root_node, siblings); 788 | insert(siblings); 789 | } 790 | 791 | /** 792 | * 拓展数组 793 | * 794 | * @param newSize 795 | * @return 796 | */ 797 | private int resize(int newSize) 798 | { 799 | int[] base2 = new int[newSize]; 800 | int[] check2 = new int[newSize]; 801 | boolean used2[] = new boolean[newSize]; 802 | if (allocSize > 0) 803 | { 804 | System.arraycopy(base, 0, base2, 0, allocSize); 805 | System.arraycopy(check, 0, check2, 0, allocSize); 806 | System.arraycopy(used, 0, used2, 0, allocSize); 807 | } 808 | 809 | base = base2; 810 | check = check2; 811 | used = used2; 812 | 813 | return allocSize = newSize; 814 | } 815 | 816 | /** 817 | * 插入节点 818 | * 819 | * @param siblings 等待插入的兄弟节点 820 | * @return 插入位置 821 | */ 822 | private int insert(List> siblings) 823 | { 824 | int begin = 0; 825 | int pos = Math.max(siblings.get(0).getKey() + 1, nextCheckPos) - 1; 826 | int nonzero_num = 0; 827 | int first = 0; 828 | 829 | if (allocSize <= pos) 830 | resize(pos + 1); 831 | 832 | outer: 833 | // 此循环体的目标是找出满足base[begin + a1...an] == 0的n个空闲空间,a1...an是siblings中的n个节点 834 | while (true) 835 | { 836 | pos++; 837 | 838 | if (allocSize <= pos) 839 | resize(pos + 1); 840 | 841 | if (check[pos] != 0) 842 | { 843 | nonzero_num++; 844 | continue; 845 | } 846 | else if (first == 0) 847 | { 848 | nextCheckPos = pos; 849 | first = 1; 850 | } 851 | 852 | begin = pos - siblings.get(0).getKey(); // 当前位置离第一个兄弟节点的距离 853 | if (allocSize <= (begin + siblings.get(siblings.size() - 1).getKey())) 854 | { 855 | // progress can be zero // 防止progress产生除零错误 856 | double l = (1.05 > 1.0 * keySize / (progress + 1)) ? 1.05 : 1.0 * keySize / (progress + 1); 857 | resize((int) (allocSize * l)); 858 | } 859 | 860 | if (used[begin]) 861 | continue; 862 | 863 | for (int i = 1; i < siblings.size(); i++) 864 | if (check[begin + siblings.get(i).getKey()] != 0) 865 | continue outer; 866 | 867 | break; 868 | } 869 | 870 | // -- Simple heuristics -- 871 | // if the percentage of non-empty contents in check between the 872 | // index 873 | // 'next_check_pos' and 'check' is greater than some constant value 874 | // (e.g. 0.9), 875 | // new 'next_check_pos' index is written by 'check'. 876 | if (1.0 * nonzero_num / (pos - nextCheckPos + 1) >= 0.95) 877 | nextCheckPos = pos; // 从位置 next_check_pos 开始到 pos 间，如果已占用的空间在95%以上，下次插入节点时，直接从 pos 位置处开始查找 878 | used[begin] = true; 879 | 880 | size = (size > begin + siblings.get(siblings.size() - 1).getKey() + 1) ? size : begin + siblings.get(siblings.size() - 1).getKey() + 1; 881 | 882 | for (Map.Entry sibling : siblings) 883 | { 884 | check[begin + sibling.getKey()] = begin; 885 | } 886 | 887 | for (Map.Entry sibling : siblings) 888 | { 889 | List> new_siblings = new ArrayList>(sibling.getValue().getSuccess().entrySet().size() + 1); 890 | 891 | if (fetch(sibling.getValue(), new_siblings) == 0) // 一个词的终止且不为其他词的前缀，其实就是叶子节点 892 | { 893 | base[begin + sibling.getKey()] = (-sibling.getValue().getLargestValueId() - 1); 894 | progress++; 895 | } 896 | else 897 | { 898 | int h = insert(new_siblings); // dfs 899 | base[begin + sibling.getKey()] = h; 900 | } 901 | sibling.getValue().setIndex(begin + sibling.getKey()); 902 | } 903 | return begin; 904 | } 905 | 906 | /** 907 | * 释放空闲的内存 908 | */ 909 | private void loseWeight() 910 | { 911 | int nbase[] = new int[size + 65535]; 912 | System.arraycopy(base, 0, nbase, 0, size); 913 | base = nbase; 914 | 915 | int ncheck[] = new int[size + 65535]; 916 | System.arraycopy(check, 0, ncheck, 0, size); 917 | check = ncheck; 918 | } 919 | } 920 | } 921 | -------------------------------------------------------------------------------- /src/main/java/trie/AhoCorasick/State.java: -------------------------------------------------------------------------------- 1 | package trie.AhoCorasick; 2 | 3 | import java.util.*; 4 | 5 | /** 6 | *

7 | * 一个状态有如下几个功能 8 | *

9 | *

10 | *

success; 成功转移到另一个状态
failure; 不可顺着字符串跳转的话，则跳转到一个浅一点的节点
emits; 命中一个模式串

15 | *

16 | *

17 | * 根节点稍有不同，根节点没有 failure 功能，它的“failure”指的是按照字符串路径转移到下一个状态。其他节点则都有failure状态。 18 | *

19 | * 20 | * @author Robert Bor 21 | */ 22 | public class State 23 | { 24 | 25 | /** 26 | * 模式串的长度，也是这个状态的深度 27 | */ 28 | protected final int depth; 29 | 30 | /** 31 | * fail 函数，如果没有匹配到，则跳转到此状态。 32 | */ 33 | private State failure = null; 34 | 35 | /** 36 | * 只要这个状态可达，则记录模式串 37 | */ 38 | private Set emits = null; 39 | /** 40 | * goto 表，也称转移函数。根据字符串的下一个字符转移到下一个状态 41 | */ 42 | private Map success = new TreeMap(); 43 | 44 | /** 45 | * 在双数组中的对应下标 46 | */ 47 | private int index; 48 | 49 | /** 50 | * 构造深度为0的节点 51 | */ 52 | public State() 53 | { 54 | this(0); 55 | } 56 | 57 | /** 58 | * 构造深度为depth的节点 59 | * @param depth 60 | */ 61 | public State(int depth) 62 | { 63 | this.depth = depth; 64 | } 65 | 66 | /** 67 | * 获取节点深度 68 | * @return 69 | */ 70 | public int getDepth() 71 | { 72 | return this.depth; 73 | } 74 | 75 | /** 76 | * 添加一个匹配到的模式串（这个状态对应着这个模式串) 77 | * @param keyword 78 | */ 79 | public void addEmit(int keyword) 80 | { 81 | if (this.emits == null) 82 | { 83 | this.emits = new TreeSet(Collections.reverseOrder()); 84 | } 85 | this.emits.add(keyword); 86 | } 87 | 88 | /** 89 | * 获取最大的值 90 | * @return 91 | */ 92 | public Integer getLargestValueId() 93 | { 94 | if (emits == null || emits.size() == 0) return null; 95 | 96 | return emits.iterator().next(); 97 | } 98 | 99 | /** 100 | * 添加一些匹配到的模式串 101 | * @param emits 102 | */ 103 | public void addEmit(Collection emits) 104 | { 105 | for (int emit : emits) 106 | { 107 | addEmit(emit); 108 | } 109 | } 110 | 111 | /** 112 | * 获取这个节点代表的模式串（们） 113 | * @return 114 | */ 115 | public Collection emit() 116 | { 117 | return this.emits == null ? Collections.emptyList() : this.emits; 118 | } 119 | 120 | /** 121 | * 是否是终止状态 122 | * @return 123 | */ 124 | public boolean isAcceptable() 125 | { 126 | return this.depth > 0 && this.emits != null; 127 | } 128 | 129 | /** 130 | * 获取failure状态 131 | * @return 132 | */ 133 | public State failure() 134 | { 135 | return this.failure; 136 | } 137 | 138 | /** 139 | * 设置failure状态 140 | * @param failState 141 | */ 142 | public void setFailure(State failState, int fail[]) 143 | { 144 | this.failure = failState; 145 | fail[index] = failState.index; 146 | } 147 | 148 | /** 149 | * 转移到下一个状态 150 | * @param character 希望按此字符转移 151 | * @param ignoreRootState 是否忽略根节点，如果是根节点自己调用则应该是true，否则为false 152 | * @return 转移结果 153 | */ 154 | private State nextState(Character character, boolean ignoreRootState) 155 | { 156 | State nextState = this.success.get(character); 157 | if (!ignoreRootState && nextState == null && this.depth == 0) 158 | { 159 | nextState = this; 160 | } 161 | return nextState; 162 | } 163 | 164 | /** 165 | * 按照character转移，根节点转移失败会返回自己（永远不会返回null） 166 | * @param character 167 | * @return 168 | */ 169 | public State nextState(Character character) 170 | { 171 | return nextState(character, false); 172 | } 173 | 174 | /** 175 | * 按照character转移，任何节点转移失败会返回null 176 | * @param character 177 | * @return 178 | */ 179 | public State nextStateIgnoreRootState(Character character) 180 | { 181 | return nextState(character, true); 182 | } 183 | 184 | public State addState(Character character) 185 | { 186 | State nextState = nextStateIgnoreRootState(character); 187 | if (nextState == null) 188 | { 189 | nextState = new State(this.depth + 1); 190 | this.success.put(character, nextState); 191 | } 192 | return nextState; 193 | } 194 | 195 | public Collection getStates() 196 | { 197 | return this.success.values(); 198 | } 199 | 200 | public Collection getTransitions() 201 | { 202 | return this.success.keySet(); 203 | } 204 | 205 | @Override 206 | public String toString() 207 | { 208 | final StringBuilder sb = new StringBuilder("State{"); 209 | sb.append("depth=").append(depth); 210 | sb.append(", ID=").append(index); 211 | sb.append(", emits=").append(emits); 212 | sb.append(", success=").append(success.keySet()); 213 | sb.append(", failureID=").append(failure == null ? "-1" : failure.index); 214 | sb.append(", failure=").append(failure); 215 | sb.append('}'); 216 | return sb.toString(); 217 | } 218 | 219 | /** 220 | * 获取goto表 221 | * @return 222 | */ 223 | public Map getSuccess() 224 | { 225 | return success; 226 | } 227 | 228 | public int getIndex() 229 | { 230 | return index; 231 | } 232 | 233 | public void setIndex(int index) 234 | { 235 | this.index = index; 236 | } 237 | } 238 | -------------------------------------------------------------------------------- /src/main/java/trie/ITrie.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2015/4/23 0:23 6 | * 7 | * 8 | * Copyright (c) 2003-2015, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie; 13 | 14 | 15 | import io.ByteArray; 16 | import java.io.DataOutputStream; 17 | import java.util.TreeMap; 18 | 19 | /** 20 | * trie树接口 21 | * @author hankcs 22 | */ 23 | public interface ITrie 24 | { 25 | int build(TreeMap keyValueMap); 26 | boolean save(DataOutputStream out); 27 | boolean load(ByteArray byteArray, V[] value); 28 | V get(char[] key); 29 | V get(String key); 30 | V[] getValueArray(V[] a); 31 | boolean containsKey(String key); 32 | int size(); 33 | } 34 | -------------------------------------------------------------------------------- /src/main/java/trie/Trie.java: -------------------------------------------------------------------------------- 1 | package trie; 2 | 3 | /** 4 | * Created by bruce_shan on 2018/12/4 20:33. 5 | * Corporation CSU Software 这是一个字典树demo,并没有在项目中用到 6 | */ 7 | public class Trie { 8 | private TrieNode root; 9 | 10 | public Trie() { 11 | root = new TrieNode(); 12 | } 13 | 14 | // Inserts a word into the trie. 15 | public void insert(String word) { 16 | TrieNode node = root; 17 | int length = word.length(); 18 | int position ; 19 | char c; 20 | for (int i = 0; i < length; i++) { 21 | c = word.charAt(i); 22 | position = c-'a'; 23 | if (node.trieNodes[position] == null) { 24 | node.trieNodes[position] = new TrieNode(); 25 | } 26 | node = node.trieNodes[position]; 27 | node.setCount(node.getCount()+1); 28 | } 29 | node.setExist(true); 30 | } 31 | 32 | // Returns if the word is in the trie. 33 | public boolean search(String word) { 34 | boolean result = false; 35 | TrieNode node = root; 36 | int length = word.length(); 37 | int position ; 38 | char c; 39 | for (int i = 0; i < length; i++) { 40 | c = word.charAt(i); 41 | position = c - 'a'; 42 | node = node.trieNodes[position]; 43 | if (node == null) { 44 | break; 45 | } 46 | } 47 | if (node != null && node.getExist()) { 48 | result = true; 49 | } 50 | return result; 51 | } 52 | 53 | // Returns if there is any word in the trie 54 | // that starts with the given prefix. 55 | public boolean startsWith(String prefix) { 56 | TrieNode node = root; 57 | int length = prefix.length(); 58 | int position ; 59 | char c; 60 | for (int i = 0; i < length; i++) { 61 | c = prefix.charAt(i); 62 | position = c - 'a'; 63 | node = node.trieNodes[position]; 64 | if (node == null) { 65 | return false; 66 | } 67 | } 68 | return true; 69 | } 70 | 71 | // delete if the word is in the trie. 72 | public boolean doDelete(String word, TrieNode node) { 73 | //树中已匹配的字符串比传入字符串短 74 | if (node == null) { 75 | return false; 76 | } 77 | 78 | //树中已匹配的字符串比传入字符串不短 79 | if (word.length() > 1){ 80 | char c = word.charAt(0); 81 | int position = c - 'a'; 82 | TrieNode trieNode = node.trieNodes[position]; 83 | boolean b = doDelete(word.substring(1), trieNode); 84 | if (b) { 85 | node.setCount(node.getCount() - 1); 86 | if (trieNode.getCount() == 0) { 87 | node.trieNodes[position] = null; 88 | } 89 | return true; 90 | } 91 | } 92 | 93 | if (word.length() == 1) { 94 | char c = word.charAt(0); 95 | int position = c - 'a'; 96 | TrieNode trieNode = node.trieNodes[position]; 97 | //只删除单词如果是前缀不删除 98 | if (trieNode != null && trieNode.getExist()) { 99 | return true; 100 | } 101 | } 102 | return false; 103 | } 104 | 105 | // delete if the word is in the trie. 106 | public boolean delete(String word) { 107 | return this.doDelete(word,root); 108 | } 109 | 110 | class TrieNode { 111 | // Initialize your data structure here. 112 | int count = 0; 113 | TrieNode[] trieNodes = new TrieNode[26]; 114 | Boolean exist = false; 115 | public TrieNode() { 116 | } 117 | 118 | public TrieNode(int count, Boolean exist) { 119 | this.count = count; 120 | this.exist = exist; 121 | } 122 | 123 | public int getCount() { 124 | return count; 125 | } 126 | 127 | public void setCount(int count) { 128 | this.count = count; 129 | } 130 | 131 | public TrieNode[] getTrieNodes() { 132 | return trieNodes; 133 | } 134 | 135 | public void setTrieNodes(TrieNode[] trieNodes) { 136 | this.trieNodes = trieNodes; 137 | } 138 | 139 | public Boolean getExist() { 140 | return exist; 141 | } 142 | 143 | public void setExist(Boolean exist) { 144 | this.exist = exist; 145 | } 146 | } 147 | 148 | public static void main(String[] args) { 149 | Trie trie = new Trie(); 150 | trie.search("lintcode"); 151 | trie.startsWith("lint"); 152 | trie.insert("lint"); 153 | trie.startsWith("lint"); 154 | 155 | boolean lint = trie.delete("lin"); 156 | //System.out.println("lint = " + lint); 157 | lint = trie.delete("lint"); 158 | // System.out.println("lint = " + lint); 159 | } 160 | } 161 | -------------------------------------------------------------------------------- /src/main/java/trie/bintrie/BaseNode.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/5/2 20:22 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie.bintrie; 13 | 14 | 15 | import io.ByteArray; 16 | 17 | import java.io.DataOutputStream; 18 | import java.io.IOException; 19 | import java.io.ObjectInput; 20 | import java.io.ObjectOutput; 21 | import java.util.AbstractMap; 22 | import java.util.Map; 23 | import java.util.Set; 24 | 25 | /** 26 | * 节点，统一Trie树根和其他节点的基类 27 | * 28 | * @param 值 29 | * @author He Han 30 | */ 31 | public abstract class BaseNode implements Comparable 32 | { 33 | /** 34 | * 状态数组，方便读取的时候用 35 | */ 36 | static final Status[] ARRAY_STATUS = Status.values(); 37 | /** 38 | * 子节点 39 | */ 40 | protected BaseNode[] child; 41 | /** 42 | * 节点状态 43 | */ 44 | protected Status status; 45 | /** 46 | * 节点代表的字符 47 | */ 48 | protected char c; 49 | /** 50 | * 节点代表的值 51 | */ 52 | protected V value; 53 | 54 | public BaseNode transition(String path, int begin) 55 | { 56 | BaseNode cur = this; 57 | for (int i = begin; i < path.length(); ++i) 58 | { 59 | cur = cur.getChild(path.charAt(i)); 60 | if (cur == null || cur.status == Status.UNDEFINED_0) return null; 61 | } 62 | return cur; 63 | } 64 | 65 | public BaseNode transition(char[] path, int begin) 66 | { 67 | BaseNode cur = this; 68 | for (int i = begin; i < path.length; ++i) 69 | { 70 | cur = cur.getChild(path[i]); 71 | if (cur == null || cur.status == Status.UNDEFINED_0) return null; 72 | } 73 | return cur; 74 | } 75 | 76 | /** 77 | * 转移状态 78 | * @param path 79 | * @return 80 | */ 81 | public BaseNode transition(char path) 82 | { 83 | BaseNode cur = this; 84 | cur = cur.getChild(path); 85 | if (cur == null || cur.status == Status.UNDEFINED_0) return null; 86 | return cur; 87 | } 88 | 89 | /** 90 | * 添加子节点 91 | * 92 | * @return true-新增了节点 false-修改了现有节点 93 | */ 94 | protected abstract boolean addChild(BaseNode node); 95 | 96 | /** 97 | * 是否含有子节点 98 | * 99 | * @param c 子节点的char 100 | * @return 是否含有 101 | */ 102 | protected boolean hasChild(char c) 103 | { 104 | return getChild(c) != null; 105 | } 106 | 107 | protected char getChar() 108 | { 109 | return c; 110 | } 111 | 112 | /** 113 | * 获取子节点 114 | * 115 | * @param c 子节点的char 116 | * @return 子节点 117 | */ 118 | public abstract BaseNode getChild(char c); 119 | 120 | /** 121 | * 获取节点对应的值 122 | * 123 | * @return 值 124 | */ 125 | public final V getValue() 126 | { 127 | return value; 128 | } 129 | 130 | /** 131 | * 设置节点对应的值 132 | * 133 | * @param value 值 134 | */ 135 | public final void setValue(V value) 136 | { 137 | this.value = value; 138 | } 139 | 140 | @Override 141 | public int compareTo(BaseNode other) 142 | { 143 | return compareTo(other.getChar()); 144 | } 145 | 146 | /** 147 | * 重载，与字符的比较 148 | * @param other 149 | * @return 150 | */ 151 | public int compareTo(char other) 152 | { 153 | if (this.c > other) 154 | { 155 | return 1; 156 | } 157 | if (this.c < other) 158 | { 159 | return -1; 160 | } 161 | return 0; 162 | } 163 | 164 | /** 165 | * 获取节点的成词状态 166 | * @return 167 | */ 168 | public Status getStatus() 169 | { 170 | return status; 171 | } 172 | 173 | protected void walk(StringBuilder sb, Set> entrySet) 174 | { 175 | sb.append(c); 176 | if (status == Status.WORD_MIDDLE_2 || status == Status.WORD_END_3) 177 | { 178 | entrySet.add(new TrieEntry(sb.toString(), value)); 179 | } 180 | if (child == null) return; 181 | for (BaseNode node : child) 182 | { 183 | if (node == null) continue; 184 | node.walk(new StringBuilder(sb.toString()), entrySet); 185 | } 186 | } 187 | 188 | protected void walkToSave(DataOutputStream out) throws IOException 189 | { 190 | out.writeChar(c); 191 | out.writeInt(status.ordinal()); 192 | int childSize = 0; 193 | if (child != null) childSize = child.length; 194 | out.writeInt(childSize); 195 | if (child == null) return; 196 | for (BaseNode node : child) 197 | { 198 | node.walkToSave(out); 199 | } 200 | } 201 | 202 | protected void walkToSave(ObjectOutput out) throws IOException 203 | { 204 | out.writeChar(c); 205 | out.writeInt(status.ordinal()); 206 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) 207 | { 208 | out.writeObject(value); 209 | } 210 | int childSize = 0; 211 | if (child != null) childSize = child.length; 212 | out.writeInt(childSize); 213 | if (child == null) return; 214 | for (BaseNode node : child) 215 | { 216 | node.walkToSave(out); 217 | } 218 | } 219 | 220 | protected void walkToLoad(ByteArray byteArray, _ValueArray valueArray) 221 | { 222 | c = byteArray.nextChar(); 223 | status = ARRAY_STATUS[byteArray.nextInt()]; 224 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) 225 | { 226 | value = valueArray.nextValue(); 227 | } 228 | int childSize = byteArray.nextInt(); 229 | child = new BaseNode[childSize]; 230 | for (int i = 0; i < childSize; ++i) 231 | { 232 | child[i] = new Node(); 233 | child[i].walkToLoad(byteArray, valueArray); 234 | } 235 | } 236 | 237 | protected void walkToLoad(ObjectInput byteArray) throws IOException, ClassNotFoundException 238 | { 239 | c = byteArray.readChar(); 240 | status = ARRAY_STATUS[byteArray.readInt()]; 241 | if (status == Status.WORD_END_3 || status == Status.WORD_MIDDLE_2) 242 | { 243 | value = (V) byteArray.readObject(); 244 | } 245 | int childSize = byteArray.readInt(); 246 | child = new BaseNode[childSize]; 247 | for (int i = 0; i < childSize; ++i) 248 | { 249 | child[i] = new Node(); 250 | child[i].walkToLoad(byteArray); 251 | } 252 | } 253 | 254 | public enum Status 255 | { 256 | /** 257 | * 未指定，用于删除词条 258 | */ 259 | UNDEFINED_0, 260 | /** 261 | * 不是词语的结尾 262 | */ 263 | NOT_WORD_1, 264 | /** 265 | * 是个词语的结尾，并且还可以继续 266 | */ 267 | WORD_MIDDLE_2, 268 | /** 269 | * 是个词语的结尾，并且没有继续 270 | */ 271 | WORD_END_3, 272 | } 273 | 274 | public class TrieEntry extends AbstractMap.SimpleEntry implements Comparable 275 | { 276 | public TrieEntry(String key, V value) 277 | { 278 | super(key, value); 279 | } 280 | @Override 281 | public int compareTo(TrieEntry o) 282 | { 283 | return getKey().compareTo(o.getKey()); 284 | } 285 | } 286 | 287 | @Override 288 | public String toString() 289 | { 290 | if (child == null) 291 | { 292 | return "BaseNode{" + 293 | "status=" + status + 294 | ", c=" + c + 295 | ", value=" + value + 296 | '}'; 297 | } 298 | return "BaseNode{" + 299 | "child=" + child.length + 300 | ", status=" + status + 301 | ", c=" + c + 302 | ", value=" + value + 303 | '}'; 304 | } 305 | } 306 | -------------------------------------------------------------------------------- /src/main/java/trie/bintrie/BinTrie.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/5/3 11:34 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie.bintrie; 13 | 14 | 15 | 16 | import io.ByteArray; 17 | import io.IOUtil; 18 | import trie.AhoCorasick.AhoCorasickDoubleArrayTrie; 19 | import trie.ITrie; 20 | import util.TextUtility; 21 | 22 | import java.io.*; 23 | import java.util.*; 24 | 25 | import static config.Logger.logger; 26 | 27 | /** 28 | * 首字直接分配内存，之后二分动态数组的Trie树，能够平衡时间和空间 29 | * 30 | * @author hankcs 31 | */ 32 | public class BinTrie extends BaseNode implements ITrie, Externalizable 33 | { 34 | private int size; 35 | 36 | public BinTrie() 37 | { 38 | child = new BaseNode[65535 + 1]; // (int)Character.MAX_VALUE 39 | size = 0; 40 | status = Status.NOT_WORD_1; 41 | } 42 | 43 | public BinTrie(Map map) 44 | { 45 | this(); 46 | for (Map.Entry entry : map.entrySet()) 47 | { 48 | put(entry.getKey(), entry.getValue()); 49 | } 50 | } 51 | 52 | /** 53 | * 插入一个词 54 | * 55 | * @param key 56 | * @param value 57 | */ 58 | public void put(String key, V value) 59 | { 60 | if (key.length() == 0) return; // 安全起见 61 | BaseNode branch = this; 62 | char[] chars = key.toCharArray(); 63 | for (int i = 0; i < chars.length - 1; ++i) 64 | { 65 | // 除了最后一个字外，都是继续 66 | branch.addChild(new Node(chars[i], Status.NOT_WORD_1, null)); 67 | branch = branch.getChild(chars[i]); 68 | } 69 | // 最后一个字加入时属性为end 70 | if (branch.addChild(new Node(chars[chars.length - 1], Status.WORD_END_3, value))) 71 | { 72 | ++size; // 维护size 73 | } 74 | } 75 | 76 | public void put(char[] key, V value) 77 | { 78 | BaseNode branch = this; 79 | for (int i = 0; i < key.length - 1; ++i) 80 | { 81 | // 除了最后一个字外，都是继续 82 | branch.addChild(new Node(key[i], Status.NOT_WORD_1, null)); 83 | branch = branch.getChild(key[i]); 84 | } 85 | // 最后一个字加入时属性为end 86 | if (branch.addChild(new Node(key[key.length - 1], Status.WORD_END_3, value))) 87 | { 88 | ++size; // 维护size 89 | } 90 | } 91 | 92 | /** 93 | * 设置键值对，当键不存在的时候会自动插入 94 | * @param key 95 | * @param value 96 | */ 97 | public void set(String key, V value) 98 | { 99 | put(key.toCharArray(), value); 100 | } 101 | 102 | /** 103 | * 删除一个词 104 | * 105 | * @param key 106 | */ 107 | public void remove(String key) 108 | { 109 | BaseNode branch = this; 110 | char[] chars = key.toCharArray(); 111 | for (int i = 0; i < chars.length - 1; ++i) 112 | { 113 | if (branch == null) return; 114 | branch = branch.getChild(chars[i]); 115 | } 116 | if (branch == null) return; 117 | // 最后一个字设为undefined 118 | if (branch.addChild(new Node(chars[chars.length - 1], Status.UNDEFINED_0, value))) 119 | { 120 | --size; 121 | } 122 | } 123 | 124 | public boolean containsKey(String key) 125 | { 126 | BaseNode branch = this; 127 | char[] chars = key.toCharArray(); 128 | for (char aChar : chars) 129 | { 130 | if (branch == null) return false; 131 | branch = branch.getChild(aChar); 132 | } 133 | 134 | return branch != null && (branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2); 135 | } 136 | 137 | public V get(String key) 138 | { 139 | BaseNode branch = this; 140 | char[] chars = key.toCharArray(); 141 | for (char aChar : chars) 142 | { 143 | if (branch == null) return null; 144 | branch = branch.getChild(aChar); 145 | } 146 | 147 | if (branch == null) return null; 148 | // 下面这句可以保证只有成词的节点被返回 149 | if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null; 150 | return (V) branch.getValue(); 151 | } 152 | 153 | public V get(char[] key) 154 | { 155 | BaseNode branch = this; 156 | for (char aChar : key) 157 | { 158 | if (branch == null) return null; 159 | branch = branch.getChild(aChar); 160 | } 161 | 162 | if (branch == null) return null; 163 | // 下面这句可以保证只有成词的节点被返回 164 | if (!(branch.status == Status.WORD_END_3 || branch.status == Status.WORD_MIDDLE_2)) return null; 165 | return (V) branch.getValue(); 166 | } 167 | 168 | @Override 169 | public V[] getValueArray(V[] a) 170 | { 171 | if (a.length < size) 172 | a = (V[]) java.lang.reflect.Array.newInstance( 173 | a.getClass().getComponentType(), size); 174 | int i = 0; 175 | for (Map.Entry entry : entrySet()) 176 | { 177 | a[i++] = entry.getValue(); 178 | } 179 | return a; 180 | } 181 | 182 | /** 183 | * 获取键值对集合 184 | * 185 | * @return 186 | */ 187 | public Set> entrySet() 188 | { 189 | Set> entrySet = new TreeSet>(); 190 | StringBuilder sb = new StringBuilder(); 191 | for (BaseNode node : child) 192 | { 193 | if (node == null) continue; 194 | node.walk(new StringBuilder(sb.toString()), entrySet); 195 | } 196 | return entrySet; 197 | } 198 | 199 | /** 200 | * 键集合 201 | * @return 202 | */ 203 | public Set keySet() 204 | { 205 | TreeSet keySet = new TreeSet(); 206 | for (Map.Entry entry : entrySet()) 207 | { 208 | keySet.add(entry.getKey()); 209 | } 210 | 211 | return keySet; 212 | } 213 | 214 | /** 215 | * 前缀查询 216 | * 217 | * @param key 查询串 218 | * @return 键值对 219 | */ 220 | public Set> prefixSearch(String key) 221 | { 222 | Set> entrySet = new TreeSet>(); 223 | StringBuilder sb = new StringBuilder(key.substring(0, key.length() - 1)); 224 | BaseNode branch = this; 225 | char[] chars = key.toCharArray(); 226 | for (char aChar : chars) 227 | { 228 | if (branch == null) return entrySet; 229 | branch = branch.getChild(aChar); 230 | } 231 | 232 | if (branch == null) return entrySet; 233 | branch.walk(sb, entrySet); 234 | return entrySet; 235 | } 236 | 237 | /** 238 | * 前缀查询，包含值 239 | * 240 | * @param key 键 241 | * @return 键值对列表 242 | */ 243 | public LinkedList> commonPrefixSearchWithValue(String key) 244 | { 245 | char[] chars = key.toCharArray(); 246 | return commonPrefixSearchWithValue(chars, 0); 247 | } 248 | 249 | /** 250 | * 前缀查询，通过字符数组来表示字符串可以优化运行速度 251 | * 252 | * @param chars 字符串的字符数组 253 | * @param begin 开始的下标 254 | * @return 255 | */ 256 | public LinkedList> commonPrefixSearchWithValue(char[] chars, int begin) 257 | { 258 | LinkedList> result = new LinkedList>(); 259 | StringBuilder sb = new StringBuilder(); 260 | BaseNode branch = this; 261 | for (int i = begin; i < chars.length; ++i) 262 | { 263 | char aChar = chars[i]; 264 | branch = branch.getChild(aChar); 265 | if (branch == null || branch.status == Status.UNDEFINED_0) return result; 266 | sb.append(aChar); 267 | if (branch.status == Status.WORD_MIDDLE_2 || branch.status == Status.WORD_END_3) 268 | { 269 | result.add(new AbstractMap.SimpleEntry(sb.toString(), (V) branch.value)); 270 | } 271 | } 272 | 273 | return result; 274 | } 275 | 276 | @Override 277 | protected boolean addChild(BaseNode node) 278 | { 279 | boolean add = false; 280 | char c = node.getChar(); 281 | BaseNode target = getChild(c); 282 | if (target == null) 283 | { 284 | child[c] = node; 285 | add = true; 286 | } 287 | else 288 | { 289 | switch (node.status) 290 | { 291 | case UNDEFINED_0: 292 | if (target.status != Status.NOT_WORD_1) 293 | { 294 | target.status = Status.NOT_WORD_1; 295 | add = true; 296 | } 297 | break; 298 | case NOT_WORD_1: 299 | if (target.status == Status.WORD_END_3) 300 | { 301 | target.status = Status.WORD_MIDDLE_2; 302 | } 303 | break; 304 | case WORD_END_3: 305 | if (target.status == Status.NOT_WORD_1) 306 | { 307 | target.status = Status.WORD_MIDDLE_2; 308 | } 309 | if (target.getValue() == null) 310 | { 311 | add = true; 312 | } 313 | target.setValue(node.getValue()); 314 | break; 315 | } 316 | } 317 | return add; 318 | } 319 | 320 | public int size() 321 | { 322 | return size; 323 | } 324 | 325 | @Override 326 | protected char getChar() 327 | { 328 | return 0; // 根节点没有char 329 | } 330 | 331 | @Override 332 | public BaseNode getChild(char c) 333 | { 334 | return child[c]; 335 | } 336 | 337 | public boolean save(String path) 338 | { 339 | try 340 | { 341 | DataOutputStream out = new DataOutputStream(IOUtil.newOutputStream(path)); 342 | for (BaseNode node : child) 343 | { 344 | if (node == null) 345 | { 346 | out.writeInt(0); 347 | } 348 | else 349 | { 350 | out.writeInt(1); 351 | node.walkToSave(out); 352 | } 353 | } 354 | out.close(); 355 | } 356 | catch (Exception e) 357 | { 358 | logger.warning("保存到" + path + "失败" + TextUtility.exceptionToString(e)); 359 | return false; 360 | } 361 | 362 | return true; 363 | } 364 | 365 | @Override 366 | public int build(TreeMap keyValueMap) 367 | { 368 | for (Map.Entry entry : keyValueMap.entrySet()) 369 | { 370 | put(entry.getKey(), entry.getValue()); 371 | } 372 | return 0; 373 | } 374 | 375 | /** 376 | * 保存到二进制输出流 377 | * 378 | * @param out 379 | * @return 380 | */ 381 | public boolean save(DataOutputStream out) 382 | { 383 | try 384 | { 385 | for (BaseNode node : child) 386 | { 387 | if (node == null) 388 | { 389 | out.writeInt(0); 390 | } 391 | else 392 | { 393 | out.writeInt(1); 394 | node.walkToSave(out); 395 | } 396 | } 397 | } 398 | catch (Exception e) 399 | { 400 | logger.warning("保存到" + out + "失败" + TextUtility.exceptionToString(e)); 401 | return false; 402 | } 403 | 404 | return true; 405 | } 406 | 407 | @Override 408 | public boolean load(ByteArray byteArray, V[] value) { 409 | return false; 410 | } 411 | 412 | /** 413 | * 从磁盘加载二分数组树 414 | * 415 | * @param path 路径 416 | * @param value 额外提供的值数组，按照值的字典序。（之所以要求提供它，是因为泛型的保存不归树管理） 417 | * @return 是否成功 418 | */ 419 | public boolean load(String path, V[] value) 420 | { 421 | byte[] bytes = IOUtil.readBytes(path); 422 | if (bytes == null) return false; 423 | _ValueArray valueArray = new _ValueArray(value); 424 | ByteArray byteArray = new ByteArray(bytes); 425 | for (int i = 0; i < child.length; ++i) 426 | { 427 | int flag = byteArray.nextInt(); 428 | if (flag == 1) 429 | { 430 | child[i] = new Node(); 431 | child[i].walkToLoad(byteArray, valueArray); 432 | } 433 | } 434 | size = value.length; 435 | 436 | return true; 437 | } 438 | public _ValueArray newValueArray() 439 | { 440 | return new _ValueArray(); 441 | } 442 | 443 | @Override 444 | public void writeExternal(ObjectOutput out) throws IOException 445 | { 446 | out.writeInt(size); 447 | for (BaseNode node : child) 448 | { 449 | if (node == null) 450 | { 451 | out.writeInt(0); 452 | } 453 | else 454 | { 455 | out.writeInt(1); 456 | node.walkToSave(out); 457 | } 458 | } 459 | } 460 | 461 | @Override 462 | public void readExternal(ObjectInput in) throws IOException, ClassNotFoundException 463 | { 464 | size = in.readInt(); 465 | for (int i = 0; i < child.length; ++i) 466 | { 467 | int flag = in.readInt(); 468 | if (flag == 1) 469 | { 470 | child[i] = new Node(); 471 | child[i].walkToLoad(in); 472 | } 473 | } 474 | } 475 | 476 | /** 477 | * 最长匹配 478 | * 479 | * @param text 文本 480 | * @param processor 处理器 481 | */ 482 | public void parseLongestText(String text, AhoCorasickDoubleArrayTrie.IHit processor) 483 | { 484 | int length = text.length(); 485 | for (int i = 0; i < length; ++i) 486 | { 487 | BaseNode state = transition(text.charAt(i)); 488 | if (state != null) 489 | { 490 | int to = i + 1; 491 | int end = to; 492 | V value = state.getValue(); 493 | for (; to < length; ++to) 494 | { 495 | state = state.transition(text.charAt(to)); 496 | if (state == null) break; 497 | if (state.getValue() != null) 498 | { 499 | value = state.getValue(); 500 | end = to + 1; 501 | } 502 | } 503 | if (value != null) 504 | { 505 | processor.hit(i, end, value); 506 | i = end - 1; 507 | } 508 | } 509 | } 510 | } 511 | 512 | /** 513 | * 最长匹配 514 | * 515 | * @param text 文本 516 | * @param processor 处理器 517 | */ 518 | public void parseLongestText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) 519 | { 520 | int length = text.length; 521 | for (int i = 0; i < length; ++i) 522 | { 523 | BaseNode state = transition(text[i]); 524 | if (state != null) 525 | { 526 | int to = i + 1; 527 | int end = to; 528 | V value = state.getValue(); 529 | for (; to < length; ++to) 530 | { 531 | state = state.transition(text[to]); 532 | if (state == null) break; 533 | if (state.getValue() != null) 534 | { 535 | value = state.getValue(); 536 | end = to + 1; 537 | } 538 | } 539 | if (value != null) 540 | { 541 | processor.hit(i, end, value); 542 | i = end - 1; 543 | } 544 | } 545 | } 546 | } 547 | 548 | /** 549 | * 匹配文本 550 | * 551 | * @param text 文本 552 | * @param processor 处理器 553 | */ 554 | public void parseText(String text, AhoCorasickDoubleArrayTrie.IHit processor) 555 | { 556 | int length = text.length(); 557 | int begin = 0; 558 | BaseNode state = this; 559 | 560 | for (int i = begin; i < length; ++i) 561 | { 562 | state = state.transition(text.charAt(i)); 563 | if (state != null) 564 | { 565 | V value = state.getValue(); 566 | if (value != null) 567 | { 568 | processor.hit(begin, i + 1, value); 569 | } 570 | } 571 | else 572 | { 573 | i = begin; 574 | ++begin; 575 | state = this; 576 | } 577 | } 578 | } 579 | 580 | /** 581 | * 匹配文本 582 | * 583 | * @param text 文本 584 | * @param processor 处理器 585 | */ 586 | public void parseText(char[] text, AhoCorasickDoubleArrayTrie.IHit processor) 587 | { 588 | int length = text.length; 589 | int begin = 0; 590 | BaseNode state = this; 591 | 592 | for (int i = begin; i < length; ++i) 593 | { 594 | state = state.transition(text[i]); 595 | if (state != null) 596 | { 597 | V value = state.getValue(); 598 | if (value != null) 599 | { 600 | processor.hit(begin, i + 1, value); 601 | } 602 | } 603 | else 604 | { 605 | i = begin; 606 | ++begin; 607 | state = this; 608 | } 609 | } 610 | } 611 | } 612 | -------------------------------------------------------------------------------- /src/main/java/trie/bintrie/Node.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/5/3 12:27 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie.bintrie; 13 | 14 | 15 | import trie.bintrie.util.ArrayTool; 16 | 17 | /** 18 | * 深度大于等于2的子节点 19 | * 20 | * @author He Han 21 | */ 22 | public class Node extends BaseNode 23 | { 24 | @Override 25 | protected boolean addChild(BaseNode node) 26 | { 27 | boolean add = false; 28 | if (child == null) 29 | { 30 | child = new BaseNode[0]; 31 | } 32 | int index = ArrayTool.binarySearch(child, node); 33 | if (index >= 0) 34 | { 35 | BaseNode target = child[index]; 36 | switch (node.status) 37 | { 38 | case UNDEFINED_0: 39 | if (target.status != Status.NOT_WORD_1) 40 | { 41 | target.status = Status.NOT_WORD_1; 42 | target.value = null; 43 | add = true; 44 | } 45 | break; 46 | case NOT_WORD_1: 47 | if (target.status == Status.WORD_END_3) 48 | { 49 | target.status = Status.WORD_MIDDLE_2; 50 | } 51 | break; 52 | case WORD_END_3: 53 | if (target.status != Status.WORD_END_3) 54 | { 55 | target.status = Status.WORD_MIDDLE_2; 56 | } 57 | if (target.getValue() == null) 58 | { 59 | add = true; 60 | } 61 | target.setValue(node.getValue()); 62 | break; 63 | } 64 | } 65 | else 66 | { 67 | BaseNode newChild[] = new BaseNode[child.length + 1]; 68 | int insert = -(index + 1); 69 | System.arraycopy(child, 0, newChild, 0, insert); 70 | System.arraycopy(child, insert, newChild, insert + 1, child.length - insert); 71 | newChild[insert] = node; 72 | child = newChild; 73 | add = true; 74 | } 75 | return add; 76 | } 77 | 78 | /** 79 | * @param c 节点的字符 80 | * @param status 节点状态 81 | * @param value 值 82 | */ 83 | public Node(char c, Status status, V value) 84 | { 85 | this.c = c; 86 | this.status = status; 87 | this.value = value; 88 | } 89 | 90 | public Node() 91 | { 92 | } 93 | 94 | @Override 95 | public BaseNode getChild(char c) 96 | { 97 | if (child == null) return null; 98 | int index = ArrayTool.binarySearch(child, c); 99 | if (index < 0) return null; 100 | 101 | return child[index]; 102 | } 103 | } 104 | -------------------------------------------------------------------------------- /src/main/java/trie/bintrie/_ValueArray.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * hankcs 4 | * me@hankcs.com 5 | * 2015/5/15 10:23 6 | * 7 | * 8 | * Copyright (c) 2003-2015, hankcs. All Right Reserved, http://www.hankcs.com/ 9 | * 10 | */ 11 | package trie.bintrie; 12 | 13 | /** 14 | * 对值数组的包装，可以方便地取下一个 15 | * @author hankcs 16 | */ 17 | public class _ValueArray 18 | { 19 | V[] value; 20 | int offset; 21 | 22 | public _ValueArray(V[] value) 23 | { 24 | this.value = value; 25 | } 26 | 27 | public V nextValue() 28 | { 29 | return value[offset++]; 30 | } 31 | 32 | /** 33 | * 仅仅给子类用，不要用 34 | */ 35 | protected _ValueArray() 36 | { 37 | } 38 | 39 | public _ValueArray setValue(V[] value) 40 | { 41 | this.value = value; 42 | return this; 43 | } 44 | } 45 | -------------------------------------------------------------------------------- /src/main/java/trie/bintrie/util/ArrayTool.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/5/3 12:32 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package trie.bintrie.util; 13 | 14 | 15 | import trie.bintrie.BaseNode; 16 | 17 | /** 18 | * @author He Han 19 | */ 20 | public class ArrayTool 21 | { 22 | /** 23 | * 二分查找 24 | * @param branches 数组 25 | * @param node 要查找的node 26 | * @return 数组下标，小于0表示没找到 27 | */ 28 | public static int binarySearch(BaseNode[] branches, BaseNode node) 29 | { 30 | int high = branches.length - 1; 31 | if (branches.length < 1) 32 | { 33 | return high; 34 | } 35 | int low = 0; 36 | while (low <= high) 37 | { 38 | int mid = (low + high) >>> 1; 39 | int cmp = branches[mid].compareTo(node); 40 | 41 | if (cmp < 0) 42 | low = mid + 1; 43 | else if (cmp > 0) 44 | high = mid - 1; 45 | else 46 | return mid; 47 | } 48 | return -(low + 1); 49 | } 50 | 51 | public static int binarySearch(BaseNode[] branches, char node) 52 | { 53 | int high = branches.length - 1; 54 | if (branches.length < 1) 55 | { 56 | return high; 57 | } 58 | int low = 0; 59 | while (low <= high) 60 | { 61 | int mid = (low + high) >>> 1; 62 | int cmp = branches[mid].compareTo(node); 63 | 64 | if (cmp < 0) 65 | low = mid + 1; 66 | else if (cmp > 0) 67 | high = mid - 1; 68 | else 69 | return mid; 70 | } 71 | return -(low + 1); 72 | } 73 | } 74 | -------------------------------------------------------------------------------- /src/main/java/util/FileUtils.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | 4 | import org.apache.commons.lang.StringUtils; 5 | 6 | import java.io.*; 7 | import java.util.HashSet; 8 | import java.util.LinkedHashSet; 9 | import java.util.List; 10 | import java.util.Map; 11 | 12 | public class FileUtils { 13 | 14 | public static void writeMapResultToFile(String outputPath, List> list) { 15 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputPath), "utf-8"))) { 16 | for (Map.Entry mapping : list) { 17 | if(StringUtils.isBlank(mapping.getValue())) continue; 18 | writer.write(mapping.getValue() + "\n"); 19 | } 20 | } catch (Exception e) { 21 | e.printStackTrace(); 22 | } 23 | } 24 | // 文件读写 25 | public static String readFileToString(String fileName) { 26 | String encoding = "UTF-8"; 27 | File file = new File(fileName); 28 | 29 | if (!file.exists()) 30 | 31 | try { 32 | file.createNewFile(); 33 | } catch (IOException e) { 34 | e.printStackTrace(); 35 | } 36 | 37 | Long filelength = file.length(); 38 | byte[] filecontent = new byte[filelength.intValue()]; 39 | try { 40 | if (!file.exists()) file.createNewFile(); 41 | 42 | 43 | FileInputStream in = new FileInputStream(file); 44 | in.read(filecontent); 45 | in.close(); 46 | } catch (FileNotFoundException e) { 47 | e.printStackTrace(); 48 | } catch (IOException e) { 49 | e.printStackTrace(); 50 | } 51 | try { 52 | return new String(filecontent, encoding); 53 | } catch (UnsupportedEncodingException e) { 54 | System.err.println("The OS does not support " + encoding); 55 | e.printStackTrace(); 56 | return null; 57 | } 58 | } 59 | 60 | 61 | public static void writeFileToPath(String outPutPath, LinkedHashSet stringSet) { 62 | try { 63 | FileOutputStream writer = new FileOutputStream(outPutPath); 64 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果 65 | stringSet.forEach(it -> { 66 | if (StringUtils.isNotBlank(it)) { 67 | String result = it + " "; 68 | try { 69 | // System.out.println(result); 70 | bw.write(result); 71 | } catch (IOException e) { 72 | e.printStackTrace(); 73 | } 74 | } 75 | }); 76 | 77 | bw.close(); 78 | writer.close(); 79 | } catch (IOException ex) { 80 | ex.printStackTrace(); 81 | } 82 | } 83 | 84 | 85 | public static void writeFileToPath(String outPutPath, List stringSet) { 86 | try { 87 | FileOutputStream writer = new FileOutputStream(outPutPath); 88 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果 89 | stringSet.forEach(it -> { 90 | String result = it + "\n"; 91 | try { 92 | bw.write(result); 93 | } catch (IOException e) { 94 | e.printStackTrace(); 95 | } 96 | }); 97 | 98 | bw.close(); 99 | writer.close(); 100 | } catch (IOException ex) { 101 | ex.printStackTrace(); 102 | } 103 | } 104 | 105 | 106 | public static void writeFileToPath(String outPutPath, List list, Map wcMap) { 107 | try { 108 | FileOutputStream writer = new FileOutputStream(outPutPath); 109 | 110 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果 111 | list.forEach(it -> { 112 | if (StringUtils.isNotBlank(it) && wcMap.get(it) >= 4) { 113 | try { 114 | bw.write(it + " -> " + wcMap.get(it) + "\n"); 115 | } catch (IOException e) { 116 | e.printStackTrace(); 117 | } 118 | } 119 | }); 120 | 121 | bw.close(); 122 | writer.close(); 123 | } catch (IOException ex) { 124 | ex.printStackTrace(); 125 | } 126 | } 127 | 128 | // 按行读取进集合 129 | public static HashSet readFileByLineToHashSet(String inputFilePath) { 130 | HashSet set = new HashSet(); 131 | try { 132 | // 以utf-8读取文件 133 | FileInputStream fis = new FileInputStream(inputFilePath); 134 | InputStreamReader reader = new InputStreamReader(fis, "UTF-8"); 135 | BufferedReader br = new BufferedReader(reader); 136 | String str = null; 137 | while ((str = br.readLine()) != null) { 138 | set.add(str); 139 | } 140 | br.close(); 141 | reader.close(); 142 | } catch (FileNotFoundException e) { 143 | e.printStackTrace(); 144 | } catch (IOException e) { 145 | e.printStackTrace(); 146 | } 147 | return set; 148 | } 149 | 150 | // 按行读取进集合 151 | public static void writeStringToFile(String outPutPath, String text) { 152 | try { 153 | FileOutputStream writer = new FileOutputStream(outPutPath); 154 | OutputStreamWriter bw = new OutputStreamWriter(writer, "UTF-8"); // 以utf-8写结果 155 | bw.write(text); 156 | bw.close(); 157 | writer.close(); 158 | } catch (IOException ex) { 159 | ex.printStackTrace(); 160 | } 161 | } 162 | 163 | public static void writeResultToFile(String outputPath, List list) { 164 | try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(outputPath), "utf-8"))) { 165 | for (String text : list) { 166 | if (StringUtils.isNotBlank(text)) { 167 | writer.write(text + "\n"); 168 | } 169 | } 170 | } catch (Exception e) { 171 | e.printStackTrace(); 172 | } 173 | } 174 | } 175 | -------------------------------------------------------------------------------- /src/main/java/util/HanUtils.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/main/java/util/HanUtils.java -------------------------------------------------------------------------------- /src/main/java/util/Predefine.java: -------------------------------------------------------------------------------- 1 | /* 2 | *

3 | * He Han 4 | * hankcs.cn@gmail.com 5 | * 2014/5/14 21:36 6 | * 7 | * 8 | * Copyright (c) 2003-2014, 上海林原信息科技有限公司. All Right Reserved, http://www.linrunsoft.com/ 9 | * This source is subject to the LinrunSpace License. Please contact 上海林原信息科技有限公司 to get more information. 10 | * 11 | */ 12 | package util; 13 | 14 | import java.util.logging.Level; 15 | import java.util.logging.Logger; 16 | import java.util.regex.Pattern; 17 | 18 | /** 19 | * 一些预定义的静态全局变量 20 | */ 21 | public class Predefine 22 | { 23 | public static final String CHINESE_NUMBERS = "零○〇一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟"; 24 | /** 25 | * hanlp.properties的路径，一般情况下位于classpath目录中。 26 | * 但在某些极端情况下（不标准的Java虚拟机，用户缺乏相关知识等），允许将其设为绝对路径 27 | */ 28 | public static String HANLP_PROPERTIES_PATH; 29 | public final static double MIN_PROBABILITY = 1e-10; 30 | /** 31 | * 浮点数正则 32 | */ 33 | public static final Pattern PATTERN_FLOAT_NUMBER = Pattern.compile("^(-?\\d+)(\\.\\d+)?$"); 34 | 35 | public static String POSTFIX_SINGLE = 36 | "坝邦堡城池村单岛道堤店洞渡队峰府冈港阁宫沟国海号河湖环集江礁角街井郡坑口矿里岭楼路门盟庙弄牌派坡铺旗桥区渠泉山省市水寺塔台滩坛堂厅亭屯湾屋溪峡县线乡巷洋窑营屿园苑院闸寨站镇州庄族陂庵町"; 37 | 38 | public final static String[] POSTFIX_MUTIPLE = {"半岛","草原","城区","大堤","大公国","大桥","地区", 39 | "帝国","渡槽","港口","高速公路","高原","公路","公园","共和国","谷地","广场", 40 | "国道","海峡","胡同","机场","集镇","教区","街道","口岸","码头","煤矿", 41 | "牧场","农场","盆地","平原","丘陵","群岛","沙漠","沙洲","山脉","山丘", 42 | "水库","隧道","特区","铁路","新村","雪峰","盐场","盐湖","渔场","直辖市", 43 | "自治区","自治县","自治州"}; 44 | 45 | //Seperator type 46 | public static String SEPERATOR_C_SENTENCE = "。！？：；…"; 47 | public static String SEPERATOR_C_SUB_SENTENCE = "、，（）“”‘’"; 48 | public static String SEPERATOR_E_SENTENCE = "!?:;"; 49 | public static String SEPERATOR_E_SUB_SENTENCE = ",()*'"; 50 | //注释：原来程序为",()\042'"，"\042"为10进制42好ASC字符，为* 51 | public static String SEPERATOR_LINK = "\n\r 　"; 52 | 53 | //Seperator between two words 54 | public static String WORD_SEGMENTER = "@"; 55 | 56 | public static int MAX_SEGMENT_NUM = 10; 57 | 58 | public static final int MAX_FREQUENCY = 25146057; // 现在总词频25146057 59 | /** 60 | * Smoothing 平滑因子 61 | */ 62 | public static final double dTemp = (double) 1 / MAX_FREQUENCY + 0.00001; 63 | /** 64 | * 平滑参数 65 | */ 66 | public static final double dSmoothingPara = 0.1; 67 | /** 68 | * 地址 ns 69 | */ 70 | public final static String TAG_PLACE = "未##地"; 71 | /** 72 | * 句子的开始 begin 73 | */ 74 | public final static String TAG_BIGIN = "始##始"; 75 | /** 76 | * 其它 77 | */ 78 | public final static String TAG_OTHER = "未##它"; 79 | /** 80 | * 团体名词 nt 81 | */ 82 | public final static String TAG_GROUP = "未##团"; 83 | /** 84 | * 数词 m 85 | */ 86 | public final static String TAG_NUMBER = "未##数"; 87 | /** 88 | * 数量词 mq （现在觉得应该和数词同等处理，比如一个人和一人都是合理的） 89 | */ 90 | public final static String TAG_QUANTIFIER = "未##量"; 91 | /** 92 | * 专有名词 nx 93 | */ 94 | public final static String TAG_PROPER = "未##专"; 95 | /** 96 | * 时间 t 97 | */ 98 | public final static String TAG_TIME = "未##时"; 99 | /** 100 | * 字符串 x 101 | */ 102 | public final static String TAG_CLUSTER = "未##串"; 103 | /** 104 | * 结束 end 105 | */ 106 | public final static String TAG_END = "末##末"; 107 | /** 108 | * 人名 nr 109 | */ 110 | public final static String TAG_PEOPLE = "未##人"; 111 | 112 | /** 113 | * 日志组件 114 | */ 115 | public static Logger logger = Logger.getLogger("HanLP"); 116 | static 117 | { 118 | logger.setLevel(Level.WARNING); 119 | } 120 | 121 | /** 122 | * trie树文件后缀名 123 | */ 124 | public final static String TRIE_EXT = ".trie.dat"; 125 | /** 126 | * 值文件后缀名 127 | */ 128 | public final static String VALUE_EXT = ".value.dat"; 129 | 130 | /** 131 | * 逆转后缀名 132 | */ 133 | public final static String REVERSE_EXT = ".reverse"; 134 | 135 | /** 136 | * 二进制文件后缀 137 | */ 138 | public final static String BIN_EXT = ".bin"; 139 | } 140 | -------------------------------------------------------------------------------- /src/main/java/util/TextUtility.java: -------------------------------------------------------------------------------- 1 | package util; 2 | 3 | 4 | import pojo.IWord; 5 | import pojo.Sentence; 6 | import pojo.Word; 7 | 8 | import java.io.*; 9 | import java.util.Collection; 10 | import java.util.Iterator; 11 | import java.util.List; 12 | 13 | import static pojo.CharType.*; 14 | 15 | 16 | /** 17 | * 文本工具类 18 | */ 19 | public class TextUtility 20 | { 21 | 22 | public static int charType(char c) 23 | { 24 | return charType(String.valueOf(c)); 25 | } 26 | 27 | /** 28 | * 判断字符类型 29 | * @param str 30 | * @return 31 | */ 32 | public static int charType(String str) 33 | { 34 | if (str != null && str.length() > 0) 35 | { 36 | if (Predefine.CHINESE_NUMBERS.contains(str)) return CT_CNUM; 37 | byte[] b; 38 | try 39 | { 40 | b = str.getBytes("GBK"); 41 | } 42 | catch (UnsupportedEncodingException e) 43 | { 44 | b = str.getBytes(); 45 | e.printStackTrace(); 46 | } 47 | byte b1 = b[0]; 48 | byte b2 = b.length > 1 ? b[1] : 0; 49 | int ub1 = getUnsigned(b1); 50 | int ub2 = getUnsigned(b2); 51 | if (ub1 < 128) 52 | { 53 | if (ub1 < 32) return CT_DELIMITER; // NON PRINTABLE CHARACTERS 54 | if (' ' == b1) return CT_OTHER; 55 | if ('\n' == b1) return CT_DELIMITER; 56 | if ("*\"!,.?()[]{}+=/\\;:|".indexOf((char) b1) != -1) 57 | return CT_DELIMITER; 58 | if ("0123456789".indexOf((char)b1) != -1) 59 | return CT_NUM; 60 | return CT_SINGLE; 61 | } 62 | else if (ub1 == 162) 63 | return CT_INDEX; 64 | else if (ub1 == 163 && ub2 > 175 && ub2 < 186) 65 | return CT_NUM; 66 | else if (ub1 == 163 67 | && (ub2 >= 193 && ub2 <= 218 || ub2 >= 225 68 | && ub2 <= 250)) 69 | return CT_LETTER; 70 | else if (ub1 == 161 || ub1 == 163) 71 | return CT_DELIMITER; 72 | else if (ub1 >= 176 && ub1 <= 247) 73 | return CT_CHINESE; 74 | 75 | } 76 | return CT_OTHER; 77 | } 78 | 79 | /** 80 | * 是否全是中文 81 | * @param str 82 | * @return 83 | */ 84 | public static boolean isAllChinese(String str) 85 | { 86 | return str.matches("[\\u4E00-\\u9FA5]+"); 87 | } 88 | /** 89 | * 是否全部不是中文 90 | * @param sString 91 | * @return 92 | */ 93 | public static boolean isAllNonChinese(byte[] sString) 94 | { 95 | int nLen = sString.length; 96 | int i = 0; 97 | 98 | while (i < nLen) 99 | { 100 | if (getUnsigned(sString[i]) < 248 && getUnsigned(sString[i]) > 175) 101 | return false; 102 | if (sString[i] < 0) 103 | i += 2; 104 | else 105 | i += 1; 106 | } 107 | return true; 108 | } 109 | 110 | /** 111 | * 是否全是单字节 112 | * @param str 113 | * @return 114 | */ 115 | public static boolean isAllSingleByte(String str) 116 | { 117 | assert str != null; 118 | for (int i = 0; i < str.length(); i++) 119 | { 120 | if (str.charAt(i) >128) 121 | { 122 | return false; 123 | } 124 | } 125 | return true; 126 | } 127 | 128 | /** 129 | * 把表示数字含义的字符串转成整形 130 | * 131 | * @param str 要转换的字符串 132 | * @return 如果是有意义的整数，则返回此整数值。否则，返回-1。 133 | */ 134 | public static int cint(String str) 135 | { 136 | if (str != null) 137 | try 138 | { 139 | int i = new Integer(str).intValue(); 140 | return i; 141 | } 142 | catch (NumberFormatException e) 143 | { 144 | 145 | } 146 | 147 | return -1; 148 | } 149 | /** 150 | * 是否全是数字 151 | * @param str 152 | * @return 153 | */ 154 | public static boolean isAllNum(String str) 155 | { 156 | if (str == null) 157 | return false; 158 | 159 | int i = 0; 160 | /** 判断开头是否是+-之类的符号 */ 161 | if ("±+-＋－—".indexOf(str.charAt(0)) != -1) 162 | i++; 163 | /** 如果是全角的０１２３４５６７８９字符* */ 164 | while (i < str.length() && "０１２３４５６７８９".indexOf(str.charAt(i)) != -1) 165 | i++; 166 | // Get middle delimiter such as . 167 | if (i > 0 && i < str.length()) 168 | { 169 | char ch = str.charAt(i); 170 | if ("·∶:，,．.／/".indexOf(ch) != -1) 171 | {// 98．1％ 172 | i++; 173 | while (i < str.length() && "０１２３４５６７８９".indexOf(str.charAt(i)) != -1) 174 | i++; 175 | } 176 | } 177 | if (i >= str.length()) 178 | return true; 179 | 180 | /** 如果是半角的0123456789字符* */ 181 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) 182 | i++; 183 | // Get middle delimiter such as . 184 | if (i > 0 && i < str.length()) 185 | { 186 | char ch = str.charAt(i); 187 | if (',' == ch || '.' == ch || '/' == ch || ':' == ch || "∶·，．／".indexOf(ch) != -1) 188 | {// 98．1％ 189 | i++; 190 | while (i < str.length() && "0123456789".indexOf(str.charAt(i)) != -1) 191 | i++; 192 | } 193 | } 194 | 195 | if (i < str.length()) 196 | { 197 | if ("百千万亿佰仟%％‰".indexOf(str.charAt(i)) != -1) 198 | i++; 199 | } 200 | if (i >= str.length()) 201 | return true; 202 | 203 | return false; 204 | } 205 | 206 | /** 207 | * 是否全是序号 208 | * @param sString 209 | * @return 210 | */ 211 | public static boolean isAllIndex(byte[] sString) 212 | { 213 | int nLen = sString.length; 214 | int i = 0; 215 | 216 | while (i < nLen - 1 && getUnsigned(sString[i]) == 162) 217 | { 218 | i += 2; 219 | } 220 | if (i >= nLen) 221 | return true; 222 | while (i < nLen && (sString[i] > 'A' - 1 && sString[i] < 'Z' + 1) 223 | || (sString[i] > 'a' - 1 && sString[i] < 'z' + 1)) 224 | {// single 225 | // byte 226 | // number 227 | // char 228 | i += 1; 229 | } 230 | 231 | if (i < nLen) 232 | return false; 233 | return true; 234 | 235 | } 236 | 237 | /** 238 | * 是否全为英文 239 | * 240 | * @param text 241 | * @return 242 | */ 243 | public static boolean isAllLetter(String text) 244 | { 245 | for (int i = 0; i < text.length(); ++i) 246 | { 247 | char c = text.charAt(i); 248 | if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')))) 249 | { 250 | return false; 251 | } 252 | } 253 | 254 | return true; 255 | } 256 | 257 | /** 258 | * 是否全为英文或字母 259 | * 260 | * @param text 261 | * @return 262 | */ 263 | public static boolean isAllLetterOrNum(String text) 264 | { 265 | for (int i = 0; i < text.length(); ++i) 266 | { 267 | char c = text.charAt(i); 268 | if ((((c < 'a' || c > 'z')) && ((c < 'A' || c > 'Z')) && ((c < '0' || c > '9')))) 269 | { 270 | return false; 271 | } 272 | } 273 | 274 | return true; 275 | } 276 | 277 | /** 278 | * 是否全是分隔符 279 | * @param sString 280 | * @return 281 | */ 282 | public static boolean isAllDelimiter(byte[] sString) 283 | { 284 | int nLen = sString.length; 285 | int i = 0; 286 | 287 | while (i < nLen - 1 && (getUnsigned(sString[i]) == 161 || getUnsigned(sString[i]) == 163)) 288 | { 289 | i += 2; 290 | } 291 | if (i < nLen) 292 | return false; 293 | return true; 294 | } 295 | 296 | /** 297 | * 是否全是中国数字 298 | * @param word 299 | * @return 300 | */ 301 | public static boolean isAllChineseNum(String word) 302 | {// 百分之五点六的人早上八点十八分起床 303 | 304 | String chineseNum = "零○一二两三四五六七八九十廿百千万亿壹贰叁肆伍陆柒捌玖拾佰仟∶·．／点";// 305 | String prefix = "几数上第"; 306 | String surfix = "几多余来成倍"; 307 | boolean round = false; 308 | 309 | if (word == null) 310 | return false; 311 | 312 | char[] temp = word.toCharArray(); 313 | for (int i = 0; i < temp.length; i++) 314 | { 315 | if (word.startsWith("分之", i))// 百分之五 316 | { 317 | i += 1; 318 | continue; 319 | } 320 | char tchar = temp[i]; 321 | if (i == 0 && prefix.indexOf(tchar) != -1) 322 | { 323 | round = true; 324 | } 325 | else if (i == temp.length-1 && !round && surfix.indexOf(tchar) != -1) 326 | { 327 | round = true; 328 | } 329 | else if (chineseNum.indexOf(tchar) == -1) 330 | return false; 331 | } 332 | return true; 333 | } 334 | 335 | 336 | /** 337 | * 得到字符集的字符在字符串中出现的次数 338 | * 339 | * @param charSet 340 | * @param word 341 | * @return 342 | */ 343 | public static int getCharCount(String charSet, String word) 344 | { 345 | int nCount = 0; 346 | 347 | if (word != null) 348 | { 349 | String temp = word + " "; 350 | for (int i = 0; i < word.length(); i++) 351 | { 352 | String s = temp.substring(i, i + 1); 353 | if (charSet.indexOf(s) != -1) 354 | nCount++; 355 | } 356 | } 357 | 358 | return nCount; 359 | } 360 | 361 | 362 | /** 363 | * 获取字节对应的无符号整型数 364 | * 365 | * @param b 366 | * @return 367 | */ 368 | public static int getUnsigned(byte b) 369 | { 370 | if (b > 0) 371 | return (int) b; 372 | else 373 | return (b & 0x7F + 128); 374 | } 375 | 376 | /** 377 | * 判断字符串是否是年份 378 | * 379 | * @param snum 380 | * @return 381 | */ 382 | public static boolean isYearTime(String snum) 383 | { 384 | if (snum != null) 385 | { 386 | int len = snum.length(); 387 | String first = snum.substring(0, 1); 388 | 389 | // 1992年, 98年,06年 390 | if (isAllSingleByte(snum) 391 | && (len == 4 || len == 2 && (cint(first) > 4 || cint(first) == 0))) 392 | return true; 393 | if (isAllNum(snum) && (len >= 3 || len == 2 && "０５６７８９".indexOf(first) != -1)) 394 | return true; 395 | if (getCharCount("零○一二三四五六七八九壹贰叁肆伍陆柒捌玖", snum) == len && len >= 2) 396 | return true; 397 | if (len == 4 && getCharCount("千仟零○", snum) == 2)// 二仟零二年 398 | return true; 399 | if (len == 1 && getCharCount("千仟", snum) == 1) 400 | return true; 401 | if (len == 2 && getCharCount("甲乙丙丁戊己庚辛壬癸", snum) == 1 402 | && getCharCount("子丑寅卯辰巳午未申酉戌亥", snum.substring(1)) == 1) 403 | return true; 404 | } 405 | return false; 406 | } 407 | 408 | /** 409 | * 判断一个字符串的所有字符是否在另一个字符串集合中 410 | * 411 | * @param aggr 字符串集合 412 | * @param str 需要判断的字符串 413 | * @return 414 | */ 415 | public static boolean isInAggregate(String aggr, String str) 416 | { 417 | if (aggr != null && str != null) 418 | { 419 | str += "1"; 420 | for (int i = 0; i < str.length(); i++) 421 | { 422 | String s = str.substring(i, i + 1); 423 | if (aggr.indexOf(s) == -1) 424 | return false; 425 | } 426 | return true; 427 | } 428 | 429 | return false; 430 | } 431 | 432 | /** 433 | * 判断该字符串是否是半角字符 434 | * 435 | * @param str 436 | * @return 437 | */ 438 | public static boolean isDBCCase(String str) 439 | { 440 | if (str != null) 441 | { 442 | str += " "; 443 | for (int i = 0; i < str.length(); i++) 444 | { 445 | String s = str.substring(i, i + 1); 446 | int length = 0; 447 | try 448 | { 449 | length = s.getBytes("GBK").length; 450 | } 451 | catch (UnsupportedEncodingException e) 452 | { 453 | e.printStackTrace(); 454 | length = s.getBytes().length; 455 | } 456 | if (length != 1) 457 | return false; 458 | } 459 | 460 | return true; 461 | } 462 | 463 | return false; 464 | } 465 | 466 | /** 467 | * 判断该字符串是否是全角字符 468 | * 469 | * @param str 470 | * @return 471 | */ 472 | public static boolean isSBCCase(String str) 473 | { 474 | if (str != null) 475 | { 476 | str += " "; 477 | for (int i = 0; i < str.length(); i++) 478 | { 479 | String s = str.substring(i, i + 1); 480 | int length = 0; 481 | try 482 | { 483 | length = s.getBytes("GBK").length; 484 | } 485 | catch (UnsupportedEncodingException e) 486 | { 487 | e.printStackTrace(); 488 | length = s.getBytes().length; 489 | } 490 | if (length != 2) 491 | return false; 492 | } 493 | 494 | return true; 495 | } 496 | 497 | return false; 498 | } 499 | 500 | /** 501 | * 判断是否是一个连字符（分隔符） 502 | * 503 | * @param str 504 | * @return 505 | */ 506 | public static boolean isDelimiter(String str) 507 | { 508 | if (str != null && ("-".equals(str) || "－".equals(str))) 509 | return true; 510 | else 511 | return false; 512 | } 513 | 514 | public static boolean isUnknownWord(String word) 515 | { 516 | if (word != null && word.indexOf("未##") == 0) 517 | return true; 518 | else 519 | return false; 520 | } 521 | 522 | /** 523 | * 防止频率为0发生除零错误 524 | * 525 | * @param frequency 526 | * @return 527 | */ 528 | public static double nonZero(double frequency) 529 | { 530 | if (frequency == 0) return 1e-3; 531 | 532 | return frequency; 533 | } 534 | 535 | /** 536 | * 转换long型为char数组 537 | * 538 | * @param x 539 | */ 540 | public static char[] long2char(long x) 541 | { 542 | char[] c = new char[4]; 543 | c[0] = (char) (x >> 48); 544 | c[1] = (char) (x >> 32); 545 | c[2] = (char) (x >> 16); 546 | c[3] = (char) (x); 547 | return c; 548 | } 549 | 550 | /** 551 | * 转换long类型为string 552 | * 553 | * @param x 554 | * @return 555 | */ 556 | public static String long2String(long x) 557 | { 558 | char[] cArray = long2char(x); 559 | StringBuilder sbResult = new StringBuilder(cArray.length); 560 | for (char c : cArray) 561 | { 562 | sbResult.append(c); 563 | } 564 | return sbResult.toString(); 565 | } 566 | 567 | /** 568 | * 将异常转为字符串 569 | * 570 | * @param e 571 | * @return 572 | */ 573 | public static String exceptionToString(Exception e) 574 | { 575 | StringWriter sw = new StringWriter(); 576 | PrintWriter pw = new PrintWriter(sw); 577 | e.printStackTrace(pw); 578 | return sw.toString(); 579 | } 580 | 581 | /** 582 | * 判断某个字符是否为汉字 583 | * 584 | * @param c 需要判断的字符 585 | * @return 是汉字返回true，否则返回false 586 | */ 587 | public static boolean isChinese(char c) 588 | { 589 | String regex = "[\\u4e00-\\u9fa5]"; 590 | return String.valueOf(c).matches(regex); 591 | } 592 | 593 | /** 594 | * 统计 keyword 在 srcText 中的出现次数 595 | * 596 | * @param keyword 597 | * @param srcText 598 | * @return 599 | */ 600 | public static int count(String keyword, String srcText) 601 | { 602 | int count = 0; 603 | int leng = srcText.length(); 604 | int j = 0; 605 | for (int i = 0; i < leng; i++) 606 | { 607 | if (srcText.charAt(i) == keyword.charAt(j)) 608 | { 609 | j++; 610 | if (j == keyword.length()) 611 | { 612 | count++; 613 | j = 0; 614 | } 615 | } 616 | else 617 | { 618 | i = i - j;// should rollback when not match 619 | j = 0; 620 | } 621 | } 622 | 623 | return count; 624 | } 625 | 626 | /** 627 | * 简单好用的写String方式 628 | * 629 | * @param s 630 | * @param out 631 | * @throws IOException 632 | */ 633 | public static void writeString(String s, DataOutputStream out) throws IOException 634 | { 635 | out.writeInt(s.length()); 636 | for (char c : s.toCharArray()) 637 | { 638 | out.writeChar(c); 639 | } 640 | } 641 | 642 | /** 643 | * 判断字符串是否为空（null和空格） 644 | * 645 | * @param cs 646 | * @return 647 | */ 648 | public static boolean isBlank(CharSequence cs) 649 | { 650 | int strLen; 651 | if (cs == null || (strLen = cs.length()) == 0) 652 | { 653 | return true; 654 | } 655 | for (int i = 0; i < strLen; i++) 656 | { 657 | if (!Character.isWhitespace(cs.charAt(i))) 658 | { 659 | return false; 660 | } 661 | } 662 | return true; 663 | } 664 | 665 | public static String join(String delimiter, Collection stringCollection) 666 | { 667 | StringBuilder sb = new StringBuilder(stringCollection.size() * (16 + delimiter.length())); 668 | for (String str : stringCollection) 669 | { 670 | sb.append(str).append(delimiter); 671 | } 672 | 673 | return sb.toString(); 674 | } 675 | 676 | public static String combine(String... termArray) 677 | { 678 | StringBuilder sbSentence = new StringBuilder(); 679 | for (String word : termArray) 680 | { 681 | sbSentence.append(word); 682 | } 683 | return sbSentence.toString(); 684 | } 685 | 686 | public static String join(Iterable s, String delimiter) 687 | { 688 | Iterator iter = s.iterator(); 689 | if (!iter.hasNext()) return ""; 690 | StringBuilder buffer = new StringBuilder(iter.next()); 691 | while (iter.hasNext()) buffer.append(delimiter).append(iter.next()); 692 | return buffer.toString(); 693 | } 694 | 695 | public static String combine(Sentence sentence) 696 | { 697 | StringBuilder sb = new StringBuilder(sentence.wordList.size() * 3); 698 | for (IWord word : sentence.wordList) 699 | { 700 | sb.append(word.getValue()); 701 | } 702 | 703 | return sb.toString(); 704 | } 705 | 706 | public static String combine(List wordList) 707 | { 708 | StringBuilder sb = new StringBuilder(wordList.size() * 3); 709 | for (IWord word : wordList) 710 | { 711 | sb.append(word.getValue()); 712 | } 713 | 714 | return sb.toString(); 715 | } 716 | } 717 | -------------------------------------------------------------------------------- /src/test/java/SegmentTest/SegTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/SegmentTest/SegTest.java -------------------------------------------------------------------------------- /src/test/java/SegmentTest/WordCountTest.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/SegmentTest/WordCountTest.java -------------------------------------------------------------------------------- /src/test/java/concurrent/SegCountProcess.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shanjunwei/SegAndNewWordDiscover/610172db9c494bb260444b055fcb107e68c3b995/src/test/java/concurrent/SegCountProcess.java --------------------------------------------------------------------------------