├── .gitignore ├── README.md └── src └── LingoesLd2Reader.java /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | out/ 3 | .idea/ 4 | Dict.iml 5 | indexFile 6 | *.iml 7 | *.indexFile 8 | \#*\# 9 | *.log 10 | *.log.* -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # LingoesStarDictConverter 2 | 3 | 这个项目会把灵格斯Lingoes的LD2文件转制成星际王StarDict的格式。只是改动了Xiaoyun Zhu的源码,英文、德文字典测试可以使用,俄文好像有编码问题,未来会改进。 4 | 5 | # 使用 6 | 7 | 源码中的main函数里的ld2File改成你ld2的路径,然后会输出4个文件: 8 | 9 | ## .inflated 10 | 11 | 解压文件 12 | 13 | ## .ifo 14 | 星际王词典信息文件 15 | 16 | ## .idx 17 | 星际王词典文件 18 | 19 | ## .dict 20 | 星际王词典的释意文件 21 | 22 | ## ifo, idx, dict就是星际王需要的3个文件。 23 | -------------------------------------------------------------------------------- /src/LingoesLd2Reader.java: -------------------------------------------------------------------------------- 1 | import java.io.*; 2 | import java.nio.ByteBuffer; 3 | import java.nio.ByteOrder; 4 | import java.nio.CharBuffer; 5 | import java.nio.channels.FileChannel; 6 | import java.nio.charset.CharacterCodingException; 7 | import java.nio.charset.Charset; 8 | import java.nio.charset.CharsetDecoder; 9 | import java.nio.charset.CoderResult; 10 | import java.nio.charset.CodingErrorAction; 11 | import java.nio.file.Path; 12 | import java.nio.file.Paths; 13 | import java.util.ArrayList; 14 | import java.util.Arrays; 15 | import java.util.List; 16 | import java.util.zip.Inflater; 17 | import java.util.zip.InflaterInputStream; 18 | 19 | /** 20 | * Lingoes LD2/LDF File Reader 21 | * 22 | *
 23 |  * Lingoes Format overview:
 24 |  *
 25 |  * General Information:
 26 |  * - Dictionary data are stored in deflate streams.
 27 |  * - Index group information is stored in an index array in the LD2 file itself.
 28 |  * - Numbers are using little endian byte order.
 29 |  * - Definitions and xml data have UTF-8 or UTF-16LE encodings.
 30 |  *
 31 |  * LD2 file schema:
 32 |  * - File Header
 33 |  * - File Description
 34 |  * - Additional Information (optional)
 35 |  * - Index Group (corresponds to definitions in dictionary)
 36 |  * - Deflated Dictionary Streams
 37 |  * -- Index Data
 38 |  * --- Offsets of definitions
 39 |  * --- Offsets of translations
 40 |  * --- Flags
 41 |  * --- References to other translations
 42 |  * -- Definitions
 43 |  * -- Translations (xml)
 44 |  *
 45 |  * TODO: find encoding / language fields to replace auto-detect of encodings
 46 |  *
 47 |  * 
48 | * 49 | * @author keke 50 | * 51 | */ 52 | public class LingoesLd2Reader { 53 | private static final SensitiveStringDecoder[] AVAIL_ENCODINGS = { 54 | new SensitiveStringDecoder(Charset.forName("UTF-8")), 55 | new SensitiveStringDecoder(Charset.forName("UTF-16LE")), 56 | new SensitiveStringDecoder(Charset.forName("UTF-16BE")), 57 | new SensitiveStringDecoder(Charset.forName("EUC-JP")) }; 58 | 59 | public static void main(final String[] args) throws IOException { 60 | final String ld2File = "E:/FTP/LingoesDict/Vicon Russian-English Dictionary.ld2"; 61 | // read lingoes ld2 into byte array 62 | final ByteBuffer dataRawBytes; 63 | RandomAccessFile file = new RandomAccessFile(ld2File, "r"); 64 | FileChannel fChannel = file.getChannel(); 65 | dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); 66 | fChannel.read(dataRawBytes); 67 | 68 | dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); 69 | dataRawBytes.rewind(); 70 | 71 | System.out.println("文件:" + ld2File); 72 | System.out.println("类型:" 73 | + new String(dataRawBytes.array(), 0, 4, "ASCII")); 74 | System.out.println("版本:" + dataRawBytes.getShort(0x18) + "." 75 | + dataRawBytes.getShort(0x1A)); 76 | System.out.println("ID: 0x" 77 | + Long.toHexString(dataRawBytes.getLong(0x1C))); 78 | 79 | final int offsetData = dataRawBytes.getInt(0x5C) + 0x60; 80 | if (dataRawBytes.limit() > offsetData) { 81 | System.out.println("简介地址:0x" + Integer.toHexString(offsetData)); 82 | final int type = dataRawBytes.getInt(offsetData); 83 | System.out.println("简介类型:0x" + Integer.toHexString(type)); 84 | final int offsetWithInfo = dataRawBytes.getInt(offsetData + 4) 85 | + offsetData + 12; 86 | if (type == 3) { 87 | // without additional information 88 | LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, 89 | offsetData); 90 | } else if (dataRawBytes.limit() > (offsetWithInfo - 0x1C)) { 91 | LingoesLd2Reader.readDictionary(ld2File, dataRawBytes, 92 | offsetWithInfo); 93 | } else { 94 | System.err.println("文件不包含字典数据。网上字典?"); 95 | } 96 | } else { 97 | System.err.println("文件不包含字典数据。网上字典?"); 98 | } 99 | } 100 | 101 | private static long decompress(final String inflatedFile, 102 | final ByteBuffer data, final int offset, final int length, 103 | final boolean append) throws IOException { 104 | final Inflater inflator = new Inflater(); 105 | final InflaterInputStream in = new InflaterInputStream( 106 | new ByteArrayInputStream(data.array(), offset, length), 107 | inflator, 1024 * 8); 108 | final FileOutputStream out = new FileOutputStream(inflatedFile, append); 109 | 110 | LingoesLd2Reader.writeInputStream(in, out); 111 | final long bytesRead = inflator.getBytesRead(); 112 | inflator.end(); 113 | return bytesRead; 114 | } 115 | 116 | private static SensitiveStringDecoder[] detectEncodings( 117 | final ByteBuffer inflatedBytes, final int offsetWords, 118 | final int offsetXml, final int defTotal, final int dataLen, 119 | final int[] idxData, final String[] defData) { 120 | final int test = Math.min(defTotal, 10); 121 | for (int j = 0; j < LingoesLd2Reader.AVAIL_ENCODINGS.length; j++) { 122 | for (int k = 0; k < LingoesLd2Reader.AVAIL_ENCODINGS.length; k++) { 123 | try { 124 | for (int i = 0; i < test; i++) { 125 | LingoesLd2Reader.readDefinitionData(inflatedBytes, 126 | offsetWords, offsetXml, dataLen, 127 | LingoesLd2Reader.AVAIL_ENCODINGS[j], 128 | LingoesLd2Reader.AVAIL_ENCODINGS[k], idxData, 129 | defData, i); 130 | } 131 | System.out.println("词组编码:" 132 | + LingoesLd2Reader.AVAIL_ENCODINGS[j].name); 133 | System.out.println("XML编码:" 134 | + LingoesLd2Reader.AVAIL_ENCODINGS[k].name); 135 | return new SensitiveStringDecoder[] { 136 | LingoesLd2Reader.AVAIL_ENCODINGS[j], 137 | LingoesLd2Reader.AVAIL_ENCODINGS[k] }; 138 | } catch (final Throwable e) { 139 | // ignore 140 | } 141 | } 142 | } 143 | System.err.println("自动识别编码失败!选择UTF-16LE继续。"); 144 | return new SensitiveStringDecoder[] { 145 | LingoesLd2Reader.AVAIL_ENCODINGS[1], 146 | LingoesLd2Reader.AVAIL_ENCODINGS[1] }; 147 | } 148 | 149 | private static void extract(final String inflatedFile, 150 | final String indexFile, 151 | final String extractedOutputFile, 152 | final String informationFile, 153 | final int offsetDefs, 154 | final int offsetXml) 155 | 156 | throws IOException, FileNotFoundException, 157 | UnsupportedEncodingException { 158 | System.out.println("写入'" + extractedOutputFile + "'。。。"); 159 | int counter = 0; 160 | //解压后的文件 161 | RandomAccessFile file = new RandomAccessFile(inflatedFile, "r"); 162 | //索引文件 163 | // final FileWriter indexWriter = new FileWriter(indexFile); 164 | //final Writer indexWriter = new BufferedWriter(new OutputStreamWriter( new FileOutputStream("indexFile"),"UTF-8")); 165 | final FileOutputStream indexWriter = new FileOutputStream(indexFile); 166 | 167 | //释意文件 168 | final FileOutputStream outputWriter = new FileOutputStream(extractedOutputFile); 169 | 170 | // 读解压后的文件 171 | final FileChannel fChannel = file.getChannel(); 172 | final ByteBuffer dataRawBytes = ByteBuffer.allocate((int) fChannel.size()); 173 | fChannel.read(dataRawBytes); 174 | fChannel.close(); 175 | dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); 176 | dataRawBytes.rewind(); 177 | 178 | final int dataLen = 10; 179 | 180 | // 单词总数 181 | final int defTotal = (offsetDefs / dataLen) - 1; 182 | 183 | 184 | final int[] idxData = new int[6]; 185 | final String[] defData = new String[2]; 186 | 187 | final SensitiveStringDecoder[] encodings = LingoesLd2Reader 188 | .detectEncodings(dataRawBytes, offsetDefs, offsetXml, defTotal, dataLen, idxData, defData); 189 | 190 | dataRawBytes.position(8); 191 | int currDefPosition = 0; 192 | for (int i = 0; i < defTotal; i++) { 193 | LingoesLd2Reader.readDefinitionData(dataRawBytes, offsetDefs, 194 | offsetXml, dataLen, encodings[0], encodings[1], idxData, 195 | defData, i); 196 | 197 | //释意写入索引文件 198 | 199 | indexWriter.write(defData[0].getBytes("UTF-8")); 200 | //写入\0分隔 201 | indexWriter.write(0); 202 | //写入位置 203 | byte[] positionIntegerByte = ByteBuffer.allocate(4).putInt(currDefPosition).array(); 204 | indexWriter.write(positionIntegerByte); 205 | 206 | int defintionNumOfBytes = defData[1].getBytes("UTF-8").length; 207 | 208 | byte[] definitionLengthIntegerByte = ByteBuffer.allocate(4).putInt(defintionNumOfBytes).array(); 209 | 210 | indexWriter.write(definitionLengthIntegerByte); 211 | 212 | currDefPosition += defintionNumOfBytes; 213 | outputWriter.write(defData[1].getBytes("UTF-8")); 214 | 215 | //System.out.println(defData[0] + " = " + defData[1]); 216 | counter++; 217 | } 218 | 219 | 220 | // 给出最后的information文件 221 | File idxFile = new File(indexFile); 222 | long idxFileSize = idxFile.length(); 223 | FileWriter infomationFileWriter = new FileWriter(informationFile); 224 | String fileName = idxFile.getName(); 225 | String dictName = fileName.substring(0,fileName.lastIndexOf(".")); 226 | infomationFileWriter.write( 227 | "StartDict's dict ifo file\n" + 228 | "version=2.4.2\n" + 229 | "wordcount=" + counter + "\n" + 230 | "idxfilesize="+idxFileSize + "\n" + 231 | "bookname="+dictName.substring(0,dictName.lastIndexOf("."))); 232 | infomationFileWriter.flush(); 233 | indexWriter.flush(); 234 | outputWriter.flush(); 235 | System.out.println("成功读出" + counter + "组数据。"); 236 | } 237 | 238 | private static void getIdxData(final ByteBuffer dataRawBytes, 239 | final int position, final int[] wordIdxData) { 240 | dataRawBytes.position(position); 241 | wordIdxData[0] = dataRawBytes.getInt(); 242 | wordIdxData[1] = dataRawBytes.getInt(); 243 | wordIdxData[2] = dataRawBytes.get() & 0xff; 244 | wordIdxData[3] = dataRawBytes.get() & 0xff; 245 | wordIdxData[4] = dataRawBytes.getInt(); 246 | wordIdxData[5] = dataRawBytes.getInt(); 247 | } 248 | 249 | private static void inflate(final ByteBuffer dataRawBytes, 250 | final List deflateStreams, final String inflatedFile) { 251 | System.out.println("解压缩'" + deflateStreams.size() + "'个数据流至'" 252 | + inflatedFile + "'。。。"); 253 | final int startOffset = dataRawBytes.position(); 254 | int offset = -1; 255 | int lastOffset = startOffset; 256 | boolean append = false; 257 | try { 258 | for (final Integer offsetRelative : deflateStreams) { 259 | offset = startOffset + offsetRelative.intValue(); 260 | LingoesLd2Reader.decompress(inflatedFile, dataRawBytes, 261 | lastOffset, offset - lastOffset, append); 262 | append = true; 263 | lastOffset = offset; 264 | } 265 | } catch (final Throwable e) { 266 | System.err.println("解压缩失败: 0x" + Integer.toHexString(offset) + ": " 267 | + e.toString()); 268 | } 269 | } 270 | 271 | private static void readDefinitionData( 272 | final ByteBuffer inflatedBytes, final int offsetWords, 273 | final int offsetXml, final int dataLen, 274 | final SensitiveStringDecoder wordStringDecoder, 275 | final SensitiveStringDecoder xmlStringDecoder, final int[] idxData, 276 | final String[] defData, final int i) { 277 | LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * i, idxData); 278 | int lastWordPos = idxData[0]; 279 | int lastXmlPos = idxData[1]; 280 | // final int flags = idxData[2]; 281 | int refs = idxData[3]; 282 | final int currentWordOffset = idxData[4]; 283 | int currenXmlOffset = idxData[5]; 284 | 285 | String xml = LingoesLd2Reader.strip(new String(xmlStringDecoder.decode( 286 | inflatedBytes.array(), offsetXml + lastXmlPos, currenXmlOffset 287 | - lastXmlPos))); 288 | while (refs-- > 0) { 289 | final int ref = inflatedBytes.getInt(offsetWords + lastWordPos); 290 | LingoesLd2Reader.getIdxData(inflatedBytes, dataLen * ref, idxData); 291 | lastXmlPos = idxData[1]; 292 | currenXmlOffset = idxData[5]; 293 | if (xml.isEmpty()) { 294 | xml = LingoesLd2Reader.strip(new String(xmlStringDecoder 295 | .decode(inflatedBytes.array(), offsetXml + lastXmlPos, 296 | currenXmlOffset - lastXmlPos))); 297 | } else { 298 | xml = LingoesLd2Reader.strip(new String(xmlStringDecoder 299 | .decode(inflatedBytes.array(), offsetXml + lastXmlPos, 300 | currenXmlOffset - lastXmlPos))) 301 | + ", " + xml; 302 | } 303 | lastWordPos += 4; 304 | } 305 | defData[1] = xml; 306 | 307 | final String word = new String(wordStringDecoder.decode( 308 | inflatedBytes.array(), offsetWords + lastWordPos, 309 | currentWordOffset - lastWordPos)); 310 | defData[0] = word; 311 | } 312 | 313 | private static void readDictionary(final String ld2File, 314 | final ByteBuffer dataRawBytes, 315 | final int offsetWithIndex) 316 | throws IOException, FileNotFoundException, UnsupportedEncodingException { 317 | System.out.println("词典类型:0x" 318 | + Integer.toHexString(dataRawBytes.getInt(offsetWithIndex))); 319 | final int limit = dataRawBytes.getInt(offsetWithIndex + 4) 320 | + offsetWithIndex + 8; 321 | final int offsetIndex = offsetWithIndex + 0x1C; 322 | final int offsetCompressedDataHeader = dataRawBytes 323 | .getInt(offsetWithIndex + 8) + offsetIndex; 324 | final int inflatedWordsIndexLength = dataRawBytes 325 | .getInt(offsetWithIndex + 12); 326 | final int inflatedWordsLength = dataRawBytes 327 | .getInt(offsetWithIndex + 16); 328 | final int inflatedXmlLength = dataRawBytes.getInt(offsetWithIndex + 20); 329 | final int definitions = (offsetCompressedDataHeader - offsetIndex) / 4; 330 | final List deflateStreams = new ArrayList(); 331 | dataRawBytes.position(offsetCompressedDataHeader + 8); 332 | int offset = dataRawBytes.getInt(); 333 | while ((offset + dataRawBytes.position()) < limit) { 334 | offset = dataRawBytes.getInt(); 335 | deflateStreams.add(Integer.valueOf(offset)); 336 | } 337 | final int offsetCompressedData = dataRawBytes.position(); 338 | System.out.println("索引词组数目:" + definitions); 339 | System.out.println("索引地址/大小:0x" + Integer.toHexString(offsetIndex) 340 | + " / " + (offsetCompressedDataHeader - offsetIndex) + " B"); 341 | System.out.println("压缩数据地址/大小:0x" 342 | + Integer.toHexString(offsetCompressedData) + " / " 343 | + (limit - offsetCompressedData) + " B"); 344 | System.out.println("词组索引地址/大小(解压缩后):0x0 / " + inflatedWordsIndexLength 345 | + " B"); 346 | System.out.println("词组地址/大小(解压缩后):0x" 347 | + Integer.toHexString(inflatedWordsIndexLength) + " / " 348 | + inflatedWordsLength + " B"); 349 | System.out.println("XML地址/大小(解压缩后):0x" 350 | + Integer.toHexString(inflatedWordsIndexLength 351 | + inflatedWordsLength) + " / " + inflatedXmlLength 352 | + " B"); 353 | System.out 354 | .println("文件大小(解压缩后):" 355 | + ((inflatedWordsIndexLength + inflatedWordsLength + inflatedXmlLength) / 1024) 356 | + " KB"); 357 | final String inflatedFile = ld2File + ".inflated"; 358 | LingoesLd2Reader.inflate(dataRawBytes, deflateStreams, inflatedFile); 359 | 360 | if (new File(inflatedFile).isFile()) { 361 | final String indexFile = ld2File + ".idx"; 362 | final String extractedOutputFile = ld2File + ".dict"; 363 | final String infomationFile = ld2File + ".ifo"; 364 | dataRawBytes.position(offsetIndex); 365 | final int[] idxArray = new int[definitions]; 366 | for (int i = 0; i < definitions; i++) { 367 | idxArray[i] = dataRawBytes.getInt(); 368 | } 369 | LingoesLd2Reader.extract(inflatedFile, indexFile, extractedOutputFile,infomationFile, inflatedWordsIndexLength, 370 | inflatedWordsIndexLength 371 | + inflatedWordsLength); 372 | } 373 | } 374 | 375 | private static String strip(final String xml) { 376 | int open = 0; 377 | int end = 0; 378 | if ((open = xml.indexOf("", open)) != -1) { 380 | return xml.substring(open + "", open + 1); 387 | return xml.substring(open + 1, end).replace('\t', ' ') 388 | .replace('\n', ' ').replace('\u001e', ' ') 389 | .replace('\u001f', ' '); 390 | } 391 | } else { 392 | final StringBuilder sb = new StringBuilder(); 393 | end = 0; 394 | open = xml.indexOf('<'); 395 | do { 396 | if ((open - end) > 1) { 397 | sb.append(xml.substring(end + 1, open)); 398 | } 399 | open = xml.indexOf('<', open + 1); 400 | end = xml.indexOf('>', end + 1); 401 | } while ((open != -1) && (end != -1)); 402 | return sb.toString().replace('\t', ' ').replace('\n', ' ') 403 | .replace('\u001e', ' ').replace('\u001f', ' '); 404 | } 405 | return ""; 406 | } 407 | 408 | private static void writeInputStream(final InputStream in, 409 | final OutputStream out) throws IOException { 410 | final byte[] buffer = new byte[1024 * 8]; 411 | int len; 412 | while ((len = in.read(buffer)) > 0) { 413 | out.write(buffer, 0, len); 414 | } 415 | } 416 | 417 | private static class SensitiveStringDecoder { 418 | final String name; 419 | private final CharsetDecoder cd; 420 | 421 | SensitiveStringDecoder(final Charset cs) { 422 | this.cd = cs.newDecoder() 423 | .onMalformedInput(CodingErrorAction.REPORT) 424 | .onUnmappableCharacter(CodingErrorAction.REPORT); 425 | this.name = cs.name(); 426 | } 427 | 428 | char[] decode(final byte[] ba, final int off, final int len) { 429 | final int en = (int) (len * (double) this.cd.maxCharsPerByte()); 430 | final char[] ca = new char[en]; 431 | if (len == 0) { 432 | return ca; 433 | } 434 | this.cd.reset(); 435 | final ByteBuffer bb = ByteBuffer.wrap(ba, off, len); 436 | final CharBuffer cb = CharBuffer.wrap(ca); 437 | try { 438 | CoderResult cr = this.cd.decode(bb, cb, true); 439 | if (!cr.isUnderflow()) { 440 | cr.throwException(); 441 | } 442 | cr = this.cd.flush(cb); 443 | if (!cr.isUnderflow()) { 444 | cr.throwException(); 445 | } 446 | } catch (final CharacterCodingException x) { 447 | // Substitution is always enabled, 448 | // so this shouldn't happen 449 | throw new Error(x); 450 | } 451 | return SensitiveStringDecoder.safeTrim(ca, cb.position()); 452 | } 453 | 454 | private static char[] safeTrim(final char[] ca, final int len) { 455 | if (len == ca.length) { 456 | return ca; 457 | } else { 458 | return Arrays.copyOf(ca, len); 459 | } 460 | } 461 | } 462 | } --------------------------------------------------------------------------------