├── DictParser ├── .classpath ├── .project ├── .settings │ └── org.eclipse.jdt.core.prefs ├── bin │ └── com │ │ └── ck │ │ └── dict │ │ ├── model │ │ ├── CompressedRecord.class │ │ └── Dictionary.class │ │ └── util │ │ ├── DictionaryQuerier.class │ │ ├── MdxFileParser.class │ │ └── Utils.class ├── lib │ ├── lzo-core-1.0.5-sources.jar │ └── lzo-core-1.0.5.jar └── src │ └── com │ └── ck │ └── dict │ ├── model │ ├── CompressedRecord.java │ └── Dictionary.java │ └── util │ ├── DictionaryQuerier.java │ ├── MdxFileParser.java │ └── Utils.java └── README.md /DictParser/.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | -------------------------------------------------------------------------------- /DictParser/.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | DictParser 4 | 5 | 6 | 7 | 8 | 9 | org.eclipse.jdt.core.javabuilder 10 | 11 | 12 | 13 | 14 | 15 | org.eclipse.jdt.core.javanature 16 | 17 | 18 | -------------------------------------------------------------------------------- /DictParser/.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.8 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.source=1.8 12 | -------------------------------------------------------------------------------- /DictParser/bin/com/ck/dict/model/CompressedRecord.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/model/CompressedRecord.class -------------------------------------------------------------------------------- /DictParser/bin/com/ck/dict/model/Dictionary.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/model/Dictionary.class -------------------------------------------------------------------------------- /DictParser/bin/com/ck/dict/util/DictionaryQuerier.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/DictionaryQuerier.class -------------------------------------------------------------------------------- /DictParser/bin/com/ck/dict/util/MdxFileParser.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/MdxFileParser.class -------------------------------------------------------------------------------- /DictParser/bin/com/ck/dict/util/Utils.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/Utils.class -------------------------------------------------------------------------------- /DictParser/lib/lzo-core-1.0.5-sources.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/lib/lzo-core-1.0.5-sources.jar -------------------------------------------------------------------------------- /DictParser/lib/lzo-core-1.0.5.jar: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/lib/lzo-core-1.0.5.jar -------------------------------------------------------------------------------- /DictParser/src/com/ck/dict/model/CompressedRecord.java: -------------------------------------------------------------------------------- 1 | package com.ck.dict.model; 2 | 3 | import java.io.ByteArrayInputStream; 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.UnsupportedEncodingException; 8 | 9 | import com.ck.dict.util.Utils; 10 | 11 | public class CompressedRecord { 12 | private byte[] compressedData; 13 | private String encoding; 14 | public String getString(long position){ 15 | String result=null; 16 | byte[] bytes=getRecordData(position); 17 | try { 18 | result=new String(bytes, encoding); 19 | } catch (UnsupportedEncodingException e) { 20 | e.printStackTrace(); 21 | } 22 | return result; 23 | } 24 | public InputStream getFile(long position){ 25 | return new ByteArrayInputStream(getRecordData(position)); 26 | } 27 | private byte[] getRecordData(long position){ 28 | InputStream record_ds=Utils.decompress(compressedData, compressedData.length); 29 | byte[] result=null; 30 | try { 31 | record_ds.skip(position); 32 | ByteArrayOutputStream baos=new ByteArrayOutputStream(); 33 | int read=0; 34 | while ((read = record_ds.read()) > 0) { 35 | baos.write(read); 36 | } 37 | result=baos.toByteArray(); 38 | } catch (IOException e) { 39 | e.printStackTrace(); 40 | } 41 | return result; 42 | } 43 | public CompressedRecord(byte[] compressedData,String encoding) { 44 | super(); 45 | this.compressedData = compressedData; 46 | this.encoding = encoding; 47 | } 48 | public byte[] getCompressedData() { 49 | return compressedData; 50 | } 51 | public void setCompressedData(byte[] compressedData) { 52 | this.compressedData = compressedData; 53 | } 54 | 55 | } 56 | -------------------------------------------------------------------------------- /DictParser/src/com/ck/dict/model/Dictionary.java: -------------------------------------------------------------------------------- 1 | package com.ck.dict.model; 2 | 3 | import java.util.ArrayList; 4 | import java.util.HashMap; 5 | 6 | import javax.xml.crypto.dsig.keyinfo.KeyInfo; 7 | 8 | public class Dictionary { 9 | private ArrayList oriKeys; 10 | private ArrayList first_last_keys; 11 | 12 | private HashMap offsets; 13 | private HashMap records; 14 | public Dictionary(ArrayList oriKeys, ArrayList first_last_keys, HashMap offsetMap, 15 | HashMap records) { 16 | super(); 17 | this.oriKeys = oriKeys; 18 | this.first_last_keys = first_last_keys; 19 | this.offsets = offsetMap; 20 | this.records = records; 21 | } 22 | public ArrayList getOriKeys() { 23 | return oriKeys; 24 | } 25 | public void setOriKeys(ArrayList oriKeys) { 26 | this.oriKeys = oriKeys; 27 | } 28 | public ArrayList getFirst_last_keys() { 29 | return first_last_keys; 30 | } 31 | public void setFirst_last_keys(ArrayList first_last_keys) { 32 | this.first_last_keys = first_last_keys; 33 | } 34 | public HashMap getOffsets() { 35 | return offsets; 36 | } 37 | public void setOffsets(HashMap offsets) { 38 | this.offsets = offsets; 39 | } 40 | public HashMap getRecords() { 41 | return records; 42 | } 43 | public void setRecords(HashMap records) { 44 | this.records = records; 45 | } 46 | 47 | 48 | } 49 | 50 | -------------------------------------------------------------------------------- /DictParser/src/com/ck/dict/util/DictionaryQuerier.java: -------------------------------------------------------------------------------- 1 | package com.ck.dict.util; 2 | 3 | import java.io.FileInputStream; 4 | import java.io.FileNotFoundException; 5 | import java.util.HashMap; 6 | import java.util.List; 7 | import java.util.Set; 8 | 9 | import com.ck.dict.model.CompressedRecord; 10 | import com.ck.dict.model.Dictionary; 11 | 12 | 13 | public class DictionaryQuerier { 14 | private static HashMap dicts=new HashMap(); 15 | static{ 16 | String filePath="D:/MDictPC/doc/牛津高阶8简体.mdx"; 17 | try { 18 | FileInputStream fins=new FileInputStream(filePath); 19 | MdxFileParser parser=new MdxFileParser(); 20 | Dictionary dict=parser.parse(fins); 21 | dicts.put("牛津高阶8简体", dict); 22 | } catch (FileNotFoundException e) { 23 | e.printStackTrace(); 24 | } 25 | } 26 | public static String query(String query) { 27 | return query("牛津高阶8简体",query); 28 | } 29 | public static String query(String dictName,String query) { 30 | String result; 31 | Dictionary dict=dicts.get(dictName); 32 | if(dict==null) { 33 | result="词典不存在!"; 34 | }else { 35 | List keys=dict.getOriKeys(); 36 | HashMap recordsMap=dict.getRecords(); 37 | 38 | //定位到词条或者最相近的词条 39 | int start=0,end=keys.size(),mid; 40 | while(end-start>1) { 41 | mid=(start+end)/2; 42 | if(end>start) { 43 | String midWord=keys.get(mid); 44 | int flag=query.compareTo(midWord); 45 | if(flag>0) { 46 | start=mid; 47 | }else if(flag<0) { 48 | end=mid; 49 | }else { 50 | start=mid; 51 | break; 52 | } 53 | }else{ 54 | break; 55 | } 56 | } 57 | 58 | //确定要显示的词,拿到偏移量 59 | String item=keys.get(start); 60 | Long wordOffset=dict.getOffsets().get(item); 61 | 62 | //根据偏移量定位到块 63 | long pre=0; 64 | Set offSets=dict.getRecords().keySet(); 65 | for(Long offSet:offSets) { 66 | if(wordOffset=offSet) { 69 | pre=offSet; 70 | continue; 71 | } 72 | } 73 | 74 | //拿出记录块,从里面解压出对应的词条 75 | long position=wordOffset-pre; 76 | CompressedRecord record=recordsMap.get(pre); 77 | result=record.getString(position); 78 | } 79 | return result; 80 | } 81 | public static void main(String[] args) { 82 | String record=DictionaryQuerier.query("apple"); 83 | System.out.println(record); 84 | } 85 | } 86 | -------------------------------------------------------------------------------- /DictParser/src/com/ck/dict/util/MdxFileParser.java: -------------------------------------------------------------------------------- 1 | package com.ck.dict.util; 2 | 3 | import java.io.ByteArrayOutputStream; 4 | import java.io.FileNotFoundException; 5 | import java.io.IOException; 6 | import java.io.InputStream; 7 | import java.io.StringReader; 8 | import java.util.ArrayList; 9 | import java.util.HashMap; 10 | import java.util.LinkedHashMap; 11 | 12 | import javax.xml.parsers.DocumentBuilder; 13 | import javax.xml.parsers.DocumentBuilderFactory; 14 | import javax.xml.parsers.ParserConfigurationException; 15 | 16 | import org.w3c.dom.Document; 17 | import org.w3c.dom.Element; 18 | import org.xml.sax.InputSource; 19 | import org.xml.sax.SAXException; 20 | 21 | import com.ck.dict.model.CompressedRecord; 22 | import com.ck.dict.model.Dictionary; 23 | 24 | 25 | public class MdxFileParser { 26 | private ArrayList keyNameList=new ArrayList(); 27 | private ArrayList first_last_keys=new ArrayList(); 28 | 29 | private HashMap offsetMap=new HashMap(); 30 | private LinkedHashMap records=new LinkedHashMap(); 31 | 32 | private ArrayList compressedRecords=new ArrayList(); 33 | public Dictionary parse(InputStream fins) { 34 | byte[]twoBytes=new byte[2]; 35 | byte[]fourBytes=new byte[4]; 36 | byte[]eightBytes=new byte[8]; 37 | Dictionary dict=null; 38 | try { 39 | /*----------------------------------标头部分开始-------------------------------------------------------- 40 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#header-section 41 | ---------------------------------------------------------------------------------------------------*/ 42 | 43 | //获取Header Section的header_str长度 44 | fins.read(fourBytes); 45 | int headerStrLength=Utils.byteArrayToInt(fourBytes); 46 | 47 | //获取Header Section的header_str。 48 | byte[] header_str_bytes=new byte[headerStrLength]; 49 | fins.read(header_str_bytes); 50 | String headerStr=new String(header_str_bytes, "UTF-16LE").trim(); 51 | 52 | //解析headerStr,获取字典信息。 53 | DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance(); 54 | DocumentBuilder builder = builderFactory.newDocumentBuilder(); 55 | StringReader sr = new StringReader(headerStr.trim()); 56 | InputSource is = new InputSource(sr); 57 | Document document = builder.parse(is); 58 | Element root=document.getDocumentElement(); 59 | String encoding = root.getAttribute("Encoding"); 60 | 61 | //跳过checksum校验信息 62 | fins.skip(4); 63 | /*----------------------------------关键字部分开始-------------------------------------------------------- 64 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-section 65 | ---------------------------------------------------------------------------------------------------*/ 66 | fins.read(eightBytes); 67 | long key_num_blocks=Utils.byteArrayToLong(eightBytes);//共有多少个key的压缩块 68 | 69 | fins.read(eightBytes); 70 | long key_sum=Utils.byteArrayToLong(eightBytes);//共有多少个key(单词数目) 71 | 72 | fins.read(eightBytes); 73 | long key_index_decomp_len=Utils.byteArrayToLong(eightBytes);//keyindex信息解压后的大小 74 | 75 | fins.read(eightBytes); 76 | long key_index_comp_len=Utils.byteArrayToLong(eightBytes);//keyindex信息压缩时的大小 77 | 78 | fins.read(eightBytes); 79 | long key_blocks_len=Utils.byteArrayToLong(eightBytes);//key的压缩块总共大小 80 | 81 | fins.skip(4);//跳过checksum校验信息 82 | 83 | /*----------------------------------关键字--索引信息开始-------------------------------------------------------- 84 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index 85 | -------------------------------------------------------------------------------------------*/ 86 | //将压缩的key_index_comp_len长度的信息解压出来。 87 | InputStream index_ds=Utils.decompress(fins,key_index_comp_len); 88 | long[]comp_size=new long[(int) key_num_blocks];//各个压缩块的压缩长度 89 | long[]decomp_size=new long[(int) key_num_blocks];//各个压缩块的解压后长度 90 | long[]num_entries=new long[(int) key_num_blocks];//每个块上存储的key数量 91 | for(int i=0;i 0) { 142 | baos.write(read); 143 | } 144 | String keyword=new String(baos.toByteArray(), encoding); 145 | String keyName=getKeyName(keyword); 146 | offsetMap.put(keyName, off); 147 | keyNameList.add(keyName); 148 | } 149 | } 150 | 151 | /*----------------------------------关键字部分结束-------------------------------------------------------- 152 | ---------------------------------------------------------------------------------------------------*/ 153 | /*----------------------------------记录部分开始-------------------------------------------------------- 154 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section 155 | ---------------------------------------------------------------------------------------------------*/ 156 | fins.read(eightBytes); 157 | long record_num_blocks=Utils.byteArrayToLong(eightBytes);//记录压缩块总数 158 | 159 | fins.read(eightBytes); 160 | long record_num_entries=Utils.byteArrayToLong(eightBytes);//字典中的记录总数。 161 | 162 | fins.read(eightBytes); 163 | long record_index_len=Utils.byteArrayToLong(eightBytes); 164 | 165 | fins.read(eightBytes); 166 | long record_blocks_len=Utils.byteArrayToLong(eightBytes);//所有记录块的总大小 167 | 168 | //获取每个记录块的压缩后和解压时大小 169 | long[]record_comp_size=new long[(int) record_num_blocks]; 170 | long[]record_decomp_size=new long[(int) record_num_blocks]; 171 | long preBytes=0; 172 | for(int i=0;i1024) { 62 | i=fins.read(temp); 63 | }else { 64 | i=fins.read(temp,0,(int)total); 65 | } 66 | } catch (IOException e) { 67 | e.printStackTrace(); 68 | } 69 | if(i>0) { 70 | baos.write(temp, 0, i); 71 | total-=i; 72 | } 73 | } 74 | byte[] compressedData=baos.toByteArray(); 75 | return compressedData; 76 | } 77 | 78 | //用于估计基于当前词典的编码类型的每个关键字的长度,对于UTF8,标记长度就是存储长度,对于UTF-16标记长度是编码单元长度,存储长度还要乘以2.其它以此类推,但是没有加以实现 79 | private static int lengthOfEncoding(int length,String encoding){ 80 | int result = 0; 81 | switch (encoding) { 82 | case "UTF-8": 83 | result = length; 84 | break; 85 | case "UTF-16": 86 | result = length*2; 87 | break; 88 | } 89 | return result; 90 | } 91 | 92 | public static long byteArrayToLong(byte[] eightBytes) { 93 | ByteBuffer buffer = ByteBuffer.wrap(eightBytes); 94 | long result=buffer.getLong(); 95 | return result; 96 | } 97 | public static int byteArrayToInt(byte[] fourBytes) { 98 | ByteBuffer buffer = ByteBuffer.wrap(fourBytes); 99 | int result=buffer.getInt(); 100 | return result; 101 | } 102 | public static short byteArrayToShort(byte[] twoBytes) { 103 | ByteBuffer buffer = ByteBuffer.wrap(twoBytes); 104 | short result=buffer.getShort(); 105 | return result; 106 | } 107 | } 108 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # mdict-java 2 | 将mdx/mdd词典文件解析为文本内容的工具。java实现。 3 | --------------------------------------------------------------------------------