├── DictParser
├── .classpath
├── .project
├── .settings
│ └── org.eclipse.jdt.core.prefs
├── bin
│ └── com
│ │ └── ck
│ │ └── dict
│ │ ├── model
│ │ ├── CompressedRecord.class
│ │ └── Dictionary.class
│ │ └── util
│ │ ├── DictionaryQuerier.class
│ │ ├── MdxFileParser.class
│ │ └── Utils.class
├── lib
│ ├── lzo-core-1.0.5-sources.jar
│ └── lzo-core-1.0.5.jar
└── src
│ └── com
│ └── ck
│ └── dict
│ ├── model
│ ├── CompressedRecord.java
│ └── Dictionary.java
│ └── util
│ ├── DictionaryQuerier.java
│ ├── MdxFileParser.java
│ └── Utils.java
└── README.md
/DictParser/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
--------------------------------------------------------------------------------
/DictParser/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | DictParser
4 |
5 |
6 |
7 |
8 |
9 | org.eclipse.jdt.core.javabuilder
10 |
11 |
12 |
13 |
14 |
15 | org.eclipse.jdt.core.javanature
16 |
17 |
18 |
--------------------------------------------------------------------------------
/DictParser/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.8
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.8
12 |
--------------------------------------------------------------------------------
/DictParser/bin/com/ck/dict/model/CompressedRecord.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/model/CompressedRecord.class
--------------------------------------------------------------------------------
/DictParser/bin/com/ck/dict/model/Dictionary.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/model/Dictionary.class
--------------------------------------------------------------------------------
/DictParser/bin/com/ck/dict/util/DictionaryQuerier.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/DictionaryQuerier.class
--------------------------------------------------------------------------------
/DictParser/bin/com/ck/dict/util/MdxFileParser.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/MdxFileParser.class
--------------------------------------------------------------------------------
/DictParser/bin/com/ck/dict/util/Utils.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/bin/com/ck/dict/util/Utils.class
--------------------------------------------------------------------------------
/DictParser/lib/lzo-core-1.0.5-sources.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/lib/lzo-core-1.0.5-sources.jar
--------------------------------------------------------------------------------
/DictParser/lib/lzo-core-1.0.5.jar:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Grinner2436/mdict-java/99164039c20f6746520ad7a38a513cf588021867/DictParser/lib/lzo-core-1.0.5.jar
--------------------------------------------------------------------------------
/DictParser/src/com/ck/dict/model/CompressedRecord.java:
--------------------------------------------------------------------------------
1 | package com.ck.dict.model;
2 |
3 | import java.io.ByteArrayInputStream;
4 | import java.io.ByteArrayOutputStream;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import java.io.UnsupportedEncodingException;
8 |
9 | import com.ck.dict.util.Utils;
10 |
11 | public class CompressedRecord {
12 | private byte[] compressedData;
13 | private String encoding;
14 | public String getString(long position){
15 | String result=null;
16 | byte[] bytes=getRecordData(position);
17 | try {
18 | result=new String(bytes, encoding);
19 | } catch (UnsupportedEncodingException e) {
20 | e.printStackTrace();
21 | }
22 | return result;
23 | }
24 | public InputStream getFile(long position){
25 | return new ByteArrayInputStream(getRecordData(position));
26 | }
27 | private byte[] getRecordData(long position){
28 | InputStream record_ds=Utils.decompress(compressedData, compressedData.length);
29 | byte[] result=null;
30 | try {
31 | record_ds.skip(position);
32 | ByteArrayOutputStream baos=new ByteArrayOutputStream();
33 | int read=0;
34 | while ((read = record_ds.read()) > 0) {
35 | baos.write(read);
36 | }
37 | result=baos.toByteArray();
38 | } catch (IOException e) {
39 | e.printStackTrace();
40 | }
41 | return result;
42 | }
43 | public CompressedRecord(byte[] compressedData,String encoding) {
44 | super();
45 | this.compressedData = compressedData;
46 | this.encoding = encoding;
47 | }
48 | public byte[] getCompressedData() {
49 | return compressedData;
50 | }
51 | public void setCompressedData(byte[] compressedData) {
52 | this.compressedData = compressedData;
53 | }
54 |
55 | }
56 |
--------------------------------------------------------------------------------
/DictParser/src/com/ck/dict/model/Dictionary.java:
--------------------------------------------------------------------------------
1 | package com.ck.dict.model;
2 |
3 | import java.util.ArrayList;
4 | import java.util.HashMap;
5 |
6 | import javax.xml.crypto.dsig.keyinfo.KeyInfo;
7 |
8 | public class Dictionary {
9 | private ArrayList oriKeys;
10 | private ArrayList first_last_keys;
11 |
12 | private HashMap offsets;
13 | private HashMap records;
14 | public Dictionary(ArrayList oriKeys, ArrayList first_last_keys, HashMap offsetMap,
15 | HashMap records) {
16 | super();
17 | this.oriKeys = oriKeys;
18 | this.first_last_keys = first_last_keys;
19 | this.offsets = offsetMap;
20 | this.records = records;
21 | }
22 | public ArrayList getOriKeys() {
23 | return oriKeys;
24 | }
25 | public void setOriKeys(ArrayList oriKeys) {
26 | this.oriKeys = oriKeys;
27 | }
28 | public ArrayList getFirst_last_keys() {
29 | return first_last_keys;
30 | }
31 | public void setFirst_last_keys(ArrayList first_last_keys) {
32 | this.first_last_keys = first_last_keys;
33 | }
34 | public HashMap getOffsets() {
35 | return offsets;
36 | }
37 | public void setOffsets(HashMap offsets) {
38 | this.offsets = offsets;
39 | }
40 | public HashMap getRecords() {
41 | return records;
42 | }
43 | public void setRecords(HashMap records) {
44 | this.records = records;
45 | }
46 |
47 |
48 | }
49 |
50 |
--------------------------------------------------------------------------------
/DictParser/src/com/ck/dict/util/DictionaryQuerier.java:
--------------------------------------------------------------------------------
1 | package com.ck.dict.util;
2 |
3 | import java.io.FileInputStream;
4 | import java.io.FileNotFoundException;
5 | import java.util.HashMap;
6 | import java.util.List;
7 | import java.util.Set;
8 |
9 | import com.ck.dict.model.CompressedRecord;
10 | import com.ck.dict.model.Dictionary;
11 |
12 |
13 | public class DictionaryQuerier {
14 | private static HashMap dicts=new HashMap();
15 | static{
16 | String filePath="D:/MDictPC/doc/牛津高阶8简体.mdx";
17 | try {
18 | FileInputStream fins=new FileInputStream(filePath);
19 | MdxFileParser parser=new MdxFileParser();
20 | Dictionary dict=parser.parse(fins);
21 | dicts.put("牛津高阶8简体", dict);
22 | } catch (FileNotFoundException e) {
23 | e.printStackTrace();
24 | }
25 | }
26 | public static String query(String query) {
27 | return query("牛津高阶8简体",query);
28 | }
29 | public static String query(String dictName,String query) {
30 | String result;
31 | Dictionary dict=dicts.get(dictName);
32 | if(dict==null) {
33 | result="词典不存在!";
34 | }else {
35 | List keys=dict.getOriKeys();
36 | HashMap recordsMap=dict.getRecords();
37 |
38 | //定位到词条或者最相近的词条
39 | int start=0,end=keys.size(),mid;
40 | while(end-start>1) {
41 | mid=(start+end)/2;
42 | if(end>start) {
43 | String midWord=keys.get(mid);
44 | int flag=query.compareTo(midWord);
45 | if(flag>0) {
46 | start=mid;
47 | }else if(flag<0) {
48 | end=mid;
49 | }else {
50 | start=mid;
51 | break;
52 | }
53 | }else{
54 | break;
55 | }
56 | }
57 |
58 | //确定要显示的词,拿到偏移量
59 | String item=keys.get(start);
60 | Long wordOffset=dict.getOffsets().get(item);
61 |
62 | //根据偏移量定位到块
63 | long pre=0;
64 | Set offSets=dict.getRecords().keySet();
65 | for(Long offSet:offSets) {
66 | if(wordOffset=offSet) {
69 | pre=offSet;
70 | continue;
71 | }
72 | }
73 |
74 | //拿出记录块,从里面解压出对应的词条
75 | long position=wordOffset-pre;
76 | CompressedRecord record=recordsMap.get(pre);
77 | result=record.getString(position);
78 | }
79 | return result;
80 | }
81 | public static void main(String[] args) {
82 | String record=DictionaryQuerier.query("apple");
83 | System.out.println(record);
84 | }
85 | }
86 |
--------------------------------------------------------------------------------
/DictParser/src/com/ck/dict/util/MdxFileParser.java:
--------------------------------------------------------------------------------
1 | package com.ck.dict.util;
2 |
3 | import java.io.ByteArrayOutputStream;
4 | import java.io.FileNotFoundException;
5 | import java.io.IOException;
6 | import java.io.InputStream;
7 | import java.io.StringReader;
8 | import java.util.ArrayList;
9 | import java.util.HashMap;
10 | import java.util.LinkedHashMap;
11 |
12 | import javax.xml.parsers.DocumentBuilder;
13 | import javax.xml.parsers.DocumentBuilderFactory;
14 | import javax.xml.parsers.ParserConfigurationException;
15 |
16 | import org.w3c.dom.Document;
17 | import org.w3c.dom.Element;
18 | import org.xml.sax.InputSource;
19 | import org.xml.sax.SAXException;
20 |
21 | import com.ck.dict.model.CompressedRecord;
22 | import com.ck.dict.model.Dictionary;
23 |
24 |
25 | public class MdxFileParser {
26 | private ArrayList keyNameList=new ArrayList();
27 | private ArrayList first_last_keys=new ArrayList();
28 |
29 | private HashMap offsetMap=new HashMap();
30 | private LinkedHashMap records=new LinkedHashMap();
31 |
32 | private ArrayList compressedRecords=new ArrayList();
33 | public Dictionary parse(InputStream fins) {
34 | byte[]twoBytes=new byte[2];
35 | byte[]fourBytes=new byte[4];
36 | byte[]eightBytes=new byte[8];
37 | Dictionary dict=null;
38 | try {
39 | /*----------------------------------标头部分开始--------------------------------------------------------
40 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#header-section
41 | ---------------------------------------------------------------------------------------------------*/
42 |
43 | //获取Header Section的header_str长度
44 | fins.read(fourBytes);
45 | int headerStrLength=Utils.byteArrayToInt(fourBytes);
46 |
47 | //获取Header Section的header_str。
48 | byte[] header_str_bytes=new byte[headerStrLength];
49 | fins.read(header_str_bytes);
50 | String headerStr=new String(header_str_bytes, "UTF-16LE").trim();
51 |
52 | //解析headerStr,获取字典信息。
53 | DocumentBuilderFactory builderFactory = DocumentBuilderFactory.newInstance();
54 | DocumentBuilder builder = builderFactory.newDocumentBuilder();
55 | StringReader sr = new StringReader(headerStr.trim());
56 | InputSource is = new InputSource(sr);
57 | Document document = builder.parse(is);
58 | Element root=document.getDocumentElement();
59 | String encoding = root.getAttribute("Encoding");
60 |
61 | //跳过checksum校验信息
62 | fins.skip(4);
63 | /*----------------------------------关键字部分开始--------------------------------------------------------
64 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-section
65 | ---------------------------------------------------------------------------------------------------*/
66 | fins.read(eightBytes);
67 | long key_num_blocks=Utils.byteArrayToLong(eightBytes);//共有多少个key的压缩块
68 |
69 | fins.read(eightBytes);
70 | long key_sum=Utils.byteArrayToLong(eightBytes);//共有多少个key(单词数目)
71 |
72 | fins.read(eightBytes);
73 | long key_index_decomp_len=Utils.byteArrayToLong(eightBytes);//keyindex信息解压后的大小
74 |
75 | fins.read(eightBytes);
76 | long key_index_comp_len=Utils.byteArrayToLong(eightBytes);//keyindex信息压缩时的大小
77 |
78 | fins.read(eightBytes);
79 | long key_blocks_len=Utils.byteArrayToLong(eightBytes);//key的压缩块总共大小
80 |
81 | fins.skip(4);//跳过checksum校验信息
82 |
83 | /*----------------------------------关键字--索引信息开始--------------------------------------------------------
84 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#keyword-index
85 | -------------------------------------------------------------------------------------------*/
86 | //将压缩的key_index_comp_len长度的信息解压出来。
87 | InputStream index_ds=Utils.decompress(fins,key_index_comp_len);
88 | long[]comp_size=new long[(int) key_num_blocks];//各个压缩块的压缩长度
89 | long[]decomp_size=new long[(int) key_num_blocks];//各个压缩块的解压后长度
90 | long[]num_entries=new long[(int) key_num_blocks];//每个块上存储的key数量
91 | for(int i=0;i 0) {
142 | baos.write(read);
143 | }
144 | String keyword=new String(baos.toByteArray(), encoding);
145 | String keyName=getKeyName(keyword);
146 | offsetMap.put(keyName, off);
147 | keyNameList.add(keyName);
148 | }
149 | }
150 |
151 | /*----------------------------------关键字部分结束--------------------------------------------------------
152 | ---------------------------------------------------------------------------------------------------*/
153 | /*----------------------------------记录部分开始--------------------------------------------------------
154 | https://github.com/zhansliu/writemdict/blob/master/fileformat.md#record-section
155 | ---------------------------------------------------------------------------------------------------*/
156 | fins.read(eightBytes);
157 | long record_num_blocks=Utils.byteArrayToLong(eightBytes);//记录压缩块总数
158 |
159 | fins.read(eightBytes);
160 | long record_num_entries=Utils.byteArrayToLong(eightBytes);//字典中的记录总数。
161 |
162 | fins.read(eightBytes);
163 | long record_index_len=Utils.byteArrayToLong(eightBytes);
164 |
165 | fins.read(eightBytes);
166 | long record_blocks_len=Utils.byteArrayToLong(eightBytes);//所有记录块的总大小
167 |
168 | //获取每个记录块的压缩后和解压时大小
169 | long[]record_comp_size=new long[(int) record_num_blocks];
170 | long[]record_decomp_size=new long[(int) record_num_blocks];
171 | long preBytes=0;
172 | for(int i=0;i1024) {
62 | i=fins.read(temp);
63 | }else {
64 | i=fins.read(temp,0,(int)total);
65 | }
66 | } catch (IOException e) {
67 | e.printStackTrace();
68 | }
69 | if(i>0) {
70 | baos.write(temp, 0, i);
71 | total-=i;
72 | }
73 | }
74 | byte[] compressedData=baos.toByteArray();
75 | return compressedData;
76 | }
77 |
78 | //用于估计基于当前词典的编码类型的每个关键字的长度,对于UTF8,标记长度就是存储长度,对于UTF-16标记长度是编码单元长度,存储长度还要乘以2.其它以此类推,但是没有加以实现
79 | private static int lengthOfEncoding(int length,String encoding){
80 | int result = 0;
81 | switch (encoding) {
82 | case "UTF-8":
83 | result = length;
84 | break;
85 | case "UTF-16":
86 | result = length*2;
87 | break;
88 | }
89 | return result;
90 | }
91 |
92 | public static long byteArrayToLong(byte[] eightBytes) {
93 | ByteBuffer buffer = ByteBuffer.wrap(eightBytes);
94 | long result=buffer.getLong();
95 | return result;
96 | }
97 | public static int byteArrayToInt(byte[] fourBytes) {
98 | ByteBuffer buffer = ByteBuffer.wrap(fourBytes);
99 | int result=buffer.getInt();
100 | return result;
101 | }
102 | public static short byteArrayToShort(byte[] twoBytes) {
103 | ByteBuffer buffer = ByteBuffer.wrap(twoBytes);
104 | short result=buffer.getShort();
105 | return result;
106 | }
107 | }
108 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # mdict-java
2 | 将mdx/mdd词典文件解析为文本内容的工具。java实现。
3 |
--------------------------------------------------------------------------------