├── Baidu ├── BaiduBdcitReader.java └── BaiduBdict2Txt.java ├── LICENSE ├── QQ ├── QQqpyd2Txt.java └── QQqpydReader.java ├── README.md └── Sogou ├── SogouScel2Txt.java └── SougouScelReader.java /Baidu/BaiduBdcitReader.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 功能:解析百度词库文件(bdict),返回存储词语的list 3 | */ 4 | import java.io.ByteArrayOutputStream; 5 | import java.io.IOException; 6 | import java.io.RandomAccessFile; 7 | import java.nio.ByteBuffer; 8 | import java.nio.ByteOrder; 9 | import java.nio.channels.Channels; 10 | import java.nio.channels.FileChannel; 11 | import java.util.*; 12 | 13 | public class BaiduBdcitReader 14 | { 15 | /** 16 | * 读取百度词库文件(bdict),返回一个包含所以词的list 17 | * @param bdictFilePath : bdict文件的路径 18 | * @return: 包含词库文件中所有词的一个List 19 | * @throws Exception 20 | */ 21 | public static List readBdictFile(String bdictFilePath) throws Exception 22 | { 23 | // read bdict into byte array 24 | ByteArrayOutputStream dataOut = new ByteArrayOutputStream(); 25 | FileChannel fChannel = new RandomAccessFile(bdictFilePath, "r").getChannel(); 26 | fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut)); 27 | fChannel.close(); 28 | ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray()); 29 | dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); 30 | 31 | byte[] buf = new byte[1024]; 32 | byte[] pinyin = new byte[1024]; 33 | // dictionary offset 34 | dataRawBytes.position(0x350); 35 | 36 | List wordList = new ArrayList(); 37 | String word = null; 38 | while (dataRawBytes.position() < dataRawBytes.capacity()) 39 | { 40 | int length = dataRawBytes.getShort(); //得到词的字节长度 41 | dataRawBytes.getShort(); 42 | try 43 | { 44 | dataRawBytes.get(pinyin,0,2 * length); //跳过拼音 45 | dataRawBytes.get(buf, 0, 2 * length); //得到实际的词 46 | word = new String(buf, 0, 2 * length, "UTF-16LE"); 47 | wordList.add(word); 48 | } 49 | catch (Exception e) 50 | { 51 | return wordList; 52 | } 53 | 54 | } 55 | return wordList; 56 | 57 | } 58 | 59 | public static void main(String[] args) 60 | { 61 | 62 | String bdictFile = "G:/各大输入法词库/百度/windows单线程/电子游戏/手游/热门手游.bdict"; 63 | List wordList = new ArrayList(); 64 | try 65 | { 66 | wordList = readBdictFile(bdictFile); 67 | } 68 | catch (Exception e) 69 | { 70 | System.out.println(bdictFile+"not parsed successfully"); 71 | } 72 | 73 | for(int i=0;i wordList = new ArrayList(); 21 | wordList = BaiduBdcitReader.readBdictFile(inputPath); 22 | 23 | //create outputDirs if not exists 24 | File outFile = new File(outputPath); 25 | outFile.getParentFile().mkdirs(); 26 | 27 | PrintWriter writer = new PrintWriter(outputPath, "UTF-8"); 28 | for (int i=0;i wordList = new ArrayList(); 21 | wordList = QQqpydReader.readQpydFile(inputPath); 22 | 23 | //create outputDirs if not exists 24 | File outFile = new File(outputPath); 25 | outFile.getParentFile().mkdirs(); 26 | 27 | PrintWriter writer = new PrintWriter(outputPath, "UTF-8"); 28 | for (int i=0;i wordList = new ArrayList(); 23 | 24 | wordList = readQpydFile(qpydFile); 25 | for(int i=0;i 35 | * @throws Exception 36 | */ 37 | public static List readQpydFile(String inputPath) throws Exception 38 | { 39 | 40 | List wordList = new ArrayList(); 41 | // read qpyd into byte array 42 | ByteArrayOutputStream dataOut = new ByteArrayOutputStream(); 43 | FileChannel fChannel = new RandomAccessFile(inputPath, "r").getChannel(); 44 | fChannel.transferTo(0, fChannel.size(), Channels.newChannel(dataOut)); 45 | fChannel.close(); 46 | 47 | // qpyd as bytes 48 | ByteBuffer dataRawBytes = ByteBuffer.wrap(dataOut.toByteArray()); 49 | dataRawBytes.order(ByteOrder.LITTLE_ENDIAN); 50 | 51 | // read info of compressed data 52 | int startZippedDictAddr = dataRawBytes.getInt(0x38); 53 | int zippedDictLength = dataRawBytes.limit() - startZippedDictAddr; 54 | 55 | // read zipped qqyd dictionary into byte array 56 | dataOut.reset(); 57 | Channels.newChannel(new InflaterOutputStream(dataOut)).write( 58 | ByteBuffer.wrap(dataRawBytes.array(), startZippedDictAddr, zippedDictLength)); 59 | 60 | // uncompressed qqyd dictionary as bytes 61 | ByteBuffer dataUnzippedBytes = ByteBuffer.wrap(dataOut.toByteArray()); 62 | dataUnzippedBytes.order(ByteOrder.LITTLE_ENDIAN); 63 | 64 | // for debugging: save unzipped data to *.unzipped file 65 | Channels.newChannel(new FileOutputStream(inputPath + ".unzipped")).write(dataUnzippedBytes); 66 | 67 | // stores the start address of actual dictionary data 68 | int unzippedDictStartAddr = -1; 69 | int idx = 0; 70 | 71 | byte[] byteArray = dataUnzippedBytes.array(); 72 | while (unzippedDictStartAddr == -1 || idx < unzippedDictStartAddr) 73 | { 74 | // read word 75 | int pinyinStartAddr = dataUnzippedBytes.getInt(idx + 0x6); 76 | int pinyinLength = dataUnzippedBytes.get(idx + 0x0) & 0xff; 77 | int wordStartAddr = pinyinStartAddr + pinyinLength; 78 | int wordLength = dataUnzippedBytes.get(idx + 0x1) & 0xff; 79 | if (unzippedDictStartAddr == -1) 80 | { 81 | unzippedDictStartAddr = pinyinStartAddr; 82 | } 83 | String word = new String(Arrays.copyOfRange(byteArray, wordStartAddr, wordStartAddr + wordLength), 84 | "UTF-16LE"); 85 | wordList.add(word); 86 | 87 | // step up 88 | idx += 0xa; 89 | } 90 | return wordList; 91 | } 92 | } 93 | 94 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 搜狗、百度、QQ输入法词库解析程序 2 | 3 | 用Java实现的搜狗(scel)、百度(bdict)、QQ输入法(qpyd)词库文件的解析程序,一种词库文件对应于一个文件夹,对应关系如下: 4 | 5 | - Baidu:解析百度输入法词库文件,词库文件后缀为bdict 6 | - Sogou:解析搜狗输入法词库文件,词库文件后缀为scel 7 | - QQ:解析QQ输入法词库文件,词库文件后缀为qpyd 8 | 9 | 每个文件夹下都有两个文件,其中`*2Txt.java`调用了另外一个文件的类的功能,建议直接调用`*2Txt.java`文件即可,其功能是是**将输入的词库源文件转为输出的txt文件**,调用时需要处理抛出的异常。 10 | -------------------------------------------------------------------------------- /Sogou/SogouScel2Txt.java: -------------------------------------------------------------------------------- 1 | /** 2 | * 功能:输入scel的词库文件路径,根据指定路径生成包含该词库文件的词条的txt文件 3 | */ 4 | 5 | import java.io.File; 6 | import java.io.IOException; 7 | import java.io.RandomAccessFile; 8 | import java.nio.file.Files; 9 | import java.nio.file.LinkOption; 10 | import java.nio.file.Paths; 11 | import java.util.Iterator; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Set; 15 | import java.util.Map.Entry; 16 | 17 | /** 18 | * 解析sogo词库工具类 19 | * 20 | */ 21 | public class SogouScel2Txt 22 | { 23 | 24 | public static void main(String[] args)throws Exception 25 | { 26 | sogou("G:/各大输入法词库/搜狗/sogou/城市信息大全/安徽/安徽.scel","G:/各大输入法词库/搜狗/sogou/城市信息大全/安徽/安徽.txt",false); 27 | } 28 | 29 | /** 30 | * 读取scel的词库文件,生成txt格式的文件 31 | * @param inputPath 输入路径 32 | * @param outputPath 输出路径 33 | * @param isAppend 是否拼接追加词库内容,true 代表追加,false代表重建 34 | * **/ 35 | 36 | public static void sogou(String inputPath,String outputPath,boolean isAppend) throws IOException 37 | { 38 | File file=new File(inputPath); 39 | if(!isAppend) 40 | { 41 | if(Files.exists(Paths.get(outputPath),LinkOption.values())) 42 | { 43 | System.out.println("存储此文件已经删除"); 44 | Files.deleteIfExists(Paths.get(outputPath)); 45 | 46 | } 47 | } 48 | RandomAccessFile raf=new RandomAccessFile(outputPath, "rw"); 49 | 50 | int count=0; 51 | SougouScelMdel model = new SougouScelReader().read(file); 52 | Map> words = model.getWordMap(); //词<拼音,词> 53 | Set>> set = words.entrySet(); 54 | Iterator>> iter = set.iterator(); 55 | while(iter.hasNext()) 56 | { 57 | Entry> entry = iter.next(); 58 | List list = entry.getValue(); 59 | int size = list.size(); 60 | for(int i = 0; i < size; i++) 61 | { 62 | String word = list.get(i); 63 | //System.out.println(word); 64 | raf.seek(raf.getFilePointer()); 65 | raf.write((word+"\n").getBytes());//写入txt文件 66 | count++; 67 | } 68 | } 69 | raf.close(); 70 | System.out.println(outputPath+"生成成功,总写入"+count+"个词条"); 71 | } 72 | 73 | } 74 | -------------------------------------------------------------------------------- /Sogou/SougouScelReader.java: -------------------------------------------------------------------------------- 1 | import java.io.ByteArrayOutputStream; 2 | import java.io.DataInputStream; 3 | import java.io.File; 4 | import java.io.FileInputStream; 5 | import java.io.IOException; 6 | import java.io.EOFException; 7 | import java.io.InputStream; 8 | import java.net.URL; 9 | import java.util.ArrayList; 10 | import java.util.Iterator; 11 | import java.util.LinkedHashMap; 12 | import java.util.List; 13 | import java.util.Map; 14 | import java.util.Map.Entry; 15 | import java.util.Set; 16 | 17 | public class SougouScelReader { 18 | 19 | public SougouScelMdel read(File file) throws IOException { 20 | return read(new FileInputStream(file)); 21 | } 22 | 23 | public SougouScelMdel read(URL url) throws IOException { 24 | return read(url.openStream()); 25 | } 26 | 27 | protected ByteArrayOutputStream output=new ByteArrayOutputStream(); 28 | 29 | protected String readString(DataInputStream input,int pos,int[] reads) throws IOException { 30 | int read=reads[0]; 31 | input.skip(pos-read); 32 | read=pos; 33 | output.reset(); 34 | while(true) { 35 | int c1 = input.read(); 36 | int c2 = input.read(); 37 | read+=2; 38 | if(c1==0 && c2==0) { 39 | break; 40 | } else { 41 | output.write(c1); 42 | output.write(c2); 43 | } 44 | } 45 | reads[0]=read; 46 | return new String(output.toByteArray(),encoding); 47 | } 48 | 49 | protected static String encoding = "UTF-16LE"; 50 | 51 | public SougouScelMdel read(InputStream in) throws IOException { 52 | SougouScelMdel model = new SougouScelMdel(); 53 | DataInputStream input = new DataInputStream(in); 54 | int read; 55 | try { 56 | byte[] bytes = new byte[4]; 57 | input.readFully(bytes); 58 | assert (bytes[0] == 0x40 && bytes[1] == 0x15 && bytes[2] == 0 && bytes[3] == 0); 59 | input.readFully(bytes); 60 | int flag1 = bytes[0]; 61 | assert (bytes[1] == 0x43 && bytes[2] == 0x53 && bytes[3] == 0x01); 62 | int[] reads=new int[]{8}; 63 | model.setName(readString(input,0x130,reads)); 64 | model.setType(readString(input,0x338,reads)); 65 | model.setDescription(readString(input,0x540,reads)); 66 | model.setSample(readString(input,0xd40,reads)); 67 | read = reads[0]; 68 | input.skip(0x1540 - read); 69 | read=0x1540; 70 | input.readFully(bytes); 71 | read += 4; 72 | assert (bytes[0] == (byte) 0x9D && bytes[1] == 0x01 && bytes[2] == 0 && bytes[3] == 0); 73 | bytes = new byte[128]; 74 | Map pyMap = new LinkedHashMap(); 75 | while (true) { 76 | int mark = readUnsignedShort(input); 77 | int size = input.readUnsignedByte(); 78 | input.skip(1); 79 | read += 4; 80 | assert (size > 0 && (size % 2) == 0); 81 | input.readFully(bytes, 0, size); 82 | read += size; 83 | String py = new String(bytes, 0, size, encoding); 84 | //System.out.println(py); 85 | pyMap.put(mark, py); 86 | if ("zuo".equals(py)) { 87 | break; 88 | } 89 | } 90 | if (flag1 == 0x44) { 91 | input.skip(0x2628 - read); 92 | } else if (flag1 == 0x45) { 93 | input.skip(0x26C4 - read); 94 | } else { 95 | throw new RuntimeException("出现意外,联系作者"); 96 | } 97 | StringBuffer buffer = new StringBuffer(); 98 | Map> wordMap = new LinkedHashMap>(); 99 | while (true) { 100 | int size = readUnsignedShort(input); 101 | if (size < 0) { 102 | break; 103 | } 104 | int count = readUnsignedShort(input); 105 | int len = count / 2; 106 | assert (len * 2 == count); 107 | buffer.setLength(0); 108 | for (int i = 0; i < len; i++) { 109 | int key = readUnsignedShort(input); 110 | buffer.append(pyMap.get(key)).append("'"); 111 | } 112 | buffer.setLength(Math.max(buffer.length() - 1, 0)); 113 | String py = buffer.toString(); 114 | List list = wordMap.get(py); 115 | if (list == null) { 116 | list = new ArrayList(); 117 | wordMap.put(py, list); 118 | } 119 | for (int i = 0; i < size; i++) { 120 | count = readUnsignedShort(input); 121 | if (count > bytes.length) { 122 | bytes = new byte[count]; 123 | } 124 | 125 | try 126 | { 127 | input.readFully(bytes, 0, count); 128 | String word = new String(bytes, 0, count, encoding); 129 | //System.out.println(String.valueOf(i)+','+String.valueOf(size)+','+word); 130 | //接下来12个字节可能是词频或者类似信息 131 | input.skip(12); 132 | list.add(word); 133 | } 134 | catch (EOFException e) 135 | { 136 | break; 137 | } 138 | } 139 | } 140 | //System.out.println(wordMap.size()); 141 | model.setWordMap(wordMap); 142 | return model; 143 | } finally { 144 | in.close(); 145 | } 146 | } 147 | 148 | protected final int readUnsignedShort(InputStream in) throws IOException { 149 | int ch1 = in.read(); 150 | int ch2 = in.read(); 151 | if ((ch1 | ch2) < 0) { 152 | return Integer.MIN_VALUE; 153 | } 154 | return (ch2 << 8) + (ch1 << 0); 155 | } 156 | 157 | } 158 | 159 | //自行将此类提出来为public class 160 | class SougouScelMdel { 161 | 162 | private Map> wordMap; 163 | 164 | private String name; 165 | private String type; 166 | private String description; 167 | private String sample; 168 | 169 | public Map> getWordMap() { 170 | return wordMap; 171 | } 172 | 173 | void setWordMap(Map> wordMap) { 174 | this.wordMap = wordMap; 175 | } 176 | 177 | public String getType() { 178 | return type; 179 | } 180 | 181 | public void setType(String type) { 182 | this.type = type; 183 | } 184 | 185 | public String getDescription() { 186 | return description; 187 | } 188 | 189 | public void setDescription(String description) { 190 | this.description = description; 191 | } 192 | 193 | public String getSample() { 194 | return sample; 195 | } 196 | 197 | public void setSample(String sample) { 198 | this.sample = sample; 199 | } 200 | 201 | public String getName() { 202 | return name; 203 | } 204 | 205 | public void setName(String name) { 206 | this.name = name; 207 | } 208 | 209 | 210 | 211 | 212 | 213 | } 214 | --------------------------------------------------------------------------------