├── README.md ├── mdict.py └── tool.py /README.md: -------------------------------------------------------------------------------- 1 | # mdict_reader 2 | Extract data from Octopus mdict (*.mdd, *.mdx) files 3 | 4 | ``` 5 | usage: tool.py [-h] [-l] [-a] [-x EXTRACT] [-d DIR] [-o OUTPUT] [-e TRANSCODE] 6 | mdict_file 7 | 8 | mdict tool 9 | 10 | positional arguments: 11 | mdict_file Input *.mdx or *.mdd file 12 | 13 | optional arguments: 14 | -h, --help show this help message and exit 15 | -l, --list List entry names in MDX or file names in MDD 16 | -a, --dump Dump all files in *.mdd into files in output dir or 17 | all entries in *.mdx into a CSV 18 | -x EXTRACT, --extract EXTRACT 19 | Extract one file or entry content, print to stdout if 20 | -o not specified. Argument should be specified in 21 | UTF-8 22 | -d DIR, --dir DIR Output directory for -a or -o 23 | -o OUTPUT, --output OUTPUT 24 | Output filename for -x 25 | -e TRANSCODE, --transcode TRANSCODE 26 | Transcode data, specified in format of 27 | INPUT_ENC:OUTPUT_ENC 28 | ``` 29 | -------------------------------------------------------------------------------- /mdict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # mdict.py 5 | # 6 | # Trimmed-down, refactored version of Octopus MDict Dictionary File (.mdx) and 7 | # Resource File (.mdd) Analyser by Xiaoquing Wang 8 | # 9 | # 10 | # This package includes ripemd128 and Salsa20 implementation by 11 | # 12 | # 13 | # This program is a free software; you can redistribute it and/or modify 14 | # it under the terms of the GNU General Public License as published by 15 | # the Free Software Foundation, version 3 of the License. 16 | # 17 | # You can get a copy of GNU General Public License along this program 18 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt 19 | # 20 | # This program is distributed in the hope that it will be useful, 21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 23 | # GNU General Public License for more details. 24 | 25 | import json 26 | import math 27 | import re 28 | import sys 29 | import zlib # zlib compression is used for engine version >=2.0 30 | 31 | from struct import pack, unpack, Struct 32 | from io import BytesIO 33 | 34 | assert(sys.version_info >= (2, 6)) 35 | if sys.version_info >= (3,): 36 | integer_types = (int,) 37 | unicode = str # 2x3 compatible 38 | python3 = True 39 | else: 40 | integer_types = (int, long) 41 | python3 = False 42 | 43 | ######### 44 | # For LZO decompression 45 | # 46 | class FlexBuffer(): 47 | def __init__(self): 48 | self.blockSize = None 49 | self.c = None 50 | self.l = None 51 | self.buf = None 52 | def require(self, n): 53 | r = self.c - self.l + n 54 | if r > 0: 55 | self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) 56 | self.buf = self.buf + bytearray(self.l - len(self.buf)) 57 | self.c = self.c + n 58 | return self.buf 59 | def alloc(self, initSize, blockSize): 60 | sz = blockSize or 4096 61 | self.blockSize = self.roundUp(sz) 62 | self.c = 0 63 | self.l = self.roundUp(initSize) | 0 64 | self.l += self.blockSize - (self.l % self.blockSize) 65 | self.buf = bytearray(self.l) 66 | return self.buf 67 | def roundUp(self, n): 68 | r = n % 4 69 | return n if r==0 else (n+4-r) 70 | def reset(self): 71 | self.c = 0 72 | self.l = len(self.buf) 73 | def pack(self, size): 74 | return self.buf[0:size] 75 | 76 | def _decompress(inBuf, outBuf): 77 | # state label as constants 78 | c_top_loop, c_first_literal_run, c_match, c_copy_match, c_match_done, c_match_next = range(6) 79 | 80 | out = outBuf.buf 81 | op = ip = m_pos = 0 82 | t = inBuf[ip] 83 | state = c_top_loop 84 | 85 | def copy(inbuffer, outbuffer, iptr, optr, counter, k): 86 | for i in range(k): 87 | outbuffer[optr+i] = inbuffer[iptr+i] 88 | return iptr+k, optr+k, counter-k 89 | 90 | if t > 17: 91 | ip = ip + 1 92 | t = t - 17 93 | if t < 4: 94 | state = c_match_next 95 | else: 96 | out = outBuf.require(t) 97 | ip, op, t = copy(inBuf, out, ip, op, t, t) 98 | state = c_first_literal_run 99 | while True: 100 | if_block = False 101 | # emulate c switch structure by sequences of if statment 102 | if state == c_top_loop: 103 | t = inBuf[ip] 104 | ip = ip + 1 105 | if t >= 16: 106 | state = c_match 107 | continue 108 | if t == 0: 109 | while inBuf[ip] == 0: 110 | t, ip = t+255, ip+1 111 | t = t + 15 + inBuf[ip] 112 | ip = ip + 1 113 | t = t + 3 114 | out = outBuf.require(t) 115 | ip, op, t = copy(inBuf, out, ip, op, t, t) 116 | state = c_first_literal_run 117 | if state == c_first_literal_run: 118 | t = inBuf[ip] 119 | ip = ip + 1 120 | if t >= 16: 121 | state = c_match 122 | continue 123 | m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2) 124 | ip = ip + 1 125 | out = outBuf.require(3) 126 | _, op, _ = copy(out, out, m_pos, op, 0, 3) 127 | state = c_match_done 128 | continue 129 | if state == c_match: 130 | if t >= 64: 131 | m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3) 132 | ip = ip + 1 133 | t = (t >> 5) - 1 134 | state = c_copy_match 135 | continue 136 | elif t >= 32: 137 | t = t & 31 138 | if t == 0: 139 | while inBuf[ip] == 0: 140 | t, ip = t+255, ip+1 141 | t = t + 31 + inBuf[ip] 142 | ip = ip + 1 143 | m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) 144 | ip = ip + 2 145 | elif t >= 16: 146 | m_pos = op - ((t & 8) << 11) 147 | t = t & 7 148 | if t == 0: 149 | while inBuf[ip] == 0: 150 | t, ip = t+255, ip+1 151 | t = t + 7 + inBuf[ip] 152 | ip = ip + 1 153 | m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) 154 | ip = ip + 2 155 | if m_pos == op: 156 | break 157 | m_pos = m_pos - 0x4000 158 | else: 159 | m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2); 160 | ip = ip + 1 161 | out = outBuf.require(2) 162 | _, op, _ = copy(out, out, m_pos, op, 0, 2) 163 | state = c_match_done 164 | continue 165 | if t >= 6 and (op - m_pos) >= 4: 166 | if_block = True 167 | t += 2 168 | out = outBuf.require(t) 169 | m_pos, op, t = copy(out, out, m_pos, op, t, t) 170 | state = c_copy_match 171 | if state == c_copy_match: 172 | if not if_block: 173 | t += 2 174 | out = outBuf.require(t) 175 | m_pos, op, t = copy(out, out, m_pos, op, t, t) 176 | state = c_match_done 177 | if state == c_match_done: 178 | t = inBuf[ip - 2] & 3 179 | if t == 0: 180 | state = c_top_loop 181 | continue 182 | state = c_match_next 183 | if state == c_match_next: 184 | out = outBuf.require(1) 185 | ip, op, _ = copy(inBuf, out, ip, op, 0, 1) 186 | if t > 1: 187 | out = outBuf.require(1) 188 | ip, op, _ = copy(inBuf, out, ip, op, 0, 1) 189 | if t > 2: 190 | out = outBuf.require(1) 191 | ip, op, _ = copy(inBuf, out, ip, op, 0, 1) 192 | t = inBuf[ip] 193 | ip += 1 194 | state = c_match 195 | return bytes(outBuf.pack(op)) 196 | 197 | def lzo_decompress(input, initSize=16000, blockSize=1308672): 198 | output = FlexBuffer() 199 | output.alloc(initSize, blockSize) 200 | return _decompress(bytearray(input), output) 201 | 202 | ######### 203 | # For RIPEMD128 204 | # 205 | def f(j, x, y, z): 206 | assert(0 <= j < 64) 207 | return ((x ^ y ^ z) if j<16 else 208 | ((x & y) | (z & ~x)) if j<32 else 209 | ((x | (0xffffffff & ~y)) ^ z) if j<48 else 210 | ((x & z) | (y & ~z)) 211 | ) 212 | def K(j): 213 | assert(0 <= j < 64) 214 | return (0x00000000 if j<16 else 215 | 0x5a827999 if j<32 else 216 | 0x6ed9eba1 if j<48 else 217 | 0x8f1bbcdc 218 | ) 219 | def Kp(j): 220 | assert(0 <= j < 64) 221 | return (0x50a28be6 if j<16 else 222 | 0x5c4dd124 if j<32 else 223 | 0x6d703ef3 if j<48 else 224 | 0x00000000 225 | ) 226 | def padandsplit(message): 227 | """ 228 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges 229 | from 0 to 16. 230 | First pads the message to length in bytes is congruent to 56 (mod 64), 231 | by first adding a byte 0x80, and then padding with 0x00 bytes until the 232 | message length is congruent to 56 (mod 64). Then adds the little-endian 233 | 64-bit representation of the original length. Finally, splits the result 234 | up into 64-byte blocks, which are further parsed as 32-bit integers. 235 | """ 236 | origlen = len(message) 237 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1! 238 | message += b"\x80" 239 | message += b"\x00" * (padlength - 1) 240 | message += pack("> (32-s)) & 0xffffffff 254 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 255 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, 256 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, 257 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2] 258 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, 259 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, 260 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, 261 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14] 262 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, 263 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, 264 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, 265 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12] 266 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, 267 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, 268 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, 269 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8] 270 | def ripemd128(message): 271 | h0 = 0x67452301 272 | h1 = 0xefcdab89 273 | h2 = 0x98badcfe 274 | h3 = 0x10325476 275 | X = padandsplit(message) 276 | for i in range(len(X)): 277 | (A,B,C,D) = (h0,h1,h2,h3) 278 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3) 279 | for j in range(64): 280 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j))) 281 | (A,D,C,B) = (D,C,B,T) 282 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j))) 283 | (Ap,Dp,Cp,Bp) = (Dp,Cp,Bp,T) 284 | T = add(h1,C,Dp) 285 | h1 = add(h2,D,Ap) 286 | h2 = add(h3,A,Bp) 287 | h3 = add(h0,B,Cp) 288 | h0 = T 289 | return pack("= 2**64" 335 | ctx = self.ctx 336 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) ) 337 | def getCounter( self ): 338 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0] 339 | def setRounds(self, rounds, testing=False ): 340 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20' 341 | self.rounds = rounds 342 | def encryptBytes(self, data): 343 | assert type(data) == bytes, 'data must be byte string' 344 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes' 345 | lendata = len(data) 346 | munged = bytearray(lendata) 347 | for i in range( 0, lendata, 64 ): 348 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False ) 349 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 ) 350 | # Stopping at 2^70 bytes per nonce is user's responsibility. 351 | for j in range( min( 64, lendata - i ) ): 352 | if python3: 353 | munged[ i+j ] = data[ i+j ] ^ h[j] 354 | else: 355 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j]) 356 | self._lastChunk64 = not lendata % 64 357 | return bytes(munged) 358 | decryptBytes = encryptBytes # encrypt and decrypt use same function 359 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ): 360 | """ Do nRounds Salsa20 rounds on a copy of 361 | input: list or tuple of 16 ints treated as little-endian unsigneds. 362 | Returns a 64-byte string. 363 | """ 364 | assert( type(input) in ( list, tuple ) and len(input) == 16 ) 365 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) ) 366 | x = list( input ) 367 | def XOR( a, b ): return a ^ b 368 | ROTATE = rot32 369 | PLUS = add32 370 | for i in range( nRounds // 2 ): 371 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c 372 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7)) 373 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9)) 374 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13)) 375 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18)) 376 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7)) 377 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9)) 378 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13)) 379 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18)) 380 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7)) 381 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9)) 382 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13)) 383 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18)) 384 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7)) 385 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9)) 386 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13)) 387 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18)) 388 | 389 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7)) 390 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9)) 391 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13)) 392 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18)) 393 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7)) 394 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9)) 395 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13)) 396 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18)) 397 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7)) 398 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9)) 399 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13)) 400 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18)) 401 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7)) 402 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9)) 403 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13)) 404 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18)) 405 | for i in range( len( input ) ): 406 | x[i] = PLUS( x[i], input[i] ) 407 | return little16_i32.pack( *x ) 408 | def trunc32( w ): 409 | "extract bottom 32 bits to a 32-bit word" 410 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) ) 411 | assert type(w) == int 412 | return w 413 | def add32( a, b ): 414 | "add two 32-bit word and keep retval a 32-bit word by discarding carry" 415 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF ) 416 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 ) 417 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF ) 418 | def rot32( w, nLeft ): 419 | "left rotate 32-bit word and keep retval a 32-bit word" 420 | nLeft &= 31 # which makes nLeft >= 0 421 | if nLeft == 0: 422 | return w 423 | # Note: now 1 <= nLeft <= 31. 424 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, 425 | # => sLLLLLLRRR and one s which becomes the sign bit. 426 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) ) 427 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w 428 | return RRR | ( sLLLLLL << nLeft ) 429 | def _unescape_entities(text): 430 | ' unescape offending tags < > " & ' 431 | text = text.replace(b'<', b'<') 432 | text = text.replace(b'>', b'>') 433 | text = text.replace(b'"', b'"') 434 | text = text.replace(b'&', b'&') 435 | return text 436 | def _fast_decrypt(data, key): 437 | b = bytearray(data) 438 | key = bytearray(key) 439 | previous = 0x36 440 | for i in range(len(b)): 441 | t = (b[i] >> 4 | b[i] << 4) & 0xff 442 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)] 443 | previous, b[i] = b[i], t 444 | return bytes(b) 445 | def _mdx_decrypt(comp_block): 446 | key = ripemd128(comp_block[4:8] + pack(b' 505 | """ 506 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL) 507 | return {key:_unescape_entities(value) for key, value in taglist} 508 | def get_records(self): 509 | """ 510 | Return a generator for key and value of each record 511 | key is from self._key_list, value is decrypted/decompressed record body 512 | """ 513 | with open(self._fname, 'rb') as f: 514 | f.seek(self._record_block_offset) 515 | # metadata from header 516 | num_record_blocks = self._read_number(f) 517 | num_entries = self._read_number(f) 518 | record_block_info_size = self._read_number(f) 519 | record_block_size = self._read_number(f) 520 | assert(num_entries == self._num_entries) 521 | # metadata of each record 522 | record_block_info_list = [] 523 | size_counter = 0 524 | for i in range(num_record_blocks): 525 | compressed_size = self._read_number(f) 526 | decompressed_size = self._read_number(f) 527 | record_block_info_list += [(compressed_size, decompressed_size)] 528 | size_counter += self._number_width * 2 529 | assert(size_counter == record_block_info_size) 530 | # scan each record 531 | offset = i = size_counter = 0 532 | for compressed_size, decompressed_size in record_block_info_list: 533 | current_pos = f.tell() 534 | # the whole record: read `compressed_size` bytes for compressed data 535 | record_block_compressed = f.read(compressed_size) 536 | # first 4 bytes: compression type 537 | # next 4 bytes: adler32 checksum of decompressed record block 538 | # the rest: record data 539 | record_block_type = record_block_compressed[:4] 540 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 541 | record_block = decompress(record_block_type, record_block_compressed[8:], decompressed_size) 542 | compress_type = {PLAIN_MAGIC:0, LZO_MAGIC:1, ZLIB_MAGIC:2}[record_block_type] 543 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) # adler32 is signed 544 | assert(len(record_block) == decompressed_size) 545 | # split record block according to the offset info from key block 546 | while i < len(self._key_list): 547 | record_start, key_text = self._key_list[i] 548 | # reach the end of current record block 549 | if record_start - offset >= decompressed_size: 550 | break 551 | # record end index 552 | if i < len(self._key_list) - 1: 553 | record_end = self._key_list[i + 1][0] 554 | else: 555 | record_end = decompressed_size + offset 556 | i += 1 557 | yield { 558 | 'file_pos': current_pos 559 | ,'compressed_size': compressed_size 560 | ,'decompressed_size': decompressed_size 561 | ,'record_block_type': compress_type 562 | ,'record_start': record_start 563 | ,'key_text': key_text 564 | ,'offset': offset 565 | ,'data': record_block[record_start - offset:record_end - offset] 566 | ,'record_end': record_end 567 | } 568 | offset += decompressed_size 569 | size_counter += compressed_size 570 | # verify how much read matches what is specified in header 571 | assert(size_counter == record_block_size) 572 | def _decode_key_block_info(self, key_block_info_compressed): 573 | if self._version >= 2: 574 | # version>=2 must use zlib compression 575 | assert(key_block_info_compressed[:4] == ZLIB_MAGIC) 576 | # decrypt if needed, then decompress 577 | if self._encrypt & 0x02: 578 | key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) 579 | key_block_info = decompress(ZLIB_MAGIC, key_block_info_compressed[8:]) 580 | # verify adler checksum 581 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0] 582 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff) 583 | else: 584 | # no compression 585 | key_block_info = key_block_info_compressed 586 | # decode 587 | key_block_info_list = [] 588 | num_entries = i = 0 589 | if self._version >= 2: 590 | byte_format, byte_width, text_term = '>H', 2, 1 591 | else: 592 | byte_format, byte_width, text_term = '>B', 1, 0 593 | while i < len(key_block_info): 594 | # number of entries in current key block 595 | num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 596 | i += self._number_width 597 | # text head size 598 | text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] 599 | i += byte_width 600 | # text head 601 | if self._encoding != 'UTF-16': 602 | i += text_head_size + text_term 603 | else: 604 | i += (text_head_size + text_term) * 2 605 | # text tail size 606 | text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] 607 | i += byte_width 608 | # text tail 609 | if self._encoding != 'UTF-16': 610 | i += text_tail_size + text_term 611 | else: 612 | i += (text_tail_size + text_term) * 2 613 | # key block compressed size 614 | key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 615 | i += self._number_width 616 | # key block decompressed size 617 | key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 618 | i += self._number_width 619 | key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)] 620 | assert(num_entries == self._num_entries) 621 | return key_block_info_list 622 | def _decode_key_block(self, key_block_compressed, key_block_info_list): 623 | key_list = [] 624 | i = 0 625 | for compressed_size, decompressed_size in key_block_info_list: 626 | start = i 627 | i = end = i + compressed_size 628 | # 4 bytes : compression type 629 | key_block_type = key_block_compressed[start:start+4] 630 | # 4 bytes : adler checksum of decompressed key block 631 | adler32 = unpack('>I', key_block_compressed[start+4:start+8])[0] 632 | key_block = decompress(key_block_type, key_block_compressed[start+8:end], decompressed_size) 633 | # extract one single key block into a key list 634 | key_list += self._split_key_block(key_block) 635 | # notice that adler32 returns signed value 636 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff) 637 | return key_list 638 | def _split_key_block(self, key_block): 639 | key_list = [] 640 | key_start_index = 0 641 | while key_start_index < len(key_block): 642 | # the corresponding record's offset in record block 643 | key_id = unpack(self._number_format, 644 | key_block[key_start_index:key_start_index+self._number_width] 645 | )[0] 646 | # key text ends with '\x00' 647 | if self._encoding == 'UTF-16': 648 | delimiter, width = b'\x00\x00', 2 649 | else: 650 | delimiter, width = b'\x00', 1 651 | i = key_start_index + self._number_width 652 | while i < len(key_block): 653 | if key_block[i:i + width] == delimiter: 654 | key_end_index = i 655 | break 656 | i += width 657 | key_text = key_block[key_start_index + self._number_width:key_end_index]\ 658 | .decode(self._encoding, errors='ignore').encode('utf-8').strip() 659 | key_start_index = key_end_index + width 660 | key_list += [(key_id, key_text)] 661 | return key_list 662 | def _read_header(self): 663 | with open(self._fname, 'rb') as f: 664 | # number of bytes of header text 665 | header_bytes_size = unpack('>I', f.read(4))[0] 666 | header_bytes = f.read(header_bytes_size) 667 | # 4 bytes: adler32 checksum of header, in little endian 668 | adler32 = unpack('= (3,): 678 | encoding = encoding.decode('utf-8') 679 | # GB18030 is superset of GBK & GB2312 680 | if encoding in ['GBK', 'GB2312']: 681 | encoding = 'GB18030' 682 | self._encoding = encoding 683 | # read title and description 684 | self._title = header_tag[b'Title'].decode('utf-8') if b'Title' in header_tag else '' 685 | self._description = header_tag[b'Description'].decode('utf-8') if b'Description' in header_tag else '' 686 | # encryption flag 687 | # 0x00 - no encryption 688 | # 0x01 - encrypt record block 689 | # 0x02 - encrypt key info block 690 | if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No': 691 | self._encrypt = 0 692 | elif header_tag[b'Encrypted'] == b'Yes': 693 | self._encrypt = 1 694 | else: 695 | self._encrypt = int(header_tag[b'Encrypted']) 696 | # stylesheet attribute if present takes form of: 697 | # style_number # 1-255 698 | # style_begin # or '' 699 | # style_end # or '' 700 | # store stylesheet in dict in the form of 701 | # {'number' : ('style_begin', 'style_end')} 702 | self._stylesheet = {} 703 | if header_tag.get('StyleSheet'): 704 | lines = header_tag['StyleSheet'].splitlines() 705 | for i in range(0, len(lines), 3): 706 | self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2]) 707 | # before version 2.0, number is 4 bytes integer 708 | # version 2.0 and above uses 8 bytes 709 | self._version = float(header_tag[b'GeneratedByEngineVersion']) 710 | if self._version < 2.0: 711 | self._number_width, self._number_format = 4, '>I' 712 | else: 713 | self._number_width, self._number_format = 8, '>Q' 714 | return header_tag 715 | def _read_keys(self): 716 | with open(self._fname, 'rb') as f: 717 | f.seek(self._key_block_offset) 718 | # the following numbers could be encrypted 719 | num_bytes = (8*5) if self._version >= 2.0 else (4*4) 720 | block = f.read(num_bytes) 721 | if self._encrypt & 1: 722 | if self._passcode is None: 723 | raise RuntimeError('user identification is needed to read encrypted file') 724 | regcode, userid = self._passcode 725 | if isinstance(userid, unicode): 726 | userid = userid.encode('utf8') 727 | if self.header[b'RegisterBy'] == b'EMail': 728 | encrypted_key = _decrypt_regcode_by_email(regcode, userid) 729 | else: 730 | encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) 731 | block = _salsa_decrypt(block, encrypted_key) 732 | # decode this block 733 | sf = BytesIO(block) 734 | num_key_blocks = self._read_number(sf) 735 | self._num_entries = self._read_number(sf) 736 | # number of bytes of key block info after decompression 737 | if self._version >= 2.0: 738 | _ = self._read_number(sf) # key_block_info_decomp_size, unused here 739 | # number of bytes of key block info 740 | key_block_info_size = self._read_number(sf) 741 | # number of bytes of key block 742 | key_block_size = self._read_number(sf) 743 | # 4 bytes: adler checksum of previous 5 numbers 744 | if self._version >= 2.0: 745 | adler32 = unpack('>I', f.read(4))[0] 746 | assert adler32 == (zlib.adler32(block) & 0xffffffff) 747 | # read key block info, which indicates key block's compressed and decompressed size 748 | key_block_info = f.read(key_block_info_size) 749 | key_block_info_list = self._decode_key_block_info(key_block_info) 750 | assert(num_key_blocks == len(key_block_info_list)) 751 | # read and decompress key block 752 | key_block_compressed = f.read(key_block_size) 753 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 754 | self._record_block_offset = f.tell() 755 | return key_list 756 | def _read_keys_brutal(self): 757 | with open(self._fname, 'rb') as f: 758 | f.seek(self._key_block_offset) 759 | # the following numbers could be encrypted, disregard them! 760 | if self._version >= 2.0: 761 | num_bytes, key_block_type = (8*5+4), ZLIB_MAGIC 762 | else: 763 | num_bytes, key_block_type = (4*4), LZO_MAGIC 764 | block = f.read(num_bytes) 765 | # key block info: 766 | # - 4 bytes '\x02\x00\x00\x00' 767 | # - 4 bytes adler32 checksum 768 | # - a number of bytes 769 | # - 4 bytes '\x02\x00\x00\x00' marks the beginning of key block 770 | key_block_info = f.read(8) 771 | if self._version >= 2.0: 772 | assert key_block_info[:4] == ZLIB_MAGIC 773 | while True: 774 | fpos = f.tell() 775 | t = f.read(1024) 776 | index = t.find(key_block_type) 777 | if index != -1: 778 | key_block_info += t[:index] 779 | f.seek(fpos + index) 780 | break 781 | else: 782 | key_block_info += t 783 | key_block_info_list = self._decode_key_block_info(key_block_info) 784 | key_block_size = sum(list(zip(*key_block_info_list))[0]) 785 | # read and decompress key block 786 | key_block_compressed = f.read(key_block_size) 787 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 788 | self._record_block_offset = f.tell() 789 | self._num_entries = len(key_list) 790 | return key_list 791 | def get_index(self): 792 | index_dict_list = [] # list of dict, each one is index to one record 793 | for record_dict in self.get_records(): 794 | del record_dict['data'] 795 | index_dict_list.append(index_dict) 796 | return index_dict_list 797 | 798 | class MDD(MDict): 799 | """ 800 | MDict resource file format (*.MDD) reader. 801 | >>> mdd = MDD('example.mdd') 802 | >>> len(mdd) 803 | 208 804 | >>> for filename,content in mdd.items(): 805 | ... print filename, content[:10] 806 | """ 807 | def __init__(self, fname, passcode=None): 808 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode) 809 | def items(self): 810 | """ 811 | Return a generator which in turn produce tuples of (filename, blob), 812 | both in bytestring 813 | """ 814 | for record_dict in self.get_records(): 815 | filename = record_dict['key_text'].decode('utf-8') 816 | blob = record_dict['data'] 817 | yield filename, blob 818 | 819 | class MDX(MDict): 820 | """ 821 | MDict dictionary file format (*.MDD) reader. 822 | >>> mdx = MDX('example.mdx') 823 | >>> len(mdx) 824 | 42481 825 | >>> for key,value in mdx.items(): 826 | ... print key, value[:10] 827 | """ 828 | def __init__(self, fname, encoding='', substyle=False, passcode=None): 829 | MDict.__init__(self, fname, encoding, passcode) 830 | self._substyle = substyle 831 | def _substitute_stylesheet(self, txt): 832 | 'Replace style with loaded stylesheet' 833 | txt_list = re.split('`\d+`', txt) 834 | txt_tag = re.findall('`\d+`', txt) 835 | txt_styled = txt_list[0] 836 | for j, p in enumerate(txt_list[1:]): 837 | style = self._stylesheet[txt_tag[j][1:-1]] 838 | if p and p[-1] == b'\n': 839 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + b'\r\n' 840 | else: 841 | txt_styled = txt_styled + style[0] + p + style[1] 842 | return txt_styled 843 | def items(self): 844 | """ 845 | Return a generator which in turn produce tuples in the form of (title, text), 846 | both in unicode string 847 | """ 848 | for record_dict in self.get_records(): 849 | title = record_dict['key_text'] 850 | text = record_dict['data'] \ 851 | .decode(self._encoding, errors='ignore') \ 852 | .strip(u'\x00') \ 853 | .encode('utf-8') 854 | # substitute stylesheet if required 855 | if self._substyle and self._stylesheet: 856 | text = self._substitute_stylesheet(text) 857 | yield title, text 858 | def get_index(self): 859 | index_dict_list = super(MDX,self).get_index() 860 | return { 861 | "index_dict_list": index_dict_list 862 | ,"meta": { 863 | 'encoding': self._encoding 864 | ,'stylesheet': json.dumps(self._stylesheet) 865 | ,'title': self._title 866 | ,'description': self._description 867 | } 868 | } 869 | -------------------------------------------------------------------------------- /tool.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | 4 | # MDict *.mdd and *.mdx data extractor 5 | # 6 | # This program is a free software; you can redistribute it and/or modify 7 | # it under the terms of the GNU General Public License as published by 8 | # the Free Software Foundation, version 3 of the License. 9 | # 10 | # You can get a copy of GNU General Public License along this program 11 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt 12 | # 13 | # This program is distributed in the hope that it will be useful, 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 | # GNU General Public License for more details. 17 | 18 | from __future__ import print_function 19 | 20 | import os 21 | import sys 22 | import contextlib 23 | import argparse 24 | 25 | #from readmdict import MDD, MDX 26 | from mdict import MDD, MDX 27 | 28 | def csvquote(s): 29 | 'Quote a CSV field, as in RFC4180' 30 | if not any(c in s for c in '\n",'): 31 | return s 32 | elif '"' in s: 33 | return '"' + s.replace('"','""') + '"' 34 | else: 35 | return '"' + s + '"' 36 | 37 | @contextlib.contextmanager 38 | def csvopen(filename=None): 39 | 'Combined interface for file and stdout' 40 | if filename and filename != '-': 41 | fh = open(filename, 'wb') 42 | else: 43 | fh = sys.stdout 44 | try: 45 | yield fh 46 | finally: 47 | if fh is not sys.stdout: 48 | fh.close() 49 | 50 | def main(): 51 | # command line argument 52 | parser = argparse.ArgumentParser(description="mdict tool") 53 | parser.add_argument('mdict_file', 54 | help="Input *.mdx or *.mdd file") 55 | parser.add_argument('-l', '--list', default=False, action='store_true', 56 | help='List entry names in MDX or file names in MDD') 57 | parser.add_argument('-a', '--dump', default=False, action='store_true', 58 | help='Dump all files in *.mdd into files in output dir or ' \ 59 | 'all entries in *.mdx into a CSV') 60 | parser.add_argument('-x', '--extract', 61 | help='Extract one file or entry content, print to stdout if -o not specified. ' \ 62 | 'Argument should be specified in UTF-8') 63 | parser.add_argument('-d', '--dir', 64 | help='Output directory for -a or -o') 65 | parser.add_argument('-o', '--output', 66 | help='Output filename for -x') 67 | parser.add_argument('-e', '--transcode', 68 | help='Transcode data, specified in format of INPUT_ENC:OUTPUT_ENC') 69 | args = parser.parse_args() 70 | 71 | # open file 72 | is_mdd = args.mdict_file.lower().endswith('.mdd') 73 | obj = MDD(args.mdict_file) if is_mdd else MDX(args.mdict_file) 74 | if args.transcode: 75 | in_enc, out_enc = args.transcode.split(':') 76 | assert((in_enc and out_enc) or (not in_enc and not out_enc)) 77 | else: 78 | in_enc, out_enc = None, None 79 | 80 | # operation depends on input 81 | if args.list: 82 | # print all key (entries or filenames) 83 | for key, _ in obj.items(): 84 | if in_enc and out_enc: 85 | print(key.decode(in_enc).encode(out_enc)) 86 | else: 87 | print(key) 88 | elif args.dump and is_mdd: 89 | # dump all resources in *.mdd into files 90 | for filename, blob in obj.items(): 91 | if in_enc: # transcode filename if needed 92 | filename = filename.decode(in_enc) 93 | filename = key # use entry name as filename 94 | if args.dir: 95 | filename = os.path.join(args.dir, filename) 96 | open(filename, 'wb').write(value) 97 | elif args.dump and not is_mdd: 98 | # dump all resources in *.mdx into a CSV file 99 | newline = '\r\n' 100 | if args.output: 101 | filename = args.output 102 | if args.dir: 103 | filename = os.path.join(args.dir, filename) 104 | else: 105 | filename = '-' 106 | with csvopen(filename) as fh: 107 | for key, val in obj.items(): 108 | if in_enc: # transcode key & val if needed 109 | key = key.decode(in_enc) 110 | val = val.decode(in_enc).encode(out_enc) 111 | fh.write(csvquote(key)) 112 | fh.write(',') 113 | fh.write(csvquote(val)) 114 | fh.write(newline) 115 | elif args.extract: 116 | # find entry/filename, write definition/blob 117 | target = args.extract.decode('utf-8') 118 | for key, value in obj.items(): 119 | if in_enc: # transcode key if needed 120 | key = key.decode(in_enc) 121 | if key != target: # seek until we find the target 122 | continue 123 | if in_enc and out_enc: # transcode value if needed 124 | value = value.decode(in_enc).encode(out_enc) 125 | filename = args.output 126 | if args.dir and is_mdd:# data from *.mdd has its default filename 127 | filename = key 128 | if not filename: # print to console or save to file 129 | print(value) 130 | else: 131 | filename = args.output 132 | if args.dir: 133 | filename = os.path.join(args.dir, filename) 134 | open(filename, 'wb').write(value) 135 | 136 | if __name__ == '__main__': 137 | main() 138 | --------------------------------------------------------------------------------