├── README.md
├── mdict.py
└── tool.py


/README.md:
--------------------------------------------------------------------------------
 1 | # mdict_reader
 2 | Extract data from Octopus mdict (*.mdd, *.mdx) files
 3 | 
 4 | ```
 5 | usage: tool.py [-h] [-l] [-a] [-x EXTRACT] [-d DIR] [-o OUTPUT] [-e TRANSCODE]
 6 |                mdict_file
 7 | 
 8 | mdict tool
 9 | 
10 | positional arguments:
11 |   mdict_file            Input *.mdx or *.mdd file
12 | 
13 | optional arguments:
14 |   -h, --help            show this help message and exit
15 |   -l, --list            List entry names in MDX or file names in MDD
16 |   -a, --dump            Dump all files in *.mdd into files in output dir or
17 |                         all entries in *.mdx into a CSV
18 |   -x EXTRACT, --extract EXTRACT
19 |                         Extract one file or entry content, print to stdout if
20 |                         -o not specified. Argument should be specified in
21 |                         UTF-8
22 |   -d DIR, --dir DIR     Output directory for -a or -o
23 |   -o OUTPUT, --output OUTPUT
24 |                         Output filename for -x
25 |   -e TRANSCODE, --transcode TRANSCODE
26 |                         Transcode data, specified in format of
27 |                         INPUT_ENC:OUTPUT_ENC
28 | ```
29 | 


--------------------------------------------------------------------------------
/mdict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # mdict.py
  5 | #
  6 | # Trimmed-down, refactored version of Octopus MDict Dictionary File (.mdx) and
  7 | # Resource File (.mdd) Analyser by Xiaoquing Wang
  8 | # <https://bitbucket.org/xwang/mdict-analysis>
  9 | #
 10 | # This package includes ripemd128 and Salsa20 implementation by
 11 | # <https://github.com/zhansliu/writemdict>
 12 | #
 13 | # This program is a free software; you can redistribute it and/or modify
 14 | # it under the terms of the GNU General Public License as published by
 15 | # the Free Software Foundation, version 3 of the License.
 16 | #
 17 | # You can get a copy of GNU General Public License along this program
 18 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt
 19 | #
 20 | # This program is distributed in the hope that it will be useful,
 21 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 22 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 23 | # GNU General Public License for more details.
 24 | 
 25 | import json
 26 | import math
 27 | import re
 28 | import sys
 29 | import zlib # zlib compression is used for engine version >=2.0
 30 | 
 31 | from struct import pack, unpack, Struct
 32 | from io import BytesIO
 33 | 
 34 | assert(sys.version_info >= (2, 6))
 35 | if sys.version_info >= (3,):
 36 |     integer_types = (int,)
 37 |     unicode = str # 2x3 compatible
 38 |     python3 = True
 39 | else:
 40 |     integer_types = (int, long)
 41 |     python3 = False
 42 | 
 43 | #########
 44 | # For LZO decompression
 45 | #
 46 | class FlexBuffer():
 47 |     def __init__(self):
 48 |         self.blockSize = None
 49 |         self.c = None
 50 |         self.l = None
 51 |         self.buf = None
 52 |     def require(self, n):
 53 |         r = self.c - self.l + n
 54 |         if r > 0:
 55 |             self.l = self.l + self.blockSize * math.ceil(r / self.blockSize)
 56 |             self.buf = self.buf + bytearray(self.l - len(self.buf))
 57 |         self.c = self.c + n
 58 |         return self.buf
 59 |     def alloc(self, initSize, blockSize):
 60 |         sz = blockSize or 4096
 61 |         self.blockSize = self.roundUp(sz)
 62 |         self.c = 0
 63 |         self.l = self.roundUp(initSize) | 0
 64 |         self.l += self.blockSize - (self.l % self.blockSize)
 65 |         self.buf = bytearray(self.l)
 66 |         return self.buf
 67 |     def roundUp(self, n):
 68 |         r = n % 4
 69 |         return n if r==0 else (n+4-r)
 70 |     def reset(self):
 71 |         self.c = 0
 72 |         self.l = len(self.buf)
 73 |     def pack(self, size):
 74 |         return self.buf[0:size]
 75 | 
 76 | def _decompress(inBuf, outBuf):
 77 |     # state label as constants
 78 |     c_top_loop, c_first_literal_run, c_match, c_copy_match, c_match_done, c_match_next = range(6)
 79 | 
 80 |     out = outBuf.buf
 81 |     op = ip = m_pos = 0
 82 |     t = inBuf[ip]
 83 |     state = c_top_loop
 84 | 
 85 |     def copy(inbuffer, outbuffer, iptr, optr, counter, k):
 86 |         for i in range(k):
 87 |             outbuffer[optr+i] = inbuffer[iptr+i]
 88 |         return iptr+k, optr+k, counter-k
 89 | 
 90 |     if t > 17:
 91 |         ip = ip + 1
 92 |         t = t - 17
 93 |         if t < 4:
 94 |             state = c_match_next
 95 |         else:
 96 |             out = outBuf.require(t)
 97 |             ip, op, t = copy(inBuf, out, ip, op, t, t)
 98 |             state = c_first_literal_run
 99 |     while True:
100 |         if_block = False
101 |         # emulate c switch structure by sequences of if statment
102 |         if state == c_top_loop:
103 |             t = inBuf[ip]
104 |             ip = ip + 1
105 |             if t >= 16:
106 |                 state = c_match
107 |                 continue
108 |             if t == 0:
109 |                 while inBuf[ip] == 0:
110 |                     t, ip = t+255, ip+1
111 |                 t = t + 15 + inBuf[ip]
112 |                 ip = ip + 1
113 |             t = t + 3
114 |             out = outBuf.require(t)
115 |             ip, op, t = copy(inBuf, out, ip, op, t, t)
116 |             state = c_first_literal_run
117 |         if state == c_first_literal_run:
118 |             t = inBuf[ip]
119 |             ip = ip + 1
120 |             if t >= 16:
121 |                 state = c_match
122 |                 continue
123 |             m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2)
124 |             ip = ip + 1
125 |             out = outBuf.require(3)
126 |             _, op, _ = copy(out, out, m_pos, op, 0, 3)
127 |             state = c_match_done
128 |             continue
129 |         if state == c_match:
130 |             if t >= 64:
131 |                 m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3)
132 |                 ip = ip + 1
133 |                 t = (t >> 5) - 1
134 |                 state = c_copy_match
135 |                 continue
136 |             elif t >= 32:
137 |                 t = t & 31
138 |                 if t == 0:
139 |                     while inBuf[ip] == 0:
140 |                         t, ip = t+255, ip+1
141 |                     t = t + 31 + inBuf[ip]
142 |                     ip = ip + 1
143 |                 m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
144 |                 ip = ip + 2
145 |             elif t >= 16:
146 |                 m_pos = op - ((t & 8) << 11)
147 |                 t = t & 7
148 |                 if t == 0:
149 |                     while inBuf[ip] == 0:
150 |                         t, ip = t+255, ip+1
151 |                     t = t + 7 + inBuf[ip]
152 |                     ip = ip + 1
153 |                 m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
154 |                 ip = ip + 2
155 |                 if m_pos == op:
156 |                     break
157 |                 m_pos = m_pos - 0x4000
158 |             else:
159 |                 m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2);
160 |                 ip = ip + 1
161 |                 out = outBuf.require(2)
162 |                 _, op, _ = copy(out, out, m_pos, op, 0, 2)
163 |                 state = c_match_done
164 |                 continue
165 |             if t >= 6 and (op - m_pos) >= 4:
166 |                 if_block = True
167 |                 t += 2
168 |                 out = outBuf.require(t)
169 |                 m_pos, op, t = copy(out, out, m_pos, op, t, t)
170 |             state = c_copy_match
171 |         if state == c_copy_match:
172 |             if not if_block:
173 |                 t += 2
174 |                 out = outBuf.require(t)
175 |                 m_pos, op, t = copy(out, out, m_pos, op, t, t)
176 |             state = c_match_done
177 |         if state == c_match_done:
178 |             t = inBuf[ip - 2] & 3
179 |             if t == 0:
180 |                 state = c_top_loop
181 |                 continue
182 |             state = c_match_next
183 |         if state == c_match_next:
184 |             out = outBuf.require(1)
185 |             ip, op, _ = copy(inBuf, out, ip, op, 0, 1)
186 |             if t > 1:
187 |                 out = outBuf.require(1)
188 |                 ip, op, _ = copy(inBuf, out, ip, op, 0, 1)
189 |             if t > 2:
190 |                 out = outBuf.require(1)
191 |                 ip, op, _ = copy(inBuf, out, ip, op, 0, 1)
192 |             t = inBuf[ip]
193 |             ip += 1
194 |             state = c_match
195 |     return bytes(outBuf.pack(op))
196 | 
197 | def lzo_decompress(input, initSize=16000, blockSize=1308672):
198 |     output = FlexBuffer()
199 |     output.alloc(initSize, blockSize)
200 |     return _decompress(bytearray(input), output)
201 | 
202 | #########
203 | # For RIPEMD128
204 | #
205 | def f(j, x, y, z):
206 |     assert(0 <= j < 64)
207 |     return ((x ^ y ^ z)                   if j<16 else
208 |             ((x & y) | (z & ~x))          if j<32 else
209 |             ((x | (0xffffffff & ~y)) ^ z) if j<48 else
210 |             ((x & z) | (y & ~z))
211 |            )
212 | def K(j):
213 |     assert(0 <= j < 64)
214 |     return (0x00000000 if j<16 else
215 |             0x5a827999 if j<32 else
216 |             0x6ed9eba1 if j<48 else
217 |             0x8f1bbcdc
218 |            )
219 | def Kp(j):
220 |     assert(0 <= j < 64)
221 |     return (0x50a28be6 if j<16 else
222 |             0x5c4dd124 if j<32 else
223 |             0x6d703ef3 if j<48 else
224 |             0x00000000
225 |            )
226 | def padandsplit(message):
227 |     """
228 |     returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges
229 |     from 0 to 16.
230 |     First pads the message to length in bytes is congruent to 56 (mod 64), 
231 |     by first adding a byte 0x80, and then padding with 0x00 bytes until the
232 |     message length is congruent to 56 (mod 64). Then adds the little-endian
233 |     64-bit representation of the original length. Finally, splits the result
234 |     up into 64-byte blocks, which are further parsed as 32-bit integers.
235 |     """
236 |     origlen = len(message)
237 |     padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1!
238 |     message += b"\x80"
239 |     message += b"\x00" * (padlength - 1)
240 |     message += pack("<Q", origlen*8)
241 |     assert(len(message) % 64 == 0)
242 |     return [
243 |              [
244 |                unpack("<L", message[i+j:i+j+4])[0]
245 |                for j in range(0, 64, 4)
246 |              ]
247 |              for i in range(0, len(message), 64)
248 |            ]
249 | def add(*args):
250 |     return sum(args) & 0xffffffff
251 | def rol(s,x):
252 |     assert(s < 32)
253 |     return (x << s | x >> (32-s)) & 0xffffffff
254 | r =  [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
255 |        7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8,
256 |        3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12,
257 |        1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2]
258 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12,
259 |        6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2,
260 |       15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13,
261 |        8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14]
262 | s =  [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8,
263 |        7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12,
264 |       11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5,
265 |       11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12]
266 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6,
267 |        9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11,
268 |        9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5,
269 |       15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8]
270 | def ripemd128(message):
271 |     h0 = 0x67452301
272 |     h1 = 0xefcdab89
273 |     h2 = 0x98badcfe
274 |     h3 = 0x10325476
275 |     X = padandsplit(message)
276 |     for i in range(len(X)):
277 |         (A,B,C,D) = (h0,h1,h2,h3)
278 |         (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3)
279 |         for j in range(64):
280 |             T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j)))
281 |             (A,D,C,B) = (D,C,B,T)
282 |             T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j)))
283 |             (Ap,Dp,Cp,Bp) = (Dp,Cp,Bp,T)
284 |         T = add(h1,C,Dp)
285 |         h1 = add(h2,D,Ap)
286 |         h2 = add(h3,A,Bp)
287 |         h3 = add(h0,B,Cp)
288 |         h0 = T
289 |     return pack("<LLLL",h0,h1,h2,h3)
290 | def hexstr(bstr):
291 |     return "".join("{0:02x}".format(b) for b in bstr)
292 | #########
293 | # For Salsa20
294 | #
295 | little_u64 = Struct( "<Q" )      #    little-endian 64-bit unsigned.
296 |                                  #    Unpacks to a tuple of one element!
297 | little16_i32 = Struct( "<16i" )  # 16 little-endian 32-bit signed ints.
298 | little4_i32 = Struct( "<4i" )    #  4 little-endian 32-bit signed ints.
299 | little2_i32 = Struct( "<2i" )    #  2 little-endian 32-bit signed ints.
300 | class Salsa20(object):
301 |     def __init__(self, key=None, IV=None, rounds=20 ):
302 |         self._lastChunk64 = True
303 |         self._IVbitlen = 64             # must be 64 bits
304 |         self.ctx = [ 0 ] * 16
305 |         if key:
306 |             self.setKey(key)
307 |         if IV:
308 |             self.setIV(IV)
309 |         self.setRounds(rounds)
310 |     def setKey(self, key):
311 |         assert type(key) == bytes
312 |         ctx = self.ctx
313 |         if len( key ) == 32:  # recommended
314 |             constants = b"expand 32-byte k"
315 |             ctx[ 1],ctx[ 2],ctx[ 3],ctx[ 4] = little4_i32.unpack(key[0:16])
316 |             ctx[11],ctx[12],ctx[13],ctx[14] = little4_i32.unpack(key[16:32])
317 |         elif len( key ) == 16:
318 |             constants = b"expand 16-byte k"
319 |             ctx[ 1],ctx[ 2],ctx[ 3],ctx[ 4] = little4_i32.unpack(key[0:16])
320 |             ctx[11],ctx[12],ctx[13],ctx[14] = little4_i32.unpack(key[0:16])
321 |         else:
322 |             raise Exception( "key length isn't 32 or 16 bytes." )
323 |         ctx[0],ctx[5],ctx[10],ctx[15] = little4_i32.unpack( constants )
324 |     def setIV(self, IV):
325 |         assert type(IV) == bytes
326 |         assert len(IV)*8 == 64, 'nonce (IV) not 64 bits'
327 |         self.IV = IV
328 |         ctx=self.ctx
329 |         ctx[ 6],ctx[ 7] = little2_i32.unpack( IV )
330 |         ctx[ 8],ctx[ 9] = 0, 0  # Reset the block counter.
331 |     setNonce = setIV            # support an alternate name
332 |     def setCounter( self, counter ):
333 |         assert( type(counter) in integer_types )
334 |         assert( 0 <= counter < 1<<64 ), "counter < 0 or >= 2**64"
335 |         ctx = self.ctx
336 |         ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) )
337 |     def getCounter( self ):
338 |         return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0]
339 |     def setRounds(self, rounds, testing=False ):
340 |         assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20'
341 |         self.rounds = rounds
342 |     def encryptBytes(self, data):
343 |         assert type(data) == bytes, 'data must be byte string'
344 |         assert self._lastChunk64, 'previous chunk not multiple of 64 bytes'
345 |         lendata = len(data)
346 |         munged = bytearray(lendata)
347 |         for i in range( 0, lendata, 64 ):
348 |             h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False )
349 |             self.setCounter( ( self.getCounter() + 1 ) % 2**64 )
350 |             # Stopping at 2^70 bytes per nonce is user's responsibility.
351 |             for j in range( min( 64, lendata - i ) ):
352 |                 if python3:
353 |                     munged[ i+j ] = data[ i+j ] ^ h[j]
354 |                 else:
355 |                     munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j])
356 |         self._lastChunk64 = not lendata % 64
357 |         return bytes(munged)
358 |     decryptBytes = encryptBytes # encrypt and decrypt use same function
359 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ):
360 |     """ Do nRounds Salsa20 rounds on a copy of 
361 |             input: list or tuple of 16 ints treated as little-endian unsigneds.
362 |         Returns a 64-byte string.
363 |         """
364 |     assert( type(input) in ( list, tuple )  and  len(input) == 16 )
365 |     assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) )
366 |     x = list( input )
367 |     def XOR( a, b ):  return a ^ b
368 |     ROTATE = rot32
369 |     PLUS   = add32
370 |     for i in range( nRounds // 2 ):
371 |         # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c
372 |         x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7))
373 |         x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9))
374 |         x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13))
375 |         x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18))
376 |         x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7))
377 |         x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9))
378 |         x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13))
379 |         x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18))
380 |         x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7))
381 |         x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9))
382 |         x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13))
383 |         x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18))
384 |         x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7))
385 |         x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9))
386 |         x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13))
387 |         x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18))
388 | 
389 |         x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7))
390 |         x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9))
391 |         x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13))
392 |         x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18))
393 |         x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7))
394 |         x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9))
395 |         x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13))
396 |         x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18))
397 |         x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7))
398 |         x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9))
399 |         x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13))
400 |         x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18))
401 |         x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7))
402 |         x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9))
403 |         x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13))
404 |         x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18))
405 |     for i in range( len( input ) ):
406 |         x[i] = PLUS( x[i], input[i] )
407 |     return little16_i32.pack( *x )
408 | def trunc32( w ):
409 |     "extract bottom 32 bits to a 32-bit word"
410 |     w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) )
411 |     assert type(w) == int
412 |     return w
413 | def add32( a, b ):
414 |     "add two 32-bit word and keep retval a 32-bit word by discarding carry"
415 |     lo = ( a & 0xFFFF ) + ( b & 0xFFFF )
416 |     hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 )
417 |     return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF )
418 | def rot32( w, nLeft ):
419 |     "left rotate 32-bit word and keep retval a 32-bit word"
420 |     nLeft &= 31  # which makes nLeft >= 0
421 |     if nLeft == 0:
422 |         return w
423 |     # Note: now 1 <= nLeft <= 31.
424 |     #     RRRsLLLLLL   There are nLeft RRR's, (31-nLeft) LLLLLL's,
425 |     # =>  sLLLLLLRRR   and one s which becomes the sign bit.
426 |     RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) )
427 |     sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w
428 |     return RRR | ( sLLLLLL << nLeft )
429 | def _unescape_entities(text):
430 |     ' unescape offending tags < > " & '
431 |     text = text.replace(b'&lt;', b'<')
432 |     text = text.replace(b'&gt;', b'>')
433 |     text = text.replace(b'&quot;', b'"')
434 |     text = text.replace(b'&amp;', b'&')
435 |     return text
436 | def _fast_decrypt(data, key):
437 |     b = bytearray(data)
438 |     key = bytearray(key)
439 |     previous = 0x36
440 |     for i in range(len(b)):
441 |         t = (b[i] >> 4 | b[i] << 4) & 0xff
442 |         t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
443 |         previous, b[i] = b[i], t
444 |     return bytes(b)
445 | def _mdx_decrypt(comp_block):
446 |     key = ripemd128(comp_block[4:8] + pack(b'<L', 0x3695))
447 |     return comp_block[0:8] + _fast_decrypt(comp_block[8:], key)
448 | def _salsa_decrypt(ciphertext, encrypt_key):
449 |     s20 = Salsa20(key=encrypt_key, IV=b"\x00" * 8, rounds=8)
450 |     return s20.encryptBytes(ciphertext)
451 | def _decrypt_regcode_by_deviceid(reg_code, deviceid):
452 |     deviceid_digest = ripemd128(deviceid)
453 |     s20 = Salsa20(key=deviceid_digest, IV=b"\x00" * 8, rounds=8)
454 |     encrypt_key = s20.encryptBytes(reg_code)
455 |     return encrypt_key
456 | def _decrypt_regcode_by_email(reg_code, email):
457 |     email_digest = ripemd128(email.decode().encode('utf-16-le'))
458 |     s20 = Salsa20(key=email_digest, IV=b"\x00" * 8, rounds=8)
459 |     encrypt_key = s20.encryptBytes(reg_code)
460 |     return encrypt_key
461 | 
462 | #########
463 | # Octopus mdict object classes
464 | #
465 | PLAIN_MAGIC = b'\x00\x00\x00\x00'
466 | LZO_MAGIC = b'\x01\x00\x00\x00'
467 | ZLIB_MAGIC = b'\x02\x00\x00\x00'
468 | 
469 | def decompress(block_type, block_data, decompressed_size=0):
470 |     if block_type == PLAIN_MAGIC: # no compression
471 |         return block_data
472 |     elif block_type == LZO_MAGIC: # LZO compressed
473 |         return lzo_decompress(block_data, decompressed_size)
474 |     elif block_type == ZLIB_MAGIC: # zlib compressed
475 |         return zlib.decompress(block_data)
476 | 
477 | class MDict(object):
478 |     """
479 |     Base class which reads in header and key block.
480 |     It has no public methods and serves only as code sharing base class.
481 |     """
482 |     def __init__(self, fname, encoding='', passcode=None):
483 |         self._fname    = fname
484 |         self._encoding = encoding.upper()
485 |         self._passcode = passcode
486 |         self.header    = self._read_header()
487 |         try:
488 |             self._key_list = self._read_keys()
489 |         except:
490 |             print("Try Brute Force on Encrypted Key Blocks")
491 |             self._key_list = self._read_keys_brutal()
492 |     def __len__(self):
493 |         return self._num_entries
494 |     def __iter__(self):
495 |         return self.keys()
496 |     def keys(self):
497 |         return (key_value for key_id, key_value in self._key_list)
498 |     def items(self):
499 |         raise NotImplementedError
500 |     def _read_number(self, f):
501 |         return unpack(self._number_format, f.read(self._number_width))[0]
502 |     def _parse_header(self, header):
503 |         """
504 |         extract attributes from <Dict attr="value" ... >
505 |         """
506 |         taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL)
507 |         return {key:_unescape_entities(value) for key, value in taglist}
508 |     def get_records(self):
509 |         """
510 |         Return a generator for key and value of each record
511 |         key is from self._key_list, value is decrypted/decompressed record body
512 |         """
513 |         with open(self._fname, 'rb') as f:
514 |             f.seek(self._record_block_offset)
515 |             # metadata from header
516 |             num_record_blocks      = self._read_number(f)
517 |             num_entries            = self._read_number(f)
518 |             record_block_info_size = self._read_number(f)
519 |             record_block_size      = self._read_number(f)
520 |             assert(num_entries == self._num_entries)
521 |             # metadata of each record
522 |             record_block_info_list = []
523 |             size_counter = 0
524 |             for i in range(num_record_blocks):
525 |                 compressed_size        =  self._read_number(f)
526 |                 decompressed_size      =  self._read_number(f)
527 |                 record_block_info_list += [(compressed_size, decompressed_size)]
528 |                 size_counter           += self._number_width * 2
529 |             assert(size_counter == record_block_info_size)
530 |             # scan each record
531 |             offset = i = size_counter = 0
532 |             for compressed_size, decompressed_size in record_block_info_list:
533 |                 current_pos = f.tell()
534 |                 # the whole record: read `compressed_size` bytes for compressed data
535 |                 record_block_compressed = f.read(compressed_size)
536 |                 # first 4 bytes: compression type
537 |                 # next 4 bytes: adler32 checksum of decompressed record block
538 |                 # the rest: record data
539 |                 record_block_type = record_block_compressed[:4]
540 |                 adler32 = unpack('>I', record_block_compressed[4:8])[0]
541 |                 record_block = decompress(record_block_type, record_block_compressed[8:], decompressed_size)
542 |                 compress_type = {PLAIN_MAGIC:0, LZO_MAGIC:1, ZLIB_MAGIC:2}[record_block_type]
543 |                 assert(adler32 == zlib.adler32(record_block) & 0xffffffff) # adler32 is signed
544 |                 assert(len(record_block) == decompressed_size)
545 |                 # split record block according to the offset info from key block
546 |                 while i < len(self._key_list):
547 |                     record_start, key_text = self._key_list[i]
548 |                     # reach the end of current record block
549 |                     if record_start - offset >= decompressed_size:
550 |                         break
551 |                     # record end index
552 |                     if i < len(self._key_list) - 1:
553 |                         record_end = self._key_list[i + 1][0]
554 |                     else:
555 |                         record_end = decompressed_size + offset
556 |                     i += 1
557 |                     yield {
558 |                         'file_pos':          current_pos
559 |                        ,'compressed_size':   compressed_size
560 |                        ,'decompressed_size': decompressed_size
561 |                        ,'record_block_type': compress_type
562 |                        ,'record_start':      record_start
563 |                        ,'key_text':          key_text
564 |                        ,'offset':            offset
565 |                        ,'data':              record_block[record_start - offset:record_end - offset]
566 |                        ,'record_end':        record_end
567 |                     }
568 |                 offset += decompressed_size
569 |                 size_counter += compressed_size
570 |             # verify how much read matches what is specified in header
571 |             assert(size_counter == record_block_size)
572 |     def _decode_key_block_info(self, key_block_info_compressed):
573 |         if self._version >= 2:
574 |             # version>=2 must use zlib compression
575 |             assert(key_block_info_compressed[:4] == ZLIB_MAGIC)
576 |             # decrypt if needed, then decompress
577 |             if self._encrypt & 0x02:
578 |                 key_block_info_compressed = _mdx_decrypt(key_block_info_compressed)
579 |             key_block_info = decompress(ZLIB_MAGIC, key_block_info_compressed[8:])
580 |             # verify adler checksum
581 |             adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
582 |             assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
583 |         else:
584 |             # no compression
585 |             key_block_info = key_block_info_compressed
586 |         # decode
587 |         key_block_info_list = []
588 |         num_entries = i = 0
589 |         if self._version >= 2:
590 |             byte_format, byte_width, text_term = '>H', 2, 1
591 |         else:
592 |             byte_format, byte_width, text_term = '>B', 1, 0
593 |         while i < len(key_block_info):
594 |             # number of entries in current key block
595 |             num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
596 |             i += self._number_width
597 |             # text head size
598 |             text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
599 |             i += byte_width
600 |             # text head
601 |             if self._encoding != 'UTF-16':
602 |                 i += text_head_size + text_term
603 |             else:
604 |                 i += (text_head_size + text_term) * 2
605 |             # text tail size
606 |             text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
607 |             i += byte_width
608 |             # text tail
609 |             if self._encoding != 'UTF-16':
610 |                 i += text_tail_size + text_term
611 |             else:
612 |                 i += (text_tail_size + text_term) * 2
613 |             # key block compressed size
614 |             key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
615 |             i += self._number_width
616 |             # key block decompressed size
617 |             key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
618 |             i += self._number_width
619 |             key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
620 |         assert(num_entries == self._num_entries)
621 |         return key_block_info_list
622 |     def _decode_key_block(self, key_block_compressed, key_block_info_list):
623 |         key_list = []
624 |         i = 0
625 |         for compressed_size, decompressed_size in key_block_info_list:
626 |             start = i
627 |             i = end = i + compressed_size
628 |             # 4 bytes : compression type
629 |             key_block_type = key_block_compressed[start:start+4]
630 |             # 4 bytes : adler checksum of decompressed key block
631 |             adler32 = unpack('>I', key_block_compressed[start+4:start+8])[0]
632 |             key_block = decompress(key_block_type, key_block_compressed[start+8:end], decompressed_size)
633 |             # extract one single key block into a key list
634 |             key_list += self._split_key_block(key_block)
635 |             # notice that adler32 returns signed value
636 |             assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
637 |         return key_list
638 |     def _split_key_block(self, key_block):
639 |         key_list = []
640 |         key_start_index = 0
641 |         while key_start_index < len(key_block):
642 |             # the corresponding record's offset in record block
643 |             key_id = unpack(self._number_format,
644 |                             key_block[key_start_index:key_start_index+self._number_width]
645 |                            )[0]
646 |             # key text ends with '\x00'
647 |             if self._encoding == 'UTF-16':
648 |                 delimiter, width = b'\x00\x00', 2
649 |             else:
650 |                 delimiter, width = b'\x00', 1
651 |             i = key_start_index + self._number_width
652 |             while i < len(key_block):
653 |                 if key_block[i:i + width] == delimiter:
654 |                     key_end_index = i
655 |                     break
656 |                 i += width
657 |             key_text = key_block[key_start_index + self._number_width:key_end_index]\
658 |                        .decode(self._encoding, errors='ignore').encode('utf-8').strip()
659 |             key_start_index = key_end_index + width
660 |             key_list += [(key_id, key_text)]
661 |         return key_list
662 |     def _read_header(self):
663 |         with open(self._fname, 'rb') as f:
664 |             # number of bytes of header text
665 |             header_bytes_size = unpack('>I', f.read(4))[0]
666 |             header_bytes = f.read(header_bytes_size)
667 |             # 4 bytes: adler32 checksum of header, in little endian
668 |             adler32 = unpack('<I', f.read(4))[0]
669 |             assert(adler32 == zlib.adler32(header_bytes) & 0xffffffff)
670 |             # mark down key block offset
671 |             self._key_block_offset = f.tell()
672 |         # header text in utf-16 encoding ending with '\x00\x00'
673 |         header_text = header_bytes[:-2].decode('utf-16').encode('utf-8')
674 |         header_tag = self._parse_header(header_text)
675 |         if not self._encoding:
676 |             encoding = header_tag[b'Encoding']
677 |             if sys.version_info >= (3,):
678 |                 encoding = encoding.decode('utf-8')
679 |             # GB18030 is superset of  GBK & GB2312
680 |             if encoding in ['GBK', 'GB2312']:
681 |                 encoding = 'GB18030'
682 |             self._encoding = encoding
683 |         # read title and description
684 |         self._title = header_tag[b'Title'].decode('utf-8') if b'Title' in header_tag else ''
685 |         self._description = header_tag[b'Description'].decode('utf-8') if b'Description' in header_tag else ''
686 |         # encryption flag
687 |         #   0x00 - no encryption
688 |         #   0x01 - encrypt record block
689 |         #   0x02 - encrypt key info block
690 |         if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
691 |             self._encrypt = 0
692 |         elif header_tag[b'Encrypted'] == b'Yes':
693 |             self._encrypt = 1
694 |         else:
695 |             self._encrypt = int(header_tag[b'Encrypted'])
696 |         # stylesheet attribute if present takes form of:
697 |         #   style_number # 1-255
698 |         #   style_begin # or ''
699 |         #   style_end # or ''
700 |         # store stylesheet in dict in the form of
701 |         # {'number' : ('style_begin', 'style_end')}
702 |         self._stylesheet = {}
703 |         if header_tag.get('StyleSheet'):
704 |             lines = header_tag['StyleSheet'].splitlines()
705 |             for i in range(0, len(lines), 3):
706 |                 self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2])
707 |         # before version 2.0, number is 4 bytes integer
708 |         # version 2.0 and above uses 8 bytes
709 |         self._version = float(header_tag[b'GeneratedByEngineVersion'])
710 |         if self._version < 2.0:
711 |             self._number_width, self._number_format = 4, '>I'
712 |         else:
713 |             self._number_width, self._number_format = 8, '>Q'
714 |         return header_tag
715 |     def _read_keys(self):
716 |         with open(self._fname, 'rb') as f:
717 |             f.seek(self._key_block_offset)
718 |             # the following numbers could be encrypted
719 |             num_bytes = (8*5) if self._version >= 2.0 else (4*4)
720 |             block = f.read(num_bytes)
721 |             if self._encrypt & 1:
722 |                 if self._passcode is None:
723 |                     raise RuntimeError('user identification is needed to read encrypted file')
724 |                 regcode, userid = self._passcode
725 |                 if isinstance(userid, unicode):
726 |                     userid = userid.encode('utf8')
727 |                 if self.header[b'RegisterBy'] == b'EMail':
728 |                     encrypted_key = _decrypt_regcode_by_email(regcode, userid)
729 |                 else:
730 |                     encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid)
731 |                 block = _salsa_decrypt(block, encrypted_key)
732 |             # decode this block
733 |             sf = BytesIO(block)
734 |             num_key_blocks = self._read_number(sf)
735 |             self._num_entries = self._read_number(sf)
736 |             # number of bytes of key block info after decompression
737 |             if self._version >= 2.0:
738 |                 _ = self._read_number(sf) # key_block_info_decomp_size, unused here
739 |             # number of bytes of key block info
740 |             key_block_info_size = self._read_number(sf)
741 |             # number of bytes of key block
742 |             key_block_size = self._read_number(sf)
743 |             # 4 bytes: adler checksum of previous 5 numbers
744 |             if self._version >= 2.0:
745 |                 adler32 = unpack('>I', f.read(4))[0]
746 |                 assert adler32 == (zlib.adler32(block) & 0xffffffff)
747 |             # read key block info, which indicates key block's compressed and decompressed size
748 |             key_block_info = f.read(key_block_info_size)
749 |             key_block_info_list = self._decode_key_block_info(key_block_info)
750 |             assert(num_key_blocks == len(key_block_info_list))
751 |             # read and decompress key block
752 |             key_block_compressed = f.read(key_block_size)
753 |             key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
754 |             self._record_block_offset = f.tell()
755 |         return key_list
756 |     def _read_keys_brutal(self):
757 |         with open(self._fname, 'rb') as f:
758 |             f.seek(self._key_block_offset)
759 |             # the following numbers could be encrypted, disregard them!
760 |             if self._version >= 2.0:
761 |                 num_bytes, key_block_type = (8*5+4), ZLIB_MAGIC
762 |             else:
763 |                 num_bytes, key_block_type = (4*4), LZO_MAGIC
764 |             block = f.read(num_bytes)
765 |             # key block info:
766 |             # - 4 bytes '\x02\x00\x00\x00'
767 |             # - 4 bytes adler32 checksum
768 |             # - a number of bytes
769 |             # - 4 bytes '\x02\x00\x00\x00' marks the beginning of key block
770 |             key_block_info = f.read(8)
771 |             if self._version >= 2.0:
772 |                 assert key_block_info[:4] == ZLIB_MAGIC
773 |             while True:
774 |                 fpos = f.tell()
775 |                 t = f.read(1024)
776 |                 index = t.find(key_block_type)
777 |                 if index != -1:
778 |                     key_block_info += t[:index]
779 |                     f.seek(fpos + index)
780 |                     break
781 |                 else:
782 |                     key_block_info += t
783 |             key_block_info_list = self._decode_key_block_info(key_block_info)
784 |             key_block_size = sum(list(zip(*key_block_info_list))[0])
785 |             # read and decompress key block
786 |             key_block_compressed = f.read(key_block_size)
787 |             key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
788 |             self._record_block_offset = f.tell()
789 |         self._num_entries = len(key_list)
790 |         return key_list
791 |     def get_index(self):
792 |         index_dict_list = [] # list of dict, each one is index to one record
793 |         for record_dict in self.get_records():
794 |             del record_dict['data']
795 |             index_dict_list.append(index_dict)
796 |         return index_dict_list
797 | 
798 | class MDD(MDict):
799 |     """
800 |     MDict resource file format (*.MDD) reader.
801 |     >>> mdd = MDD('example.mdd')
802 |     >>> len(mdd)
803 |     208
804 |     >>> for filename,content in mdd.items():
805 |     ... print filename, content[:10]
806 |     """
807 |     def __init__(self, fname, passcode=None):
808 |         MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
809 |     def items(self):
810 |         """
811 |         Return a generator which in turn produce tuples of (filename, blob),
812 |         both in bytestring
813 |         """
814 |         for record_dict in self.get_records():
815 |             filename = record_dict['key_text'].decode('utf-8')
816 |             blob = record_dict['data']
817 |             yield filename, blob
818 | 
819 | class MDX(MDict):
820 |     """
821 |     MDict dictionary file format (*.MDD) reader.
822 |     >>> mdx = MDX('example.mdx')
823 |     >>> len(mdx)
824 |     42481
825 |     >>> for key,value in mdx.items():
826 |     ... print key, value[:10]
827 |     """
828 |     def __init__(self, fname, encoding='', substyle=False, passcode=None):
829 |         MDict.__init__(self, fname, encoding, passcode)
830 |         self._substyle = substyle
831 |     def _substitute_stylesheet(self, txt):
832 |         'Replace style with loaded stylesheet'
833 |         txt_list = re.split('`\d+`', txt)
834 |         txt_tag = re.findall('`\d+`', txt)
835 |         txt_styled = txt_list[0]
836 |         for j, p in enumerate(txt_list[1:]):
837 |             style = self._stylesheet[txt_tag[j][1:-1]]
838 |             if p and p[-1] == b'\n':
839 |                 txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + b'\r\n'
840 |             else:
841 |                 txt_styled = txt_styled + style[0] + p + style[1]
842 |         return txt_styled
843 |     def items(self):
844 |         """
845 |         Return a generator which in turn produce tuples in the form of (title, text),
846 |         both in unicode string
847 |         """
848 |         for record_dict in self.get_records():
849 |             title = record_dict['key_text']
850 |             text  = record_dict['data'] \
851 |                     .decode(self._encoding, errors='ignore') \
852 |                     .strip(u'\x00') \
853 |                     .encode('utf-8')
854 |             # substitute stylesheet if required
855 |             if self._substyle and self._stylesheet:
856 |                 text = self._substitute_stylesheet(text)
857 |             yield title, text
858 |     def get_index(self):
859 |         index_dict_list = super(MDX,self).get_index()
860 |         return {
861 |             "index_dict_list": index_dict_list
862 |            ,"meta": {
863 |                 'encoding':    self._encoding
864 |                ,'stylesheet':  json.dumps(self._stylesheet)
865 |                ,'title':       self._title
866 |                ,'description': self._description
867 |             }
868 |         }
869 | 


--------------------------------------------------------------------------------
/tool.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | 
  4 | # MDict *.mdd and *.mdx data extractor
  5 | #
  6 | # This program is a free software; you can redistribute it and/or modify
  7 | # it under the terms of the GNU General Public License as published by
  8 | # the Free Software Foundation, version 3 of the License.
  9 | #
 10 | # You can get a copy of GNU General Public License along this program
 11 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt
 12 | #
 13 | # This program is distributed in the hope that it will be useful,
 14 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 15 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 16 | # GNU General Public License for more details.
 17 | 
 18 | from __future__ import print_function
 19 | 
 20 | import os
 21 | import sys
 22 | import contextlib
 23 | import argparse
 24 | 
 25 | #from readmdict import MDD, MDX
 26 | from mdict import MDD, MDX
 27 | 
 28 | def csvquote(s):
 29 |     'Quote a CSV field, as in RFC4180'
 30 |     if not any(c in s for c in '\n",'):
 31 |         return s
 32 |     elif '"' in s:
 33 |         return '"' + s.replace('"','""') + '"'
 34 |     else:
 35 |         return '"' + s + '"'
 36 | 
 37 | @contextlib.contextmanager
 38 | def csvopen(filename=None):
 39 |     'Combined interface for file and stdout'
 40 |     if filename and filename != '-':
 41 |         fh = open(filename, 'wb')
 42 |     else:
 43 |         fh = sys.stdout
 44 |     try:
 45 |         yield fh
 46 |     finally:
 47 |         if fh is not sys.stdout:
 48 |             fh.close()
 49 | 
 50 | def main():
 51 |     # command line argument
 52 |     parser = argparse.ArgumentParser(description="mdict tool")
 53 |     parser.add_argument('mdict_file',
 54 |                         help="Input *.mdx or *.mdd file")
 55 |     parser.add_argument('-l', '--list', default=False, action='store_true',
 56 |                         help='List entry names in MDX or file names in MDD')
 57 |     parser.add_argument('-a', '--dump', default=False, action='store_true',
 58 |                         help='Dump all files in *.mdd into files in output dir or ' \
 59 |                              'all entries in *.mdx into a CSV')
 60 |     parser.add_argument('-x', '--extract',
 61 |                         help='Extract one file or entry content, print to stdout if -o not specified. ' \
 62 |                              'Argument should be specified in UTF-8')
 63 |     parser.add_argument('-d', '--dir',
 64 |                         help='Output directory for -a or -o')
 65 |     parser.add_argument('-o', '--output',
 66 |                         help='Output filename for -x')
 67 |     parser.add_argument('-e', '--transcode',
 68 |                         help='Transcode data, specified in format of INPUT_ENC:OUTPUT_ENC')
 69 |     args = parser.parse_args()
 70 | 
 71 |     # open file
 72 |     is_mdd = args.mdict_file.lower().endswith('.mdd')
 73 |     obj = MDD(args.mdict_file) if is_mdd else MDX(args.mdict_file)
 74 |     if args.transcode:
 75 |         in_enc, out_enc = args.transcode.split(':')
 76 |         assert((in_enc and out_enc) or (not in_enc and not out_enc))
 77 |     else:
 78 |         in_enc, out_enc = None, None
 79 | 
 80 |     # operation depends on input
 81 |     if args.list:
 82 |         # print all key (entries or filenames)
 83 |         for key, _ in obj.items():
 84 |             if in_enc and out_enc:
 85 |                 print(key.decode(in_enc).encode(out_enc))
 86 |             else:
 87 |                 print(key)
 88 |     elif args.dump and is_mdd:
 89 |         # dump all resources in *.mdd into files
 90 |         for filename, blob in obj.items():
 91 |             if in_enc:         # transcode filename if needed
 92 |                 filename = filename.decode(in_enc)
 93 |             filename = key     # use entry name as filename
 94 |             if args.dir:
 95 |                 filename = os.path.join(args.dir, filename)
 96 |             open(filename, 'wb').write(value)
 97 |     elif args.dump and not is_mdd:
 98 |         # dump all resources in *.mdx into a CSV file
 99 |         newline = '\r\n'
100 |         if args.output:
101 |             filename = args.output
102 |             if args.dir:
103 |                 filename = os.path.join(args.dir, filename)
104 | 	else:
105 |             filename = '-'
106 |         with csvopen(filename) as fh:
107 |             for key, val in obj.items():
108 |                 if in_enc:         # transcode key & val if needed
109 |                     key = key.decode(in_enc)
110 |                     val = val.decode(in_enc).encode(out_enc)
111 |                 fh.write(csvquote(key))
112 |                 fh.write(',')
113 |                 fh.write(csvquote(val))
114 |                 fh.write(newline)
115 |     elif args.extract:
116 |         # find entry/filename, write definition/blob
117 |         target = args.extract.decode('utf-8')
118 |         for key, value in obj.items():
119 |             if in_enc:             # transcode key if needed
120 |                 key = key.decode(in_enc)
121 |             if key != target:      # seek until we find the target
122 |                 continue
123 |             if in_enc and out_enc: # transcode value if needed
124 |                 value = value.decode(in_enc).encode(out_enc)
125 |             filename = args.output
126 |             if args.dir and is_mdd:# data from *.mdd has its default filename
127 |                 filename = key
128 |             if not filename:       # print to console or save to file
129 |                 print(value)
130 |             else:
131 |                 filename = args.output
132 |                 if args.dir:
133 |                     filename = os.path.join(args.dir, filename)
134 |                 open(filename, 'wb').write(value)
135 | 
136 | if __name__ == '__main__':
137 |     main()
138 | 


--------------------------------------------------------------------------------