├── .gitattributes
├── .gitignore
├── README.md
├── cache
└── o3.js
├── lzo.py
├── mdict-query.pyproj
├── mdict-query.sln
├── mdict_dir.py
├── mdict_query.py
├── mdx
└── drop mdict files here.txt
├── pureSalsa20.py
├── readmdict.py
├── ripemd128.py
├── static
└── cache here.txt
├── templates
├── all.html
├── dict.html
└── entry.html
├── test.py
├── test_lzo.py
├── web.py
├── web.spec
└── wsgi.py
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
4 | # Custom for Visual Studio
5 | *.cs diff=csharp
6 |
7 | # Standard to msysgit
8 | *.doc diff=astextplain
9 | *.DOC diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot diff=astextplain
13 | *.DOT diff=astextplain
14 | *.pdf diff=astextplain
15 | *.PDF diff=astextplain
16 | *.rtf diff=astextplain
17 | *.RTF diff=astextplain
18 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | #################
2 | ## Eclipse
3 | #################
4 |
5 | *.pydevproject
6 | .project
7 | .metadata
8 | bin/
9 | tmp/
10 | *.tmp
11 | *.bak
12 | *.swp
13 | *~.nib
14 | local.properties
15 | .classpath
16 | .settings/
17 | .loadpath
18 | *.mdd
19 | *.mdx
20 | *.db
21 | *.jpg
22 | *.png
23 | *.gif
24 | *.mp3
25 | *.css
26 | mdx/*
27 | static/*
28 |
29 | # External tool builders
30 | .externalToolBuilders/
31 |
32 | # Locally stored "Eclipse launch configurations"
33 | *.launch
34 |
35 | # CDT-specific
36 | .cproject
37 |
38 | # PDT-specific
39 | .buildpath
40 |
41 |
42 | #################
43 | ## Visual Studio
44 | #################
45 |
46 | ## Ignore Visual Studio temporary files, build results, and
47 | ## files generated by popular Visual Studio add-ons.
48 |
49 | # User-specific files
50 | *.suo
51 | *.user
52 | *.sln.docstates
53 |
54 | # Build results
55 |
56 | [Dd]ebug/
57 | [Rr]elease/
58 | x64/
59 | build/
60 | [Bb]in/
61 | [Oo]bj/
62 |
63 | # MSTest test Results
64 | [Tt]est[Rr]esult*/
65 | [Bb]uild[Ll]og.*
66 |
67 | *_i.c
68 | *_p.c
69 | *.ilk
70 | *.meta
71 | *.obj
72 | *.pch
73 | *.pdb
74 | *.pgc
75 | *.pgd
76 | *.rsp
77 | *.sbr
78 | *.tlb
79 | *.tli
80 | *.tlh
81 | *.tmp
82 | *.tmp_proj
83 | *.log
84 | *.vspscc
85 | *.vssscc
86 | .builds
87 | *.pidb
88 | *.log
89 | *.scc
90 |
91 | # Visual C++ cache files
92 | ipch/
93 | *.aps
94 | *.ncb
95 | *.opensdf
96 | *.sdf
97 | *.cachefile
98 |
99 | # Visual Studio profiler
100 | *.psess
101 | *.vsp
102 | *.vspx
103 |
104 | # Guidance Automation Toolkit
105 | *.gpState
106 |
107 | # ReSharper is a .NET coding add-in
108 | _ReSharper*/
109 | *.[Rr]e[Ss]harper
110 |
111 | # TeamCity is a build add-in
112 | _TeamCity*
113 |
114 | # DotCover is a Code Coverage Tool
115 | *.dotCover
116 |
117 | # NCrunch
118 | *.ncrunch*
119 | .*crunch*.local.xml
120 |
121 | # Installshield output folder
122 | [Ee]xpress/
123 |
124 | # DocProject is a documentation generator add-in
125 | DocProject/buildhelp/
126 | DocProject/Help/*.HxT
127 | DocProject/Help/*.HxC
128 | DocProject/Help/*.hhc
129 | DocProject/Help/*.hhk
130 | DocProject/Help/*.hhp
131 | DocProject/Help/Html2
132 | DocProject/Help/html
133 |
134 | # Click-Once directory
135 | publish/
136 |
137 | # Publish Web Output
138 | *.Publish.xml
139 | *.pubxml
140 | *.publishproj
141 |
142 | # NuGet Packages Directory
143 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
144 | #packages/
145 |
146 | # Windows Azure Build Output
147 | csx
148 | *.build.csdef
149 |
150 | # Windows Store app package directory
151 | AppPackages/
152 |
153 | # Others
154 | sql/
155 | *.Cache
156 | ClientBin/
157 | [Ss]tyle[Cc]op.*
158 | ~$*
159 | *~
160 | *.dbmdl
161 | *.[Pp]ublish.xml
162 | *.pfx
163 | *.publishsettings
164 |
165 | # RIA/Silverlight projects
166 | Generated_Code/
167 |
168 | # Backup & report files from converting an old project file to a newer
169 | # Visual Studio version. Backup files are not needed, because we have git ;-)
170 | _UpgradeReport_Files/
171 | Backup*/
172 | UpgradeLog*.XML
173 | UpgradeLog*.htm
174 |
175 | # SQL Server files
176 | App_Data/*.mdf
177 | App_Data/*.ldf
178 |
179 | #############
180 | ## Windows detritus
181 | #############
182 |
183 | # Windows image file caches
184 | Thumbs.db
185 | ehthumbs.db
186 |
187 | # Folder config file
188 | Desktop.ini
189 |
190 | # Recycle Bin used on file shares
191 | $RECYCLE.BIN/
192 |
193 | # Mac crap
194 | .DS_Store
195 |
196 |
197 | #############
198 | ## Python
199 | #############
200 |
201 | *.py[cod]
202 |
203 | # Packages
204 | *.egg
205 | *.egg-info
206 | dist/
207 | build/
208 | eggs/
209 | parts/
210 | var/
211 | sdist/
212 | develop-eggs/
213 | .installed.cfg
214 |
215 | # Installer logs
216 | pip-log.txt
217 |
218 | # Unit test / coverage reports
219 | .coverage
220 | .tox
221 |
222 | #Translations
223 | *.mo
224 |
225 | #Mr Developer
226 | .mr.developer.cfg
227 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | This is a python module for looking up `mdict` dictionary files (`.mdx` and `.mdd`). Function converting `mdx` to `sqlite` is added.
2 |
3 | >>Based on [readmdict](https://bitbucket.org/xwang/mdict-analysis) by [Xiaoqiang Wang](http://bitbucket.org/xwang/).
4 |
5 | While this project is a trivial extension of the [original module](https://bitbucket.org/xwang/mdict-analysis), it adds the features of looking up a single entry in `.mdx` or resource file in `.mdd` without extracting all content, which may be helpful in other projects that requires dictionaries.
6 |
7 | ## Usage
8 |
9 | Constructs the `IndexBuilder` object, which builds the sqlite index for `.mdx` file and the corresponding `.mdd` file (if exists).
10 |
11 | from mdict_query import IndexBuilder
12 | builder = IndexBuilder('ode.mdx')
13 |
14 | Convert `mdx` to `sqlite`:
15 | ```
16 | builder.make_sqlite()
17 | # Check the output file `ode.mdx.sqlite.db` near your `ode.mdx`
18 | ```
19 |
20 |
21 | Get all mdx keys:
22 |
23 | builder.get_mdx_keys()
24 | # ==> ['key1', 'key2', 'key3', ...]
25 |
26 | Filter mdx keys by wildcard:
27 |
28 | builder.get_mdx_keys('dedicat*')
29 | # ==> ['dedicate', 'dedication', ...]
30 |
31 | Looks up mdx with a key:
32 |
33 | result_text = builder.mdx_lookup('dedication')
34 |
35 | There is an option to ignore cases:
36 |
37 | result_text = builder.mdx_lookup('Dedication', ignorecase = True)
38 |
39 | Get all mdd keys:
40 |
41 | builder.get_mdd_keys()
42 | # ==> ['key1', 'key2', 'key3', ...]
43 |
44 | Filter mdd keys by wildcard:
45 |
46 | builder.get_mdd_keys('*.css')
47 | # ==> ['/style.css', ...]
48 |
49 | Looks up mdd with a key:
50 |
51 | bytes_list = builder.mdd_lookup('/style.css')
52 | #bytes_list is the bytes list of the file stored in mdd
53 |
54 |
--------------------------------------------------------------------------------
/cache/o3.js:
--------------------------------------------------------------------------------
1 | var o0e=(function(){return{e:function(c,d){var n=d==2?c.nextSibling:c.parentNode.nextSibling;if(!d)n=n.childNodes[0];var s=n.style;if(s.display!="block")s.display="block";else s.display="none";},a:function(c,d,f){c.removeAttribute("onclick");var s=c.style;s.cursor="default";s.outline="1px dotted gray";var m=/([^//]+)$/.exec(f);
2 | if(m){var u="http://audio.oxforddictionaries.com/en/mp3/"+m[0].replace('__','_')+".mp3";var b=function(){s.outline="";s.cursor="pointer";c.setAttribute("onclick","o0e.a(this,"+d+",'"+f+"')");};var t=setTimeout(b,2000);try{with(document.createElement("audio")){setAttribute("src",u);onloadstart=function(){clearTimeout(t);};onended=b;play();}}catch(e){c.style.outline="";}}},x:function(c){var s=c.parentNode.nextSibling.style;if(s.display!="none"){s.display="none";c.className="yuq";}else{s.display="block";c.className="aej";}},p:function(c){if(c.className=="j02")c.className="g4p";else c.className="j02";}}}());
--------------------------------------------------------------------------------
/lzo.py:
--------------------------------------------------------------------------------
1 | import math
2 |
3 |
4 | class FlexBuffer():
5 |
6 | def __init__(self):
7 |
8 | self.blockSize = None
9 | self.c = None
10 | self.l = None
11 | self.buf = None
12 |
13 | def require(self, n):
14 |
15 | r = self.c - self.l + n
16 | if r > 0:
17 | self.l = self.l + self.blockSize * math.ceil(r / self.blockSize)
18 | #tmp = bytearray(self.l)
19 | #for i in len(self.buf):
20 | # tmp[i] = self.buf[i]
21 | #self.buf = tmp
22 | self.buf = self.buf + bytearray(self.l - len(self.buf))
23 | self.c = self.c + n
24 | return self.buf
25 |
26 | def alloc(self, initSize, blockSize):
27 |
28 | if blockSize:
29 | sz = blockSize
30 | else:
31 | sz = 4096
32 | self.blockSize = self.roundUp(sz)
33 | self.c = 0
34 | self.l = self.roundUp(initSize) | 0
35 | self.l += self.blockSize - (self.l % self.blockSize)
36 | self.buf = bytearray(self.l)
37 | return self.buf
38 |
39 | def roundUp(self, n):
40 |
41 | r = n % 4
42 | if r == 0:
43 | return n
44 | else:
45 | return n + 4 - r
46 |
47 | def reset(self):
48 |
49 | self.c = 0
50 | self.l = len(self.buf)
51 |
52 | def pack(self, size):
53 |
54 | return self.buf[0:size]
55 |
56 | def _decompress(inBuf, outBuf):
57 |
58 | c_top_loop = 1
59 | c_first_literal_run = 2
60 | c_match = 3
61 | c_copy_match = 4
62 | c_match_done = 5
63 | c_match_next = 6
64 |
65 | out = outBuf.buf
66 | op = 0
67 | ip = 0
68 | t = inBuf[ip]
69 | state = c_top_loop
70 | m_pos = 0
71 | ip_end = len(inBuf)
72 |
73 | if t > 17:
74 | ip = ip + 1
75 | t = t - 17
76 | if t < 4:
77 | state = c_match_next
78 | else:
79 | out = outBuf.require(t)
80 | while True:
81 | out[op] = inBuf[ip]
82 | op = op + 1
83 | ip = ip + 1
84 | t = t - 1
85 | if not t > 0: break
86 | state = c_first_literal_run
87 |
88 | while True:
89 | if_block = False
90 |
91 | ##
92 | if state == c_top_loop:
93 | t = inBuf[ip]
94 | ip = ip + 1
95 | if t >= 16:
96 | state = c_match
97 | continue
98 | if t == 0:
99 | while inBuf[ip] == 0:
100 | t = t + 255
101 | ip = ip + 1
102 | t = t + 15 + inBuf[ip]
103 | ip = ip + 1
104 |
105 | t = t + 3
106 | out = outBuf.require(t)
107 | while True:
108 | out[op] = inBuf[ip]
109 | op = op + 1
110 | ip = ip + 1
111 | t = t - 1
112 | if not t > 0: break
113 | # emulate c switch
114 | state = c_first_literal_run
115 |
116 | ##
117 | if state == c_first_literal_run:
118 | t = inBuf[ip]
119 | ip = ip + 1
120 | if t >= 16:
121 | state = c_match
122 | continue
123 | m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2)
124 | ip = ip + 1
125 | out = outBuf.require(3)
126 | out[op] = out[m_pos]
127 | op = op + 1
128 | m_pos = m_pos + 1
129 | out[op] = out[m_pos]
130 | op = op + 1
131 | m_pos = m_pos + 1
132 | out[op] = out[m_pos]
133 | op = op + 1
134 |
135 | state = c_match_done
136 | continue
137 |
138 | ##
139 | if state == c_match:
140 | if t >= 64:
141 | m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3)
142 | ip = ip + 1
143 | t = (t >> 5) - 1
144 | state = c_copy_match
145 | continue
146 | elif t >= 32:
147 | t = t & 31
148 | if t == 0:
149 | while inBuf[ip] == 0:
150 | t = t + 255
151 | ip = ip + 1
152 | t = t + 31 + inBuf[ip]
153 | ip = ip + 1
154 | m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
155 | ip = ip + 2
156 | elif t >= 16:
157 | m_pos = op - ((t & 8) << 11)
158 | t = t & 7
159 | if t == 0:
160 | while inBuf[ip] == 0:
161 | t = t + 255
162 | ip = ip + 1
163 | t = t + 7 + inBuf[ip]
164 | ip = ip + 1
165 | m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
166 | ip = ip + 2
167 | if m_pos == op:
168 | break
169 | m_pos = m_pos - 0x4000
170 | else:
171 | m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2);
172 | ip = ip + 1
173 | out = outBuf.require(2)
174 | out[op] = out[m_pos]
175 | op = op + 1
176 | m_pos = m_pos + 1
177 | out[op] = out[m_pos]
178 | op = op + 1
179 | state = c_match_done
180 | continue
181 |
182 | if t >= 6 and (op - m_pos) >= 4:
183 | if_block = True
184 | t += 2
185 | out = outBuf.require(t)
186 | while True:
187 | out[op] = out[m_pos]
188 | op += 1
189 | m_pos += 1
190 | t -= 1
191 | if not t > 0: break
192 | #emulate c switch
193 | state = c_copy_match
194 |
195 | ##
196 | if state == c_copy_match:
197 | if not if_block:
198 | t += 2
199 | out = outBuf.require(t)
200 | while True:
201 | out[op] = out[m_pos]
202 | op += 1
203 | m_pos += 1
204 | t -= 1
205 | if not t > 0: break
206 | #emulating c switch
207 | state = c_match_done
208 |
209 | ##
210 | if state == c_match_done:
211 | t = inBuf[ip - 2] & 3
212 | if t == 0:
213 | state = c_top_loop
214 | continue
215 | #emulate c switch
216 | state = c_match_next
217 |
218 | ##
219 | if state == c_match_next:
220 | out = outBuf.require(1)
221 | out[op] = inBuf[ip]
222 | op += 1
223 | ip += 1
224 | if t > 1:
225 | out = outBuf.require(1)
226 | out[op] = inBuf[ip]
227 | op += 1
228 | ip += 1
229 | if t > 2:
230 | out = outBuf.require(1)
231 | out[op] = inBuf[ip]
232 | op += 1
233 | ip += 1
234 | t = inBuf[ip]
235 | ip += 1
236 | state = c_match
237 | continue
238 |
239 | return bytes(outBuf.pack(op))
240 |
241 | def decompress(input, initSize = 16000, blockSize = 8192):
242 | output = FlexBuffer()
243 | output.alloc(initSize, blockSize)
244 | return _decompress(bytearray(input), output)
245 |
246 |
247 |
--------------------------------------------------------------------------------
/mdict-query.pyproj:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | Debug
5 | 2.0
6 | {f227ad7e-74e4-4364-ad90-d2b3dda5abf6}
7 |
8 | test_lzo.py
9 |
10 | .
11 | .
12 | {888888a0-9f3d-457c-b088-3a5042f75d52}
13 | Standard Python launcher
14 | {9a7a9026-48c1-4688-9d5d-e5699d47d074}
15 | 3.5
16 | False
17 |
18 |
19 |
20 |
21 | 10.0
22 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets
23 |
24 |
25 |
26 |
27 |
28 |
29 |
30 |
31 | Code
32 |
33 |
34 |
35 |
36 |
37 |
38 |
39 |
40 |
--------------------------------------------------------------------------------
/mdict-query.sln:
--------------------------------------------------------------------------------
1 |
2 | Microsoft Visual Studio Solution File, Format Version 12.00
3 | # Visual Studio 14
4 | VisualStudioVersion = 14.0.25420.1
5 | MinimumVisualStudioVersion = 10.0.40219.1
6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "mdict-query", "mdict-query.pyproj", "{F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}"
7 | EndProject
8 | Global
9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | Debug|Any CPU = Debug|Any CPU
11 | Release|Any CPU = Release|Any CPU
12 | EndGlobalSection
13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | {F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | {F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Release|Any CPU.ActiveCfg = Release|Any CPU
16 | EndGlobalSection
17 | GlobalSection(SolutionProperties) = preSolution
18 | HideSolutionNode = FALSE
19 | EndGlobalSection
20 | EndGlobal
21 |
--------------------------------------------------------------------------------
/mdict_dir.py:
--------------------------------------------------------------------------------
1 | from mdict_query import IndexBuilder
2 | import os
3 | import json
4 |
5 |
6 | class Dir(object):
7 |
8 | def __init__(self, mdict_dir, config_name = 'config.json'):
9 |
10 | assert(os.path.isdir(mdict_dir))
11 | self._mdict_dir = mdict_dir
12 | self._config_file_base_name = config_name
13 | self._config = {}
14 | #check config.json
15 | self._config_file = os.path.join(mdict_dir, self._config_file_base_name)
16 |
17 | if os.path.exists(self._config_file):
18 | self._ensure_config_consistency()
19 | self._load_config()
20 | self._add_builder()
21 | pass
22 | else:
23 | self._build_index()
24 | self._make_config()
25 | self._dump_config()
26 | self._add_builder()
27 | pass
28 |
29 | def _add_builder(self):
30 |
31 | for dict in self._config['dicts']:
32 | dict['builder'] = IndexBuilder(dict['mdx_name'])
33 |
34 |
35 | def _load_config(self):
36 |
37 | file_opened = open(self._config_file, 'r', encoding = 'utf-8')
38 | self._config = json.load(file_opened)
39 | file_opened.close()
40 |
41 |
42 | def _build_index(self):
43 |
44 | dict_list = []
45 | files_in_dir = os.listdir(self._mdict_dir)
46 | for item in files_in_dir:
47 | full_name = os.path.join(self._mdict_dir, item)
48 | print(full_name)
49 | if os.path.isfile(full_name):
50 | _filename, _file_extension = os.path.splitext(full_name)
51 | if _file_extension == '.mdx':
52 | _config_single_dic = {
53 | 'title': '',
54 | 'description':'',
55 | 'mdx_name': full_name,
56 | 'has_mdd': os.path.isfile(_filename + '.mdd')
57 | }
58 | try:
59 | ib = IndexBuilder(full_name)
60 | except Exception:
61 | continue
62 | _config_single_dic['title'] = ib._title
63 | _config_single_dic['description'] = ib._description
64 | dict_list.append(_config_single_dic)
65 | self._config['dicts'] = dict_list
66 |
67 | def _make_config(self):
68 | pass
69 |
70 | def _dump_config(self):
71 |
72 | file_opened = open(self._config_file, 'w', encoding = 'utf-8')
73 | json.dump(self._config, file_opened, ensure_ascii = False, indent = True)
74 | file_opened.close()
75 |
76 | #todo: implement ensure consistency
77 | def _ensure_config_consistency(self):
78 | pass
79 |
80 | Dir('mdx')
81 |
--------------------------------------------------------------------------------
/mdict_query.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 |
4 | from .readmdict import MDX, MDD
5 | from struct import pack, unpack
6 | from io import BytesIO
7 | import re
8 | import sys
9 | import os
10 | import sqlite3
11 | import json
12 |
13 | # zlib compression is used for engine version >=2.0
14 | import zlib
15 | # LZO compression is used for engine version < 2.0
16 | try:
17 | import lzo
18 | except ImportError:
19 | lzo = None
20 | #print("LZO compression support is not available")
21 |
22 | # 2x3 compatible
23 | if sys.hexversion >= 0x03000000:
24 | unicode = str
25 |
26 | version = '1.1'
27 |
28 |
29 | class IndexBuilder(object):
30 | #todo: enable history
31 | def __init__(self, fname, encoding = "", passcode = None, force_rebuild = False, enable_history = False, sql_index = True, check = False):
32 | self._mdx_file = fname
33 | self._mdd_file = ""
34 | self._encoding = ''
35 | self._stylesheet = {}
36 | self._title = ''
37 | self._version = ''
38 | self._description = ''
39 | self._sql_index = sql_index
40 | self._check = check
41 | _filename, _file_extension = os.path.splitext(fname)
42 | assert(_file_extension == '.mdx')
43 | assert(os.path.isfile(fname))
44 | self._mdx_db = _filename + ".mdx.db"
45 | # make index anyway
46 | if force_rebuild:
47 | self._make_mdx_index(self._mdx_db)
48 | if os.path.isfile(_filename + '.mdd'):
49 | self._mdd_file = _filename + ".mdd"
50 | self._mdd_db = _filename + ".mdd.db"
51 | self._make_mdd_index(self._mdd_db)
52 |
53 | if os.path.isfile(self._mdx_db):
54 | #read from META table
55 | conn = sqlite3.connect(self._mdx_db)
56 | #cursor = conn.execute("SELECT * FROM META")
57 | cursor = conn.execute("SELECT * FROM META WHERE key = \"version\"")
58 | #判断有无版本号
59 | for cc in cursor:
60 | self._version = cc[1]
61 | ################# if not version in fo #############
62 | if not self._version:
63 | print("version info not found")
64 | conn.close()
65 | self._make_mdx_index(self._mdx_db)
66 | print("mdx.db rebuilt!")
67 | if os.path.isfile(_filename + '.mdd'):
68 | self._mdd_file = _filename + ".mdd"
69 | self._mdd_db = _filename + ".mdd.db"
70 | self._make_mdd_index(self._mdd_db)
71 | print("mdd.db rebuilt!")
72 | return None
73 | cursor = conn.execute("SELECT * FROM META WHERE key = \"encoding\"")
74 | for cc in cursor:
75 | self._encoding = cc[1]
76 | cursor = conn.execute("SELECT * FROM META WHERE key = \"stylesheet\"")
77 | for cc in cursor:
78 | self._stylesheet = json.loads(cc[1])
79 |
80 | cursor = conn.execute("SELECT * FROM META WHERE key = \"title\"")
81 | for cc in cursor:
82 | self._title = cc[1]
83 |
84 | cursor = conn.execute("SELECT * FROM META WHERE key = \"description\"")
85 | for cc in cursor:
86 | self._description = cc[1]
87 |
88 | #for cc in cursor:
89 | # if cc[0] == 'encoding':
90 | # self._encoding = cc[1]
91 | # continue
92 | # if cc[0] == 'stylesheet':
93 | # self._stylesheet = json.loads(cc[1])
94 | # continue
95 | # if cc[0] == 'title':
96 | # self._title = cc[1]
97 | # continue
98 | # if cc[0] == 'title':
99 | # self._description = cc[1]
100 | else:
101 | self._make_mdx_index(self._mdx_db)
102 |
103 | if os.path.isfile(_filename + ".mdd"):
104 | self._mdd_file = _filename + ".mdd"
105 | self._mdd_db = _filename + ".mdd.db"
106 | if not os.path.isfile(self._mdd_db):
107 | self._make_mdd_index(self._mdd_db)
108 | pass
109 |
110 |
111 | def _replace_stylesheet(self, txt):
112 | # substitute stylesheet definition
113 | txt_list = re.split('`\d+`', txt)
114 | txt_tag = re.findall('`\d+`', txt)
115 | txt_styled = txt_list[0]
116 | for j, p in enumerate(txt_list[1:]):
117 | style = self._stylesheet[txt_tag[j][1:-1]]
118 | if p and p[-1] == '\n':
119 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
120 | else:
121 | txt_styled = txt_styled + style[0] + p + style[1]
122 | return txt_styled
123 |
124 |
125 | def make_sqlite(self):
126 | sqlite_file = self._mdx_file + '.sqlite.db'
127 | if os.path.exists(sqlite_file):
128 | os.remove(sqlite_file)
129 | mdx = MDX(self._mdx_file)
130 | conn = sqlite3.connect(sqlite_file)
131 | cursor = conn.cursor()
132 | cursor.execute(
133 | ''' CREATE TABLE MDX_DICT
134 | (key text not null,
135 | value text
136 | )'''
137 | )
138 |
139 | # remove '(pīnyīn)', remove `1`:
140 | aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ'
141 | pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?"%(aeiou, aeiou, aeiou)
142 | tuple_list = [(key.decode(), re.sub(pattern, '', value.decode()))
143 | for key, value in mdx.items()]
144 |
145 | cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list)
146 |
147 | returned_index = mdx.get_index(check_block = self._check)
148 | meta = returned_index['meta']
149 | cursor.execute(
150 | '''CREATE TABLE META (key text, value text)''')
151 |
152 | cursor.executemany(
153 | 'INSERT INTO META VALUES (?,?)',
154 | [('encoding', meta['encoding']),
155 | ('stylesheet', meta['stylesheet']),
156 | ('title', meta['title']),
157 | ('description', meta['description']),
158 | ('version', version)
159 | ]
160 | )
161 |
162 | if self._sql_index:
163 | cursor.execute(
164 | '''
165 | CREATE INDEX key_index ON MDX_DICT (key)
166 | '''
167 | )
168 | conn.commit()
169 | conn.close()
170 |
171 |
172 | def _make_mdx_index(self, db_name):
173 | if os.path.exists(db_name):
174 | os.remove(db_name)
175 | mdx = MDX(self._mdx_file)
176 | self._mdx_db = db_name
177 | returned_index = mdx.get_index(check_block = self._check)
178 | index_list = returned_index['index_dict_list']
179 | conn = sqlite3.connect(db_name)
180 | c = conn.cursor()
181 | c.execute(
182 | ''' CREATE TABLE MDX_INDEX
183 | (key_text text not null,
184 | file_pos integer,
185 | compressed_size integer,
186 | decompressed_size integer,
187 | record_block_type integer,
188 | record_start integer,
189 | record_end integer,
190 | offset integer
191 | )'''
192 | )
193 |
194 | tuple_list = [
195 | (item['key_text'],
196 | item['file_pos'],
197 | item['compressed_size'],
198 | item['decompressed_size'],
199 | item['record_block_type'],
200 | item['record_start'],
201 | item['record_end'],
202 | item['offset']
203 | )
204 | for item in index_list
205 | ]
206 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
207 | tuple_list)
208 | # build the metadata table
209 | meta = returned_index['meta']
210 | c.execute(
211 | '''CREATE TABLE META
212 | (key text,
213 | value text
214 | )''')
215 |
216 | #for k,v in meta:
217 | # c.execute(
218 | # 'INSERT INTO META VALUES (?,?)',
219 | # (k, v)
220 | # )
221 |
222 | c.executemany(
223 | 'INSERT INTO META VALUES (?,?)',
224 | [('encoding', meta['encoding']),
225 | ('stylesheet', meta['stylesheet']),
226 | ('title', meta['title']),
227 | ('description', meta['description']),
228 | ('version', version)
229 | ]
230 | )
231 |
232 | if self._sql_index:
233 | c.execute(
234 | '''
235 | CREATE INDEX key_index ON MDX_INDEX (key_text)
236 | '''
237 | )
238 |
239 | conn.commit()
240 | conn.close()
241 | #set class member
242 | self._encoding = meta['encoding']
243 | self._stylesheet = json.loads(meta['stylesheet'])
244 | self._title = meta['title']
245 | self._description = meta['description']
246 |
247 |
248 | def _make_mdd_index(self, db_name):
249 | if os.path.exists(db_name):
250 | os.remove(db_name)
251 | mdd = MDD(self._mdd_file)
252 | self._mdd_db = db_name
253 | index_list = mdd.get_index(check_block = self._check)
254 | conn = sqlite3.connect(db_name)
255 | c = conn.cursor()
256 | c.execute(
257 | ''' CREATE TABLE MDX_INDEX
258 | (key_text text not null unique,
259 | file_pos integer,
260 | compressed_size integer,
261 | decompressed_size integer,
262 | record_block_type integer,
263 | record_start integer,
264 | record_end integer,
265 | offset integer
266 | )'''
267 | )
268 |
269 | tuple_list = [
270 | (item['key_text'],
271 | item['file_pos'],
272 | item['compressed_size'],
273 | item['decompressed_size'],
274 | item['record_block_type'],
275 | item['record_start'],
276 | item['record_end'],
277 | item['offset']
278 | )
279 | for item in index_list
280 | ]
281 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
282 | tuple_list)
283 | if self._sql_index:
284 | c.execute(
285 | '''
286 | CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text)
287 | '''
288 | )
289 |
290 | conn.commit()
291 | conn.close()
292 |
293 | @staticmethod
294 | def get_data_by_index(fmdx, index):
295 | fmdx.seek(index['file_pos'])
296 | record_block_compressed = fmdx.read(index['compressed_size'])
297 | record_block_type = record_block_compressed[:4]
298 | record_block_type = index['record_block_type']
299 | decompressed_size = index['decompressed_size']
300 | #adler32 = unpack('>I', record_block_compressed[4:8])[0]
301 | if record_block_type == 0:
302 | _record_block = record_block_compressed[8:]
303 | # lzo compression
304 | elif record_block_type == 1:
305 | if lzo is None:
306 | print("LZO compression is not supported")
307 | # decompress
308 | header = b'\xf0' + pack('>I', index['decompressed_size'])
309 | _record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
310 | # zlib compression
311 | elif record_block_type == 2:
312 | # decompress
313 | _record_block = zlib.decompress(record_block_compressed[8:])
314 | data = _record_block[index['record_start'] - index['offset']:index['record_end'] - index['offset']]
315 | return data
316 |
317 | def get_mdx_by_index(self, fmdx, index):
318 | data = self.get_data_by_index(fmdx,index)
319 | record = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
320 | if self._stylesheet:
321 | record = self._replace_stylesheet(record)
322 | record = record.decode('utf-8')
323 | return record
324 |
325 | def get_mdd_by_index(self, fmdx, index):
326 | return self.get_data_by_index(fmdx,index)
327 |
328 | @staticmethod
329 | def lookup_indexes(db,keyword,ignorecase=None):
330 | indexes = []
331 | if ignorecase:
332 | sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(keyword)
333 | else:
334 | sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(keyword)
335 | with sqlite3.connect(db) as conn:
336 | cursor = conn.execute(sql)
337 | for result in cursor:
338 | index = {}
339 | index['file_pos'] = result[1]
340 | index['compressed_size'] = result[2]
341 | index['decompressed_size'] = result[3]
342 | index['record_block_type'] = result[4]
343 | index['record_start'] = result[5]
344 | index['record_end'] = result[6]
345 | index['offset'] = result[7]
346 | indexes.append(index)
347 | return indexes
348 |
349 | def mdx_lookup(self, keyword,ignorecase=None):
350 | lookup_result_list = []
351 | indexes = self.lookup_indexes(self._mdx_db,keyword,ignorecase)
352 | with open(self._mdx_file,'rb') as mdx_file:
353 | for index in indexes:
354 | lookup_result_list.append(self.get_mdx_by_index(mdx_file, index))
355 | return lookup_result_list
356 |
357 | def mdd_lookup(self, keyword,ignorecase=None):
358 | lookup_result_list = []
359 | indexes = self.lookup_indexes(self._mdd_db,keyword,ignorecase)
360 | with open(self._mdd_file,'rb') as mdd_file:
361 | for index in indexes:
362 | lookup_result_list.append(self.get_mdd_by_index(mdd_file, index))
363 | return lookup_result_list
364 |
365 | @staticmethod
366 | def get_keys(db,query = ''):
367 | if not db:
368 | return []
369 | if query:
370 | if '*' in query:
371 | query = query.replace('*','%')
372 | else:
373 | query = query + '%'
374 | sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"'
375 | else:
376 | sql = 'SELECT key_text FROM MDX_INDEX'
377 | with sqlite3.connect(db) as conn:
378 | cursor = conn.execute(sql)
379 | keys = [item[0] for item in cursor]
380 | return keys
381 |
382 | def get_mdd_keys(self, query = ''):
383 | return self.get_keys(self._mdd_db,query)
384 |
385 | def get_mdx_keys(self, query = ''):
386 | return self.get_keys(self._mdx_db,query)
387 |
388 |
389 |
390 | # mdx_builder = IndexBuilder("oald.mdx")
391 | # text = mdx_builder.mdx_lookup('dedication')
392 | # keys = mdx_builder.get_mdx_keys()
393 | # keys1 = mdx_builder.get_mdx_keys('abstrac')
394 | # keys2 = mdx_builder.get_mdx_keys('*tion')
395 | # for key in keys2:
396 | # text = mdx_builder.mdx_lookup(key)[0]
397 | # pass
398 |
--------------------------------------------------------------------------------
/mdx/drop mdict files here.txt:
--------------------------------------------------------------------------------
1 | hihaaha
2 |
--------------------------------------------------------------------------------
/pureSalsa20.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | """
5 | Copyright by https://github.com/zhansliu/writemdict
6 |
7 | pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3
8 |
9 | v4.0: Added Python 3 support, dropped support for Python <= 2.5.
10 |
11 | // zhansliu
12 |
13 | Original comments below.
14 |
15 | ====================================================================
16 | There are comments here by two authors about three pieces of software:
17 | comments by Larry Bugbee about
18 | Salsa20, the stream cipher by Daniel J. Bernstein
19 | (including comments about the speed of the C version) and
20 | pySalsa20, Bugbee's own Python wrapper for salsa20.c
21 | (including some references), and
22 | comments by Steve Witham about
23 | pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20,
24 | which follows pySalsa20's API, and is in this file.
25 |
26 | Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee)
27 | -----------------------------------------------------------
28 |
29 | Salsa20 is a fast stream cipher written by Daniel Bernstein
30 | that basically uses a hash function and XOR making for fast
31 | encryption. (Decryption uses the same function.) Salsa20
32 | is simple and quick.
33 |
34 | Some Salsa20 parameter values...
35 | design strength 128 bits
36 | key length 128 or 256 bits, exactly
37 | IV, aka nonce 64 bits, always
38 | chunk size must be in multiples of 64 bytes
39 |
40 | Salsa20 has two reduced versions, 8 and 12 rounds each.
41 |
42 | One benchmark (10 MB):
43 | 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds
44 | AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds
45 | (no I/O and before Python GC kicks in)
46 |
47 | Salsa20 is a Phase 3 finalist in the EU eSTREAM competition
48 | and appears to be one of the fastest ciphers. It is well
49 | documented so I will not attempt any injustice here. Please
50 | see "References" below.
51 |
52 | ...and Salsa20 is "free for any use".
53 |
54 |
55 | pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee)
56 | ------------------------------------------------------------------
57 |
58 | pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is
59 | as it's name implies, 20 rounds, but there are two reduced
60 | versions, 8 and 12 rounds each. Because the APIs are
61 | identical, pySalsa20 is capable of wrapping all three
62 | versions (number of rounds hardcoded), including a special
63 | version that allows you to set the number of rounds with a
64 | set_rounds() function. Compile the version of your choice
65 | as a shared library (not as a Python extension), name and
66 | install it as libsalsa20.so.
67 |
68 | Sample usage:
69 | from pySalsa20 import Salsa20
70 | s20 = Salsa20(key, IV)
71 | dataout = s20.encryptBytes(datain) # same for decrypt
72 |
73 | This is EXPERIMENTAL software and intended for educational
74 | purposes only. To make experimentation less cumbersome,
75 | pySalsa20 is also free for any use.
76 |
77 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
78 | ANY KIND. USE AT YOUR OWN RISK.
79 |
80 | Enjoy,
81 |
82 | Larry Bugbee
83 | bugbee@seanet.com
84 | April 2007
85 |
86 |
87 | References:
88 | -----------
89 | http://en.wikipedia.org/wiki/Salsa20
90 | http://en.wikipedia.org/wiki/Daniel_Bernstein
91 | http://cr.yp.to/djb.html
92 | http://www.ecrypt.eu.org/stream/salsa20p3.html
93 | http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip
94 |
95 |
96 | Prerequisites for pySalsa20:
97 | ----------------------------
98 | - Python 2.5 (haven't tested in 2.4)
99 |
100 |
101 | pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham)
102 | ------------------------------------------------------------------
103 |
104 | pureSalsa20 is the stand-alone Python code in this file.
105 | It implements the underlying Salsa20 core algorithm
106 | and emulates pySalsa20's Salsa20 class API (minus a bug(*)).
107 |
108 | pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20--
109 | about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8,
110 | when encrypting 64k-byte blocks on my computer.
111 |
112 | pureSalsa20 is for cases where portability is much more important than
113 | speed. I wrote it for use in a "structured" random number generator.
114 |
115 | There are comments about the reasons for this slowness in
116 | http://www.tiac.net/~sw/2010/02/PureSalsa20
117 |
118 | Sample usage:
119 | from pureSalsa20 import Salsa20
120 | s20 = Salsa20(key, IV)
121 | dataout = s20.encryptBytes(datain) # same for decrypt
122 |
123 | I took the test code from pySalsa20, added a bunch of tests including
124 | rough speed tests, and moved them into the file testSalsa20.py.
125 | To test both pySalsa20 and pureSalsa20, type
126 | python testSalsa20.py
127 |
128 | (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the
129 | libsalsa20.so library and not switched when switching between instances
130 | of the Salsa20 class.
131 | s1 = Salsa20( key, IV, 20 )
132 | s2 = Salsa20( key, IV, 8 )
133 | In this example,
134 | with pySalsa20, both s1 and s2 will do 8 rounds of encryption.
135 | with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds.
136 | Perhaps giving each instance its own nRounds variable, which
137 | is passed to the salsa20wordtobyte() function, is insecure. I'm not a
138 | cryptographer.
139 |
140 | pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and
141 | intended for educational purposes only. To make experimentation less
142 | cumbersome, pureSalsa20.py and testSalsa20.py are free for any use.
143 |
144 | Revisions:
145 | ----------
146 | p3.2 Fixed bug that initialized the output buffer with plaintext!
147 | Saner ramping of nreps in speed test.
148 | Minor changes and print statements.
149 | p3.1 Took timing variability out of add32() and rot32().
150 | Made the internals more like pySalsa20/libsalsa .
151 | Put the semicolons back in the main loop!
152 | In encryptBytes(), modify a byte array instead of appending.
153 | Fixed speed calculation bug.
154 | Used subclasses instead of patches in testSalsa20.py .
155 | Added 64k-byte messages to speed test to be fair to pySalsa20.
156 | p3 First version, intended to parallel pySalsa20 version 3.
157 |
158 | More references:
159 | ----------------
160 | http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20]
161 | http://cr.yp.to/snuffle.html [The original name of Salsa20]
162 | http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design]
163 | http://www.tiac.net/~sw/2010/02/PureSalsa20
164 |
165 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
166 | ANY KIND. USE AT YOUR OWN RISK.
167 |
168 | Cheers,
169 |
170 | Steve Witham sw at remove-this tiac dot net
171 | February, 2010
172 | """
173 | import sys
174 | assert(sys.version_info >= (2, 6))
175 |
176 | if sys.version_info >= (3,):
177 | integer_types = (int,)
178 | python3 = True
179 | else:
180 | integer_types = (int, long)
181 | python3 = False
182 |
183 | from struct import Struct
184 | little_u64 = Struct( "= 2**64"
238 | ctx = self.ctx
239 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) )
240 |
241 | def getCounter( self ):
242 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0]
243 |
244 |
245 | def setRounds(self, rounds, testing=False ):
246 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20'
247 | self.rounds = rounds
248 |
249 |
250 | def encryptBytes(self, data):
251 | assert type(data) == bytes, 'data must be byte string'
252 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes'
253 | lendata = len(data)
254 | munged = bytearray(lendata)
255 | for i in range( 0, lendata, 64 ):
256 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False )
257 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 )
258 | # Stopping at 2^70 bytes per nonce is user's responsibility.
259 | for j in range( min( 64, lendata - i ) ):
260 | if python3:
261 | munged[ i+j ] = data[ i+j ] ^ h[j]
262 | else:
263 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j])
264 |
265 | self._lastChunk64 = not lendata % 64
266 | return bytes(munged)
267 |
268 | decryptBytes = encryptBytes # encrypt and decrypt use same function
269 |
270 | #--------------------------------------------------------------------------
271 |
272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ):
273 | """ Do nRounds Salsa20 rounds on a copy of
274 | input: list or tuple of 16 ints treated as little-endian unsigneds.
275 | Returns a 64-byte string.
276 | """
277 |
278 | assert( type(input) in ( list, tuple ) and len(input) == 16 )
279 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) )
280 |
281 | x = list( input )
282 |
283 | def XOR( a, b ): return a ^ b
284 | ROTATE = rot32
285 | PLUS = add32
286 |
287 | for i in range( nRounds // 2 ):
288 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c
289 | # unchanged except for indents and the blank line between rounds:
290 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
291 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
292 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
293 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
294 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
295 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
296 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
297 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
298 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
299 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
300 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
301 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
302 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
303 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
304 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
305 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));
306 |
307 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
308 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
309 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
310 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
311 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
312 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
313 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
314 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
315 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
316 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
317 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
318 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
319 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
320 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
321 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
322 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));
323 |
324 | for i in range( len( input ) ):
325 | x[i] = PLUS( x[i], input[i] )
326 | return little16_i32.pack( *x )
327 |
328 | #--------------------------- 32-bit ops -------------------------------
329 |
330 | def trunc32( w ):
331 | """ Return the bottom 32 bits of w as a Python int.
332 | This creates longs temporarily, but returns an int. """
333 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) )
334 | assert type(w) == int
335 | return w
336 |
337 |
338 | def add32( a, b ):
339 | """ Add two 32-bit words discarding carry above 32nd bit,
340 | and without creating a Python long.
341 | Timing shouldn't vary.
342 | """
343 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF )
344 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 )
345 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF )
346 |
347 |
348 | def rot32( w, nLeft ):
349 | """ Rotate 32-bit word left by nLeft or right by -nLeft
350 | without creating a Python long.
351 | Timing depends on nLeft but not on w.
352 | """
353 | nLeft &= 31 # which makes nLeft >= 0
354 | if nLeft == 0:
355 | return w
356 |
357 | # Note: now 1 <= nLeft <= 31.
358 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's,
359 | # => sLLLLLLRRR and one s which becomes the sign bit.
360 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) )
361 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w
362 | return RRR | ( sLLLLLL << nLeft )
363 |
364 |
365 | # --------------------------------- end -----------------------------------
366 |
--------------------------------------------------------------------------------
/readmdict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # readmdict.py
4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
5 | #
6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang
7 | #
8 | # This program is a free software; you can redistribute it and/or modify
9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, version 3 of the License.
11 | #
12 | # You can get a copy of GNU General Public License along this program
13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt
14 | #
15 | # This program is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | # GNU General Public License for more details.
19 |
20 | from struct import pack, unpack
21 | from io import BytesIO
22 | import re
23 | import sys
24 | import json
25 |
26 | from .ripemd128 import ripemd128
27 | from .pureSalsa20 import Salsa20
28 |
29 | # zlib compression is used for engine version >=2.0
30 | import zlib
31 | # LZO compression is used for engine version < 2.0
32 | try:
33 | import lzo
34 | except ImportError:
35 | lzo = None
36 | print("LZO compression support is not available")
37 |
38 | # 2x3 compatible
39 | if sys.hexversion >= 0x03000000:
40 | unicode = str
41 |
42 |
43 | def _unescape_entities(text):
44 | """
45 | unescape offending tags < > " &
46 | """
47 | text = text.replace(b'<', b'<')
48 | text = text.replace(b'>', b'>')
49 | text = text.replace(b'"', b'"')
50 | text = text.replace(b'&', b'&')
51 | return text
52 |
53 |
54 | def _fast_decrypt(data, key):
55 | b = bytearray(data)
56 | key = bytearray(key)
57 | previous = 0x36
58 | for i in range(len(b)):
59 | t = (b[i] >> 4 | b[i] << 4) & 0xff
60 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
61 | previous = b[i]
62 | b[i] = t
63 | return bytes(b)
64 |
65 |
66 | def _mdx_decrypt(comp_block):
67 | key = ripemd128(comp_block[4:8] + pack(b'
125 | """
126 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL)
127 | tagdict = {}
128 | for key, value in taglist:
129 | tagdict[key] = _unescape_entities(value)
130 | return tagdict
131 |
132 | def _decode_key_block_info(self, key_block_info_compressed):
133 | if self._version >= 2:
134 | # zlib compression
135 | assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
136 | # decrypt if needed
137 | if self._encrypt & 0x02:
138 | key_block_info_compressed = _mdx_decrypt(key_block_info_compressed)
139 | # decompress
140 | key_block_info = zlib.decompress(key_block_info_compressed[8:])
141 | # adler checksum
142 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
143 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
144 | else:
145 | # no compression
146 | key_block_info = key_block_info_compressed
147 | # decode
148 | key_block_info_list = []
149 | num_entries = 0
150 | i = 0
151 | if self._version >= 2:
152 | byte_format = '>H'
153 | byte_width = 2
154 | text_term = 1
155 | else:
156 | byte_format = '>B'
157 | byte_width = 1
158 | text_term = 0
159 |
160 | while i < len(key_block_info):
161 | # number of entries in current key block
162 | num_entries += unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
163 | i += self._number_width
164 | # text head size
165 | text_head_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
166 | i += byte_width
167 | # text head
168 | if self._encoding != 'UTF-16':
169 | i += text_head_size + text_term
170 | else:
171 | i += (text_head_size + text_term) * 2
172 | # text tail size
173 | text_tail_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
174 | i += byte_width
175 | # text tail
176 | if self._encoding != 'UTF-16':
177 | i += text_tail_size + text_term
178 | else:
179 | i += (text_tail_size + text_term) * 2
180 | # key block compressed size
181 | key_block_compressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
182 | i += self._number_width
183 | # key block decompressed size
184 | key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
185 | i += self._number_width
186 | key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
187 |
188 | assert(num_entries == self._num_entries)
189 |
190 | return key_block_info_list
191 |
192 | def _decode_key_block(self, key_block_compressed, key_block_info_list):
193 | key_list = []
194 | i = 0
195 | for compressed_size, decompressed_size in key_block_info_list:
196 | start = i
197 | end = i + compressed_size
198 | # 4 bytes : compression type
199 | key_block_type = key_block_compressed[start:start + 4]
200 | # 4 bytes : adler checksum of decompressed key block
201 | adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0]
202 | if key_block_type == b'\x00\x00\x00\x00':
203 | key_block = key_block_compressed[start + 8:end]
204 | elif key_block_type == b'\x01\x00\x00\x00':
205 | if lzo is None:
206 | print("LZO compression is not supported")
207 | break
208 | # decompress key block
209 | header = b'\xf0' + pack('>I', decompressed_size)
210 | key_block = lzo.decompress(key_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
211 | elif key_block_type == b'\x02\x00\x00\x00':
212 | # decompress key block
213 | key_block = zlib.decompress(key_block_compressed[start + 8:end])
214 | # extract one single key block into a key list
215 | key_list += self._split_key_block(key_block)
216 | # notice that adler32 returns signed value
217 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
218 |
219 | i += compressed_size
220 | return key_list
221 |
222 | def _split_key_block(self, key_block):
223 | key_list = []
224 | key_start_index = 0
225 | while key_start_index < len(key_block):
226 | temp = key_block[key_start_index:key_start_index + self._number_width]
227 | # the corresponding record's offset in record block
228 | key_id = unpack(self._number_format, key_block[key_start_index:key_start_index + self._number_width])[0]
229 | # key text ends with '\x00'
230 | if self._encoding == 'UTF-16':
231 | delimiter = b'\x00\x00'
232 | width = 2
233 | else:
234 | delimiter = b'\x00'
235 | width = 1
236 | i = key_start_index + self._number_width
237 | while i < len(key_block):
238 | if key_block[i:i + width] == delimiter:
239 | key_end_index = i
240 | break
241 | i += width
242 | key_text = key_block[key_start_index + self._number_width:key_end_index]\
243 | .decode(self._encoding, errors='ignore').encode('utf-8').strip()
244 | key_start_index = key_end_index + width
245 | key_list += [(key_id, key_text)]
246 | return key_list
247 |
248 | def _read_header(self):
249 | f = open(self._fname, 'rb')
250 | # number of bytes of header text
251 | header_bytes_size = unpack('>I', f.read(4))[0]
252 | header_bytes = f.read(header_bytes_size)
253 | # 4 bytes: adler32 checksum of header, in little endian
254 | adler32 = unpack('= 0x03000000:
266 | encoding = encoding.decode('utf-8')
267 | # GB18030 > GBK > GB2312
268 | if encoding in ['GBK', 'GB2312']:
269 | encoding = 'GB18030'
270 | self._encoding = encoding
271 | # 读取标题和描述
272 | if b'Title' in header_tag:
273 | self._title = header_tag[b'Title'].decode('utf-8')
274 | else:
275 | self._title = ''
276 |
277 | if b'Description' in header_tag:
278 | self._description = header_tag[b'Description'].decode('utf-8')
279 | else:
280 | self._description = ''
281 | pass
282 | # encryption flag
283 | # 0x00 - no encryption
284 | # 0x01 - encrypt record block
285 | # 0x02 - encrypt key info block
286 | if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
287 | self._encrypt = 0
288 | elif header_tag[b'Encrypted'] == b'Yes':
289 | self._encrypt = 1
290 | else:
291 | self._encrypt = int(header_tag[b'Encrypted'])
292 |
293 | # stylesheet attribute if present takes form of:
294 | # style_number # 1-255
295 | # style_begin # or ''
296 | # style_end # or ''
297 | # store stylesheet in dict in the form of
298 | # {'number' : ('style_begin', 'style_end')}
299 | self._stylesheet = {}
300 | if header_tag.get('StyleSheet'):
301 | lines = header_tag['StyleSheet'].splitlines()
302 | for i in range(0, len(lines), 3):
303 | self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2])
304 |
305 | # before version 2.0, number is 4 bytes integer
306 | # version 2.0 and above uses 8 bytes
307 | self._version = float(header_tag[b'GeneratedByEngineVersion'])
308 | if self._version < 2.0:
309 | self._number_width = 4
310 | self._number_format = '>I'
311 | else:
312 | self._number_width = 8
313 | self._number_format = '>Q'
314 |
315 | return header_tag
316 |
317 | def _read_keys(self):
318 | f = open(self._fname, 'rb')
319 | f.seek(self._key_block_offset)
320 |
321 | # the following numbers could be encrypted
322 | if self._version >= 2.0:
323 | num_bytes = 8 * 5
324 | else:
325 | num_bytes = 4 * 4
326 | block = f.read(num_bytes)
327 |
328 | if self._encrypt & 1:
329 | if self._passcode is None:
330 | raise RuntimeError('user identification is needed to read encrypted file')
331 | regcode, userid = self._passcode
332 | if isinstance(userid, unicode):
333 | userid = userid.encode('utf8')
334 | if self.header[b'RegisterBy'] == b'EMail':
335 | encrypted_key = _decrypt_regcode_by_email(regcode, userid)
336 | else:
337 | encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid)
338 | block = _salsa_decrypt(block, encrypted_key)
339 |
340 | # decode this block
341 | sf = BytesIO(block)
342 | # number of key blocks
343 | num_key_blocks = self._read_number(sf)
344 | # number of entries
345 | self._num_entries = self._read_number(sf)
346 | # number of bytes of key block info after decompression
347 | if self._version >= 2.0:
348 | key_block_info_decomp_size = self._read_number(sf)
349 | # number of bytes of key block info
350 | key_block_info_size = self._read_number(sf)
351 | # number of bytes of key block
352 | key_block_size = self._read_number(sf)
353 |
354 | # 4 bytes: adler checksum of previous 5 numbers
355 | if self._version >= 2.0:
356 | adler32 = unpack('>I', f.read(4))[0]
357 | assert adler32 == (zlib.adler32(block) & 0xffffffff)
358 |
359 | # read key block info, which indicates key block's compressed and
360 | # decompressed size
361 | key_block_info = f.read(key_block_info_size)
362 | key_block_info_list = self._decode_key_block_info(key_block_info)
363 | assert(num_key_blocks == len(key_block_info_list))
364 |
365 | # read key block
366 | key_block_compressed = f.read(key_block_size)
367 | # extract key block
368 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
369 |
370 | self._record_block_offset = f.tell()
371 | f.close()
372 |
373 | return key_list
374 |
375 | def _read_keys_brutal(self):
376 | f = open(self._fname, 'rb')
377 | f.seek(self._key_block_offset)
378 |
379 | # the following numbers could be encrypted, disregard them!
380 | if self._version >= 2.0:
381 | num_bytes = 8 * 5 + 4
382 | key_block_type = b'\x02\x00\x00\x00'
383 | else:
384 | num_bytes = 4 * 4
385 | key_block_type = b'\x01\x00\x00\x00'
386 | block = f.read(num_bytes)
387 |
388 | # key block info
389 | # 4 bytes '\x02\x00\x00\x00'
390 | # 4 bytes adler32 checksum
391 | # unknown number of bytes follows until '\x02\x00\x00\x00' which marks
392 | # the beginning of key block
393 | key_block_info = f.read(8)
394 | if self._version >= 2.0:
395 | assert key_block_info[:4] == b'\x02\x00\x00\x00'
396 | while True:
397 | fpos = f.tell()
398 | t = f.read(1024)
399 | index = t.find(key_block_type)
400 | if index != -1:
401 | key_block_info += t[:index]
402 | f.seek(fpos + index)
403 | break
404 | else:
405 | key_block_info += t
406 |
407 | key_block_info_list = self._decode_key_block_info(key_block_info)
408 | key_block_size = sum(list(zip(*key_block_info_list))[0])
409 |
410 | # read key block
411 | key_block_compressed = f.read(key_block_size)
412 | # extract key block
413 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
414 |
415 | self._record_block_offset = f.tell()
416 | f.close()
417 |
418 | self._num_entries = len(key_list)
419 | return key_list
420 |
421 |
422 | class MDD(MDict):
423 | """
424 | MDict resource file format (*.MDD) reader.
425 | >>> mdd = MDD('example.mdd')
426 | >>> len(mdd)
427 | 208
428 | >>> for filename,content in mdd.items():
429 | ... print filename, content[:10]
430 | """
431 | def __init__(self, fname, passcode=None):
432 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
433 |
434 | def items(self):
435 | """Return a generator which in turn produce tuples in the form of (filename, content)
436 | """
437 | return self._decode_record_block()
438 |
439 | def _decode_record_block(self):
440 | f = open(self._fname, 'rb')
441 | f.seek(self._record_block_offset)
442 |
443 | num_record_blocks = self._read_number(f)
444 | num_entries = self._read_number(f)
445 | assert(num_entries == self._num_entries)
446 | record_block_info_size = self._read_number(f)
447 | record_block_size = self._read_number(f)
448 |
449 | # record block info section
450 | record_block_info_list = []
451 | size_counter = 0
452 | for i in range(num_record_blocks):
453 | compressed_size = self._read_number(f)
454 | decompressed_size = self._read_number(f)
455 | record_block_info_list += [(compressed_size, decompressed_size)]
456 | size_counter += self._number_width * 2
457 | assert(size_counter == record_block_info_size)
458 |
459 | # actual record block
460 | offset = 0
461 | i = 0
462 | size_counter = 0
463 | for compressed_size, decompressed_size in record_block_info_list:
464 | record_block_compressed = f.read(compressed_size)
465 | # 4 bytes: compression type
466 | record_block_type = record_block_compressed[:4]
467 | # 4 bytes: adler32 checksum of decompressed record block
468 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
469 | if record_block_type == b'\x00\x00\x00\x00':
470 | record_block = record_block_compressed[8:]
471 | elif record_block_type == b'\x01\x00\x00\x00':
472 | if lzo is None:
473 | print("LZO compression is not supported")
474 | break
475 | # decompress
476 | header = b'\xf0' + pack('>I', decompressed_size)
477 | record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
478 | elif record_block_type == b'\x02\x00\x00\x00':
479 | # decompress
480 | record_block = zlib.decompress(record_block_compressed[8:])
481 |
482 | # notice that adler32 return signed value
483 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
484 |
485 | assert(len(record_block) == decompressed_size)
486 | # split record block according to the offset info from key block
487 | while i < len(self._key_list):
488 | record_start, key_text = self._key_list[i]
489 | # reach the end of current record block
490 | if record_start - offset >= len(record_block):
491 | break
492 | # record end index
493 | if i < len(self._key_list) - 1:
494 | record_end = self._key_list[i + 1][0]
495 | else:
496 | record_end = len(record_block) + offset
497 | i += 1
498 | data = record_block[record_start - offset:record_end - offset]
499 | yield key_text, data
500 | offset += len(record_block)
501 | size_counter += compressed_size
502 | assert(size_counter == record_block_size)
503 |
504 | f.close()
505 |
506 | ### 获取 mdx 文件的索引列表,格式为
507 | ### key_text(关键词,可以由后面的 keylist 得到)
508 | ### file_pos(record_block开始的位置)
509 | ### compressed_size(record_block压缩前的大小)
510 | ### decompressed_size(解压后的大小)
511 | ### record_block_type(record_block 的压缩类型)
512 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
513 | ### record_end
514 | ### offset
515 | def get_index(self, check_block = True):
516 | f = open(self._fname, 'rb')
517 | index_dict_list = []
518 | f.seek(self._record_block_offset)
519 |
520 | num_record_blocks = self._read_number(f)
521 | num_entries = self._read_number(f)
522 | assert(num_entries == self._num_entries)
523 | record_block_info_size = self._read_number(f)
524 | record_block_size = self._read_number(f)
525 |
526 | # record block info section
527 | record_block_info_list = []
528 | size_counter = 0
529 | for i in range(num_record_blocks):
530 | compressed_size = self._read_number(f)
531 | decompressed_size = self._read_number(f)
532 | record_block_info_list += [(compressed_size, decompressed_size)]
533 | size_counter += self._number_width * 2
534 | # todo:注意!!!
535 | assert(size_counter == record_block_info_size)
536 |
537 | # actual record block
538 | offset = 0
539 | i = 0
540 | size_counter = 0
541 | for compressed_size, decompressed_size in record_block_info_list:
542 | current_pos = f.tell()
543 | record_block_compressed = f.read(compressed_size)
544 | # 4 bytes: compression type
545 | record_block_type = record_block_compressed[:4]
546 | # 4 bytes: adler32 checksum of decompressed record block
547 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
548 | if record_block_type == b'\x00\x00\x00\x00':
549 | _type = 0
550 | if check_block:
551 | record_block = record_block_compressed[8:]
552 | elif record_block_type == b'\x01\x00\x00\x00':
553 | _type = 1
554 | if lzo is None:
555 | print("LZO compression is not supported")
556 | break
557 | # decompress
558 | header = b'\xf0' + pack('>I', decompressed_size)
559 | if check_block:
560 | record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
561 | elif record_block_type == b'\x02\x00\x00\x00':
562 | # decompress
563 | _type = 2
564 | if check_block:
565 | record_block = zlib.decompress(record_block_compressed[8:])
566 |
567 | # notice that adler32 return signed value
568 | if check_block:
569 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
570 | assert(len(record_block) == decompressed_size)
571 | # split record block according to the offset info from key block
572 | while i < len(self._key_list):
573 | ### 用来保存索引信息的空字典
574 | index_dict = {}
575 | index_dict['file_pos'] = current_pos
576 | index_dict['compressed_size'] = compressed_size
577 | index_dict['decompressed_size'] = decompressed_size
578 | index_dict['record_block_type'] = _type
579 | record_start, key_text = self._key_list[i]
580 | index_dict['record_start'] = record_start
581 | index_dict['key_text'] = key_text.decode("utf-8")
582 | index_dict['offset'] = offset
583 | # reach the end of current record block
584 | if record_start - offset >= decompressed_size:
585 | break
586 | # record end index
587 | if i < len(self._key_list) - 1:
588 | record_end = self._key_list[i + 1][0]
589 | else:
590 | record_end = decompressed_size + offset
591 | index_dict['record_end'] = record_end
592 | i += 1
593 | if check_block:
594 | data = record_block[record_start - offset:record_end - offset]
595 | index_dict_list.append(index_dict)
596 | #yield key_text, data
597 | offset += decompressed_size
598 | size_counter += compressed_size
599 | assert(size_counter == record_block_size)
600 | f.close()
601 | return index_dict_list
602 |
603 |
604 | class MDX(MDict):
605 | """
606 | MDict dictionary file format (*.MDD) reader.
607 | >>> mdx = MDX('example.mdx')
608 | >>> len(mdx)
609 | 42481
610 | >>> for key,value in mdx.items():
611 | ... print key, value[:10]
612 | """
613 | def __init__(self, fname, encoding='', substyle=False, passcode=None):
614 | MDict.__init__(self, fname, encoding, passcode)
615 | self._substyle = substyle
616 |
617 | def items(self):
618 | """Return a generator which in turn produce tuples in the form of (key, value)
619 | """
620 | return self._decode_record_block()
621 |
622 | def _substitute_stylesheet(self, txt):
623 | # substitute stylesheet definition
624 | txt_list = re.split('`\d+`', txt)
625 | txt_tag = re.findall('`\d+`', txt)
626 | txt_styled = txt_list[0]
627 | for j, p in enumerate(txt_list[1:]):
628 | style = self._stylesheet[txt_tag[j][1:-1]]
629 | if p and p[-1] == '\n':
630 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
631 | else:
632 | txt_styled = txt_styled + style[0] + p + style[1]
633 | return txt_styled
634 |
635 | def _decode_record_block(self):
636 | f = open(self._fname, 'rb')
637 | f.seek(self._record_block_offset)
638 |
639 | num_record_blocks = self._read_number(f)
640 | num_entries = self._read_number(f)
641 | assert(num_entries == self._num_entries)
642 | record_block_info_size = self._read_number(f)
643 | record_block_size = self._read_number(f)
644 |
645 | # record block info section
646 | record_block_info_list = []
647 | size_counter = 0
648 | for i in range(num_record_blocks):
649 | compressed_size = self._read_number(f)
650 | decompressed_size = self._read_number(f)
651 | record_block_info_list += [(compressed_size, decompressed_size)]
652 | size_counter += self._number_width * 2
653 | assert(size_counter == record_block_info_size)
654 |
655 | # actual record block data
656 | offset = 0
657 | i = 0
658 | size_counter = 0
659 | ###最后的索引表的格式为
660 | ### key_text(关键词,可以由后面的 keylist 得到)
661 | ### file_pos(record_block开始的位置)
662 | ### compressed_size(record_block压缩前的大小)
663 | ### decompressed_size(解压后的大小)
664 | ### record_block_type(record_block 的压缩类型)
665 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
666 | ### record_end
667 | ### offset
668 | for compressed_size, decompressed_size in record_block_info_list:
669 | record_block_compressed = f.read(compressed_size)
670 | ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
671 | ###### 另外还需要记录当前 f 对象的位置
672 | ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
673 | # 4 bytes indicates block compression type
674 | record_block_type = record_block_compressed[:4]
675 | # 4 bytes adler checksum of uncompressed content
676 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
677 | # no compression
678 | if record_block_type == b'\x00\x00\x00\x00':
679 | record_block = record_block_compressed[8:]
680 | # lzo compression
681 | elif record_block_type == b'\x01\x00\x00\x00':
682 | if lzo is None:
683 | print("LZO compression is not supported")
684 | break
685 | # decompress
686 | header = b'\xf0' + pack('>I', decompressed_size)
687 | record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
688 | # zlib compression
689 | elif record_block_type == b'\x02\x00\x00\x00':
690 | # decompress
691 | record_block = zlib.decompress(record_block_compressed[8:])
692 | ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
693 | ###### 需要的信息有 record_block_compressed, decompress_size,
694 | ###### record_block_type
695 | ###### 另外还需要校验信息 adler32
696 | # notice that adler32 return signed value
697 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
698 |
699 | assert(len(record_block) == decompressed_size)
700 | # split record block according to the offset info from key block
701 | while i < len(self._key_list):
702 | record_start, key_text = self._key_list[i]
703 | # reach the end of current record block
704 | if record_start - offset >= len(record_block):
705 | break
706 | # record end index
707 | if i < len(self._key_list) - 1:
708 | record_end = self._key_list[i + 1][0]
709 | else:
710 | record_end = len(record_block) + offset
711 | i += 1
712 | #############需要得到 record_block , record_start, record_end,
713 | #############offset
714 | record = record_block[record_start - offset:record_end - offset]
715 | # convert to utf-8
716 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
717 | # substitute styles
718 | #############是否替换样式表
719 | if self._substyle and self._stylesheet:
720 | record = self._substitute_stylesheet(record)
721 |
722 | yield key_text, record
723 | offset += len(record_block)
724 | size_counter += compressed_size
725 | assert(size_counter == record_block_size)
726 |
727 | f.close()
728 |
729 | ### 获取 mdx 文件的索引列表,格式为
730 | ### key_text(关键词,可以由后面的 keylist 得到)
731 | ### file_pos(record_block开始的位置)
732 | ### compressed_size(record_block压缩前的大小)
733 | ### decompressed_size(解压后的大小)
734 | ### record_block_type(record_block 的压缩类型)
735 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
736 | ### record_end
737 | ### offset
738 | ### 所需 metadata
739 | ###
740 | def get_index(self, check_block = True):
741 | ### 索引列表
742 | index_dict_list = []
743 | f = open(self._fname, 'rb')
744 | f.seek(self._record_block_offset)
745 |
746 | num_record_blocks = self._read_number(f)
747 | num_entries = self._read_number(f)
748 | assert(num_entries == self._num_entries)
749 | record_block_info_size = self._read_number(f)
750 | record_block_size = self._read_number(f)
751 |
752 | # record block info section
753 | record_block_info_list = []
754 | size_counter = 0
755 | for i in range(num_record_blocks):
756 | compressed_size = self._read_number(f)
757 | decompressed_size = self._read_number(f)
758 | record_block_info_list += [(compressed_size, decompressed_size)]
759 | size_counter += self._number_width * 2
760 | assert(size_counter == record_block_info_size)
761 |
762 | # actual record block data
763 | offset = 0
764 | i = 0
765 | size_counter = 0
766 | ###最后的索引表的格式为
767 | ### key_text(关键词,可以由后面的 keylist 得到)
768 | ### file_pos(record_block开始的位置)
769 | ### compressed_size(record_block压缩前的大小)
770 | ### decompressed_size(解压后的大小)
771 | ### record_block_type(record_block 的压缩类型)
772 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存)
773 | ### record_end
774 | ### offset
775 | for compressed_size, decompressed_size in record_block_info_list:
776 | current_pos = f.tell()
777 | record_block_compressed = f.read(compressed_size)
778 | ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录)
779 | ###### 另外还需要记录当前 f 对象的位置
780 | ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
781 | # 4 bytes indicates block compression type
782 | record_block_type = record_block_compressed[:4]
783 | # 4 bytes adler checksum of uncompressed content
784 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
785 | # no compression
786 | if record_block_type == b'\x00\x00\x00\x00':
787 | _type = 0
788 | record_block = record_block_compressed[8:]
789 | # lzo compression
790 | elif record_block_type == b'\x01\x00\x00\x00':
791 | _type = 1
792 | if lzo is None:
793 | print("LZO compression is not supported")
794 | break
795 | # decompress
796 | header = b'\xf0' + pack('>I', decompressed_size)
797 | if check_block:
798 | record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
799 | # zlib compression
800 | elif record_block_type == b'\x02\x00\x00\x00':
801 | # decompress
802 | _type = 2
803 | if check_block:
804 | record_block = zlib.decompress(record_block_compressed[8:])
805 | ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法
806 | ###### 需要的信息有 record_block_compressed, decompress_size,
807 | ###### record_block_type
808 | ###### 另外还需要校验信息 adler32
809 | # notice that adler32 return signed value
810 | if check_block:
811 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
812 | assert(len(record_block) == decompressed_size)
813 | # split record block according to the offset info from key block
814 | while i < len(self._key_list):
815 | ### 用来保存索引信息的空字典
816 | index_dict = {}
817 | index_dict['file_pos'] = current_pos
818 | index_dict['compressed_size'] = compressed_size
819 | index_dict['decompressed_size'] = decompressed_size
820 | index_dict['record_block_type'] = _type
821 | record_start, key_text = self._key_list[i]
822 | index_dict['record_start'] = record_start
823 | index_dict['key_text'] = key_text.decode('utf-8')
824 | index_dict['offset'] = offset
825 | # reach the end of current record block
826 | if record_start - offset >= decompressed_size:
827 | break
828 | # record end index
829 | if i < len(self._key_list) - 1:
830 | record_end = self._key_list[i + 1][0]
831 | else:
832 | record_end = decompressed_size + offset
833 | index_dict['record_end'] = record_end
834 | i += 1
835 | #############需要得到 record_block , record_start, record_end,
836 | #############offset
837 | if check_block:
838 | record = record_block[record_start - offset:record_end - offset]
839 | # convert to utf-8
840 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
841 | # substitute styles
842 | #############是否替换样式表
843 | if self._substyle and self._stylesheet:
844 | record = self._substitute_stylesheet(record)
845 | index_dict_list.append(index_dict)
846 |
847 | offset += decompressed_size
848 | size_counter += compressed_size
849 | #todo: 注意!!!
850 | #assert(size_counter == record_block_size)
851 | f.close
852 | #这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息
853 | meta = {}
854 | meta['encoding'] = self._encoding
855 | meta['stylesheet'] = json.dumps(self._stylesheet)
856 | meta['title'] = self._title
857 | meta['description'] = self._description
858 |
859 | return {"index_dict_list":index_dict_list, 'meta':meta}
860 | if __name__ == '__main__':
861 | import sys
862 | import os
863 | import os.path
864 | import argparse
865 | import codecs
866 |
867 | def passcode(s):
868 | try:
869 | regcode, userid = s.split(',')
870 | except:
871 | raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
872 | try:
873 | regcode = codecs.decode(regcode, 'hex')
874 | except:
875 | raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
876 | return regcode, userid
877 |
878 | parser = argparse.ArgumentParser()
879 | parser.add_argument('-x', '--extract', action="store_true",
880 | help='extract mdx to source format and extract files from mdd')
881 | parser.add_argument('-s', '--substyle', action="store_true",
882 | help='substitute style definition if present')
883 | parser.add_argument('-d', '--datafolder', default="data",
884 | help='folder to extract data files from mdd')
885 | parser.add_argument('-e', '--encoding', default="",
886 | help='folder to extract data files from mdd')
887 | parser.add_argument('-p', '--passcode', default=None, type=passcode,
888 | help='register_code,email_or_deviceid')
889 | parser.add_argument("filename", nargs='?', help="mdx file name")
890 | args = parser.parse_args()
891 |
892 | # use GUI to select file, default to extract
893 | if not args.filename:
894 | import Tkinter
895 | import tkFileDialog
896 | root = Tkinter.Tk()
897 | root.withdraw()
898 | args.filename = tkFileDialog.askopenfilename(parent=root)
899 | args.extract = True
900 |
901 | if not os.path.exists(args.filename):
902 | print("Please specify a valid MDX/MDD file")
903 |
904 | base, ext = os.path.splitext(args.filename)
905 |
906 | # read mdx file
907 | if ext.lower() == os.path.extsep + 'mdx':
908 | mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
909 | if type(args.filename) is unicode:
910 | bfname = args.filename.encode('utf-8')
911 | else:
912 | bfname = args.filename
913 | print('======== %s ========' % bfname)
914 | print(' Number of Entries : %d' % len(mdx))
915 | for key, value in mdx.header.items():
916 | print(' %s : %s' % (key, value))
917 | else:
918 | mdx = None
919 |
920 | # find companion mdd file
921 | mdd_filename = ''.join([base, os.path.extsep, 'mdd'])
922 | if os.path.exists(mdd_filename):
923 | mdd = MDD(mdd_filename, args.passcode)
924 | if type(mdd_filename) is unicode:
925 | bfname = mdd_filename.encode('utf-8')
926 | else:
927 | bfname = mdd_filename
928 | print('======== %s ========' % bfname)
929 | print(' Number of Entries : %d' % len(mdd))
930 | for key, value in mdd.header.items():
931 | print(' %s : %s' % (key, value))
932 | else:
933 | mdd = None
934 |
935 | if args.extract:
936 | # write out glos
937 | if mdx:
938 | output_fname = ''.join([base, os.path.extsep, 'txt'])
939 | tf = open(output_fname, 'wb')
940 | for key, value in mdx.items():
941 | tf.write(key)
942 | tf.write(b'\r\n')
943 | tf.write(value)
944 | if not value.endswith(b'\n'):
945 | tf.write(b'\r\n')
946 | tf.write(b'>\r\n')
947 | tf.close()
948 | # write out style
949 | if mdx.header.get('StyleSheet'):
950 | style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
951 | sf = open(style_fname, 'wb')
952 | sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines()))
953 | sf.close()
954 | # write out optional data files
955 | if mdd:
956 | datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
957 | if not os.path.exists(datafolder):
958 | os.makedirs(datafolder)
959 | for key, value in mdd.items():
960 | fname = key.decode('utf-8').replace('\\', os.path.sep)
961 | dfname = datafolder + fname
962 | if not os.path.exists(os.path.dirname(dfname)):
963 | os.makedirs(os.path.dirname(dfname))
964 | df = open(dfname, 'wb')
965 | df.write(value)
966 | df.close()
967 |
--------------------------------------------------------------------------------
/ripemd128.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright by https://github.com/zhansliu/writemdict
3 |
4 | ripemd128.py - A simple ripemd128 library in pure Python.
5 |
6 | Supports both Python 2 (versions >= 2.6) and Python 3.
7 |
8 | Usage:
9 | from ripemd128 import ripemd128
10 | digest = ripemd128(b"The quick brown fox jumps over the lazy dog")
11 | assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96")
12 |
13 | """
14 |
15 |
16 |
17 | import struct
18 |
19 |
20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt
21 |
22 | def f(j, x, y, z):
23 | assert(0 <= j and j < 64)
24 | if j < 16:
25 | return x ^ y ^ z
26 | elif j < 32:
27 | return (x & y) | (z & ~x)
28 | elif j < 48:
29 | return (x | (0xffffffff & ~y)) ^ z
30 | else:
31 | return (x & z) | (y & ~z)
32 |
33 | def K(j):
34 | assert(0 <= j and j < 64)
35 | if j < 16:
36 | return 0x00000000
37 | elif j < 32:
38 | return 0x5a827999
39 | elif j < 48:
40 | return 0x6ed9eba1
41 | else:
42 | return 0x8f1bbcdc
43 |
44 | def Kp(j):
45 | assert(0 <= j and j < 64)
46 | if j < 16:
47 | return 0x50a28be6
48 | elif j < 32:
49 | return 0x5c4dd124
50 | elif j < 48:
51 | return 0x6d703ef3
52 | else:
53 | return 0x00000000
54 |
55 | def padandsplit(message):
56 | """
57 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges
58 | from 0 to 16.
59 | First pads the message to length in bytes is congruent to 56 (mod 64),
60 | by first adding a byte 0x80, and then padding with 0x00 bytes until the
61 | message length is congruent to 56 (mod 64). Then adds the little-endian
62 | 64-bit representation of the original length. Finally, splits the result
63 | up into 64-byte blocks, which are further parsed as 32-bit integers.
64 | """
65 | origlen = len(message)
66 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1!
67 | message += b"\x80"
68 | message += b"\x00" * (padlength - 1)
69 | message += struct.pack("> (32-s)) & 0xffffffff
86 |
87 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
88 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8,
89 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12,
90 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2]
91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12,
92 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2,
93 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13,
94 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14]
95 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8,
96 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12,
97 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5,
98 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12]
99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6,
100 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11,
101 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5,
102 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8]
103 |
104 |
105 | def ripemd128(message):
106 | h0 = 0x67452301
107 | h1 = 0xefcdab89
108 | h2 = 0x98badcfe
109 | h3 = 0x10325476
110 | X = padandsplit(message)
111 | for i in range(len(X)):
112 | (A,B,C,D) = (h0,h1,h2,h3)
113 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3)
114 | for j in range(64):
115 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j)))
116 | (A,D,C,B) = (D,C,B,T)
117 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j)))
118 | (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T)
119 | T = add(h1,C,Dp)
120 | h1 = add(h2,D,Ap)
121 | h2 = add(h3,A,Bp)
122 | h3 = add(h0,B,Cp)
123 | h0 = T
124 |
125 |
126 | return struct.pack("
2 |
3 |
4 | All Available Dictionary
5 |
6 |
7 | {% for item in dicts %}
8 |
9 | {% endfor %}
10 |
11 |
12 |