├── .gitattributes
├── .gitignore
├── README.md
├── cache
    └── o3.js
├── lzo.py
├── mdict-query.pyproj
├── mdict-query.sln
├── mdict_dir.py
├── mdict_query.py
├── mdx
    └── drop mdict files here.txt
├── pureSalsa20.py
├── readmdict.py
├── ripemd128.py
├── static
    └── cache here.txt
├── templates
    ├── all.html
    ├── dict.html
    └── entry.html
├── test.py
├── test_lzo.py
├── web.py
├── web.spec
└── wsgi.py


/.gitattributes:
--------------------------------------------------------------------------------
 1 | # Auto detect text files and perform LF normalization
 2 | * text=auto
 3 | 
 4 | # Custom for Visual Studio
 5 | *.cs     diff=csharp
 6 | 
 7 | # Standard to msysgit
 8 | *.doc	 diff=astextplain
 9 | *.DOC	 diff=astextplain
10 | *.docx diff=astextplain
11 | *.DOCX diff=astextplain
12 | *.dot  diff=astextplain
13 | *.DOT  diff=astextplain
14 | *.pdf  diff=astextplain
15 | *.PDF	 diff=astextplain
16 | *.rtf	 diff=astextplain
17 | *.RTF	 diff=astextplain
18 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | #################
  2 | ## Eclipse
  3 | #################
  4 | 
  5 | *.pydevproject
  6 | .project
  7 | .metadata
  8 | bin/
  9 | tmp/
 10 | *.tmp
 11 | *.bak
 12 | *.swp
 13 | *~.nib
 14 | local.properties
 15 | .classpath
 16 | .settings/
 17 | .loadpath
 18 | *.mdd
 19 | *.mdx
 20 | *.db
 21 | *.jpg
 22 | *.png
 23 | *.gif
 24 | *.mp3
 25 | *.css
 26 | mdx/*
 27 | static/*
 28 | 
 29 | # External tool builders
 30 | .externalToolBuilders/
 31 | 
 32 | # Locally stored "Eclipse launch configurations"
 33 | *.launch
 34 | 
 35 | # CDT-specific
 36 | .cproject
 37 | 
 38 | # PDT-specific
 39 | .buildpath
 40 | 
 41 | 
 42 | #################
 43 | ## Visual Studio
 44 | #################
 45 | 
 46 | ## Ignore Visual Studio temporary files, build results, and
 47 | ## files generated by popular Visual Studio add-ons.
 48 | 
 49 | # User-specific files
 50 | *.suo
 51 | *.user
 52 | *.sln.docstates
 53 | 
 54 | # Build results
 55 | 
 56 | [Dd]ebug/
 57 | [Rr]elease/
 58 | x64/
 59 | build/
 60 | [Bb]in/
 61 | [Oo]bj/
 62 | 
 63 | # MSTest test Results
 64 | [Tt]est[Rr]esult*/
 65 | [Bb]uild[Ll]og.*
 66 | 
 67 | *_i.c
 68 | *_p.c
 69 | *.ilk
 70 | *.meta
 71 | *.obj
 72 | *.pch
 73 | *.pdb
 74 | *.pgc
 75 | *.pgd
 76 | *.rsp
 77 | *.sbr
 78 | *.tlb
 79 | *.tli
 80 | *.tlh
 81 | *.tmp
 82 | *.tmp_proj
 83 | *.log
 84 | *.vspscc
 85 | *.vssscc
 86 | .builds
 87 | *.pidb
 88 | *.log
 89 | *.scc
 90 | 
 91 | # Visual C++ cache files
 92 | ipch/
 93 | *.aps
 94 | *.ncb
 95 | *.opensdf
 96 | *.sdf
 97 | *.cachefile
 98 | 
 99 | # Visual Studio profiler
100 | *.psess
101 | *.vsp
102 | *.vspx
103 | 
104 | # Guidance Automation Toolkit
105 | *.gpState
106 | 
107 | # ReSharper is a .NET coding add-in
108 | _ReSharper*/
109 | *.[Rr]e[Ss]harper
110 | 
111 | # TeamCity is a build add-in
112 | _TeamCity*
113 | 
114 | # DotCover is a Code Coverage Tool
115 | *.dotCover
116 | 
117 | # NCrunch
118 | *.ncrunch*
119 | .*crunch*.local.xml
120 | 
121 | # Installshield output folder
122 | [Ee]xpress/
123 | 
124 | # DocProject is a documentation generator add-in
125 | DocProject/buildhelp/
126 | DocProject/Help/*.HxT
127 | DocProject/Help/*.HxC
128 | DocProject/Help/*.hhc
129 | DocProject/Help/*.hhk
130 | DocProject/Help/*.hhp
131 | DocProject/Help/Html2
132 | DocProject/Help/html
133 | 
134 | # Click-Once directory
135 | publish/
136 | 
137 | # Publish Web Output
138 | *.Publish.xml
139 | *.pubxml
140 | *.publishproj
141 | 
142 | # NuGet Packages Directory
143 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line
144 | #packages/
145 | 
146 | # Windows Azure Build Output
147 | csx
148 | *.build.csdef
149 | 
150 | # Windows Store app package directory
151 | AppPackages/
152 | 
153 | # Others
154 | sql/
155 | *.Cache
156 | ClientBin/
157 | [Ss]tyle[Cc]op.*
158 | ~$*
159 | *~
160 | *.dbmdl
161 | *.[Pp]ublish.xml
162 | *.pfx
163 | *.publishsettings
164 | 
165 | # RIA/Silverlight projects
166 | Generated_Code/
167 | 
168 | # Backup & report files from converting an old project file to a newer
169 | # Visual Studio version. Backup files are not needed, because we have git ;-)
170 | _UpgradeReport_Files/
171 | Backup*/
172 | UpgradeLog*.XML
173 | UpgradeLog*.htm
174 | 
175 | # SQL Server files
176 | App_Data/*.mdf
177 | App_Data/*.ldf
178 | 
179 | #############
180 | ## Windows detritus
181 | #############
182 | 
183 | # Windows image file caches
184 | Thumbs.db
185 | ehthumbs.db
186 | 
187 | # Folder config file
188 | Desktop.ini
189 | 
190 | # Recycle Bin used on file shares
191 | $RECYCLE.BIN/
192 | 
193 | # Mac crap
194 | .DS_Store
195 | 
196 | 
197 | #############
198 | ## Python
199 | #############
200 | 
201 | *.py[cod]
202 | 
203 | # Packages
204 | *.egg
205 | *.egg-info
206 | dist/
207 | build/
208 | eggs/
209 | parts/
210 | var/
211 | sdist/
212 | develop-eggs/
213 | .installed.cfg
214 | 
215 | # Installer logs
216 | pip-log.txt
217 | 
218 | # Unit test / coverage reports
219 | .coverage
220 | .tox
221 | 
222 | #Translations
223 | *.mo
224 | 
225 | #Mr Developer
226 | .mr.developer.cfg
227 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | This is a python module for looking up `mdict` dictionary files (`.mdx` and `.mdd`). Function converting `mdx` to `sqlite` is added.
 2 | 
 3 | >>Based on [readmdict](https://bitbucket.org/xwang/mdict-analysis) by [Xiaoqiang Wang](http://bitbucket.org/xwang/).
 4 | 
 5 | While this project is a trivial extension of the [original module](https://bitbucket.org/xwang/mdict-analysis), it adds the features of looking up a single entry in `.mdx` or resource file in `.mdd` without extracting all content, which may be helpful in other projects that requires dictionaries.
 6 | 
 7 | ## Usage
 8 | 
 9 | Constructs the `IndexBuilder` object, which builds the sqlite index for `.mdx` file and the corresponding `.mdd` file (if exists).
10 | 
11 |     from mdict_query import IndexBuilder
12 |     builder = IndexBuilder('ode.mdx')
13 | 
14 | Convert `mdx` to `sqlite`:  
15 | ```
16 | builder.make_sqlite()
17 | # Check the output file `ode.mdx.sqlite.db` near your `ode.mdx`
18 | ```
19 | 
20 | 
21 | Get all mdx keys:
22 | 
23 |     builder.get_mdx_keys()
24 |     # ==> ['key1', 'key2', 'key3', ...]
25 | 
26 | Filter mdx keys by wildcard:
27 | 
28 |     builder.get_mdx_keys('dedicat*')
29 |     # ==> ['dedicate', 'dedication', ...]
30 | 
31 | Looks up mdx with a key:
32 | 
33 |     result_text = builder.mdx_lookup('dedication')
34 | 
35 | There is an option to ignore cases:
36 | 
37 |     result_text = builder.mdx_lookup('Dedication', ignorecase = True)
38 |     
39 | Get all mdd keys:
40 | 
41 |     builder.get_mdd_keys()
42 |     # ==> ['key1', 'key2', 'key3', ...]
43 | 
44 | Filter mdd keys by wildcard:
45 | 
46 |     builder.get_mdd_keys('*.css')
47 |     # ==> ['/style.css', ...]
48 |     
49 | Looks up mdd with a key:
50 | 
51 |     bytes_list = builder.mdd_lookup('/style.css')
52 |     #bytes_list is the bytes list of the file stored in mdd
53 | 
54 | 


--------------------------------------------------------------------------------
/cache/o3.js:
--------------------------------------------------------------------------------
1 | var o0e=(function(){return{e:function(c,d){var n=d==2?c.nextSibling:c.parentNode.nextSibling;if(!d)n=n.childNodes[0];var s=n.style;if(s.display!="block")s.display="block";else s.display="none";},a:function(c,d,f){c.removeAttribute("onclick");var s=c.style;s.cursor="default";s.outline="1px dotted gray";var m=/([^//]+)$/.exec(f);
2 | if(m){var u="http://audio.oxforddictionaries.com/en/mp3/"+m[0].replace('__','_')+".mp3";var b=function(){s.outline="";s.cursor="pointer";c.setAttribute("onclick","o0e.a(this,"+d+",'"+f+"')");};var t=setTimeout(b,2000);try{with(document.createElement("audio")){setAttribute("src",u);onloadstart=function(){clearTimeout(t);};onended=b;play();}}catch(e){c.style.outline="";}}},x:function(c){var s=c.parentNode.nextSibling.style;if(s.display!="none"){s.display="none";c.className="yuq";}else{s.display="block";c.className="aej";}},p:function(c){if(c.className=="j02")c.className="g4p";else c.className="j02";}}}());


--------------------------------------------------------------------------------
/lzo.py:
--------------------------------------------------------------------------------
  1 | import math
  2 | 
  3 | 
  4 | class FlexBuffer():
  5 | 
  6 |     def __init__(self):
  7 | 
  8 |         self.blockSize = None
  9 |         self.c = None
 10 |         self.l = None
 11 |         self.buf = None
 12 | 
 13 |     def require(self, n):
 14 |         
 15 |         r = self.c - self.l + n
 16 |         if r > 0:
 17 |             self.l = self.l + self.blockSize * math.ceil(r / self.blockSize)
 18 |             #tmp = bytearray(self.l)
 19 |             #for i in len(self.buf):
 20 |             #    tmp[i] = self.buf[i]
 21 |             #self.buf = tmp
 22 |             self.buf = self.buf + bytearray(self.l - len(self.buf))
 23 |         self.c = self.c + n
 24 |         return self.buf
 25 | 
 26 |     def alloc(self, initSize, blockSize):
 27 |         
 28 |         if blockSize:
 29 |             sz = blockSize
 30 |         else:
 31 |             sz = 4096
 32 |         self.blockSize = self.roundUp(sz)
 33 |         self.c = 0
 34 |         self.l = self.roundUp(initSize) | 0
 35 |         self.l += self.blockSize - (self.l % self.blockSize)
 36 |         self.buf = bytearray(self.l)
 37 |         return self.buf
 38 | 
 39 |     def roundUp(self, n):
 40 |         
 41 |         r = n % 4
 42 |         if r == 0:
 43 |             return n
 44 |         else:
 45 |             return n + 4 - r
 46 | 
 47 |     def reset(self):
 48 | 
 49 |         self.c = 0
 50 |         self.l = len(self.buf)
 51 | 
 52 |     def pack(self, size):
 53 |         
 54 |         return self.buf[0:size]
 55 | 
 56 | def _decompress(inBuf, outBuf):
 57 | 
 58 |     c_top_loop = 1
 59 |     c_first_literal_run = 2
 60 |     c_match = 3
 61 |     c_copy_match = 4
 62 |     c_match_done = 5
 63 |     c_match_next = 6
 64 | 
 65 |     out = outBuf.buf
 66 |     op = 0
 67 |     ip = 0
 68 |     t = inBuf[ip]
 69 |     state = c_top_loop
 70 |     m_pos = 0
 71 |     ip_end = len(inBuf)
 72 | 
 73 |     if t > 17:
 74 |         ip = ip + 1
 75 |         t = t - 17
 76 |         if t < 4:
 77 |             state = c_match_next
 78 |         else:
 79 |             out = outBuf.require(t)
 80 |             while True:
 81 |                 out[op] = inBuf[ip]
 82 |                 op = op + 1
 83 |                 ip = ip + 1
 84 |                 t = t - 1
 85 |                 if not t > 0: break
 86 |             state = c_first_literal_run
 87 | 
 88 |     while True:
 89 |         if_block = False
 90 | 
 91 |         ##
 92 |         if state == c_top_loop:
 93 |             t = inBuf[ip]
 94 |             ip = ip + 1
 95 |             if t >= 16:
 96 |                 state = c_match
 97 |                 continue
 98 |             if t == 0:
 99 |                 while inBuf[ip] == 0:
100 |                     t = t + 255
101 |                     ip = ip + 1
102 |                 t = t + 15 + inBuf[ip]
103 |                 ip = ip + 1
104 | 
105 |             t = t + 3
106 |             out = outBuf.require(t)
107 |             while True:
108 |                 out[op] = inBuf[ip]
109 |                 op = op + 1
110 |                 ip = ip + 1
111 |                 t = t - 1
112 |                 if not t > 0: break
113 |             # emulate c switch
114 |             state = c_first_literal_run
115 | 
116 |         ##
117 |         if state == c_first_literal_run:
118 |             t = inBuf[ip]
119 |             ip = ip + 1
120 |             if t >= 16:
121 |                 state = c_match
122 |                 continue
123 |             m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2)
124 |             ip = ip + 1
125 |             out = outBuf.require(3)
126 |             out[op] = out[m_pos]
127 |             op = op + 1
128 |             m_pos = m_pos + 1
129 |             out[op] = out[m_pos]
130 |             op = op + 1
131 |             m_pos = m_pos + 1
132 |             out[op] = out[m_pos]
133 |             op = op + 1
134 | 
135 |             state = c_match_done
136 |             continue
137 | 
138 |         ##
139 |         if state == c_match:
140 |             if t >= 64:
141 |                 m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3)
142 |                 ip = ip + 1
143 |                 t = (t >> 5) - 1
144 |                 state = c_copy_match
145 |                 continue
146 |             elif t >= 32:
147 |                 t = t & 31
148 |                 if t == 0:
149 |                     while inBuf[ip] == 0:
150 |                         t = t + 255
151 |                         ip = ip + 1
152 |                     t = t + 31 + inBuf[ip]
153 |                     ip = ip + 1
154 |                 m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
155 |                 ip = ip + 2
156 |             elif t >= 16:
157 |                 m_pos = op - ((t & 8) << 11)
158 |                 t = t & 7
159 |                 if t == 0:
160 |                     while inBuf[ip] == 0:
161 |                         t = t + 255
162 |                         ip = ip + 1
163 |                     t = t + 7 + inBuf[ip]
164 |                     ip = ip + 1
165 |                 m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2)
166 |                 ip = ip + 2
167 |                 if m_pos == op:
168 |                     break
169 |                 m_pos = m_pos - 0x4000
170 |             else:
171 |                 m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2);
172 |                 ip = ip + 1
173 |                 out = outBuf.require(2)
174 |                 out[op] = out[m_pos]
175 |                 op = op + 1
176 |                 m_pos = m_pos + 1
177 |                 out[op] = out[m_pos]
178 |                 op = op + 1
179 |                 state = c_match_done
180 |                 continue
181 | 
182 |             if t >= 6 and (op - m_pos) >= 4:
183 |                 if_block = True
184 |                 t += 2
185 |                 out = outBuf.require(t)
186 |                 while True:
187 |                     out[op] = out[m_pos]
188 |                     op += 1
189 |                     m_pos += 1
190 |                     t -= 1
191 |                     if not t > 0: break
192 |             #emulate c switch
193 |             state = c_copy_match
194 |         
195 |         ##
196 |         if state == c_copy_match:
197 |             if not if_block:
198 |                 t += 2
199 |                 out = outBuf.require(t)
200 |                 while True:
201 |                     out[op] = out[m_pos]
202 |                     op += 1
203 |                     m_pos += 1
204 |                     t -= 1
205 |                     if not t > 0: break
206 |             #emulating c switch
207 |             state = c_match_done
208 |  
209 |         ##
210 |         if state == c_match_done:
211 |             t = inBuf[ip - 2] & 3
212 |             if t == 0:
213 |                 state = c_top_loop
214 |                 continue
215 |             #emulate c switch
216 |             state = c_match_next
217 | 
218 |         ##
219 |         if state == c_match_next:
220 |             out = outBuf.require(1)
221 |             out[op] = inBuf[ip]
222 |             op += 1
223 |             ip += 1
224 |             if t > 1:
225 |                 out = outBuf.require(1)
226 |                 out[op] = inBuf[ip]
227 |                 op += 1
228 |                 ip += 1
229 |                 if t > 2:
230 |                     out = outBuf.require(1)
231 |                     out[op] = inBuf[ip]
232 |                     op += 1
233 |                     ip += 1
234 |             t = inBuf[ip]
235 |             ip += 1
236 |             state = c_match
237 |             continue
238 | 
239 |     return bytes(outBuf.pack(op))
240 | 
241 | def decompress(input, initSize = 16000, blockSize = 8192):
242 |     output = FlexBuffer()
243 |     output.alloc(initSize, blockSize)
244 |     return _decompress(bytearray(input), output)
245 | 
246 | 
247 | 


--------------------------------------------------------------------------------
/mdict-query.pyproj:
--------------------------------------------------------------------------------
 1 | ﻿<?xml version="1.0" encoding="utf-8"?>
 2 | <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003" DefaultTargets="Build">
 3 |   <PropertyGroup>
 4 |     <Configuration Condition=" '$(Configuration)' == '' ">Debug</Configuration>
 5 |     <SchemaVersion>2.0</SchemaVersion>
 6 |     <ProjectGuid>{f227ad7e-74e4-4364-ad90-d2b3dda5abf6}</ProjectGuid>
 7 |     <ProjectHome />
 8 |     <StartupFile>test_lzo.py</StartupFile>
 9 |     <SearchPath />
10 |     <WorkingDirectory>.</WorkingDirectory>
11 |     <OutputPath>.</OutputPath>
12 |     <ProjectTypeGuids>{888888a0-9f3d-457c-b088-3a5042f75d52}</ProjectTypeGuids>
13 |     <LaunchProvider>Standard Python launcher</LaunchProvider>
14 |     <InterpreterId>{9a7a9026-48c1-4688-9d5d-e5699d47d074}</InterpreterId>
15 |     <InterpreterVersion>3.5</InterpreterVersion>
16 |     <IsWindowsApplication>False</IsWindowsApplication>
17 |   </PropertyGroup>
18 |   <PropertyGroup Condition="'$(Configuration)' == 'Debug'" />
19 |   <PropertyGroup Condition="'$(Configuration)' == 'Release'" />
20 |   <PropertyGroup>
21 |     <VisualStudioVersion Condition=" '$(VisualStudioVersion)' == '' ">10.0</VisualStudioVersion>
22 |     <PtvsTargetsFile>$(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets</PtvsTargetsFile>
23 |   </PropertyGroup>
24 |   <ItemGroup>
25 |     <Compile Include="mdict_dir.py" />
26 |     <Compile Include="mdict_query.py" />
27 |     <Compile Include="pureSalsa20.py" />
28 |     <Compile Include="readmdict.py" />
29 |     <Compile Include="ripemd128.py" />
30 |     <Compile Include="test_lzo.py">
31 |       <SubType>Code</SubType>
32 |     </Compile>
33 |   </ItemGroup>
34 |   <ItemGroup>
35 |     <InterpreterReference Include="{9a7a9026-48c1-4688-9d5d-e5699d47d074}\2.7" />
36 |     <InterpreterReference Include="{9a7a9026-48c1-4688-9d5d-e5699d47d074}\3.5" />
37 |   </ItemGroup>
38 |   <Import Project="$(PtvsTargetsFile)" Condition="Exists($(PtvsTargetsFile))" />
39 |   <Import Project="$(MSBuildToolsPath)\Microsoft.Common.targets" Condition="!Exists($(PtvsTargetsFile))" />
40 | </Project>


--------------------------------------------------------------------------------
/mdict-query.sln:
--------------------------------------------------------------------------------
 1 | ﻿
 2 | Microsoft Visual Studio Solution File, Format Version 12.00
 3 | # Visual Studio 14
 4 | VisualStudioVersion = 14.0.25420.1
 5 | MinimumVisualStudioVersion = 10.0.40219.1
 6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "mdict-query", "mdict-query.pyproj", "{F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}"
 7 | EndProject
 8 | Global
 9 | 	GlobalSection(SolutionConfigurationPlatforms) = preSolution
10 | 		Debug|Any CPU = Debug|Any CPU
11 | 		Release|Any CPU = Release|Any CPU
12 | 	EndGlobalSection
13 | 	GlobalSection(ProjectConfigurationPlatforms) = postSolution
14 | 		{F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU
15 | 		{F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Release|Any CPU.ActiveCfg = Release|Any CPU
16 | 	EndGlobalSection
17 | 	GlobalSection(SolutionProperties) = preSolution
18 | 		HideSolutionNode = FALSE
19 | 	EndGlobalSection
20 | EndGlobal
21 | 


--------------------------------------------------------------------------------
/mdict_dir.py:
--------------------------------------------------------------------------------
 1 | from mdict_query import IndexBuilder
 2 | import os
 3 | import json
 4 | 
 5 | 
 6 | class Dir(object):
 7 |     
 8 |     def __init__(self, mdict_dir, config_name = 'config.json'):
 9 | 
10 |         assert(os.path.isdir(mdict_dir))        
11 |         self._mdict_dir = mdict_dir
12 |         self._config_file_base_name = config_name
13 |         self._config = {}
14 |         #check config.json
15 |         self._config_file = os.path.join(mdict_dir, self._config_file_base_name)
16 | 
17 |         if os.path.exists(self._config_file):
18 |             self._ensure_config_consistency()
19 |             self._load_config()
20 |             self._add_builder()
21 |             pass
22 |         else:
23 |             self._build_index()
24 |             self._make_config()
25 |             self._dump_config()
26 |             self._add_builder()
27 |             pass
28 | 
29 |     def _add_builder(self):
30 | 
31 |         for dict in self._config['dicts']:
32 |             dict['builder'] = IndexBuilder(dict['mdx_name'])
33 | 
34 | 
35 |     def _load_config(self):
36 | 
37 |         file_opened = open(self._config_file, 'r', encoding = 'utf-8')
38 |         self._config = json.load(file_opened)
39 |         file_opened.close()
40 | 
41 | 
42 |     def _build_index(self):
43 |         
44 |         dict_list = []
45 |         files_in_dir = os.listdir(self._mdict_dir)
46 |         for item in files_in_dir:
47 |             full_name = os.path.join(self._mdict_dir, item)
48 |             print(full_name)
49 |             if os.path.isfile(full_name):
50 |                 _filename, _file_extension = os.path.splitext(full_name)
51 |                 if _file_extension == '.mdx':
52 |                     _config_single_dic = {
53 |                         'title': '',
54 |                         'description':'',
55 |                         'mdx_name': full_name,
56 |                         'has_mdd': os.path.isfile(_filename + '.mdd')
57 |                         }
58 |                     try:
59 |                         ib = IndexBuilder(full_name)
60 |                     except Exception:
61 |                         continue
62 |                     _config_single_dic['title'] = ib._title
63 |                     _config_single_dic['description'] = ib._description
64 |                     dict_list.append(_config_single_dic)
65 |         self._config['dicts'] = dict_list
66 | 
67 |     def _make_config(self):
68 |         pass
69 | 
70 |     def _dump_config(self):
71 | 
72 |         file_opened = open(self._config_file, 'w', encoding = 'utf-8')
73 |         json.dump(self._config, file_opened, ensure_ascii = False, indent = True)
74 |         file_opened.close()
75 | 
76 |     #todo: implement ensure consistency
77 |     def _ensure_config_consistency(self):
78 |         pass
79 | 
80 | Dir('mdx')
81 | 


--------------------------------------------------------------------------------
/mdict_query.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | 
  4 | from .readmdict import MDX, MDD
  5 | from struct import pack, unpack
  6 | from io import BytesIO
  7 | import re
  8 | import sys
  9 | import os
 10 | import sqlite3
 11 | import json
 12 | 
 13 | # zlib compression is used for engine version >=2.0
 14 | import zlib
 15 | # LZO compression is used for engine version < 2.0
 16 | try:
 17 |     import lzo
 18 | except ImportError:
 19 |     lzo = None
 20 |     #print("LZO compression support is not available")
 21 | 
 22 | # 2x3 compatible
 23 | if sys.hexversion >= 0x03000000:
 24 |     unicode = str
 25 | 
 26 | version = '1.1'
 27 | 
 28 | 
 29 | class IndexBuilder(object):
 30 |     #todo: enable history
 31 |     def __init__(self, fname, encoding = "", passcode = None, force_rebuild = False, enable_history = False, sql_index = True, check = False):
 32 |         self._mdx_file = fname
 33 |         self._mdd_file = ""
 34 |         self._encoding = ''
 35 |         self._stylesheet = {}
 36 |         self._title = ''
 37 |         self._version = ''
 38 |         self._description = ''
 39 |         self._sql_index = sql_index
 40 |         self._check = check
 41 |         _filename, _file_extension = os.path.splitext(fname)
 42 |         assert(_file_extension == '.mdx')
 43 |         assert(os.path.isfile(fname))
 44 |         self._mdx_db = _filename + ".mdx.db"
 45 |         # make index anyway
 46 |         if force_rebuild:
 47 |             self._make_mdx_index(self._mdx_db)
 48 |             if os.path.isfile(_filename + '.mdd'):
 49 |                 self._mdd_file = _filename + ".mdd"
 50 |                 self._mdd_db = _filename + ".mdd.db"
 51 |                 self._make_mdd_index(self._mdd_db)
 52 | 
 53 |         if os.path.isfile(self._mdx_db):
 54 |             #read from META table
 55 |             conn = sqlite3.connect(self._mdx_db)
 56 |             #cursor = conn.execute("SELECT * FROM META")
 57 |             cursor = conn.execute("SELECT * FROM META WHERE key = \"version\"")
 58 |             #判断有无版本号
 59 |             for cc in cursor:
 60 |                 self._version = cc[1]
 61 |             ################# if not version in fo #############
 62 |             if not self._version:
 63 |                 print("version info not found")
 64 |                 conn.close()
 65 |                 self._make_mdx_index(self._mdx_db)
 66 |                 print("mdx.db rebuilt!")
 67 |                 if os.path.isfile(_filename + '.mdd'):
 68 |                     self._mdd_file = _filename + ".mdd"
 69 |                     self._mdd_db = _filename + ".mdd.db"
 70 |                     self._make_mdd_index(self._mdd_db)
 71 |                     print("mdd.db rebuilt!")
 72 |                 return None
 73 |             cursor = conn.execute("SELECT * FROM META WHERE key = \"encoding\"")
 74 |             for cc in cursor:
 75 |                 self._encoding = cc[1]
 76 |             cursor = conn.execute("SELECT * FROM META WHERE key = \"stylesheet\"")
 77 |             for cc in cursor:
 78 |                 self._stylesheet = json.loads(cc[1])
 79 | 
 80 |             cursor = conn.execute("SELECT * FROM META WHERE key = \"title\"")
 81 |             for cc in cursor:
 82 |                 self._title = cc[1]
 83 | 
 84 |             cursor = conn.execute("SELECT * FROM META WHERE key = \"description\"")
 85 |             for cc in cursor:
 86 |                 self._description = cc[1]
 87 | 
 88 |             #for cc in cursor:
 89 |             #    if cc[0] == 'encoding':
 90 |             #        self._encoding = cc[1]
 91 |             #        continue
 92 |             #    if cc[0] == 'stylesheet':
 93 |             #        self._stylesheet = json.loads(cc[1])
 94 |             #        continue
 95 |             #    if cc[0] == 'title':
 96 |             #        self._title = cc[1]
 97 |             #        continue
 98 |             #    if cc[0] == 'title':
 99 |             #        self._description = cc[1]
100 |         else:
101 |             self._make_mdx_index(self._mdx_db)
102 | 
103 |         if os.path.isfile(_filename + ".mdd"):
104 |             self._mdd_file = _filename + ".mdd"
105 |             self._mdd_db = _filename + ".mdd.db"
106 |             if not os.path.isfile(self._mdd_db):
107 |                 self._make_mdd_index(self._mdd_db)
108 |         pass
109 |     
110 | 
111 |     def _replace_stylesheet(self, txt):
112 |         # substitute stylesheet definition
113 |         txt_list = re.split('`\d+`', txt)
114 |         txt_tag = re.findall('`\d+`', txt)
115 |         txt_styled = txt_list[0]
116 |         for j, p in enumerate(txt_list[1:]):
117 |             style = self._stylesheet[txt_tag[j][1:-1]]
118 |             if p and p[-1] == '\n':
119 |                 txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
120 |             else:
121 |                 txt_styled = txt_styled + style[0] + p + style[1]
122 |         return txt_styled
123 | 
124 | 
125 |     def make_sqlite(self):
126 |         sqlite_file = self._mdx_file + '.sqlite.db'
127 |         if os.path.exists(sqlite_file):
128 |             os.remove(sqlite_file)
129 |         mdx = MDX(self._mdx_file)
130 |         conn = sqlite3.connect(sqlite_file)
131 |         cursor = conn.cursor()
132 |         cursor.execute(
133 |             ''' CREATE TABLE MDX_DICT
134 |                 (key text not null,
135 |                 value text
136 |                 )'''
137 |             )
138 | 
139 |         # remove '(pīnyīn)', remove `1`:
140 |         aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ'
141 |         pattern = r"`\d+`|[（\(]?['a-z%s]*[%s]['a-z%s]*[\)）]?"%(aeiou, aeiou, aeiou)
142 |         tuple_list = [(key.decode(), re.sub(pattern, '', value.decode()))
143 |             for key, value in mdx.items()]
144 | 
145 |         cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list)
146 | 
147 |         returned_index = mdx.get_index(check_block = self._check)
148 |         meta = returned_index['meta']
149 |         cursor.execute(
150 |             '''CREATE TABLE META (key text, value text)''')
151 | 
152 |         cursor.executemany(
153 |             'INSERT INTO META VALUES (?,?)',
154 |             [('encoding', meta['encoding']),
155 |              ('stylesheet', meta['stylesheet']),
156 |              ('title', meta['title']),
157 |              ('description', meta['description']),
158 |              ('version', version)
159 |              ]
160 |             )
161 | 
162 |         if self._sql_index:
163 |             cursor.execute(
164 |                 '''
165 |                 CREATE INDEX key_index ON MDX_DICT (key)
166 |                 '''
167 |                 )
168 |         conn.commit()
169 |         conn.close()
170 | 
171 | 
172 |     def _make_mdx_index(self, db_name):
173 |         if os.path.exists(db_name):
174 |             os.remove(db_name)
175 |         mdx = MDX(self._mdx_file)
176 |         self._mdx_db = db_name
177 |         returned_index = mdx.get_index(check_block = self._check)
178 |         index_list = returned_index['index_dict_list']
179 |         conn = sqlite3.connect(db_name)
180 |         c = conn.cursor()
181 |         c.execute(
182 |             ''' CREATE TABLE MDX_INDEX
183 |                (key_text text not null,
184 |                 file_pos integer,
185 |                 compressed_size integer,
186 |                 decompressed_size integer,
187 |                 record_block_type integer,
188 |                 record_start integer,
189 |                 record_end integer,
190 |                 offset integer
191 |                 )'''
192 |         )
193 | 
194 |         tuple_list = [
195 |             (item['key_text'],
196 |                      item['file_pos'],
197 |                      item['compressed_size'],
198 |                      item['decompressed_size'],
199 |                      item['record_block_type'],
200 |                      item['record_start'],
201 |                      item['record_end'],
202 |                      item['offset']
203 |                      )
204 |             for item in index_list
205 |             ]
206 |         c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
207 |                       tuple_list)
208 |         # build the metadata table
209 |         meta = returned_index['meta']
210 |         c.execute(
211 |             '''CREATE TABLE META
212 |                (key text,
213 |                 value text
214 |                 )''')
215 | 
216 |         #for k,v in meta:
217 |         #    c.execute(
218 |         #    'INSERT INTO META VALUES (?,?)', 
219 |         #    (k, v)
220 |         #    )
221 |         
222 |         c.executemany(
223 |             'INSERT INTO META VALUES (?,?)', 
224 |             [('encoding', meta['encoding']),
225 |              ('stylesheet', meta['stylesheet']),
226 |              ('title', meta['title']),
227 |              ('description', meta['description']),
228 |              ('version', version)
229 |              ]
230 |             )
231 |         
232 |         if self._sql_index:
233 |             c.execute(
234 |                 '''
235 |                 CREATE INDEX key_index ON MDX_INDEX (key_text)
236 |                 '''
237 |                 )
238 | 
239 |         conn.commit()
240 |         conn.close()
241 |         #set class member
242 |         self._encoding = meta['encoding']
243 |         self._stylesheet = json.loads(meta['stylesheet'])
244 |         self._title = meta['title']
245 |         self._description = meta['description']
246 | 
247 | 
248 |     def _make_mdd_index(self, db_name):
249 |         if os.path.exists(db_name):
250 |             os.remove(db_name)
251 |         mdd = MDD(self._mdd_file)
252 |         self._mdd_db = db_name
253 |         index_list = mdd.get_index(check_block = self._check)
254 |         conn = sqlite3.connect(db_name)
255 |         c = conn.cursor()
256 |         c.execute(
257 |             ''' CREATE TABLE MDX_INDEX
258 |                (key_text text not null unique,
259 |                 file_pos integer,
260 |                 compressed_size integer,
261 |                 decompressed_size integer,
262 |                 record_block_type integer,
263 |                 record_start integer,
264 |                 record_end integer,
265 |                 offset integer
266 |                 )'''
267 |         )
268 | 
269 |         tuple_list = [
270 |             (item['key_text'],
271 |                      item['file_pos'],
272 |                      item['compressed_size'],
273 |                      item['decompressed_size'],
274 |                      item['record_block_type'],
275 |                      item['record_start'],
276 |                      item['record_end'],
277 |                      item['offset']
278 |                      )
279 |             for item in index_list
280 |             ]
281 |         c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)',
282 |                       tuple_list)
283 |         if self._sql_index:
284 |             c.execute(
285 |                 '''
286 |                 CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text)
287 |                 '''
288 |                 )
289 | 
290 |         conn.commit()
291 |         conn.close()
292 | 
293 |     @staticmethod
294 |     def get_data_by_index(fmdx, index):
295 |         fmdx.seek(index['file_pos'])
296 |         record_block_compressed = fmdx.read(index['compressed_size'])
297 |         record_block_type = record_block_compressed[:4]
298 |         record_block_type = index['record_block_type']
299 |         decompressed_size = index['decompressed_size']
300 |         #adler32 = unpack('>I', record_block_compressed[4:8])[0]
301 |         if record_block_type == 0:
302 |             _record_block = record_block_compressed[8:]
303 |             # lzo compression
304 |         elif record_block_type == 1:
305 |             if lzo is None:
306 |                 print("LZO compression is not supported")
307 |                 # decompress
308 |             header = b'\xf0' + pack('>I', index['decompressed_size'])
309 |             _record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
310 |                 # zlib compression
311 |         elif record_block_type == 2:
312 |             # decompress
313 |             _record_block = zlib.decompress(record_block_compressed[8:])
314 |         data = _record_block[index['record_start'] - index['offset']:index['record_end'] - index['offset']]
315 |         return data
316 | 
317 |     def get_mdx_by_index(self, fmdx, index):
318 |         data = self.get_data_by_index(fmdx,index)
319 |         record  = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
320 |         if self._stylesheet:
321 |             record = self._replace_stylesheet(record)
322 |         record = record.decode('utf-8')
323 |         return record
324 | 
325 |     def get_mdd_by_index(self, fmdx, index):
326 |         return self.get_data_by_index(fmdx,index)
327 | 
328 |     @staticmethod
329 |     def lookup_indexes(db,keyword,ignorecase=None):
330 |         indexes = []
331 |         if ignorecase:
332 |             sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(keyword)
333 |         else:
334 |             sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(keyword)
335 |         with sqlite3.connect(db) as conn:
336 |             cursor = conn.execute(sql)
337 |             for result in cursor:
338 |                 index = {}
339 |                 index['file_pos'] = result[1]
340 |                 index['compressed_size'] = result[2]
341 |                 index['decompressed_size'] = result[3]
342 |                 index['record_block_type'] = result[4]
343 |                 index['record_start'] = result[5]
344 |                 index['record_end'] = result[6]
345 |                 index['offset'] = result[7]
346 |                 indexes.append(index)
347 |         return indexes
348 | 
349 |     def mdx_lookup(self, keyword,ignorecase=None):
350 |         lookup_result_list = []
351 |         indexes = self.lookup_indexes(self._mdx_db,keyword,ignorecase)
352 |         with open(self._mdx_file,'rb') as mdx_file:
353 |             for index in indexes:
354 |                 lookup_result_list.append(self.get_mdx_by_index(mdx_file, index))
355 |         return lookup_result_list
356 | 
357 |     def mdd_lookup(self, keyword,ignorecase=None):
358 |         lookup_result_list = []
359 |         indexes = self.lookup_indexes(self._mdd_db,keyword,ignorecase)
360 |         with open(self._mdd_file,'rb') as mdd_file:
361 |             for index in indexes:
362 |                 lookup_result_list.append(self.get_mdd_by_index(mdd_file, index))
363 |         return lookup_result_list
364 | 
365 |     @staticmethod
366 |     def get_keys(db,query = ''):
367 |         if not db:
368 |             return []
369 |         if query:
370 |             if '*' in query:
371 |                 query = query.replace('*','%')
372 |             else:
373 |                 query = query + '%'
374 |             sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"'
375 |         else:
376 |             sql = 'SELECT key_text FROM MDX_INDEX'
377 |         with sqlite3.connect(db) as conn:
378 |             cursor = conn.execute(sql)
379 |             keys = [item[0] for item in cursor]
380 |             return keys
381 | 
382 |     def get_mdd_keys(self, query = ''):
383 |         return self.get_keys(self._mdd_db,query)
384 | 
385 |     def get_mdx_keys(self, query = ''):
386 |         return self.get_keys(self._mdx_db,query)
387 | 
388 | 
389 | 
390 | # mdx_builder = IndexBuilder("oald.mdx")
391 | # text = mdx_builder.mdx_lookup('dedication')
392 | # keys = mdx_builder.get_mdx_keys()
393 | # keys1 = mdx_builder.get_mdx_keys('abstrac')
394 | # keys2 = mdx_builder.get_mdx_keys('*tion')
395 | # for key in keys2:
396 |     # text = mdx_builder.mdx_lookup(key)[0]
397 | # pass
398 | 


--------------------------------------------------------------------------------
/mdx/drop mdict files here.txt:
--------------------------------------------------------------------------------
1 | hihaaha
2 | 


--------------------------------------------------------------------------------
/pureSalsa20.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | """
  5 |     Copyright by https://github.com/zhansliu/writemdict
  6 | 
  7 |     pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3
  8 | 
  9 |     v4.0: Added Python 3 support, dropped support for Python <= 2.5.
 10 |     
 11 |     // zhansliu
 12 | 
 13 |     Original comments below.
 14 | 
 15 |     ====================================================================
 16 |     There are comments here by two authors about three pieces of software:
 17 |         comments by Larry Bugbee about
 18 |             Salsa20, the stream cipher by Daniel J. Bernstein 
 19 |                  (including comments about the speed of the C version) and
 20 |             pySalsa20, Bugbee's own Python wrapper for salsa20.c
 21 |                  (including some references), and
 22 |         comments by Steve Witham about
 23 |             pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20,
 24 |                 which follows pySalsa20's API, and is in this file.
 25 | 
 26 |     Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee)
 27 |     -----------------------------------------------------------
 28 | 
 29 |     Salsa20 is a fast stream cipher written by Daniel Bernstein 
 30 |     that basically uses a hash function and XOR making for fast 
 31 |     encryption.  (Decryption uses the same function.)  Salsa20 
 32 |     is simple and quick.  
 33 |     
 34 |     Some Salsa20 parameter values...
 35 |         design strength    128 bits
 36 |         key length         128 or 256 bits, exactly
 37 |         IV, aka nonce      64 bits, always
 38 |         chunk size         must be in multiples of 64 bytes
 39 |     
 40 |     Salsa20 has two reduced versions, 8 and 12 rounds each.
 41 |     
 42 |     One benchmark (10 MB):
 43 |         1.5GHz PPC G4     102/97/89 MB/sec for 8/12/20 rounds
 44 |         AMD Athlon 2500+   77/67/53 MB/sec for 8/12/20 rounds
 45 |           (no I/O and before Python GC kicks in)
 46 |     
 47 |     Salsa20 is a Phase 3 finalist in the EU eSTREAM competition 
 48 |     and appears to be one of the fastest ciphers.  It is well 
 49 |     documented so I will not attempt any injustice here.  Please 
 50 |     see "References" below.
 51 |     
 52 |     ...and Salsa20 is "free for any use".  
 53 |     
 54 |     
 55 |     pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee)
 56 |     ------------------------------------------------------------------
 57 | 
 58 |     pySalsa20.py is a simple ctypes Python wrapper.  Salsa20 is 
 59 |     as it's name implies, 20 rounds, but there are two reduced 
 60 |     versions, 8 and 12 rounds each.  Because the APIs are 
 61 |     identical, pySalsa20 is capable of wrapping all three 
 62 |     versions (number of rounds hardcoded), including a special 
 63 |     version that allows you to set the number of rounds with a 
 64 |     set_rounds() function.  Compile the version of your choice 
 65 |     as a shared library (not as a Python extension), name and 
 66 |     install it as libsalsa20.so.
 67 |     
 68 |     Sample usage:
 69 |         from pySalsa20 import Salsa20
 70 |         s20 = Salsa20(key, IV)
 71 |         dataout = s20.encryptBytes(datain)   # same for decrypt
 72 |     
 73 |     This is EXPERIMENTAL software and intended for educational 
 74 |     purposes only.  To make experimentation less cumbersome, 
 75 |     pySalsa20 is also free for any use.      
 76 |     
 77 |     THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
 78 |     ANY KIND.  USE AT YOUR OWN RISK.  
 79 |     
 80 |     Enjoy,
 81 |       
 82 |     Larry Bugbee
 83 |     bugbee@seanet.com
 84 |     April 2007
 85 | 
 86 |     
 87 |     References:
 88 |     -----------
 89 |       http://en.wikipedia.org/wiki/Salsa20
 90 |       http://en.wikipedia.org/wiki/Daniel_Bernstein
 91 |       http://cr.yp.to/djb.html
 92 |       http://www.ecrypt.eu.org/stream/salsa20p3.html
 93 |       http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip
 94 | 
 95 |      
 96 |     Prerequisites for pySalsa20:
 97 |     ----------------------------
 98 |       - Python 2.5 (haven't tested in 2.4)
 99 | 
100 | 
101 |     pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham)
102 |     ------------------------------------------------------------------
103 | 
104 |     pureSalsa20 is the stand-alone Python code in this file.
105 |     It implements the underlying Salsa20 core algorithm
106 |     and emulates pySalsa20's Salsa20 class API (minus a bug(*)).
107 | 
108 |     pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20--
109 |     about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8,
110 |     when encrypting 64k-byte blocks on my computer.
111 | 
112 |     pureSalsa20 is for cases where portability is much more important than
113 |     speed.  I wrote it for use in a "structured" random number generator.
114 | 
115 |     There are comments about the reasons for this slowness in
116 |           http://www.tiac.net/~sw/2010/02/PureSalsa20
117 | 
118 |     Sample usage:
119 |         from pureSalsa20 import Salsa20
120 |         s20 = Salsa20(key, IV)
121 |         dataout = s20.encryptBytes(datain)   # same for decrypt
122 | 
123 |     I took the test code from pySalsa20, added a bunch of tests including
124 |     rough speed tests, and moved them into the file testSalsa20.py.  
125 |     To test both pySalsa20 and pureSalsa20, type
126 |         python testSalsa20.py
127 | 
128 |     (*)The bug (?) in pySalsa20 is this.  The rounds variable is global to the
129 |     libsalsa20.so library and not switched when switching between instances
130 |     of the Salsa20 class.
131 |         s1 = Salsa20( key, IV, 20 )
132 |         s2 = Salsa20( key, IV, 8 )
133 |     In this example,
134 |         with pySalsa20, both s1 and s2 will do 8 rounds of encryption.
135 |         with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds.
136 |     Perhaps giving each instance its own nRounds variable, which
137 |     is passed to the salsa20wordtobyte() function, is insecure.  I'm not a 
138 |     cryptographer.
139 | 
140 |     pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and 
141 |     intended for educational purposes only.  To make experimentation less 
142 |     cumbersome, pureSalsa20.py and testSalsa20.py are free for any use.
143 | 
144 |     Revisions:
145 |     ----------
146 |       p3.2   Fixed bug that initialized the output buffer with plaintext!
147 |              Saner ramping of nreps in speed test.
148 |              Minor changes and print statements.
149 |       p3.1   Took timing variability out of add32() and rot32().
150 |              Made the internals more like pySalsa20/libsalsa .
151 |              Put the semicolons back in the main loop!
152 |              In encryptBytes(), modify a byte array instead of appending.
153 |              Fixed speed calculation bug.
154 |              Used subclasses instead of patches in testSalsa20.py .
155 |              Added 64k-byte messages to speed test to be fair to pySalsa20.
156 |       p3     First version, intended to parallel pySalsa20 version 3.
157 | 
158 |     More references:
159 |     ----------------
160 |       http://www.seanet.com/~bugbee/crypto/salsa20/          [pySalsa20]
161 |       http://cr.yp.to/snuffle.html        [The original name of Salsa20]
162 |       http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design]
163 |       http://www.tiac.net/~sw/2010/02/PureSalsa20
164 |     
165 |     THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
166 |     ANY KIND.  USE AT YOUR OWN RISK.  
167 | 
168 |     Cheers,
169 | 
170 |     Steve Witham sw at remove-this tiac dot net
171 |     February, 2010
172 | """
173 | import sys
174 | assert(sys.version_info >= (2, 6))
175 | 
176 | if sys.version_info >= (3,):
177 | 	integer_types = (int,)
178 | 	python3 = True
179 | else:
180 | 	integer_types = (int, long)
181 | 	python3 = False
182 | 
183 | from struct import Struct
184 | little_u64 = Struct( "<Q" )      #    little-endian 64-bit unsigned.
185 |                                  #    Unpacks to a tuple of one element!
186 | 
187 | little16_i32 = Struct( "<16i" )  # 16 little-endian 32-bit signed ints.
188 | little4_i32 = Struct( "<4i" )    #  4 little-endian 32-bit signed ints.
189 | little2_i32 = Struct( "<2i" )    #  2 little-endian 32-bit signed ints.
190 | 
191 | _version = 'p4.0'
192 | 
193 | #----------- Salsa20 class which emulates pySalsa20.Salsa20 ---------------
194 | 
195 | class Salsa20(object):
196 |     def __init__(self, key=None, IV=None, rounds=20 ):
197 |         self._lastChunk64 = True
198 |         self._IVbitlen = 64             # must be 64 bits
199 |         self.ctx = [ 0 ] * 16
200 |         if key:
201 |             self.setKey(key)
202 |         if IV:
203 |             self.setIV(IV)
204 | 
205 |         self.setRounds(rounds)
206 | 
207 | 
208 |     def setKey(self, key):
209 |         assert type(key) == bytes
210 |         ctx = self.ctx
211 |         if len( key ) == 32:  # recommended
212 |             constants = b"expand 32-byte k"
213 |             ctx[ 1],ctx[ 2],ctx[ 3],ctx[ 4] = little4_i32.unpack(key[0:16])
214 |             ctx[11],ctx[12],ctx[13],ctx[14] = little4_i32.unpack(key[16:32])
215 |         elif len( key ) == 16:
216 |             constants = b"expand 16-byte k"
217 |             ctx[ 1],ctx[ 2],ctx[ 3],ctx[ 4] = little4_i32.unpack(key[0:16])
218 |             ctx[11],ctx[12],ctx[13],ctx[14] = little4_i32.unpack(key[0:16])
219 |         else:
220 |             raise Exception( "key length isn't 32 or 16 bytes." )
221 |         ctx[0],ctx[5],ctx[10],ctx[15] = little4_i32.unpack( constants )
222 | 
223 |         
224 |     def setIV(self, IV):
225 |         assert type(IV) == bytes
226 |         assert len(IV)*8 == 64, 'nonce (IV) not 64 bits'
227 |         self.IV = IV
228 |         ctx=self.ctx
229 |         ctx[ 6],ctx[ 7] = little2_i32.unpack( IV )
230 |         ctx[ 8],ctx[ 9] = 0, 0  # Reset the block counter.
231 | 
232 |     setNonce = setIV            # support an alternate name
233 | 
234 | 
235 |     def setCounter( self, counter ):
236 |         assert( type(counter) in integer_types )
237 |         assert( 0 <= counter < 1<<64 ), "counter < 0 or >= 2**64"
238 |         ctx = self.ctx
239 |         ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) )
240 | 
241 |     def getCounter( self ):
242 |         return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0]
243 | 
244 | 
245 |     def setRounds(self, rounds, testing=False ):
246 |         assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20'
247 |         self.rounds = rounds
248 | 
249 | 
250 |     def encryptBytes(self, data):
251 |         assert type(data) == bytes, 'data must be byte string'
252 |         assert self._lastChunk64, 'previous chunk not multiple of 64 bytes'
253 |         lendata = len(data)
254 |         munged = bytearray(lendata)
255 |         for i in range( 0, lendata, 64 ):
256 |             h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False )
257 |             self.setCounter( ( self.getCounter() + 1 ) % 2**64 )
258 |             # Stopping at 2^70 bytes per nonce is user's responsibility.
259 |             for j in range( min( 64, lendata - i ) ):
260 |                 if python3:
261 |                     munged[ i+j ] = data[ i+j ] ^ h[j]
262 |                 else:
263 |                     munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j])
264 | 
265 |         self._lastChunk64 = not lendata % 64
266 |         return bytes(munged)
267 |     
268 |     decryptBytes = encryptBytes # encrypt and decrypt use same function
269 | 
270 | #--------------------------------------------------------------------------
271 | 
272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ):
273 |     """ Do nRounds Salsa20 rounds on a copy of 
274 |             input: list or tuple of 16 ints treated as little-endian unsigneds.
275 |         Returns a 64-byte string.
276 |         """
277 | 
278 |     assert( type(input) in ( list, tuple )  and  len(input) == 16 )
279 |     assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) )
280 | 
281 |     x = list( input )
282 | 
283 |     def XOR( a, b ):  return a ^ b
284 |     ROTATE = rot32
285 |     PLUS   = add32
286 | 
287 |     for i in range( nRounds // 2 ):
288 |         # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c
289 |         # unchanged except for indents and the blank line between rounds:
290 |         x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
291 |         x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
292 |         x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
293 |         x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
294 |         x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
295 |         x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
296 |         x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
297 |         x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
298 |         x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
299 |         x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
300 |         x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
301 |         x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
302 |         x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
303 |         x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
304 |         x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
305 |         x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));
306 | 
307 |         x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
308 |         x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
309 |         x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
310 |         x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
311 |         x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
312 |         x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
313 |         x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
314 |         x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
315 |         x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
316 |         x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
317 |         x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
318 |         x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
319 |         x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
320 |         x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
321 |         x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
322 |         x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));
323 | 
324 |     for i in range( len( input ) ):
325 |         x[i] = PLUS( x[i], input[i] )
326 |     return little16_i32.pack( *x )
327 | 
328 | #--------------------------- 32-bit ops -------------------------------
329 | 
330 | def trunc32( w ):
331 |     """ Return the bottom 32 bits of w as a Python int.
332 |         This creates longs temporarily, but returns an int. """
333 |     w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) )
334 |     assert type(w) == int
335 |     return w
336 | 
337 | 
338 | def add32( a, b ):
339 |     """ Add two 32-bit words discarding carry above 32nd bit,
340 |         and without creating a Python long.
341 |         Timing shouldn't vary.
342 |     """
343 |     lo = ( a & 0xFFFF ) + ( b & 0xFFFF )
344 |     hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 )
345 |     return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF )
346 | 
347 | 
348 | def rot32( w, nLeft ):
349 |     """ Rotate 32-bit word left by nLeft or right by -nLeft
350 |         without creating a Python long.
351 |         Timing depends on nLeft but not on w.
352 |     """
353 |     nLeft &= 31  # which makes nLeft >= 0
354 |     if nLeft == 0:
355 |         return w
356 | 
357 |     # Note: now 1 <= nLeft <= 31.
358 |     #     RRRsLLLLLL   There are nLeft RRR's, (31-nLeft) LLLLLL's,
359 |     # =>  sLLLLLLRRR   and one s which becomes the sign bit.
360 |     RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) )
361 |     sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w
362 |     return RRR | ( sLLLLLL << nLeft )
363 | 
364 | 
365 | # --------------------------------- end -----------------------------------
366 | 


--------------------------------------------------------------------------------
/readmdict.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | # readmdict.py
  4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
  5 | #
  6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang <xiaoqiangwang AT gmail DOT com>
  7 | #
  8 | # This program is a free software; you can redistribute it and/or modify
  9 | # it under the terms of the GNU General Public License as published by
 10 | # the Free Software Foundation, version 3 of the License.
 11 | #
 12 | # You can get a copy of GNU General Public License along this program
 13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt
 14 | #
 15 | # This program is distributed in the hope that it will be useful,
 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
 18 | # GNU General Public License for more details.
 19 | 
 20 | from struct import pack, unpack
 21 | from io import BytesIO
 22 | import re
 23 | import sys
 24 | import json
 25 | 
 26 | from .ripemd128 import ripemd128
 27 | from .pureSalsa20 import Salsa20
 28 | 
 29 | # zlib compression is used for engine version >=2.0
 30 | import zlib
 31 | # LZO compression is used for engine version < 2.0
 32 | try:
 33 |     import lzo
 34 | except ImportError:
 35 |     lzo = None
 36 |     print("LZO compression support is not available")
 37 | 
 38 | # 2x3 compatible
 39 | if sys.hexversion >= 0x03000000:
 40 |     unicode = str
 41 | 
 42 | 
 43 | def _unescape_entities(text):
 44 |     """
 45 |     unescape offending tags < > " &
 46 |     """
 47 |     text = text.replace(b'&lt;', b'<')
 48 |     text = text.replace(b'&gt;', b'>')
 49 |     text = text.replace(b'&quot;', b'"')
 50 |     text = text.replace(b'&amp;', b'&')
 51 |     return text
 52 | 
 53 | 
 54 | def _fast_decrypt(data, key):
 55 |     b = bytearray(data)
 56 |     key = bytearray(key)
 57 |     previous = 0x36
 58 |     for i in range(len(b)):
 59 |         t = (b[i] >> 4 | b[i] << 4) & 0xff
 60 |         t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
 61 |         previous = b[i]
 62 |         b[i] = t
 63 |     return bytes(b)
 64 | 
 65 | 
 66 | def _mdx_decrypt(comp_block):
 67 |     key = ripemd128(comp_block[4:8] + pack(b'<L', 0x3695))
 68 |     return comp_block[0:8] + _fast_decrypt(comp_block[8:], key)
 69 | 
 70 | 
 71 | def _salsa_decrypt(ciphertext, encrypt_key):
 72 |     s20 = Salsa20(key=encrypt_key, IV=b"\x00" * 8, rounds=8)
 73 |     return s20.encryptBytes(ciphertext)
 74 | 
 75 | 
 76 | def _decrypt_regcode_by_deviceid(reg_code, deviceid):
 77 |     deviceid_digest = ripemd128(deviceid)
 78 |     s20 = Salsa20(key=deviceid_digest, IV=b"\x00" * 8, rounds=8)
 79 |     encrypt_key = s20.encryptBytes(reg_code)
 80 |     return encrypt_key
 81 | 
 82 | 
 83 | def _decrypt_regcode_by_email(reg_code, email):
 84 |     email_digest = ripemd128(email.decode().encode('utf-16-le'))
 85 |     s20 = Salsa20(key=email_digest, IV=b"\x00" * 8, rounds=8)
 86 |     encrypt_key = s20.encryptBytes(reg_code)
 87 |     return encrypt_key
 88 | 
 89 | 
 90 | class MDict(object):
 91 |     """
 92 |     Base class which reads in header and key block.
 93 |     It has no public methods and serves only as code sharing base class.
 94 |     """
 95 |     def __init__(self, fname, encoding='', passcode=None):
 96 |         self._fname = fname
 97 |         self._encoding = encoding.upper()
 98 |         self._passcode = passcode
 99 | 
100 |         self.header = self._read_header()
101 |         try:
102 |             self._key_list = self._read_keys()
103 |         except:
104 |             print("Try Brutal Force on Encrypted Key Blocks")
105 |             self._key_list = self._read_keys_brutal()
106 | 
107 |     def __len__(self):
108 |         return self._num_entries
109 | 
110 |     def __iter__(self):
111 |         return self.keys()
112 | 
113 |     def keys(self):
114 |         """
115 |         Return an iterator over dictionary keys.
116 |         """
117 |         return (key_value for key_id, key_value in self._key_list)
118 | 
119 |     def _read_number(self, f):
120 |         return unpack(self._number_format, f.read(self._number_width))[0]
121 | 
122 |     def _parse_header(self, header):
123 |         """
124 |         extract attributes from <Dict attr="value" ... >
125 |         """
126 |         taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL)
127 |         tagdict = {}
128 |         for key, value in taglist:
129 |             tagdict[key] = _unescape_entities(value)
130 |         return tagdict
131 | 
132 |     def _decode_key_block_info(self, key_block_info_compressed):
133 |         if self._version >= 2:
134 |             # zlib compression
135 |             assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
136 |             # decrypt if needed
137 |             if self._encrypt & 0x02:
138 |                 key_block_info_compressed = _mdx_decrypt(key_block_info_compressed)
139 |             # decompress
140 |             key_block_info = zlib.decompress(key_block_info_compressed[8:])
141 |             # adler checksum
142 |             adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
143 |             assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
144 |         else:
145 |             # no compression
146 |             key_block_info = key_block_info_compressed
147 |         # decode
148 |         key_block_info_list = []
149 |         num_entries = 0
150 |         i = 0
151 |         if self._version >= 2:
152 |             byte_format = '>H'
153 |             byte_width = 2
154 |             text_term = 1
155 |         else:
156 |             byte_format = '>B'
157 |             byte_width = 1
158 |             text_term = 0
159 | 
160 |         while i < len(key_block_info):
161 |             # number of entries in current key block
162 |             num_entries += unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
163 |             i += self._number_width
164 |             # text head size
165 |             text_head_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
166 |             i += byte_width
167 |             # text head
168 |             if self._encoding != 'UTF-16':
169 |                 i += text_head_size + text_term
170 |             else:
171 |                 i += (text_head_size + text_term) * 2
172 |             # text tail size
173 |             text_tail_size = unpack(byte_format, key_block_info[i:i + byte_width])[0]
174 |             i += byte_width
175 |             # text tail
176 |             if self._encoding != 'UTF-16':
177 |                 i += text_tail_size + text_term
178 |             else:
179 |                 i += (text_tail_size + text_term) * 2
180 |             # key block compressed size
181 |             key_block_compressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
182 |             i += self._number_width
183 |             # key block decompressed size
184 |             key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0]
185 |             i += self._number_width
186 |             key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
187 | 
188 |         assert(num_entries == self._num_entries)
189 | 
190 |         return key_block_info_list
191 | 
192 |     def _decode_key_block(self, key_block_compressed, key_block_info_list):
193 |         key_list = []
194 |         i = 0
195 |         for compressed_size, decompressed_size in key_block_info_list:
196 |             start = i
197 |             end = i + compressed_size
198 |             # 4 bytes : compression type
199 |             key_block_type = key_block_compressed[start:start + 4]
200 |             # 4 bytes : adler checksum of decompressed key block
201 |             adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0]
202 |             if key_block_type == b'\x00\x00\x00\x00':
203 |                 key_block = key_block_compressed[start + 8:end]
204 |             elif key_block_type == b'\x01\x00\x00\x00':
205 |                 if lzo is None:
206 |                     print("LZO compression is not supported")
207 |                     break
208 |                 # decompress key block
209 |                 header = b'\xf0' + pack('>I', decompressed_size)
210 |                 key_block = lzo.decompress(key_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
211 |             elif key_block_type == b'\x02\x00\x00\x00':
212 |                 # decompress key block
213 |                 key_block = zlib.decompress(key_block_compressed[start + 8:end])
214 |             # extract one single key block into a key list
215 |             key_list += self._split_key_block(key_block)
216 |             # notice that adler32 returns signed value
217 |             assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
218 | 
219 |             i += compressed_size
220 |         return key_list
221 | 
222 |     def _split_key_block(self, key_block):
223 |         key_list = []
224 |         key_start_index = 0
225 |         while key_start_index < len(key_block):
226 |             temp = key_block[key_start_index:key_start_index + self._number_width]
227 |             # the corresponding record's offset in record block
228 |             key_id = unpack(self._number_format, key_block[key_start_index:key_start_index + self._number_width])[0]
229 |             # key text ends with '\x00'
230 |             if self._encoding == 'UTF-16':
231 |                 delimiter = b'\x00\x00'
232 |                 width = 2
233 |             else:
234 |                 delimiter = b'\x00'
235 |                 width = 1
236 |             i = key_start_index + self._number_width
237 |             while i < len(key_block):
238 |                 if key_block[i:i + width] == delimiter:
239 |                     key_end_index = i
240 |                     break
241 |                 i += width
242 |             key_text = key_block[key_start_index + self._number_width:key_end_index]\
243 |                 .decode(self._encoding, errors='ignore').encode('utf-8').strip()
244 |             key_start_index = key_end_index + width
245 |             key_list += [(key_id, key_text)]
246 |         return key_list
247 | 
248 |     def _read_header(self):
249 |         f = open(self._fname, 'rb')
250 |         # number of bytes of header text
251 |         header_bytes_size = unpack('>I', f.read(4))[0]
252 |         header_bytes = f.read(header_bytes_size)
253 |         # 4 bytes: adler32 checksum of header, in little endian
254 |         adler32 = unpack('<I', f.read(4))[0]
255 |         assert(adler32 == zlib.adler32(header_bytes) & 0xffffffff)
256 |         # mark down key block offset
257 |         self._key_block_offset = f.tell()
258 |         f.close()
259 | 
260 |         # header text in utf-16 encoding ending with '\x00\x00'
261 |         header_text = header_bytes[:-2].decode('utf-16').encode('utf-8')
262 |         header_tag = self._parse_header(header_text)
263 |         if not self._encoding:
264 |             encoding = header_tag[b'Encoding']
265 |             if sys.hexversion >= 0x03000000:
266 |                 encoding = encoding.decode('utf-8')
267 |             # GB18030 > GBK > GB2312
268 |             if encoding in ['GBK', 'GB2312']:
269 |                 encoding = 'GB18030'
270 |             self._encoding = encoding
271 |         # 读取标题和描述
272 |         if b'Title' in header_tag:
273 |             self._title = header_tag[b'Title'].decode('utf-8')
274 |         else:
275 |             self._title = ''
276 | 
277 |         if b'Description' in header_tag:
278 |             self._description = header_tag[b'Description'].decode('utf-8')
279 |         else:
280 |             self._description = ''
281 |         pass
282 |         # encryption flag
283 |         #   0x00 - no encryption
284 |         #   0x01 - encrypt record block
285 |         #   0x02 - encrypt key info block
286 |         if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No':
287 |             self._encrypt = 0
288 |         elif header_tag[b'Encrypted'] == b'Yes':
289 |             self._encrypt = 1
290 |         else:
291 |             self._encrypt = int(header_tag[b'Encrypted'])
292 | 
293 |         # stylesheet attribute if present takes form of:
294 |         #   style_number # 1-255
295 |         #   style_begin # or ''
296 |         #   style_end # or ''
297 |         # store stylesheet in dict in the form of
298 |         # {'number' : ('style_begin', 'style_end')}
299 |         self._stylesheet = {}
300 |         if header_tag.get('StyleSheet'):
301 |             lines = header_tag['StyleSheet'].splitlines()
302 |             for i in range(0, len(lines), 3):
303 |                 self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2])
304 | 
305 |         # before version 2.0, number is 4 bytes integer
306 |         # version 2.0 and above uses 8 bytes
307 |         self._version = float(header_tag[b'GeneratedByEngineVersion'])
308 |         if self._version < 2.0:
309 |             self._number_width = 4
310 |             self._number_format = '>I'
311 |         else:
312 |             self._number_width = 8
313 |             self._number_format = '>Q'
314 | 
315 |         return header_tag
316 | 
317 |     def _read_keys(self):
318 |         f = open(self._fname, 'rb')
319 |         f.seek(self._key_block_offset)
320 | 
321 |         # the following numbers could be encrypted
322 |         if self._version >= 2.0:
323 |             num_bytes = 8 * 5
324 |         else:
325 |             num_bytes = 4 * 4
326 |         block = f.read(num_bytes)
327 | 
328 |         if self._encrypt & 1:
329 |             if self._passcode is None:
330 |                 raise RuntimeError('user identification is needed to read encrypted file')
331 |             regcode, userid = self._passcode
332 |             if isinstance(userid, unicode):
333 |                 userid = userid.encode('utf8')
334 |             if self.header[b'RegisterBy'] == b'EMail':
335 |                 encrypted_key = _decrypt_regcode_by_email(regcode, userid)
336 |             else:
337 |                 encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid)
338 |             block = _salsa_decrypt(block, encrypted_key)
339 | 
340 |         # decode this block
341 |         sf = BytesIO(block)
342 |         # number of key blocks
343 |         num_key_blocks = self._read_number(sf)
344 |         # number of entries
345 |         self._num_entries = self._read_number(sf)
346 |         # number of bytes of key block info after decompression
347 |         if self._version >= 2.0:
348 |             key_block_info_decomp_size = self._read_number(sf)
349 |         # number of bytes of key block info
350 |         key_block_info_size = self._read_number(sf)
351 |         # number of bytes of key block
352 |         key_block_size = self._read_number(sf)
353 | 
354 |         # 4 bytes: adler checksum of previous 5 numbers
355 |         if self._version >= 2.0:
356 |             adler32 = unpack('>I', f.read(4))[0]
357 |             assert adler32 == (zlib.adler32(block) & 0xffffffff)
358 | 
359 |         # read key block info, which indicates key block's compressed and
360 |         # decompressed size
361 |         key_block_info = f.read(key_block_info_size)
362 |         key_block_info_list = self._decode_key_block_info(key_block_info)
363 |         assert(num_key_blocks == len(key_block_info_list))
364 | 
365 |         # read key block
366 |         key_block_compressed = f.read(key_block_size)
367 |         # extract key block
368 |         key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
369 | 
370 |         self._record_block_offset = f.tell()
371 |         f.close()
372 | 
373 |         return key_list
374 | 
375 |     def _read_keys_brutal(self):
376 |         f = open(self._fname, 'rb')
377 |         f.seek(self._key_block_offset)
378 | 
379 |         # the following numbers could be encrypted, disregard them!
380 |         if self._version >= 2.0:
381 |             num_bytes = 8 * 5 + 4
382 |             key_block_type = b'\x02\x00\x00\x00'
383 |         else:
384 |             num_bytes = 4 * 4
385 |             key_block_type = b'\x01\x00\x00\x00'
386 |         block = f.read(num_bytes)
387 | 
388 |         # key block info
389 |         # 4 bytes '\x02\x00\x00\x00'
390 |         # 4 bytes adler32 checksum
391 |         # unknown number of bytes follows until '\x02\x00\x00\x00' which marks
392 |         # the beginning of key block
393 |         key_block_info = f.read(8)
394 |         if self._version >= 2.0:
395 |             assert key_block_info[:4] == b'\x02\x00\x00\x00'
396 |         while True:
397 |             fpos = f.tell()
398 |             t = f.read(1024)
399 |             index = t.find(key_block_type)
400 |             if index != -1:
401 |                 key_block_info += t[:index]
402 |                 f.seek(fpos + index)
403 |                 break
404 |             else:
405 |                 key_block_info += t
406 | 
407 |         key_block_info_list = self._decode_key_block_info(key_block_info)
408 |         key_block_size = sum(list(zip(*key_block_info_list))[0])
409 | 
410 |         # read key block
411 |         key_block_compressed = f.read(key_block_size)
412 |         # extract key block
413 |         key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
414 | 
415 |         self._record_block_offset = f.tell()
416 |         f.close()
417 | 
418 |         self._num_entries = len(key_list)
419 |         return key_list
420 | 
421 | 
422 | class MDD(MDict):
423 |     """
424 |     MDict resource file format (*.MDD) reader.
425 |     >>> mdd = MDD('example.mdd')
426 |     >>> len(mdd)
427 |     208
428 |     >>> for filename,content in mdd.items():
429 |     ... print filename, content[:10]
430 |     """
431 |     def __init__(self, fname, passcode=None):
432 |         MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
433 | 
434 |     def items(self):
435 |         """Return a generator which in turn produce tuples in the form of (filename, content)
436 |         """
437 |         return self._decode_record_block()
438 | 
439 |     def _decode_record_block(self):
440 |         f = open(self._fname, 'rb')
441 |         f.seek(self._record_block_offset)
442 | 
443 |         num_record_blocks = self._read_number(f)
444 |         num_entries = self._read_number(f)
445 |         assert(num_entries == self._num_entries)
446 |         record_block_info_size = self._read_number(f)
447 |         record_block_size = self._read_number(f)
448 | 
449 |         # record block info section
450 |         record_block_info_list = []
451 |         size_counter = 0
452 |         for i in range(num_record_blocks):
453 |             compressed_size = self._read_number(f)
454 |             decompressed_size = self._read_number(f)
455 |             record_block_info_list += [(compressed_size, decompressed_size)]
456 |             size_counter += self._number_width * 2
457 |         assert(size_counter == record_block_info_size)
458 | 
459 |         # actual record block
460 |         offset = 0
461 |         i = 0
462 |         size_counter = 0
463 |         for compressed_size, decompressed_size in record_block_info_list:
464 |             record_block_compressed = f.read(compressed_size)
465 |             # 4 bytes: compression type
466 |             record_block_type = record_block_compressed[:4]
467 |             # 4 bytes: adler32 checksum of decompressed record block
468 |             adler32 = unpack('>I', record_block_compressed[4:8])[0]
469 |             if record_block_type == b'\x00\x00\x00\x00':
470 |                 record_block = record_block_compressed[8:]
471 |             elif record_block_type == b'\x01\x00\x00\x00':
472 |                 if lzo is None:
473 |                     print("LZO compression is not supported")
474 |                     break
475 |                 # decompress
476 |                 header = b'\xf0' + pack('>I', decompressed_size)
477 |                 record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
478 |             elif record_block_type == b'\x02\x00\x00\x00':
479 |                 # decompress
480 |                 record_block = zlib.decompress(record_block_compressed[8:])
481 | 
482 |             # notice that adler32 return signed value
483 |             assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
484 | 
485 |             assert(len(record_block) == decompressed_size)
486 |             # split record block according to the offset info from key block
487 |             while i < len(self._key_list):
488 |                 record_start, key_text = self._key_list[i]
489 |                 # reach the end of current record block
490 |                 if record_start - offset >= len(record_block):
491 |                     break
492 |                 # record end index
493 |                 if i < len(self._key_list) - 1:
494 |                     record_end = self._key_list[i + 1][0]
495 |                 else:
496 |                     record_end = len(record_block) + offset
497 |                 i += 1
498 |                 data = record_block[record_start - offset:record_end - offset]
499 |                 yield key_text, data
500 |             offset += len(record_block)
501 |             size_counter += compressed_size
502 |         assert(size_counter == record_block_size)
503 | 
504 |         f.close()
505 | 
506 |         ### 获取 mdx 文件的索引列表，格式为
507 |         ###  key_text(关键词，可以由后面的 keylist 得到)
508 |         ###  file_pos(record_block开始的位置)
509 |         ###  compressed_size(record_block压缩前的大小)
510 |         ###  decompressed_size(解压后的大小)
511 |         ###  record_block_type(record_block 的压缩类型)
512 |         ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数，可以直接保存）
513 |         ###  record_end
514 |         ###  offset
515 |     def get_index(self, check_block = True):
516 |         f = open(self._fname, 'rb')
517 |         index_dict_list = []
518 |         f.seek(self._record_block_offset)
519 | 
520 |         num_record_blocks = self._read_number(f)
521 |         num_entries = self._read_number(f)
522 |         assert(num_entries == self._num_entries)
523 |         record_block_info_size = self._read_number(f)
524 |         record_block_size = self._read_number(f)
525 | 
526 |         # record block info section
527 |         record_block_info_list = []
528 |         size_counter = 0
529 |         for i in range(num_record_blocks):
530 |             compressed_size = self._read_number(f)
531 |             decompressed_size = self._read_number(f)
532 |             record_block_info_list += [(compressed_size, decompressed_size)]
533 |             size_counter += self._number_width * 2
534 |         # todo:注意！！！
535 |         assert(size_counter == record_block_info_size)
536 | 
537 |         # actual record block
538 |         offset = 0
539 |         i = 0
540 |         size_counter = 0
541 |         for compressed_size, decompressed_size in record_block_info_list:
542 |             current_pos = f.tell()
543 |             record_block_compressed = f.read(compressed_size)
544 |             # 4 bytes: compression type
545 |             record_block_type = record_block_compressed[:4]
546 |             # 4 bytes: adler32 checksum of decompressed record block
547 |             adler32 = unpack('>I', record_block_compressed[4:8])[0]
548 |             if record_block_type == b'\x00\x00\x00\x00':
549 |                 _type = 0
550 |                 if check_block:
551 |                     record_block = record_block_compressed[8:]
552 |             elif record_block_type == b'\x01\x00\x00\x00':
553 |                 _type = 1
554 |                 if lzo is None:
555 |                     print("LZO compression is not supported")
556 |                     break
557 |                 # decompress
558 |                 header = b'\xf0' + pack('>I', decompressed_size)
559 |                 if check_block:
560 |                     record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672)
561 |             elif record_block_type == b'\x02\x00\x00\x00':
562 |                 # decompress
563 |                 _type = 2
564 |                 if check_block:
565 |                     record_block = zlib.decompress(record_block_compressed[8:])
566 | 
567 |             # notice that adler32 return signed value
568 |             if check_block:
569 |                 assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
570 |                 assert(len(record_block) == decompressed_size)
571 |             # split record block according to the offset info from key block
572 |             while i < len(self._key_list):
573 |                 ### 用来保存索引信息的空字典
574 |                 index_dict = {}
575 |                 index_dict['file_pos'] = current_pos
576 |                 index_dict['compressed_size'] = compressed_size
577 |                 index_dict['decompressed_size'] = decompressed_size
578 |                 index_dict['record_block_type'] = _type
579 |                 record_start, key_text = self._key_list[i]
580 |                 index_dict['record_start'] = record_start
581 |                 index_dict['key_text'] = key_text.decode("utf-8")
582 |                 index_dict['offset'] = offset
583 |                 # reach the end of current record block
584 |                 if record_start - offset >= decompressed_size: 
585 |                     break
586 |                 # record end index
587 |                 if i < len(self._key_list) - 1:
588 |                     record_end = self._key_list[i + 1][0]
589 |                 else:
590 |                     record_end = decompressed_size + offset
591 |                 index_dict['record_end'] = record_end
592 |                 i += 1
593 |                 if check_block:
594 |                     data = record_block[record_start - offset:record_end - offset]
595 |                 index_dict_list.append(index_dict)
596 |                 #yield key_text, data
597 |             offset += decompressed_size 
598 |             size_counter += compressed_size
599 |         assert(size_counter == record_block_size)
600 |         f.close()
601 |         return index_dict_list
602 | 
603 | 
604 | class MDX(MDict):
605 |     """
606 |     MDict dictionary file format (*.MDD) reader.
607 |     >>> mdx = MDX('example.mdx')
608 |     >>> len(mdx)
609 |     42481
610 |     >>> for key,value in mdx.items():
611 |     ... print key, value[:10]
612 |     """
613 |     def __init__(self, fname, encoding='', substyle=False, passcode=None):
614 |         MDict.__init__(self, fname, encoding, passcode)
615 |         self._substyle = substyle
616 | 
617 |     def items(self):
618 |         """Return a generator which in turn produce tuples in the form of (key, value)
619 |         """
620 |         return self._decode_record_block()
621 | 
622 |     def _substitute_stylesheet(self, txt):
623 |         # substitute stylesheet definition
624 |         txt_list = re.split('`\d+`', txt)
625 |         txt_tag = re.findall('`\d+`', txt)
626 |         txt_styled = txt_list[0]
627 |         for j, p in enumerate(txt_list[1:]):
628 |             style = self._stylesheet[txt_tag[j][1:-1]]
629 |             if p and p[-1] == '\n':
630 |                 txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
631 |             else:
632 |                 txt_styled = txt_styled + style[0] + p + style[1]
633 |         return txt_styled
634 | 
635 |     def _decode_record_block(self):
636 |         f = open(self._fname, 'rb')
637 |         f.seek(self._record_block_offset)
638 | 
639 |         num_record_blocks = self._read_number(f)
640 |         num_entries = self._read_number(f)
641 |         assert(num_entries == self._num_entries)
642 |         record_block_info_size = self._read_number(f)
643 |         record_block_size = self._read_number(f)
644 | 
645 |         # record block info section
646 |         record_block_info_list = []
647 |         size_counter = 0
648 |         for i in range(num_record_blocks):
649 |             compressed_size = self._read_number(f)
650 |             decompressed_size = self._read_number(f)
651 |             record_block_info_list += [(compressed_size, decompressed_size)]
652 |             size_counter += self._number_width * 2
653 |         assert(size_counter == record_block_info_size)
654 | 
655 |         # actual record block data
656 |         offset = 0
657 |         i = 0
658 |         size_counter = 0
659 |         ###最后的索引表的格式为
660 |         ###  key_text(关键词，可以由后面的 keylist 得到)
661 |         ###  file_pos(record_block开始的位置)
662 |         ###  compressed_size(record_block压缩前的大小)
663 |         ###  decompressed_size(解压后的大小)
664 |         ###  record_block_type(record_block 的压缩类型)
665 |         ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数，可以直接保存）
666 |         ###  record_end
667 |         ###  offset
668 |         for compressed_size, decompressed_size in record_block_info_list:
669 |             record_block_compressed = f.read(compressed_size)
670 |             ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录）
671 |             ###### 另外还需要记录当前 f 对象的位置
672 |             ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
673 |             # 4 bytes indicates block compression type
674 |             record_block_type = record_block_compressed[:4]
675 |             # 4 bytes adler checksum of uncompressed content
676 |             adler32 = unpack('>I', record_block_compressed[4:8])[0]
677 |             # no compression
678 |             if record_block_type == b'\x00\x00\x00\x00':
679 |                 record_block = record_block_compressed[8:]
680 |             # lzo compression
681 |             elif record_block_type == b'\x01\x00\x00\x00':
682 |                 if lzo is None:
683 |                     print("LZO compression is not supported")
684 |                     break
685 |                 # decompress
686 |                 header = b'\xf0' + pack('>I', decompressed_size)
687 |                 record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
688 |             # zlib compression
689 |             elif record_block_type == b'\x02\x00\x00\x00':
690 |                 # decompress
691 |                 record_block = zlib.decompress(record_block_compressed[8:])
692 |             ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的，其中一共有三种解压方法
693 |             ###### 需要的信息有 record_block_compressed, decompress_size,
694 |             ###### record_block_type
695 |             ###### 另外还需要校验信息 adler32
696 |             # notice that adler32 return signed value
697 |             assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
698 | 
699 |             assert(len(record_block) == decompressed_size)
700 |             # split record block according to the offset info from key block
701 |             while i < len(self._key_list):
702 |                 record_start, key_text = self._key_list[i]
703 |                 # reach the end of current record block
704 |                 if record_start - offset >= len(record_block):
705 |                     break
706 |                 # record end index
707 |                 if i < len(self._key_list) - 1:
708 |                     record_end = self._key_list[i + 1][0]
709 |                 else:
710 |                     record_end = len(record_block) + offset
711 |                 i += 1
712 |                 #############需要得到 record_block , record_start, record_end,
713 |                 #############offset
714 |                 record = record_block[record_start - offset:record_end - offset]
715 |                 # convert to utf-8
716 |                 record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
717 |                 # substitute styles
718 |                 #############是否替换样式表
719 |                 if self._substyle and self._stylesheet:
720 |                     record = self._substitute_stylesheet(record)
721 | 
722 |                 yield key_text, record
723 |             offset += len(record_block)
724 |             size_counter += compressed_size
725 |         assert(size_counter == record_block_size)
726 | 
727 |         f.close()
728 | 
729 |     ### 获取 mdx 文件的索引列表，格式为
730 |         ###  key_text(关键词，可以由后面的 keylist 得到)
731 |         ###  file_pos(record_block开始的位置)
732 |         ###  compressed_size(record_block压缩前的大小)
733 |         ###  decompressed_size(解压后的大小)
734 |         ###  record_block_type(record_block 的压缩类型)
735 |         ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数，可以直接保存）
736 |         ###  record_end
737 |         ###  offset
738 | 	### 所需 metadata
739 | 	### 
740 |     def get_index(self, check_block = True):
741 |         ###  索引列表
742 |         index_dict_list = []
743 |         f = open(self._fname, 'rb')
744 |         f.seek(self._record_block_offset)
745 | 
746 |         num_record_blocks = self._read_number(f)
747 |         num_entries = self._read_number(f)
748 |         assert(num_entries == self._num_entries)
749 |         record_block_info_size = self._read_number(f)
750 |         record_block_size = self._read_number(f)
751 | 
752 |         # record block info section
753 |         record_block_info_list = []
754 |         size_counter = 0
755 |         for i in range(num_record_blocks):
756 |             compressed_size = self._read_number(f)
757 |             decompressed_size = self._read_number(f)
758 |             record_block_info_list += [(compressed_size, decompressed_size)]
759 |             size_counter += self._number_width * 2
760 |         assert(size_counter == record_block_info_size)
761 | 
762 |         # actual record block data
763 |         offset = 0
764 |         i = 0
765 |         size_counter = 0
766 |         ###最后的索引表的格式为
767 |         ###  key_text(关键词，可以由后面的 keylist 得到)
768 |         ###  file_pos(record_block开始的位置)
769 |         ###  compressed_size(record_block压缩前的大小)
770 |         ###  decompressed_size(解压后的大小)
771 |         ###  record_block_type(record_block 的压缩类型)
772 |         ###  record_start (以下三个为从 record_block 中提取某一调记录需要的参数，可以直接保存）
773 |         ###  record_end
774 |         ###  offset
775 |         for compressed_size, decompressed_size in record_block_info_list:
776 |             current_pos = f.tell()
777 |             record_block_compressed = f.read(compressed_size)
778 |             ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录）
779 |             ###### 另外还需要记录当前 f 对象的位置
780 |             ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek()
781 |             # 4 bytes indicates block compression type
782 |             record_block_type = record_block_compressed[:4]
783 |             # 4 bytes adler checksum of uncompressed content
784 |             adler32 = unpack('>I', record_block_compressed[4:8])[0]
785 |             # no compression
786 |             if record_block_type == b'\x00\x00\x00\x00':
787 |                 _type = 0
788 |                 record_block = record_block_compressed[8:]
789 |             # lzo compression
790 |             elif record_block_type == b'\x01\x00\x00\x00':
791 |                 _type = 1
792 |                 if lzo is None:
793 |                     print("LZO compression is not supported")
794 |                     break
795 |                 # decompress
796 |                 header = b'\xf0' + pack('>I', decompressed_size)
797 |                 if check_block:
798 |                     record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672)
799 |             # zlib compression
800 |             elif record_block_type == b'\x02\x00\x00\x00':
801 |                 # decompress
802 |                 _type = 2
803 |                 if check_block:
804 |                     record_block = zlib.decompress(record_block_compressed[8:])
805 |             ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的，其中一共有三种解压方法
806 |             ###### 需要的信息有 record_block_compressed, decompress_size,
807 |             ###### record_block_type
808 |             ###### 另外还需要校验信息 adler32
809 |             # notice that adler32 return signed value
810 |             if check_block:
811 |                 assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
812 |                 assert(len(record_block) == decompressed_size)
813 |             # split record block according to the offset info from key block
814 |             while i < len(self._key_list):
815 |                 ### 用来保存索引信息的空字典
816 |                 index_dict = {}
817 |                 index_dict['file_pos'] = current_pos
818 |                 index_dict['compressed_size'] = compressed_size
819 |                 index_dict['decompressed_size'] = decompressed_size
820 |                 index_dict['record_block_type'] = _type
821 |                 record_start, key_text = self._key_list[i]
822 |                 index_dict['record_start'] = record_start
823 |                 index_dict['key_text'] = key_text.decode('utf-8')
824 |                 index_dict['offset'] = offset
825 |                 # reach the end of current record block
826 |                 if record_start - offset >= decompressed_size: 
827 |                     break
828 |                 # record end index
829 |                 if i < len(self._key_list) - 1:
830 |                     record_end = self._key_list[i + 1][0]
831 |                 else:
832 |                     record_end = decompressed_size + offset
833 |                 index_dict['record_end'] = record_end
834 |                 i += 1
835 |                 #############需要得到 record_block , record_start, record_end,
836 |                 #############offset
837 |                 if check_block:
838 |                     record = record_block[record_start - offset:record_end - offset]
839 |                     # convert to utf-8
840 |                     record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
841 |                     # substitute styles
842 |                     #############是否替换样式表
843 |                     if self._substyle and self._stylesheet:
844 |                         record = self._substitute_stylesheet(record)
845 |                 index_dict_list.append(index_dict)
846 | 
847 |             offset += decompressed_size 
848 |             size_counter += compressed_size
849 |         #todo: 注意！！！
850 | 		#assert(size_counter == record_block_size)
851 |         f.close
852 |         #这里比 mdd 部分稍有不同，应该还需要传递编码以及样式表信息
853 |         meta = {}
854 |         meta['encoding'] = self._encoding
855 |         meta['stylesheet'] = json.dumps(self._stylesheet)
856 |         meta['title'] = self._title
857 |         meta['description'] = self._description
858 | 
859 |         return {"index_dict_list":index_dict_list, 'meta':meta}
860 | if __name__ == '__main__':
861 |     import sys
862 |     import os
863 |     import os.path
864 |     import argparse
865 |     import codecs
866 | 
867 |     def passcode(s):
868 |         try:
869 |             regcode, userid = s.split(',')
870 |         except:
871 |             raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
872 |         try:
873 |             regcode = codecs.decode(regcode, 'hex')
874 |         except:
875 |             raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
876 |         return regcode, userid
877 | 
878 |     parser = argparse.ArgumentParser()
879 |     parser.add_argument('-x', '--extract', action="store_true",
880 |                         help='extract mdx to source format and extract files from mdd')
881 |     parser.add_argument('-s', '--substyle', action="store_true",
882 |                         help='substitute style definition if present')
883 |     parser.add_argument('-d', '--datafolder', default="data",
884 |                         help='folder to extract data files from mdd')
885 |     parser.add_argument('-e', '--encoding', default="",
886 |                         help='folder to extract data files from mdd')
887 |     parser.add_argument('-p', '--passcode', default=None, type=passcode,
888 |                         help='register_code,email_or_deviceid')
889 |     parser.add_argument("filename", nargs='?', help="mdx file name")
890 |     args = parser.parse_args()
891 | 
892 |     # use GUI to select file, default to extract
893 |     if not args.filename:
894 |         import Tkinter
895 |         import tkFileDialog
896 |         root = Tkinter.Tk()
897 |         root.withdraw()
898 |         args.filename = tkFileDialog.askopenfilename(parent=root)
899 |         args.extract = True
900 | 
901 |     if not os.path.exists(args.filename):
902 |         print("Please specify a valid MDX/MDD file")
903 | 
904 |     base, ext = os.path.splitext(args.filename)
905 | 
906 |     # read mdx file
907 |     if ext.lower() == os.path.extsep + 'mdx':
908 |         mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
909 |         if type(args.filename) is unicode:
910 |             bfname = args.filename.encode('utf-8')
911 |         else:
912 |             bfname = args.filename
913 |         print('======== %s ========' % bfname)
914 |         print('  Number of Entries : %d' % len(mdx))
915 |         for key, value in mdx.header.items():
916 |             print('  %s : %s' % (key, value))
917 |     else:
918 |         mdx = None
919 | 
920 |     # find companion mdd file
921 |     mdd_filename = ''.join([base, os.path.extsep, 'mdd'])
922 |     if os.path.exists(mdd_filename):
923 |         mdd = MDD(mdd_filename, args.passcode)
924 |         if type(mdd_filename) is unicode:
925 |             bfname = mdd_filename.encode('utf-8')
926 |         else:
927 |             bfname = mdd_filename
928 |         print('======== %s ========' % bfname)
929 |         print('  Number of Entries : %d' % len(mdd))
930 |         for key, value in mdd.header.items():
931 |             print('  %s : %s' % (key, value))
932 |     else:
933 |         mdd = None
934 | 
935 |     if args.extract:
936 |         # write out glos
937 |         if mdx:
938 |             output_fname = ''.join([base, os.path.extsep, 'txt'])
939 |             tf = open(output_fname, 'wb')
940 |             for key, value in mdx.items():
941 |                 tf.write(key)
942 |                 tf.write(b'\r\n')
943 |                 tf.write(value)
944 |                 if not value.endswith(b'\n'):
945 |                     tf.write(b'\r\n')
946 |                 tf.write(b'</>\r\n')
947 |             tf.close()
948 |             # write out style
949 |             if mdx.header.get('StyleSheet'):
950 |                 style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
951 |                 sf = open(style_fname, 'wb')
952 |                 sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines()))
953 |                 sf.close()
954 |         # write out optional data files
955 |         if mdd:
956 |             datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
957 |             if not os.path.exists(datafolder):
958 |                 os.makedirs(datafolder)
959 |             for key, value in mdd.items():
960 |                 fname = key.decode('utf-8').replace('\\', os.path.sep)
961 |                 dfname = datafolder + fname
962 |                 if not os.path.exists(os.path.dirname(dfname)):
963 |                     os.makedirs(os.path.dirname(dfname))
964 |                 df = open(dfname, 'wb')
965 |                 df.write(value)
966 |                 df.close()
967 | 


--------------------------------------------------------------------------------
/ripemd128.py:
--------------------------------------------------------------------------------
  1 | """ 
  2 | Copyright by https://github.com/zhansliu/writemdict
  3 | 
  4 | ripemd128.py - A simple ripemd128 library in pure Python.
  5 | 
  6 | Supports both Python 2 (versions >= 2.6) and Python 3.
  7 | 
  8 | Usage:
  9 |     from ripemd128 import ripemd128
 10 |     digest = ripemd128(b"The quick brown fox jumps over the lazy dog")
 11 |     assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96")
 12 | 
 13 | """
 14 |       
 15 | 
 16 | 
 17 | import struct
 18 | 
 19 | 
 20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt
 21 | 
 22 | def f(j, x, y, z):
 23 | 	assert(0 <= j and j < 64)
 24 | 	if j < 16:
 25 | 		return x ^ y ^ z
 26 | 	elif j < 32:
 27 | 		return (x & y) | (z & ~x)
 28 | 	elif j < 48:
 29 | 		return (x | (0xffffffff & ~y)) ^ z
 30 | 	else:
 31 | 		return (x & z) | (y & ~z)
 32 | 
 33 | def K(j):
 34 | 	assert(0 <= j and j < 64)
 35 | 	if j < 16:
 36 | 		return 0x00000000
 37 | 	elif j < 32:
 38 | 		return 0x5a827999
 39 | 	elif j < 48:
 40 | 		return 0x6ed9eba1
 41 | 	else:
 42 | 		return 0x8f1bbcdc
 43 | 
 44 | def Kp(j):
 45 | 	assert(0 <= j and j < 64)
 46 | 	if j < 16:
 47 | 		return 0x50a28be6
 48 | 	elif j < 32:
 49 | 		return 0x5c4dd124
 50 | 	elif j < 48:
 51 | 		return 0x6d703ef3
 52 | 	else:
 53 | 		return 0x00000000
 54 | 
 55 | def padandsplit(message):
 56 | 	"""
 57 | 	returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges
 58 | 	from 0 to 16.
 59 | 	First pads the message to length in bytes is congruent to 56 (mod 64), 
 60 | 	by first adding a byte 0x80, and then padding with 0x00 bytes until the
 61 | 	message length is congruent to 56 (mod 64). Then adds the little-endian
 62 | 	64-bit representation of the original length. Finally, splits the result
 63 | 	up into 64-byte blocks, which are further parsed as 32-bit integers.
 64 | 	"""
 65 | 	origlen = len(message)
 66 | 	padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1!
 67 | 	message += b"\x80"
 68 | 	message += b"\x00" * (padlength - 1)
 69 | 	message += struct.pack("<Q", origlen*8)
 70 | 	assert(len(message) % 64 == 0)
 71 | 	return [
 72 | 	         [
 73 | 	           struct.unpack("<L", message[i+j:i+j+4])[0]
 74 | 	           for j in range(0, 64, 4)
 75 | 	         ]
 76 | 	         for i in range(0, len(message), 64)
 77 | 	       ]
 78 | 
 79 | 
 80 | def add(*args):
 81 | 	return sum(args) & 0xffffffff
 82 | 
 83 | def rol(s,x):
 84 | 	assert(s < 32)
 85 | 	return (x << s | x >> (32-s)) & 0xffffffff
 86 | 
 87 | r =  [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
 88 |        7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8,
 89 |        3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12,
 90 |        1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2]
 91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12,
 92 |        6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2,
 93 |       15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13,
 94 |        8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14]
 95 | s =  [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8,
 96 |        7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12,
 97 |       11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5,
 98 |       11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12]
 99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6,
100 |        9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11,
101 |        9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5,
102 |       15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8]
103 | 
104 | 
105 | def ripemd128(message):
106 | 	h0 = 0x67452301
107 | 	h1 = 0xefcdab89
108 | 	h2 = 0x98badcfe
109 | 	h3 = 0x10325476
110 | 	X = padandsplit(message)
111 | 	for i in range(len(X)):
112 | 		(A,B,C,D) = (h0,h1,h2,h3)
113 | 		(Ap,Bp,Cp,Dp) = (h0,h1,h2,h3)
114 | 		for j in range(64):
115 | 			T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j)))
116 | 			(A,D,C,B) = (D,C,B,T)
117 | 			T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j)))
118 | 			(Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T)
119 | 		T = add(h1,C,Dp)
120 | 		h1 = add(h2,D,Ap)
121 | 		h2 = add(h3,A,Bp)
122 | 		h3 = add(h0,B,Cp)
123 | 		h0 = T
124 | 	
125 | 	
126 | 	return struct.pack("<LLLL",h0,h1,h2,h3)
127 | 
128 | def hexstr(bstr):
129 | 	return "".join("{0:02x}".format(b) for b in bstr)
130 | 	
131 | 


--------------------------------------------------------------------------------
/static/cache here.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mmjang/mdict-query/6aa6d65a722853d5fa6bc13da1036e8df1b8f160/static/cache here.txt


--------------------------------------------------------------------------------
/templates/all.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 4 |     <title>All Available Dictionary</title>
 5 | </head>
 6 | <body>
 7 |     {% for item in dicts %}
 8 |     <button><a href="{{item.url}}">{{item.title}}</a></abbr></button><br />
 9 |     {% endfor %}
10 | </body>
11 | 
12 | </html>


--------------------------------------------------------------------------------
/templates/dict.html:
--------------------------------------------------------------------------------
 1 | <html>
 2 | <head>
 3 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 4 |     <title>{{title}}</title>
 5 | </head>
 6 | <body>
 7 |     <h1 id="title">{{title}}</h1>
 8 |     <p id="url_title" hidden>{{url_title}}</p>
 9 |     <input id="input"/> <button id="button">search</button>
10 |     <div>
11 |         {{description|safe}}
12 |     </div>
13 |     <script>
14 |         var input = document.querySelector("#input")
15 |         var button = document.querySelector("#button")
16 |         var url_title = document.querySelector("#url_title").innerText
17 |         button.addEventListener('click',
18 |             function()
19 |             {
20 |                 var hwd = input.value
21 |                 window.location = hwd
22 |                 }
23 |             )
24 |     </script>
25 | </body>
26 | 
27 | </html>
28 | 


--------------------------------------------------------------------------------
/templates/entry.html:
--------------------------------------------------------------------------------
 1 | ﻿<html>
 2 | <head>
 3 |     <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
 4 |     <title>{{title}}/{{entry}}</title>
 5 | </head>
 6 | <body>
 7 |     {{content|safe}}
 8 |     <script>
 9 |         //play sound
10 |         player = document.createElement("audio")
11 |         player.id = 'player'
12 | 
13 |         function play(url) {
14 |             console.log(url)
15 |             //var url = sound.href.replace("sound://", "")
16 |             //var audio = document.createElement("audio")
17 |             //player.pause()
18 |             player.src = url
19 |             player.play()
20 |         }
21 | 
22 |         function makefunction(url)
23 |         {
24 |             return function (e) {
25 |                 play(url)
26 | 				e.preventDefault()
27 |             }
28 |         }
29 |         var all_a = document.getElementsByTagName('a')
30 |         for (i in all_a) {
31 |             var item = all_a[i]
32 |             if (item.href && item.href.lastIndexOf("sound://", 0) === 0) {
33 |                 var url = item.href.replace("sound://", "")
34 |                 var func = makefunction(url)
35 |                 item.addEventListener("click",
36 |                     func
37 |                 )
38 |             }
39 | 
40 |             if (item.href && item.href.lastIndexOf("entry://", 0) === 0) {
41 |                 item.href = item.href.replace("entry://", "")
42 |             }
43 |         }
44 |     </script>
45 | </body>
46 | 
47 | </html>


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | from mdict_query import IndexBuilder
 2 | import unittest
 3 | import os
 4 | import glob
 5 | import time
 6 | from timeit import timeit
 7 | 
 8 | class TestMdict(unittest.TestCase):
 9 | 
10 |     _mdx_file = glob.glob("mdx/Vocabulary*.mdx")[0]
11 |     _repeat = 100
12 |     # remove existing db
13 |     for f in glob.glob("mdx/Vocabulary*.db"):
14 |         os.remove(f)
15 |      
16 |     def test_builder_noindex(self):
17 |         '''test basic function'''
18 |         for f in glob.glob("mdx/Vocabulary*.db"):
19 |             os.remove(f)
20 |         print("***without sql index***\n")
21 |         start = time.time() 
22 |         bd = IndexBuilder(self._mdx_file, sql_index = False, check = True)
23 |         print("takes {0} seconds to build without sql index\n".format(time.time() - start))         
24 |         
25 |         start = time.time()
26 |         word = 'dedicate'
27 |         for i in range(self._repeat):
28 |             self.assertTrue(bd.mdx_lookup(word))
29 |         print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, word, self._repeat))
30 |         for i in range(self._repeat):
31 |             bd.get_mdx_keys("dedi*")
32 |         print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, "dedi*", self._repeat))
33 |          
34 |     def test_builder_index(self):
35 |         '''test basic function'''
36 |         for f in glob.glob("mdx/Vocabulary*.db"):
37 |             os.remove(f)
38 |         print("***with sql index***\n")
39 |         start = time.time() 
40 |         bd = IndexBuilder(self._mdx_file, sql_index = True, check = False)
41 |         print("takes {0} seconds to build with sql index\n".format(time.time() - start))
42 |          
43 |         start = time.time()
44 |         word = 'dedicate'
45 |         for i in range(self._repeat):
46 |             bd.mdx_lookup(word)
47 |         print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, word, self._repeat))
48 | 
49 |         for i in range(self._repeat):
50 |             bd.get_mdx_keys("dedi*")
51 |         print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, "dedi*", self._repeat))
52 |  
53 | 
54 | if __name__ == '__main__':
55 |         unittest.main()


--------------------------------------------------------------------------------
/test_lzo.py:
--------------------------------------------------------------------------------
1 | from mdict_query import IndexBuilder
2 | 
3 | bd = IndexBuilder("mdx\\oed.mdx")
4 | keys = bd.get_mdx_keys("ded*")
5 | result = bd.mdx_lookup('a')
6 | pass


--------------------------------------------------------------------------------
/web.py:
--------------------------------------------------------------------------------
  1 | from flask import Flask, send_from_directory, abort, render_template, jsonify, Response
  2 | from mdict_dir import Dir
  3 | #from mdict_query import IndexBuilder
  4 | import os
  5 | import re
  6 | import sys
  7 | import json
  8 | #IndexBuilder('vocab.mdx')
  9 | #pass
 10 | app = Flask(__name__)
 11 | 
 12 | # add reg support
 13 | from werkzeug.routing import BaseConverter
 14 | 
 15 | class RegexConverter(BaseConverter):
 16 |         def __init__(self, url_map, *items):
 17 |                 super(RegexConverter, self).__init__(url_map)
 18 |                 self.regex = items[0]
 19 | 
 20 | app.url_map.converters['regex'] = RegexConverter
 21 | #################
 22 | # 将多层路径整合为文件名
 23 | def path2file(path):
 24 |     return path.replace('/','_')
 25 | # 将词典名转为用于url的形式
 26 | def title2url(title):
 27 |     return re.sub(r"。|，|？|\s|,|\.|/|\\|(|)|（|）", "", title.lower())
 28 | # init app
 29 | mdict_dir = 'mdx' # mdx/mdd 文件目录
 30 | mdd_cache_dir = 'cache'
 31 | 
 32 | if not os.path.isdir(mdict_dir):
 33 |     print('no mdx directory\n', file=sys.stderr)
 34 |     os.makedirs(mdict_dir)
 35 | 
 36 | if not os.path.isdir(mdd_cache_dir):
 37 |     os.makedirs(mdd_cache_dir)
 38 | 
 39 | 
 40 | mdict = Dir(mdict_dir)
 41 | #config = mdict._config['dicts'][0]
 42 | mdx_map = {}
 43 | for dic in mdict._config['dicts']:
 44 |     mdx_map[title2url(dic['title'])] = dic['builder']
 45 | ##########
 46 | @app.route('/')
 47 | def hello_world():
 48 |     return 'Hello World'
 49 | 
 50 | 
 51 | @app.route('/dict/')
 52 | def all_dicts():
 53 |     dicts = []
 54 |     for dic in mdict._config['dicts']:
 55 |         title = dic['title']
 56 |         dicts.append({
 57 |                 'title' : title,
 58 |                 'url' : '/dict/{0}/'.format(title2url(title))
 59 |                 })
 60 |     return render_template('all.html', dicts = dicts)
 61 | 
 62 | @app.route('/dict/<title>/')
 63 | def description(title):
 64 |     if title not in mdx_map:
 65 |         return "没有找到此词典"
 66 |     for xxx in mdict._config['dicts']:
 67 |         if title2url(xxx['title']) == title:
 68 |             return render_template("dict.html", title = xxx['title'], description = xxx['description'], url_title = title)
 69 |   
 70 | 
 71 | @app.route('/dict/search/<query>/')
 72 | def search(query):
 73 |     result = []
 74 |     for xxx in mdict._config['dicts']:
 75 |        bd = xxx['builder']
 76 |        result.append([title2url(xxx['title']), bd.get_mdx_keys(query)])
 77 |     dat = json.dumps(result, ensure_ascii = False)
 78 |     resp = Response(response=dat, # standard way to return json
 79 |             status=200, 
 80 |             mimetype="application/json")
 81 |     return(resp)
 82 | 
 83 | 
 84 | @app.route('/dict/<title>/<regex(".+?\."):base><regex("css|png|jpg|gif|mp3|js|wav|ogg"):ext>')
 85 | def getFile(title,base,ext):
 86 |     #print(base + ext, file=sys.stderr)
 87 |     if title not in mdx_map:
 88 |         return "没有找到此词典"
 89 |     builder = mdx_map[title]
 90 |     # 是否为外挂文件
 91 |     external_file = os.path.join(mdict_dir, base + ext)
 92 |     if os.path.isfile(external_file):
 93 |         return send_from_directory(mdict_dir, base + ext)
 94 |     
 95 |     # 是否是mdd内的文件
 96 |     cache_name = path2file(base + ext)
 97 |     cache_full = os.path.join(mdd_cache_dir, cache_name)
 98 |     if not os.path.isfile(cache_full):
 99 |         mdd_key = '\\{0}{1}'.format(base,ext).replace("/","\\")
100 |         byte = builder.mdd_lookup(mdd_key)
101 |         if not byte: # 在 mdd 内未找到指定文件
102 |             abort(404) # 返回 404
103 |         file = open(cache_full,'wb')
104 |         file.write(byte[0])
105 |         file.close()
106 |     return send_from_directory(mdd_cache_dir, cache_name)
107 | 
108 | 
109 | @app.route('/dict/<title>/<hwd>')
110 | def getEntry(title, hwd):
111 |     if title not in mdx_map:
112 |         return "没有找到此词典"
113 |     builder = mdx_map[title]
114 |     result = builder.mdx_lookup(hwd)
115 |     if result:
116 |         text = result[0]
117 |     else:
118 |         return "<p>在词典{0}中没有找到{1}</p>".format(title, hwd)
119 | 
120 |     #return
121 |     #text.replace("\r\n","").replace("entry://","").replace("sound://","")
122 |     return render_template("entry.html", content = text, title = title, entry = hwd)
123 |     
124 | if __name__ == '__main__':
125 |    app.run('127.0.0.1',5000, debug = True)
126 |    
127 | 


--------------------------------------------------------------------------------
/web.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python -*-
 2 | 
 3 | block_cipher = None
 4 | 
 5 | 
 6 | a = Analysis(['web.py'],
 7 |              pathex=['D:\\Users\\Maple\\Documents\\GitHub\\mdict-query'],
 8 |              binaries=None,
 9 |              datas=None,
10 |              hiddenimports=[],
11 |              hookspath=[],
12 |              runtime_hooks=[],
13 |              excludes=[],
14 |              win_no_prefer_redirects=False,
15 |              win_private_assemblies=False,
16 |              cipher=block_cipher)
17 | pyz = PYZ(a.pure, a.zipped_data,
18 |              cipher=block_cipher)
19 | exe = EXE(pyz,
20 |           a.scripts,
21 |           exclude_binaries=True,
22 |           name='web',
23 |           debug=False,
24 |           strip=False,
25 |           upx=True,
26 |           console=True )
27 | coll = COLLECT(exe,
28 |                a.binaries,
29 |                a.zipfiles,
30 |                a.datas,
31 |                strip=False,
32 |                upx=True,
33 |                name='web')
34 | 


--------------------------------------------------------------------------------
/wsgi.py:
--------------------------------------------------------------------------------
1 | 
2 | from web  import app
3 | 
4 | if __name__ == "__main__":
5 |     app.run()
6 | 


--------------------------------------------------------------------------------