├── .gitattributes ├── .gitignore ├── README.md ├── cache └── o3.js ├── lzo.py ├── mdict-query.pyproj ├── mdict-query.sln ├── mdict_dir.py ├── mdict_query.py ├── mdx └── drop mdict files here.txt ├── pureSalsa20.py ├── readmdict.py ├── ripemd128.py ├── static └── cache here.txt ├── templates ├── all.html ├── dict.html └── entry.html ├── test.py ├── test_lzo.py ├── web.py ├── web.spec └── wsgi.py /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | 4 | # Custom for Visual Studio 5 | *.cs diff=csharp 6 | 7 | # Standard to msysgit 8 | *.doc diff=astextplain 9 | *.DOC diff=astextplain 10 | *.docx diff=astextplain 11 | *.DOCX diff=astextplain 12 | *.dot diff=astextplain 13 | *.DOT diff=astextplain 14 | *.pdf diff=astextplain 15 | *.PDF diff=astextplain 16 | *.rtf diff=astextplain 17 | *.RTF diff=astextplain 18 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | ################# 2 | ## Eclipse 3 | ################# 4 | 5 | *.pydevproject 6 | .project 7 | .metadata 8 | bin/ 9 | tmp/ 10 | *.tmp 11 | *.bak 12 | *.swp 13 | *~.nib 14 | local.properties 15 | .classpath 16 | .settings/ 17 | .loadpath 18 | *.mdd 19 | *.mdx 20 | *.db 21 | *.jpg 22 | *.png 23 | *.gif 24 | *.mp3 25 | *.css 26 | mdx/* 27 | static/* 28 | 29 | # External tool builders 30 | .externalToolBuilders/ 31 | 32 | # Locally stored "Eclipse launch configurations" 33 | *.launch 34 | 35 | # CDT-specific 36 | .cproject 37 | 38 | # PDT-specific 39 | .buildpath 40 | 41 | 42 | ################# 43 | ## Visual Studio 44 | ################# 45 | 46 | ## Ignore Visual Studio temporary files, build results, and 47 | ## files generated by popular Visual Studio add-ons. 48 | 49 | # User-specific files 50 | *.suo 51 | *.user 52 | *.sln.docstates 53 | 54 | # Build results 55 | 56 | [Dd]ebug/ 57 | [Rr]elease/ 58 | x64/ 59 | build/ 60 | [Bb]in/ 61 | [Oo]bj/ 62 | 63 | # MSTest test Results 64 | [Tt]est[Rr]esult*/ 65 | [Bb]uild[Ll]og.* 66 | 67 | *_i.c 68 | *_p.c 69 | *.ilk 70 | *.meta 71 | *.obj 72 | *.pch 73 | *.pdb 74 | *.pgc 75 | *.pgd 76 | *.rsp 77 | *.sbr 78 | *.tlb 79 | *.tli 80 | *.tlh 81 | *.tmp 82 | *.tmp_proj 83 | *.log 84 | *.vspscc 85 | *.vssscc 86 | .builds 87 | *.pidb 88 | *.log 89 | *.scc 90 | 91 | # Visual C++ cache files 92 | ipch/ 93 | *.aps 94 | *.ncb 95 | *.opensdf 96 | *.sdf 97 | *.cachefile 98 | 99 | # Visual Studio profiler 100 | *.psess 101 | *.vsp 102 | *.vspx 103 | 104 | # Guidance Automation Toolkit 105 | *.gpState 106 | 107 | # ReSharper is a .NET coding add-in 108 | _ReSharper*/ 109 | *.[Rr]e[Ss]harper 110 | 111 | # TeamCity is a build add-in 112 | _TeamCity* 113 | 114 | # DotCover is a Code Coverage Tool 115 | *.dotCover 116 | 117 | # NCrunch 118 | *.ncrunch* 119 | .*crunch*.local.xml 120 | 121 | # Installshield output folder 122 | [Ee]xpress/ 123 | 124 | # DocProject is a documentation generator add-in 125 | DocProject/buildhelp/ 126 | DocProject/Help/*.HxT 127 | DocProject/Help/*.HxC 128 | DocProject/Help/*.hhc 129 | DocProject/Help/*.hhk 130 | DocProject/Help/*.hhp 131 | DocProject/Help/Html2 132 | DocProject/Help/html 133 | 134 | # Click-Once directory 135 | publish/ 136 | 137 | # Publish Web Output 138 | *.Publish.xml 139 | *.pubxml 140 | *.publishproj 141 | 142 | # NuGet Packages Directory 143 | ## TODO: If you have NuGet Package Restore enabled, uncomment the next line 144 | #packages/ 145 | 146 | # Windows Azure Build Output 147 | csx 148 | *.build.csdef 149 | 150 | # Windows Store app package directory 151 | AppPackages/ 152 | 153 | # Others 154 | sql/ 155 | *.Cache 156 | ClientBin/ 157 | [Ss]tyle[Cc]op.* 158 | ~$* 159 | *~ 160 | *.dbmdl 161 | *.[Pp]ublish.xml 162 | *.pfx 163 | *.publishsettings 164 | 165 | # RIA/Silverlight projects 166 | Generated_Code/ 167 | 168 | # Backup & report files from converting an old project file to a newer 169 | # Visual Studio version. Backup files are not needed, because we have git ;-) 170 | _UpgradeReport_Files/ 171 | Backup*/ 172 | UpgradeLog*.XML 173 | UpgradeLog*.htm 174 | 175 | # SQL Server files 176 | App_Data/*.mdf 177 | App_Data/*.ldf 178 | 179 | ############# 180 | ## Windows detritus 181 | ############# 182 | 183 | # Windows image file caches 184 | Thumbs.db 185 | ehthumbs.db 186 | 187 | # Folder config file 188 | Desktop.ini 189 | 190 | # Recycle Bin used on file shares 191 | $RECYCLE.BIN/ 192 | 193 | # Mac crap 194 | .DS_Store 195 | 196 | 197 | ############# 198 | ## Python 199 | ############# 200 | 201 | *.py[cod] 202 | 203 | # Packages 204 | *.egg 205 | *.egg-info 206 | dist/ 207 | build/ 208 | eggs/ 209 | parts/ 210 | var/ 211 | sdist/ 212 | develop-eggs/ 213 | .installed.cfg 214 | 215 | # Installer logs 216 | pip-log.txt 217 | 218 | # Unit test / coverage reports 219 | .coverage 220 | .tox 221 | 222 | #Translations 223 | *.mo 224 | 225 | #Mr Developer 226 | .mr.developer.cfg 227 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | This is a python module for looking up `mdict` dictionary files (`.mdx` and `.mdd`). Function converting `mdx` to `sqlite` is added. 2 | 3 | >>Based on [readmdict](https://bitbucket.org/xwang/mdict-analysis) by [Xiaoqiang Wang](http://bitbucket.org/xwang/). 4 | 5 | While this project is a trivial extension of the [original module](https://bitbucket.org/xwang/mdict-analysis), it adds the features of looking up a single entry in `.mdx` or resource file in `.mdd` without extracting all content, which may be helpful in other projects that requires dictionaries. 6 | 7 | ## Usage 8 | 9 | Constructs the `IndexBuilder` object, which builds the sqlite index for `.mdx` file and the corresponding `.mdd` file (if exists). 10 | 11 | from mdict_query import IndexBuilder 12 | builder = IndexBuilder('ode.mdx') 13 | 14 | Convert `mdx` to `sqlite`: 15 | ``` 16 | builder.make_sqlite() 17 | # Check the output file `ode.mdx.sqlite.db` near your `ode.mdx` 18 | ``` 19 | 20 | 21 | Get all mdx keys: 22 | 23 | builder.get_mdx_keys() 24 | # ==> ['key1', 'key2', 'key3', ...] 25 | 26 | Filter mdx keys by wildcard: 27 | 28 | builder.get_mdx_keys('dedicat*') 29 | # ==> ['dedicate', 'dedication', ...] 30 | 31 | Looks up mdx with a key: 32 | 33 | result_text = builder.mdx_lookup('dedication') 34 | 35 | There is an option to ignore cases: 36 | 37 | result_text = builder.mdx_lookup('Dedication', ignorecase = True) 38 | 39 | Get all mdd keys: 40 | 41 | builder.get_mdd_keys() 42 | # ==> ['key1', 'key2', 'key3', ...] 43 | 44 | Filter mdd keys by wildcard: 45 | 46 | builder.get_mdd_keys('*.css') 47 | # ==> ['/style.css', ...] 48 | 49 | Looks up mdd with a key: 50 | 51 | bytes_list = builder.mdd_lookup('/style.css') 52 | #bytes_list is the bytes list of the file stored in mdd 53 | 54 | -------------------------------------------------------------------------------- /cache/o3.js: -------------------------------------------------------------------------------- 1 | var o0e=(function(){return{e:function(c,d){var n=d==2?c.nextSibling:c.parentNode.nextSibling;if(!d)n=n.childNodes[0];var s=n.style;if(s.display!="block")s.display="block";else s.display="none";},a:function(c,d,f){c.removeAttribute("onclick");var s=c.style;s.cursor="default";s.outline="1px dotted gray";var m=/([^//]+)$/.exec(f); 2 | if(m){var u="http://audio.oxforddictionaries.com/en/mp3/"+m[0].replace('__','_')+".mp3";var b=function(){s.outline="";s.cursor="pointer";c.setAttribute("onclick","o0e.a(this,"+d+",'"+f+"')");};var t=setTimeout(b,2000);try{with(document.createElement("audio")){setAttribute("src",u);onloadstart=function(){clearTimeout(t);};onended=b;play();}}catch(e){c.style.outline="";}}},x:function(c){var s=c.parentNode.nextSibling.style;if(s.display!="none"){s.display="none";c.className="yuq";}else{s.display="block";c.className="aej";}},p:function(c){if(c.className=="j02")c.className="g4p";else c.className="j02";}}}()); -------------------------------------------------------------------------------- /lzo.py: -------------------------------------------------------------------------------- 1 | import math 2 | 3 | 4 | class FlexBuffer(): 5 | 6 | def __init__(self): 7 | 8 | self.blockSize = None 9 | self.c = None 10 | self.l = None 11 | self.buf = None 12 | 13 | def require(self, n): 14 | 15 | r = self.c - self.l + n 16 | if r > 0: 17 | self.l = self.l + self.blockSize * math.ceil(r / self.blockSize) 18 | #tmp = bytearray(self.l) 19 | #for i in len(self.buf): 20 | # tmp[i] = self.buf[i] 21 | #self.buf = tmp 22 | self.buf = self.buf + bytearray(self.l - len(self.buf)) 23 | self.c = self.c + n 24 | return self.buf 25 | 26 | def alloc(self, initSize, blockSize): 27 | 28 | if blockSize: 29 | sz = blockSize 30 | else: 31 | sz = 4096 32 | self.blockSize = self.roundUp(sz) 33 | self.c = 0 34 | self.l = self.roundUp(initSize) | 0 35 | self.l += self.blockSize - (self.l % self.blockSize) 36 | self.buf = bytearray(self.l) 37 | return self.buf 38 | 39 | def roundUp(self, n): 40 | 41 | r = n % 4 42 | if r == 0: 43 | return n 44 | else: 45 | return n + 4 - r 46 | 47 | def reset(self): 48 | 49 | self.c = 0 50 | self.l = len(self.buf) 51 | 52 | def pack(self, size): 53 | 54 | return self.buf[0:size] 55 | 56 | def _decompress(inBuf, outBuf): 57 | 58 | c_top_loop = 1 59 | c_first_literal_run = 2 60 | c_match = 3 61 | c_copy_match = 4 62 | c_match_done = 5 63 | c_match_next = 6 64 | 65 | out = outBuf.buf 66 | op = 0 67 | ip = 0 68 | t = inBuf[ip] 69 | state = c_top_loop 70 | m_pos = 0 71 | ip_end = len(inBuf) 72 | 73 | if t > 17: 74 | ip = ip + 1 75 | t = t - 17 76 | if t < 4: 77 | state = c_match_next 78 | else: 79 | out = outBuf.require(t) 80 | while True: 81 | out[op] = inBuf[ip] 82 | op = op + 1 83 | ip = ip + 1 84 | t = t - 1 85 | if not t > 0: break 86 | state = c_first_literal_run 87 | 88 | while True: 89 | if_block = False 90 | 91 | ## 92 | if state == c_top_loop: 93 | t = inBuf[ip] 94 | ip = ip + 1 95 | if t >= 16: 96 | state = c_match 97 | continue 98 | if t == 0: 99 | while inBuf[ip] == 0: 100 | t = t + 255 101 | ip = ip + 1 102 | t = t + 15 + inBuf[ip] 103 | ip = ip + 1 104 | 105 | t = t + 3 106 | out = outBuf.require(t) 107 | while True: 108 | out[op] = inBuf[ip] 109 | op = op + 1 110 | ip = ip + 1 111 | t = t - 1 112 | if not t > 0: break 113 | # emulate c switch 114 | state = c_first_literal_run 115 | 116 | ## 117 | if state == c_first_literal_run: 118 | t = inBuf[ip] 119 | ip = ip + 1 120 | if t >= 16: 121 | state = c_match 122 | continue 123 | m_pos = op - 0x801 - (t >> 2) - (inBuf[ip] << 2) 124 | ip = ip + 1 125 | out = outBuf.require(3) 126 | out[op] = out[m_pos] 127 | op = op + 1 128 | m_pos = m_pos + 1 129 | out[op] = out[m_pos] 130 | op = op + 1 131 | m_pos = m_pos + 1 132 | out[op] = out[m_pos] 133 | op = op + 1 134 | 135 | state = c_match_done 136 | continue 137 | 138 | ## 139 | if state == c_match: 140 | if t >= 64: 141 | m_pos = op - 1 - ((t >> 2) & 7) - (inBuf[ip] << 3) 142 | ip = ip + 1 143 | t = (t >> 5) - 1 144 | state = c_copy_match 145 | continue 146 | elif t >= 32: 147 | t = t & 31 148 | if t == 0: 149 | while inBuf[ip] == 0: 150 | t = t + 255 151 | ip = ip + 1 152 | t = t + 31 + inBuf[ip] 153 | ip = ip + 1 154 | m_pos = op - 1 - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) 155 | ip = ip + 2 156 | elif t >= 16: 157 | m_pos = op - ((t & 8) << 11) 158 | t = t & 7 159 | if t == 0: 160 | while inBuf[ip] == 0: 161 | t = t + 255 162 | ip = ip + 1 163 | t = t + 7 + inBuf[ip] 164 | ip = ip + 1 165 | m_pos = m_pos - ((inBuf[ip] + (inBuf[ip + 1] << 8)) >> 2) 166 | ip = ip + 2 167 | if m_pos == op: 168 | break 169 | m_pos = m_pos - 0x4000 170 | else: 171 | m_pos = op - 1 - (t >> 2) - (inBuf[ip] << 2); 172 | ip = ip + 1 173 | out = outBuf.require(2) 174 | out[op] = out[m_pos] 175 | op = op + 1 176 | m_pos = m_pos + 1 177 | out[op] = out[m_pos] 178 | op = op + 1 179 | state = c_match_done 180 | continue 181 | 182 | if t >= 6 and (op - m_pos) >= 4: 183 | if_block = True 184 | t += 2 185 | out = outBuf.require(t) 186 | while True: 187 | out[op] = out[m_pos] 188 | op += 1 189 | m_pos += 1 190 | t -= 1 191 | if not t > 0: break 192 | #emulate c switch 193 | state = c_copy_match 194 | 195 | ## 196 | if state == c_copy_match: 197 | if not if_block: 198 | t += 2 199 | out = outBuf.require(t) 200 | while True: 201 | out[op] = out[m_pos] 202 | op += 1 203 | m_pos += 1 204 | t -= 1 205 | if not t > 0: break 206 | #emulating c switch 207 | state = c_match_done 208 | 209 | ## 210 | if state == c_match_done: 211 | t = inBuf[ip - 2] & 3 212 | if t == 0: 213 | state = c_top_loop 214 | continue 215 | #emulate c switch 216 | state = c_match_next 217 | 218 | ## 219 | if state == c_match_next: 220 | out = outBuf.require(1) 221 | out[op] = inBuf[ip] 222 | op += 1 223 | ip += 1 224 | if t > 1: 225 | out = outBuf.require(1) 226 | out[op] = inBuf[ip] 227 | op += 1 228 | ip += 1 229 | if t > 2: 230 | out = outBuf.require(1) 231 | out[op] = inBuf[ip] 232 | op += 1 233 | ip += 1 234 | t = inBuf[ip] 235 | ip += 1 236 | state = c_match 237 | continue 238 | 239 | return bytes(outBuf.pack(op)) 240 | 241 | def decompress(input, initSize = 16000, blockSize = 8192): 242 | output = FlexBuffer() 243 | output.alloc(initSize, blockSize) 244 | return _decompress(bytearray(input), output) 245 | 246 | 247 | -------------------------------------------------------------------------------- /mdict-query.pyproj: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | Debug 5 | 2.0 6 | {f227ad7e-74e4-4364-ad90-d2b3dda5abf6} 7 | 8 | test_lzo.py 9 | 10 | . 11 | . 12 | {888888a0-9f3d-457c-b088-3a5042f75d52} 13 | Standard Python launcher 14 | {9a7a9026-48c1-4688-9d5d-e5699d47d074} 15 | 3.5 16 | False 17 | 18 | 19 | 20 | 21 | 10.0 22 | $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion)\Python Tools\Microsoft.PythonTools.targets 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | Code 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | -------------------------------------------------------------------------------- /mdict-query.sln: -------------------------------------------------------------------------------- 1 |  2 | Microsoft Visual Studio Solution File, Format Version 12.00 3 | # Visual Studio 14 4 | VisualStudioVersion = 14.0.25420.1 5 | MinimumVisualStudioVersion = 10.0.40219.1 6 | Project("{888888A0-9F3D-457C-B088-3A5042F75D52}") = "mdict-query", "mdict-query.pyproj", "{F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}" 7 | EndProject 8 | Global 9 | GlobalSection(SolutionConfigurationPlatforms) = preSolution 10 | Debug|Any CPU = Debug|Any CPU 11 | Release|Any CPU = Release|Any CPU 12 | EndGlobalSection 13 | GlobalSection(ProjectConfigurationPlatforms) = postSolution 14 | {F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Debug|Any CPU.ActiveCfg = Debug|Any CPU 15 | {F227AD7E-74E4-4364-AD90-D2B3DDA5ABF6}.Release|Any CPU.ActiveCfg = Release|Any CPU 16 | EndGlobalSection 17 | GlobalSection(SolutionProperties) = preSolution 18 | HideSolutionNode = FALSE 19 | EndGlobalSection 20 | EndGlobal 21 | -------------------------------------------------------------------------------- /mdict_dir.py: -------------------------------------------------------------------------------- 1 | from mdict_query import IndexBuilder 2 | import os 3 | import json 4 | 5 | 6 | class Dir(object): 7 | 8 | def __init__(self, mdict_dir, config_name = 'config.json'): 9 | 10 | assert(os.path.isdir(mdict_dir)) 11 | self._mdict_dir = mdict_dir 12 | self._config_file_base_name = config_name 13 | self._config = {} 14 | #check config.json 15 | self._config_file = os.path.join(mdict_dir, self._config_file_base_name) 16 | 17 | if os.path.exists(self._config_file): 18 | self._ensure_config_consistency() 19 | self._load_config() 20 | self._add_builder() 21 | pass 22 | else: 23 | self._build_index() 24 | self._make_config() 25 | self._dump_config() 26 | self._add_builder() 27 | pass 28 | 29 | def _add_builder(self): 30 | 31 | for dict in self._config['dicts']: 32 | dict['builder'] = IndexBuilder(dict['mdx_name']) 33 | 34 | 35 | def _load_config(self): 36 | 37 | file_opened = open(self._config_file, 'r', encoding = 'utf-8') 38 | self._config = json.load(file_opened) 39 | file_opened.close() 40 | 41 | 42 | def _build_index(self): 43 | 44 | dict_list = [] 45 | files_in_dir = os.listdir(self._mdict_dir) 46 | for item in files_in_dir: 47 | full_name = os.path.join(self._mdict_dir, item) 48 | print(full_name) 49 | if os.path.isfile(full_name): 50 | _filename, _file_extension = os.path.splitext(full_name) 51 | if _file_extension == '.mdx': 52 | _config_single_dic = { 53 | 'title': '', 54 | 'description':'', 55 | 'mdx_name': full_name, 56 | 'has_mdd': os.path.isfile(_filename + '.mdd') 57 | } 58 | try: 59 | ib = IndexBuilder(full_name) 60 | except Exception: 61 | continue 62 | _config_single_dic['title'] = ib._title 63 | _config_single_dic['description'] = ib._description 64 | dict_list.append(_config_single_dic) 65 | self._config['dicts'] = dict_list 66 | 67 | def _make_config(self): 68 | pass 69 | 70 | def _dump_config(self): 71 | 72 | file_opened = open(self._config_file, 'w', encoding = 'utf-8') 73 | json.dump(self._config, file_opened, ensure_ascii = False, indent = True) 74 | file_opened.close() 75 | 76 | #todo: implement ensure consistency 77 | def _ensure_config_consistency(self): 78 | pass 79 | 80 | Dir('mdx') 81 | -------------------------------------------------------------------------------- /mdict_query.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | 4 | from .readmdict import MDX, MDD 5 | from struct import pack, unpack 6 | from io import BytesIO 7 | import re 8 | import sys 9 | import os 10 | import sqlite3 11 | import json 12 | 13 | # zlib compression is used for engine version >=2.0 14 | import zlib 15 | # LZO compression is used for engine version < 2.0 16 | try: 17 | import lzo 18 | except ImportError: 19 | lzo = None 20 | #print("LZO compression support is not available") 21 | 22 | # 2x3 compatible 23 | if sys.hexversion >= 0x03000000: 24 | unicode = str 25 | 26 | version = '1.1' 27 | 28 | 29 | class IndexBuilder(object): 30 | #todo: enable history 31 | def __init__(self, fname, encoding = "", passcode = None, force_rebuild = False, enable_history = False, sql_index = True, check = False): 32 | self._mdx_file = fname 33 | self._mdd_file = "" 34 | self._encoding = '' 35 | self._stylesheet = {} 36 | self._title = '' 37 | self._version = '' 38 | self._description = '' 39 | self._sql_index = sql_index 40 | self._check = check 41 | _filename, _file_extension = os.path.splitext(fname) 42 | assert(_file_extension == '.mdx') 43 | assert(os.path.isfile(fname)) 44 | self._mdx_db = _filename + ".mdx.db" 45 | # make index anyway 46 | if force_rebuild: 47 | self._make_mdx_index(self._mdx_db) 48 | if os.path.isfile(_filename + '.mdd'): 49 | self._mdd_file = _filename + ".mdd" 50 | self._mdd_db = _filename + ".mdd.db" 51 | self._make_mdd_index(self._mdd_db) 52 | 53 | if os.path.isfile(self._mdx_db): 54 | #read from META table 55 | conn = sqlite3.connect(self._mdx_db) 56 | #cursor = conn.execute("SELECT * FROM META") 57 | cursor = conn.execute("SELECT * FROM META WHERE key = \"version\"") 58 | #判断有无版本号 59 | for cc in cursor: 60 | self._version = cc[1] 61 | ################# if not version in fo ############# 62 | if not self._version: 63 | print("version info not found") 64 | conn.close() 65 | self._make_mdx_index(self._mdx_db) 66 | print("mdx.db rebuilt!") 67 | if os.path.isfile(_filename + '.mdd'): 68 | self._mdd_file = _filename + ".mdd" 69 | self._mdd_db = _filename + ".mdd.db" 70 | self._make_mdd_index(self._mdd_db) 71 | print("mdd.db rebuilt!") 72 | return None 73 | cursor = conn.execute("SELECT * FROM META WHERE key = \"encoding\"") 74 | for cc in cursor: 75 | self._encoding = cc[1] 76 | cursor = conn.execute("SELECT * FROM META WHERE key = \"stylesheet\"") 77 | for cc in cursor: 78 | self._stylesheet = json.loads(cc[1]) 79 | 80 | cursor = conn.execute("SELECT * FROM META WHERE key = \"title\"") 81 | for cc in cursor: 82 | self._title = cc[1] 83 | 84 | cursor = conn.execute("SELECT * FROM META WHERE key = \"description\"") 85 | for cc in cursor: 86 | self._description = cc[1] 87 | 88 | #for cc in cursor: 89 | # if cc[0] == 'encoding': 90 | # self._encoding = cc[1] 91 | # continue 92 | # if cc[0] == 'stylesheet': 93 | # self._stylesheet = json.loads(cc[1]) 94 | # continue 95 | # if cc[0] == 'title': 96 | # self._title = cc[1] 97 | # continue 98 | # if cc[0] == 'title': 99 | # self._description = cc[1] 100 | else: 101 | self._make_mdx_index(self._mdx_db) 102 | 103 | if os.path.isfile(_filename + ".mdd"): 104 | self._mdd_file = _filename + ".mdd" 105 | self._mdd_db = _filename + ".mdd.db" 106 | if not os.path.isfile(self._mdd_db): 107 | self._make_mdd_index(self._mdd_db) 108 | pass 109 | 110 | 111 | def _replace_stylesheet(self, txt): 112 | # substitute stylesheet definition 113 | txt_list = re.split('`\d+`', txt) 114 | txt_tag = re.findall('`\d+`', txt) 115 | txt_styled = txt_list[0] 116 | for j, p in enumerate(txt_list[1:]): 117 | style = self._stylesheet[txt_tag[j][1:-1]] 118 | if p and p[-1] == '\n': 119 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n' 120 | else: 121 | txt_styled = txt_styled + style[0] + p + style[1] 122 | return txt_styled 123 | 124 | 125 | def make_sqlite(self): 126 | sqlite_file = self._mdx_file + '.sqlite.db' 127 | if os.path.exists(sqlite_file): 128 | os.remove(sqlite_file) 129 | mdx = MDX(self._mdx_file) 130 | conn = sqlite3.connect(sqlite_file) 131 | cursor = conn.cursor() 132 | cursor.execute( 133 | ''' CREATE TABLE MDX_DICT 134 | (key text not null, 135 | value text 136 | )''' 137 | ) 138 | 139 | # remove '(pīnyīn)', remove `1`: 140 | aeiou = 'āáǎàĀÁǍÀēéěèêềếĒÉĚÈÊỀẾīíǐìÍǏÌōóǒòŌÓǑÒūúǔùŪÚǓÙǖǘǚǜǕǗǙǛḾǹňŃŇ' 141 | pattern = r"`\d+`|[(\(]?['a-z%s]*[%s]['a-z%s]*[\))]?"%(aeiou, aeiou, aeiou) 142 | tuple_list = [(key.decode(), re.sub(pattern, '', value.decode())) 143 | for key, value in mdx.items()] 144 | 145 | cursor.executemany('INSERT INTO MDX_DICT VALUES (?,?)', tuple_list) 146 | 147 | returned_index = mdx.get_index(check_block = self._check) 148 | meta = returned_index['meta'] 149 | cursor.execute( 150 | '''CREATE TABLE META (key text, value text)''') 151 | 152 | cursor.executemany( 153 | 'INSERT INTO META VALUES (?,?)', 154 | [('encoding', meta['encoding']), 155 | ('stylesheet', meta['stylesheet']), 156 | ('title', meta['title']), 157 | ('description', meta['description']), 158 | ('version', version) 159 | ] 160 | ) 161 | 162 | if self._sql_index: 163 | cursor.execute( 164 | ''' 165 | CREATE INDEX key_index ON MDX_DICT (key) 166 | ''' 167 | ) 168 | conn.commit() 169 | conn.close() 170 | 171 | 172 | def _make_mdx_index(self, db_name): 173 | if os.path.exists(db_name): 174 | os.remove(db_name) 175 | mdx = MDX(self._mdx_file) 176 | self._mdx_db = db_name 177 | returned_index = mdx.get_index(check_block = self._check) 178 | index_list = returned_index['index_dict_list'] 179 | conn = sqlite3.connect(db_name) 180 | c = conn.cursor() 181 | c.execute( 182 | ''' CREATE TABLE MDX_INDEX 183 | (key_text text not null, 184 | file_pos integer, 185 | compressed_size integer, 186 | decompressed_size integer, 187 | record_block_type integer, 188 | record_start integer, 189 | record_end integer, 190 | offset integer 191 | )''' 192 | ) 193 | 194 | tuple_list = [ 195 | (item['key_text'], 196 | item['file_pos'], 197 | item['compressed_size'], 198 | item['decompressed_size'], 199 | item['record_block_type'], 200 | item['record_start'], 201 | item['record_end'], 202 | item['offset'] 203 | ) 204 | for item in index_list 205 | ] 206 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', 207 | tuple_list) 208 | # build the metadata table 209 | meta = returned_index['meta'] 210 | c.execute( 211 | '''CREATE TABLE META 212 | (key text, 213 | value text 214 | )''') 215 | 216 | #for k,v in meta: 217 | # c.execute( 218 | # 'INSERT INTO META VALUES (?,?)', 219 | # (k, v) 220 | # ) 221 | 222 | c.executemany( 223 | 'INSERT INTO META VALUES (?,?)', 224 | [('encoding', meta['encoding']), 225 | ('stylesheet', meta['stylesheet']), 226 | ('title', meta['title']), 227 | ('description', meta['description']), 228 | ('version', version) 229 | ] 230 | ) 231 | 232 | if self._sql_index: 233 | c.execute( 234 | ''' 235 | CREATE INDEX key_index ON MDX_INDEX (key_text) 236 | ''' 237 | ) 238 | 239 | conn.commit() 240 | conn.close() 241 | #set class member 242 | self._encoding = meta['encoding'] 243 | self._stylesheet = json.loads(meta['stylesheet']) 244 | self._title = meta['title'] 245 | self._description = meta['description'] 246 | 247 | 248 | def _make_mdd_index(self, db_name): 249 | if os.path.exists(db_name): 250 | os.remove(db_name) 251 | mdd = MDD(self._mdd_file) 252 | self._mdd_db = db_name 253 | index_list = mdd.get_index(check_block = self._check) 254 | conn = sqlite3.connect(db_name) 255 | c = conn.cursor() 256 | c.execute( 257 | ''' CREATE TABLE MDX_INDEX 258 | (key_text text not null unique, 259 | file_pos integer, 260 | compressed_size integer, 261 | decompressed_size integer, 262 | record_block_type integer, 263 | record_start integer, 264 | record_end integer, 265 | offset integer 266 | )''' 267 | ) 268 | 269 | tuple_list = [ 270 | (item['key_text'], 271 | item['file_pos'], 272 | item['compressed_size'], 273 | item['decompressed_size'], 274 | item['record_block_type'], 275 | item['record_start'], 276 | item['record_end'], 277 | item['offset'] 278 | ) 279 | for item in index_list 280 | ] 281 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', 282 | tuple_list) 283 | if self._sql_index: 284 | c.execute( 285 | ''' 286 | CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text) 287 | ''' 288 | ) 289 | 290 | conn.commit() 291 | conn.close() 292 | 293 | @staticmethod 294 | def get_data_by_index(fmdx, index): 295 | fmdx.seek(index['file_pos']) 296 | record_block_compressed = fmdx.read(index['compressed_size']) 297 | record_block_type = record_block_compressed[:4] 298 | record_block_type = index['record_block_type'] 299 | decompressed_size = index['decompressed_size'] 300 | #adler32 = unpack('>I', record_block_compressed[4:8])[0] 301 | if record_block_type == 0: 302 | _record_block = record_block_compressed[8:] 303 | # lzo compression 304 | elif record_block_type == 1: 305 | if lzo is None: 306 | print("LZO compression is not supported") 307 | # decompress 308 | header = b'\xf0' + pack('>I', index['decompressed_size']) 309 | _record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) 310 | # zlib compression 311 | elif record_block_type == 2: 312 | # decompress 313 | _record_block = zlib.decompress(record_block_compressed[8:]) 314 | data = _record_block[index['record_start'] - index['offset']:index['record_end'] - index['offset']] 315 | return data 316 | 317 | def get_mdx_by_index(self, fmdx, index): 318 | data = self.get_data_by_index(fmdx,index) 319 | record = data.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') 320 | if self._stylesheet: 321 | record = self._replace_stylesheet(record) 322 | record = record.decode('utf-8') 323 | return record 324 | 325 | def get_mdd_by_index(self, fmdx, index): 326 | return self.get_data_by_index(fmdx,index) 327 | 328 | @staticmethod 329 | def lookup_indexes(db,keyword,ignorecase=None): 330 | indexes = [] 331 | if ignorecase: 332 | sql = 'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format(keyword) 333 | else: 334 | sql = 'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format(keyword) 335 | with sqlite3.connect(db) as conn: 336 | cursor = conn.execute(sql) 337 | for result in cursor: 338 | index = {} 339 | index['file_pos'] = result[1] 340 | index['compressed_size'] = result[2] 341 | index['decompressed_size'] = result[3] 342 | index['record_block_type'] = result[4] 343 | index['record_start'] = result[5] 344 | index['record_end'] = result[6] 345 | index['offset'] = result[7] 346 | indexes.append(index) 347 | return indexes 348 | 349 | def mdx_lookup(self, keyword,ignorecase=None): 350 | lookup_result_list = [] 351 | indexes = self.lookup_indexes(self._mdx_db,keyword,ignorecase) 352 | with open(self._mdx_file,'rb') as mdx_file: 353 | for index in indexes: 354 | lookup_result_list.append(self.get_mdx_by_index(mdx_file, index)) 355 | return lookup_result_list 356 | 357 | def mdd_lookup(self, keyword,ignorecase=None): 358 | lookup_result_list = [] 359 | indexes = self.lookup_indexes(self._mdd_db,keyword,ignorecase) 360 | with open(self._mdd_file,'rb') as mdd_file: 361 | for index in indexes: 362 | lookup_result_list.append(self.get_mdd_by_index(mdd_file, index)) 363 | return lookup_result_list 364 | 365 | @staticmethod 366 | def get_keys(db,query = ''): 367 | if not db: 368 | return [] 369 | if query: 370 | if '*' in query: 371 | query = query.replace('*','%') 372 | else: 373 | query = query + '%' 374 | sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"' 375 | else: 376 | sql = 'SELECT key_text FROM MDX_INDEX' 377 | with sqlite3.connect(db) as conn: 378 | cursor = conn.execute(sql) 379 | keys = [item[0] for item in cursor] 380 | return keys 381 | 382 | def get_mdd_keys(self, query = ''): 383 | return self.get_keys(self._mdd_db,query) 384 | 385 | def get_mdx_keys(self, query = ''): 386 | return self.get_keys(self._mdx_db,query) 387 | 388 | 389 | 390 | # mdx_builder = IndexBuilder("oald.mdx") 391 | # text = mdx_builder.mdx_lookup('dedication') 392 | # keys = mdx_builder.get_mdx_keys() 393 | # keys1 = mdx_builder.get_mdx_keys('abstrac') 394 | # keys2 = mdx_builder.get_mdx_keys('*tion') 395 | # for key in keys2: 396 | # text = mdx_builder.mdx_lookup(key)[0] 397 | # pass 398 | -------------------------------------------------------------------------------- /mdx/drop mdict files here.txt: -------------------------------------------------------------------------------- 1 | hihaaha 2 | -------------------------------------------------------------------------------- /pureSalsa20.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Copyright by https://github.com/zhansliu/writemdict 6 | 7 | pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3 8 | 9 | v4.0: Added Python 3 support, dropped support for Python <= 2.5. 10 | 11 | // zhansliu 12 | 13 | Original comments below. 14 | 15 | ==================================================================== 16 | There are comments here by two authors about three pieces of software: 17 | comments by Larry Bugbee about 18 | Salsa20, the stream cipher by Daniel J. Bernstein 19 | (including comments about the speed of the C version) and 20 | pySalsa20, Bugbee's own Python wrapper for salsa20.c 21 | (including some references), and 22 | comments by Steve Witham about 23 | pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20, 24 | which follows pySalsa20's API, and is in this file. 25 | 26 | Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee) 27 | ----------------------------------------------------------- 28 | 29 | Salsa20 is a fast stream cipher written by Daniel Bernstein 30 | that basically uses a hash function and XOR making for fast 31 | encryption. (Decryption uses the same function.) Salsa20 32 | is simple and quick. 33 | 34 | Some Salsa20 parameter values... 35 | design strength 128 bits 36 | key length 128 or 256 bits, exactly 37 | IV, aka nonce 64 bits, always 38 | chunk size must be in multiples of 64 bytes 39 | 40 | Salsa20 has two reduced versions, 8 and 12 rounds each. 41 | 42 | One benchmark (10 MB): 43 | 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds 44 | AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds 45 | (no I/O and before Python GC kicks in) 46 | 47 | Salsa20 is a Phase 3 finalist in the EU eSTREAM competition 48 | and appears to be one of the fastest ciphers. It is well 49 | documented so I will not attempt any injustice here. Please 50 | see "References" below. 51 | 52 | ...and Salsa20 is "free for any use". 53 | 54 | 55 | pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee) 56 | ------------------------------------------------------------------ 57 | 58 | pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is 59 | as it's name implies, 20 rounds, but there are two reduced 60 | versions, 8 and 12 rounds each. Because the APIs are 61 | identical, pySalsa20 is capable of wrapping all three 62 | versions (number of rounds hardcoded), including a special 63 | version that allows you to set the number of rounds with a 64 | set_rounds() function. Compile the version of your choice 65 | as a shared library (not as a Python extension), name and 66 | install it as libsalsa20.so. 67 | 68 | Sample usage: 69 | from pySalsa20 import Salsa20 70 | s20 = Salsa20(key, IV) 71 | dataout = s20.encryptBytes(datain) # same for decrypt 72 | 73 | This is EXPERIMENTAL software and intended for educational 74 | purposes only. To make experimentation less cumbersome, 75 | pySalsa20 is also free for any use. 76 | 77 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 78 | ANY KIND. USE AT YOUR OWN RISK. 79 | 80 | Enjoy, 81 | 82 | Larry Bugbee 83 | bugbee@seanet.com 84 | April 2007 85 | 86 | 87 | References: 88 | ----------- 89 | http://en.wikipedia.org/wiki/Salsa20 90 | http://en.wikipedia.org/wiki/Daniel_Bernstein 91 | http://cr.yp.to/djb.html 92 | http://www.ecrypt.eu.org/stream/salsa20p3.html 93 | http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip 94 | 95 | 96 | Prerequisites for pySalsa20: 97 | ---------------------------- 98 | - Python 2.5 (haven't tested in 2.4) 99 | 100 | 101 | pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham) 102 | ------------------------------------------------------------------ 103 | 104 | pureSalsa20 is the stand-alone Python code in this file. 105 | It implements the underlying Salsa20 core algorithm 106 | and emulates pySalsa20's Salsa20 class API (minus a bug(*)). 107 | 108 | pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20-- 109 | about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8, 110 | when encrypting 64k-byte blocks on my computer. 111 | 112 | pureSalsa20 is for cases where portability is much more important than 113 | speed. I wrote it for use in a "structured" random number generator. 114 | 115 | There are comments about the reasons for this slowness in 116 | http://www.tiac.net/~sw/2010/02/PureSalsa20 117 | 118 | Sample usage: 119 | from pureSalsa20 import Salsa20 120 | s20 = Salsa20(key, IV) 121 | dataout = s20.encryptBytes(datain) # same for decrypt 122 | 123 | I took the test code from pySalsa20, added a bunch of tests including 124 | rough speed tests, and moved them into the file testSalsa20.py. 125 | To test both pySalsa20 and pureSalsa20, type 126 | python testSalsa20.py 127 | 128 | (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the 129 | libsalsa20.so library and not switched when switching between instances 130 | of the Salsa20 class. 131 | s1 = Salsa20( key, IV, 20 ) 132 | s2 = Salsa20( key, IV, 8 ) 133 | In this example, 134 | with pySalsa20, both s1 and s2 will do 8 rounds of encryption. 135 | with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds. 136 | Perhaps giving each instance its own nRounds variable, which 137 | is passed to the salsa20wordtobyte() function, is insecure. I'm not a 138 | cryptographer. 139 | 140 | pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and 141 | intended for educational purposes only. To make experimentation less 142 | cumbersome, pureSalsa20.py and testSalsa20.py are free for any use. 143 | 144 | Revisions: 145 | ---------- 146 | p3.2 Fixed bug that initialized the output buffer with plaintext! 147 | Saner ramping of nreps in speed test. 148 | Minor changes and print statements. 149 | p3.1 Took timing variability out of add32() and rot32(). 150 | Made the internals more like pySalsa20/libsalsa . 151 | Put the semicolons back in the main loop! 152 | In encryptBytes(), modify a byte array instead of appending. 153 | Fixed speed calculation bug. 154 | Used subclasses instead of patches in testSalsa20.py . 155 | Added 64k-byte messages to speed test to be fair to pySalsa20. 156 | p3 First version, intended to parallel pySalsa20 version 3. 157 | 158 | More references: 159 | ---------------- 160 | http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20] 161 | http://cr.yp.to/snuffle.html [The original name of Salsa20] 162 | http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design] 163 | http://www.tiac.net/~sw/2010/02/PureSalsa20 164 | 165 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 166 | ANY KIND. USE AT YOUR OWN RISK. 167 | 168 | Cheers, 169 | 170 | Steve Witham sw at remove-this tiac dot net 171 | February, 2010 172 | """ 173 | import sys 174 | assert(sys.version_info >= (2, 6)) 175 | 176 | if sys.version_info >= (3,): 177 | integer_types = (int,) 178 | python3 = True 179 | else: 180 | integer_types = (int, long) 181 | python3 = False 182 | 183 | from struct import Struct 184 | little_u64 = Struct( "= 2**64" 238 | ctx = self.ctx 239 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) ) 240 | 241 | def getCounter( self ): 242 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0] 243 | 244 | 245 | def setRounds(self, rounds, testing=False ): 246 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20' 247 | self.rounds = rounds 248 | 249 | 250 | def encryptBytes(self, data): 251 | assert type(data) == bytes, 'data must be byte string' 252 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes' 253 | lendata = len(data) 254 | munged = bytearray(lendata) 255 | for i in range( 0, lendata, 64 ): 256 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False ) 257 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 ) 258 | # Stopping at 2^70 bytes per nonce is user's responsibility. 259 | for j in range( min( 64, lendata - i ) ): 260 | if python3: 261 | munged[ i+j ] = data[ i+j ] ^ h[j] 262 | else: 263 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j]) 264 | 265 | self._lastChunk64 = not lendata % 64 266 | return bytes(munged) 267 | 268 | decryptBytes = encryptBytes # encrypt and decrypt use same function 269 | 270 | #-------------------------------------------------------------------------- 271 | 272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ): 273 | """ Do nRounds Salsa20 rounds on a copy of 274 | input: list or tuple of 16 ints treated as little-endian unsigneds. 275 | Returns a 64-byte string. 276 | """ 277 | 278 | assert( type(input) in ( list, tuple ) and len(input) == 16 ) 279 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) ) 280 | 281 | x = list( input ) 282 | 283 | def XOR( a, b ): return a ^ b 284 | ROTATE = rot32 285 | PLUS = add32 286 | 287 | for i in range( nRounds // 2 ): 288 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c 289 | # unchanged except for indents and the blank line between rounds: 290 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7)); 291 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9)); 292 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13)); 293 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18)); 294 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7)); 295 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9)); 296 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13)); 297 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18)); 298 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7)); 299 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9)); 300 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13)); 301 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18)); 302 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7)); 303 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9)); 304 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13)); 305 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18)); 306 | 307 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7)); 308 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9)); 309 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13)); 310 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18)); 311 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7)); 312 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9)); 313 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13)); 314 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18)); 315 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7)); 316 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9)); 317 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13)); 318 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18)); 319 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7)); 320 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9)); 321 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13)); 322 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18)); 323 | 324 | for i in range( len( input ) ): 325 | x[i] = PLUS( x[i], input[i] ) 326 | return little16_i32.pack( *x ) 327 | 328 | #--------------------------- 32-bit ops ------------------------------- 329 | 330 | def trunc32( w ): 331 | """ Return the bottom 32 bits of w as a Python int. 332 | This creates longs temporarily, but returns an int. """ 333 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) ) 334 | assert type(w) == int 335 | return w 336 | 337 | 338 | def add32( a, b ): 339 | """ Add two 32-bit words discarding carry above 32nd bit, 340 | and without creating a Python long. 341 | Timing shouldn't vary. 342 | """ 343 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF ) 344 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 ) 345 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF ) 346 | 347 | 348 | def rot32( w, nLeft ): 349 | """ Rotate 32-bit word left by nLeft or right by -nLeft 350 | without creating a Python long. 351 | Timing depends on nLeft but not on w. 352 | """ 353 | nLeft &= 31 # which makes nLeft >= 0 354 | if nLeft == 0: 355 | return w 356 | 357 | # Note: now 1 <= nLeft <= 31. 358 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, 359 | # => sLLLLLLRRR and one s which becomes the sign bit. 360 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) ) 361 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w 362 | return RRR | ( sLLLLLL << nLeft ) 363 | 364 | 365 | # --------------------------------- end ----------------------------------- 366 | -------------------------------------------------------------------------------- /readmdict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # readmdict.py 4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser 5 | # 6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang 7 | # 8 | # This program is a free software; you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, version 3 of the License. 11 | # 12 | # You can get a copy of GNU General Public License along this program 13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | 20 | from struct import pack, unpack 21 | from io import BytesIO 22 | import re 23 | import sys 24 | import json 25 | 26 | from .ripemd128 import ripemd128 27 | from .pureSalsa20 import Salsa20 28 | 29 | # zlib compression is used for engine version >=2.0 30 | import zlib 31 | # LZO compression is used for engine version < 2.0 32 | try: 33 | import lzo 34 | except ImportError: 35 | lzo = None 36 | print("LZO compression support is not available") 37 | 38 | # 2x3 compatible 39 | if sys.hexversion >= 0x03000000: 40 | unicode = str 41 | 42 | 43 | def _unescape_entities(text): 44 | """ 45 | unescape offending tags < > " & 46 | """ 47 | text = text.replace(b'<', b'<') 48 | text = text.replace(b'>', b'>') 49 | text = text.replace(b'"', b'"') 50 | text = text.replace(b'&', b'&') 51 | return text 52 | 53 | 54 | def _fast_decrypt(data, key): 55 | b = bytearray(data) 56 | key = bytearray(key) 57 | previous = 0x36 58 | for i in range(len(b)): 59 | t = (b[i] >> 4 | b[i] << 4) & 0xff 60 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)] 61 | previous = b[i] 62 | b[i] = t 63 | return bytes(b) 64 | 65 | 66 | def _mdx_decrypt(comp_block): 67 | key = ripemd128(comp_block[4:8] + pack(b' 125 | """ 126 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL) 127 | tagdict = {} 128 | for key, value in taglist: 129 | tagdict[key] = _unescape_entities(value) 130 | return tagdict 131 | 132 | def _decode_key_block_info(self, key_block_info_compressed): 133 | if self._version >= 2: 134 | # zlib compression 135 | assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00') 136 | # decrypt if needed 137 | if self._encrypt & 0x02: 138 | key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) 139 | # decompress 140 | key_block_info = zlib.decompress(key_block_info_compressed[8:]) 141 | # adler checksum 142 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0] 143 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff) 144 | else: 145 | # no compression 146 | key_block_info = key_block_info_compressed 147 | # decode 148 | key_block_info_list = [] 149 | num_entries = 0 150 | i = 0 151 | if self._version >= 2: 152 | byte_format = '>H' 153 | byte_width = 2 154 | text_term = 1 155 | else: 156 | byte_format = '>B' 157 | byte_width = 1 158 | text_term = 0 159 | 160 | while i < len(key_block_info): 161 | # number of entries in current key block 162 | num_entries += unpack(self._number_format, key_block_info[i:i + self._number_width])[0] 163 | i += self._number_width 164 | # text head size 165 | text_head_size = unpack(byte_format, key_block_info[i:i + byte_width])[0] 166 | i += byte_width 167 | # text head 168 | if self._encoding != 'UTF-16': 169 | i += text_head_size + text_term 170 | else: 171 | i += (text_head_size + text_term) * 2 172 | # text tail size 173 | text_tail_size = unpack(byte_format, key_block_info[i:i + byte_width])[0] 174 | i += byte_width 175 | # text tail 176 | if self._encoding != 'UTF-16': 177 | i += text_tail_size + text_term 178 | else: 179 | i += (text_tail_size + text_term) * 2 180 | # key block compressed size 181 | key_block_compressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0] 182 | i += self._number_width 183 | # key block decompressed size 184 | key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i + self._number_width])[0] 185 | i += self._number_width 186 | key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)] 187 | 188 | assert(num_entries == self._num_entries) 189 | 190 | return key_block_info_list 191 | 192 | def _decode_key_block(self, key_block_compressed, key_block_info_list): 193 | key_list = [] 194 | i = 0 195 | for compressed_size, decompressed_size in key_block_info_list: 196 | start = i 197 | end = i + compressed_size 198 | # 4 bytes : compression type 199 | key_block_type = key_block_compressed[start:start + 4] 200 | # 4 bytes : adler checksum of decompressed key block 201 | adler32 = unpack('>I', key_block_compressed[start + 4:start + 8])[0] 202 | if key_block_type == b'\x00\x00\x00\x00': 203 | key_block = key_block_compressed[start + 8:end] 204 | elif key_block_type == b'\x01\x00\x00\x00': 205 | if lzo is None: 206 | print("LZO compression is not supported") 207 | break 208 | # decompress key block 209 | header = b'\xf0' + pack('>I', decompressed_size) 210 | key_block = lzo.decompress(key_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) 211 | elif key_block_type == b'\x02\x00\x00\x00': 212 | # decompress key block 213 | key_block = zlib.decompress(key_block_compressed[start + 8:end]) 214 | # extract one single key block into a key list 215 | key_list += self._split_key_block(key_block) 216 | # notice that adler32 returns signed value 217 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff) 218 | 219 | i += compressed_size 220 | return key_list 221 | 222 | def _split_key_block(self, key_block): 223 | key_list = [] 224 | key_start_index = 0 225 | while key_start_index < len(key_block): 226 | temp = key_block[key_start_index:key_start_index + self._number_width] 227 | # the corresponding record's offset in record block 228 | key_id = unpack(self._number_format, key_block[key_start_index:key_start_index + self._number_width])[0] 229 | # key text ends with '\x00' 230 | if self._encoding == 'UTF-16': 231 | delimiter = b'\x00\x00' 232 | width = 2 233 | else: 234 | delimiter = b'\x00' 235 | width = 1 236 | i = key_start_index + self._number_width 237 | while i < len(key_block): 238 | if key_block[i:i + width] == delimiter: 239 | key_end_index = i 240 | break 241 | i += width 242 | key_text = key_block[key_start_index + self._number_width:key_end_index]\ 243 | .decode(self._encoding, errors='ignore').encode('utf-8').strip() 244 | key_start_index = key_end_index + width 245 | key_list += [(key_id, key_text)] 246 | return key_list 247 | 248 | def _read_header(self): 249 | f = open(self._fname, 'rb') 250 | # number of bytes of header text 251 | header_bytes_size = unpack('>I', f.read(4))[0] 252 | header_bytes = f.read(header_bytes_size) 253 | # 4 bytes: adler32 checksum of header, in little endian 254 | adler32 = unpack('= 0x03000000: 266 | encoding = encoding.decode('utf-8') 267 | # GB18030 > GBK > GB2312 268 | if encoding in ['GBK', 'GB2312']: 269 | encoding = 'GB18030' 270 | self._encoding = encoding 271 | # 读取标题和描述 272 | if b'Title' in header_tag: 273 | self._title = header_tag[b'Title'].decode('utf-8') 274 | else: 275 | self._title = '' 276 | 277 | if b'Description' in header_tag: 278 | self._description = header_tag[b'Description'].decode('utf-8') 279 | else: 280 | self._description = '' 281 | pass 282 | # encryption flag 283 | # 0x00 - no encryption 284 | # 0x01 - encrypt record block 285 | # 0x02 - encrypt key info block 286 | if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No': 287 | self._encrypt = 0 288 | elif header_tag[b'Encrypted'] == b'Yes': 289 | self._encrypt = 1 290 | else: 291 | self._encrypt = int(header_tag[b'Encrypted']) 292 | 293 | # stylesheet attribute if present takes form of: 294 | # style_number # 1-255 295 | # style_begin # or '' 296 | # style_end # or '' 297 | # store stylesheet in dict in the form of 298 | # {'number' : ('style_begin', 'style_end')} 299 | self._stylesheet = {} 300 | if header_tag.get('StyleSheet'): 301 | lines = header_tag['StyleSheet'].splitlines() 302 | for i in range(0, len(lines), 3): 303 | self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2]) 304 | 305 | # before version 2.0, number is 4 bytes integer 306 | # version 2.0 and above uses 8 bytes 307 | self._version = float(header_tag[b'GeneratedByEngineVersion']) 308 | if self._version < 2.0: 309 | self._number_width = 4 310 | self._number_format = '>I' 311 | else: 312 | self._number_width = 8 313 | self._number_format = '>Q' 314 | 315 | return header_tag 316 | 317 | def _read_keys(self): 318 | f = open(self._fname, 'rb') 319 | f.seek(self._key_block_offset) 320 | 321 | # the following numbers could be encrypted 322 | if self._version >= 2.0: 323 | num_bytes = 8 * 5 324 | else: 325 | num_bytes = 4 * 4 326 | block = f.read(num_bytes) 327 | 328 | if self._encrypt & 1: 329 | if self._passcode is None: 330 | raise RuntimeError('user identification is needed to read encrypted file') 331 | regcode, userid = self._passcode 332 | if isinstance(userid, unicode): 333 | userid = userid.encode('utf8') 334 | if self.header[b'RegisterBy'] == b'EMail': 335 | encrypted_key = _decrypt_regcode_by_email(regcode, userid) 336 | else: 337 | encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) 338 | block = _salsa_decrypt(block, encrypted_key) 339 | 340 | # decode this block 341 | sf = BytesIO(block) 342 | # number of key blocks 343 | num_key_blocks = self._read_number(sf) 344 | # number of entries 345 | self._num_entries = self._read_number(sf) 346 | # number of bytes of key block info after decompression 347 | if self._version >= 2.0: 348 | key_block_info_decomp_size = self._read_number(sf) 349 | # number of bytes of key block info 350 | key_block_info_size = self._read_number(sf) 351 | # number of bytes of key block 352 | key_block_size = self._read_number(sf) 353 | 354 | # 4 bytes: adler checksum of previous 5 numbers 355 | if self._version >= 2.0: 356 | adler32 = unpack('>I', f.read(4))[0] 357 | assert adler32 == (zlib.adler32(block) & 0xffffffff) 358 | 359 | # read key block info, which indicates key block's compressed and 360 | # decompressed size 361 | key_block_info = f.read(key_block_info_size) 362 | key_block_info_list = self._decode_key_block_info(key_block_info) 363 | assert(num_key_blocks == len(key_block_info_list)) 364 | 365 | # read key block 366 | key_block_compressed = f.read(key_block_size) 367 | # extract key block 368 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 369 | 370 | self._record_block_offset = f.tell() 371 | f.close() 372 | 373 | return key_list 374 | 375 | def _read_keys_brutal(self): 376 | f = open(self._fname, 'rb') 377 | f.seek(self._key_block_offset) 378 | 379 | # the following numbers could be encrypted, disregard them! 380 | if self._version >= 2.0: 381 | num_bytes = 8 * 5 + 4 382 | key_block_type = b'\x02\x00\x00\x00' 383 | else: 384 | num_bytes = 4 * 4 385 | key_block_type = b'\x01\x00\x00\x00' 386 | block = f.read(num_bytes) 387 | 388 | # key block info 389 | # 4 bytes '\x02\x00\x00\x00' 390 | # 4 bytes adler32 checksum 391 | # unknown number of bytes follows until '\x02\x00\x00\x00' which marks 392 | # the beginning of key block 393 | key_block_info = f.read(8) 394 | if self._version >= 2.0: 395 | assert key_block_info[:4] == b'\x02\x00\x00\x00' 396 | while True: 397 | fpos = f.tell() 398 | t = f.read(1024) 399 | index = t.find(key_block_type) 400 | if index != -1: 401 | key_block_info += t[:index] 402 | f.seek(fpos + index) 403 | break 404 | else: 405 | key_block_info += t 406 | 407 | key_block_info_list = self._decode_key_block_info(key_block_info) 408 | key_block_size = sum(list(zip(*key_block_info_list))[0]) 409 | 410 | # read key block 411 | key_block_compressed = f.read(key_block_size) 412 | # extract key block 413 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 414 | 415 | self._record_block_offset = f.tell() 416 | f.close() 417 | 418 | self._num_entries = len(key_list) 419 | return key_list 420 | 421 | 422 | class MDD(MDict): 423 | """ 424 | MDict resource file format (*.MDD) reader. 425 | >>> mdd = MDD('example.mdd') 426 | >>> len(mdd) 427 | 208 428 | >>> for filename,content in mdd.items(): 429 | ... print filename, content[:10] 430 | """ 431 | def __init__(self, fname, passcode=None): 432 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode) 433 | 434 | def items(self): 435 | """Return a generator which in turn produce tuples in the form of (filename, content) 436 | """ 437 | return self._decode_record_block() 438 | 439 | def _decode_record_block(self): 440 | f = open(self._fname, 'rb') 441 | f.seek(self._record_block_offset) 442 | 443 | num_record_blocks = self._read_number(f) 444 | num_entries = self._read_number(f) 445 | assert(num_entries == self._num_entries) 446 | record_block_info_size = self._read_number(f) 447 | record_block_size = self._read_number(f) 448 | 449 | # record block info section 450 | record_block_info_list = [] 451 | size_counter = 0 452 | for i in range(num_record_blocks): 453 | compressed_size = self._read_number(f) 454 | decompressed_size = self._read_number(f) 455 | record_block_info_list += [(compressed_size, decompressed_size)] 456 | size_counter += self._number_width * 2 457 | assert(size_counter == record_block_info_size) 458 | 459 | # actual record block 460 | offset = 0 461 | i = 0 462 | size_counter = 0 463 | for compressed_size, decompressed_size in record_block_info_list: 464 | record_block_compressed = f.read(compressed_size) 465 | # 4 bytes: compression type 466 | record_block_type = record_block_compressed[:4] 467 | # 4 bytes: adler32 checksum of decompressed record block 468 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 469 | if record_block_type == b'\x00\x00\x00\x00': 470 | record_block = record_block_compressed[8:] 471 | elif record_block_type == b'\x01\x00\x00\x00': 472 | if lzo is None: 473 | print("LZO compression is not supported") 474 | break 475 | # decompress 476 | header = b'\xf0' + pack('>I', decompressed_size) 477 | record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) 478 | elif record_block_type == b'\x02\x00\x00\x00': 479 | # decompress 480 | record_block = zlib.decompress(record_block_compressed[8:]) 481 | 482 | # notice that adler32 return signed value 483 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 484 | 485 | assert(len(record_block) == decompressed_size) 486 | # split record block according to the offset info from key block 487 | while i < len(self._key_list): 488 | record_start, key_text = self._key_list[i] 489 | # reach the end of current record block 490 | if record_start - offset >= len(record_block): 491 | break 492 | # record end index 493 | if i < len(self._key_list) - 1: 494 | record_end = self._key_list[i + 1][0] 495 | else: 496 | record_end = len(record_block) + offset 497 | i += 1 498 | data = record_block[record_start - offset:record_end - offset] 499 | yield key_text, data 500 | offset += len(record_block) 501 | size_counter += compressed_size 502 | assert(size_counter == record_block_size) 503 | 504 | f.close() 505 | 506 | ### 获取 mdx 文件的索引列表,格式为 507 | ### key_text(关键词,可以由后面的 keylist 得到) 508 | ### file_pos(record_block开始的位置) 509 | ### compressed_size(record_block压缩前的大小) 510 | ### decompressed_size(解压后的大小) 511 | ### record_block_type(record_block 的压缩类型) 512 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 513 | ### record_end 514 | ### offset 515 | def get_index(self, check_block = True): 516 | f = open(self._fname, 'rb') 517 | index_dict_list = [] 518 | f.seek(self._record_block_offset) 519 | 520 | num_record_blocks = self._read_number(f) 521 | num_entries = self._read_number(f) 522 | assert(num_entries == self._num_entries) 523 | record_block_info_size = self._read_number(f) 524 | record_block_size = self._read_number(f) 525 | 526 | # record block info section 527 | record_block_info_list = [] 528 | size_counter = 0 529 | for i in range(num_record_blocks): 530 | compressed_size = self._read_number(f) 531 | decompressed_size = self._read_number(f) 532 | record_block_info_list += [(compressed_size, decompressed_size)] 533 | size_counter += self._number_width * 2 534 | # todo:注意!!! 535 | assert(size_counter == record_block_info_size) 536 | 537 | # actual record block 538 | offset = 0 539 | i = 0 540 | size_counter = 0 541 | for compressed_size, decompressed_size in record_block_info_list: 542 | current_pos = f.tell() 543 | record_block_compressed = f.read(compressed_size) 544 | # 4 bytes: compression type 545 | record_block_type = record_block_compressed[:4] 546 | # 4 bytes: adler32 checksum of decompressed record block 547 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 548 | if record_block_type == b'\x00\x00\x00\x00': 549 | _type = 0 550 | if check_block: 551 | record_block = record_block_compressed[8:] 552 | elif record_block_type == b'\x01\x00\x00\x00': 553 | _type = 1 554 | if lzo is None: 555 | print("LZO compression is not supported") 556 | break 557 | # decompress 558 | header = b'\xf0' + pack('>I', decompressed_size) 559 | if check_block: 560 | record_block = lzo.decompress(record_block_compressed[start + 8:end], initSize = decompressed_size, blockSize=1308672) 561 | elif record_block_type == b'\x02\x00\x00\x00': 562 | # decompress 563 | _type = 2 564 | if check_block: 565 | record_block = zlib.decompress(record_block_compressed[8:]) 566 | 567 | # notice that adler32 return signed value 568 | if check_block: 569 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 570 | assert(len(record_block) == decompressed_size) 571 | # split record block according to the offset info from key block 572 | while i < len(self._key_list): 573 | ### 用来保存索引信息的空字典 574 | index_dict = {} 575 | index_dict['file_pos'] = current_pos 576 | index_dict['compressed_size'] = compressed_size 577 | index_dict['decompressed_size'] = decompressed_size 578 | index_dict['record_block_type'] = _type 579 | record_start, key_text = self._key_list[i] 580 | index_dict['record_start'] = record_start 581 | index_dict['key_text'] = key_text.decode("utf-8") 582 | index_dict['offset'] = offset 583 | # reach the end of current record block 584 | if record_start - offset >= decompressed_size: 585 | break 586 | # record end index 587 | if i < len(self._key_list) - 1: 588 | record_end = self._key_list[i + 1][0] 589 | else: 590 | record_end = decompressed_size + offset 591 | index_dict['record_end'] = record_end 592 | i += 1 593 | if check_block: 594 | data = record_block[record_start - offset:record_end - offset] 595 | index_dict_list.append(index_dict) 596 | #yield key_text, data 597 | offset += decompressed_size 598 | size_counter += compressed_size 599 | assert(size_counter == record_block_size) 600 | f.close() 601 | return index_dict_list 602 | 603 | 604 | class MDX(MDict): 605 | """ 606 | MDict dictionary file format (*.MDD) reader. 607 | >>> mdx = MDX('example.mdx') 608 | >>> len(mdx) 609 | 42481 610 | >>> for key,value in mdx.items(): 611 | ... print key, value[:10] 612 | """ 613 | def __init__(self, fname, encoding='', substyle=False, passcode=None): 614 | MDict.__init__(self, fname, encoding, passcode) 615 | self._substyle = substyle 616 | 617 | def items(self): 618 | """Return a generator which in turn produce tuples in the form of (key, value) 619 | """ 620 | return self._decode_record_block() 621 | 622 | def _substitute_stylesheet(self, txt): 623 | # substitute stylesheet definition 624 | txt_list = re.split('`\d+`', txt) 625 | txt_tag = re.findall('`\d+`', txt) 626 | txt_styled = txt_list[0] 627 | for j, p in enumerate(txt_list[1:]): 628 | style = self._stylesheet[txt_tag[j][1:-1]] 629 | if p and p[-1] == '\n': 630 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n' 631 | else: 632 | txt_styled = txt_styled + style[0] + p + style[1] 633 | return txt_styled 634 | 635 | def _decode_record_block(self): 636 | f = open(self._fname, 'rb') 637 | f.seek(self._record_block_offset) 638 | 639 | num_record_blocks = self._read_number(f) 640 | num_entries = self._read_number(f) 641 | assert(num_entries == self._num_entries) 642 | record_block_info_size = self._read_number(f) 643 | record_block_size = self._read_number(f) 644 | 645 | # record block info section 646 | record_block_info_list = [] 647 | size_counter = 0 648 | for i in range(num_record_blocks): 649 | compressed_size = self._read_number(f) 650 | decompressed_size = self._read_number(f) 651 | record_block_info_list += [(compressed_size, decompressed_size)] 652 | size_counter += self._number_width * 2 653 | assert(size_counter == record_block_info_size) 654 | 655 | # actual record block data 656 | offset = 0 657 | i = 0 658 | size_counter = 0 659 | ###最后的索引表的格式为 660 | ### key_text(关键词,可以由后面的 keylist 得到) 661 | ### file_pos(record_block开始的位置) 662 | ### compressed_size(record_block压缩前的大小) 663 | ### decompressed_size(解压后的大小) 664 | ### record_block_type(record_block 的压缩类型) 665 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 666 | ### record_end 667 | ### offset 668 | for compressed_size, decompressed_size in record_block_info_list: 669 | record_block_compressed = f.read(compressed_size) 670 | ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) 671 | ###### 另外还需要记录当前 f 对象的位置 672 | ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() 673 | # 4 bytes indicates block compression type 674 | record_block_type = record_block_compressed[:4] 675 | # 4 bytes adler checksum of uncompressed content 676 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 677 | # no compression 678 | if record_block_type == b'\x00\x00\x00\x00': 679 | record_block = record_block_compressed[8:] 680 | # lzo compression 681 | elif record_block_type == b'\x01\x00\x00\x00': 682 | if lzo is None: 683 | print("LZO compression is not supported") 684 | break 685 | # decompress 686 | header = b'\xf0' + pack('>I', decompressed_size) 687 | record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) 688 | # zlib compression 689 | elif record_block_type == b'\x02\x00\x00\x00': 690 | # decompress 691 | record_block = zlib.decompress(record_block_compressed[8:]) 692 | ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 693 | ###### 需要的信息有 record_block_compressed, decompress_size, 694 | ###### record_block_type 695 | ###### 另外还需要校验信息 adler32 696 | # notice that adler32 return signed value 697 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 698 | 699 | assert(len(record_block) == decompressed_size) 700 | # split record block according to the offset info from key block 701 | while i < len(self._key_list): 702 | record_start, key_text = self._key_list[i] 703 | # reach the end of current record block 704 | if record_start - offset >= len(record_block): 705 | break 706 | # record end index 707 | if i < len(self._key_list) - 1: 708 | record_end = self._key_list[i + 1][0] 709 | else: 710 | record_end = len(record_block) + offset 711 | i += 1 712 | #############需要得到 record_block , record_start, record_end, 713 | #############offset 714 | record = record_block[record_start - offset:record_end - offset] 715 | # convert to utf-8 716 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') 717 | # substitute styles 718 | #############是否替换样式表 719 | if self._substyle and self._stylesheet: 720 | record = self._substitute_stylesheet(record) 721 | 722 | yield key_text, record 723 | offset += len(record_block) 724 | size_counter += compressed_size 725 | assert(size_counter == record_block_size) 726 | 727 | f.close() 728 | 729 | ### 获取 mdx 文件的索引列表,格式为 730 | ### key_text(关键词,可以由后面的 keylist 得到) 731 | ### file_pos(record_block开始的位置) 732 | ### compressed_size(record_block压缩前的大小) 733 | ### decompressed_size(解压后的大小) 734 | ### record_block_type(record_block 的压缩类型) 735 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 736 | ### record_end 737 | ### offset 738 | ### 所需 metadata 739 | ### 740 | def get_index(self, check_block = True): 741 | ### 索引列表 742 | index_dict_list = [] 743 | f = open(self._fname, 'rb') 744 | f.seek(self._record_block_offset) 745 | 746 | num_record_blocks = self._read_number(f) 747 | num_entries = self._read_number(f) 748 | assert(num_entries == self._num_entries) 749 | record_block_info_size = self._read_number(f) 750 | record_block_size = self._read_number(f) 751 | 752 | # record block info section 753 | record_block_info_list = [] 754 | size_counter = 0 755 | for i in range(num_record_blocks): 756 | compressed_size = self._read_number(f) 757 | decompressed_size = self._read_number(f) 758 | record_block_info_list += [(compressed_size, decompressed_size)] 759 | size_counter += self._number_width * 2 760 | assert(size_counter == record_block_info_size) 761 | 762 | # actual record block data 763 | offset = 0 764 | i = 0 765 | size_counter = 0 766 | ###最后的索引表的格式为 767 | ### key_text(关键词,可以由后面的 keylist 得到) 768 | ### file_pos(record_block开始的位置) 769 | ### compressed_size(record_block压缩前的大小) 770 | ### decompressed_size(解压后的大小) 771 | ### record_block_type(record_block 的压缩类型) 772 | ### record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 773 | ### record_end 774 | ### offset 775 | for compressed_size, decompressed_size in record_block_info_list: 776 | current_pos = f.tell() 777 | record_block_compressed = f.read(compressed_size) 778 | ###### 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) 779 | ###### 另外还需要记录当前 f 对象的位置 780 | ###### 使用 f.tell() 命令/ 在建立索引是需要 f.seek() 781 | # 4 bytes indicates block compression type 782 | record_block_type = record_block_compressed[:4] 783 | # 4 bytes adler checksum of uncompressed content 784 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 785 | # no compression 786 | if record_block_type == b'\x00\x00\x00\x00': 787 | _type = 0 788 | record_block = record_block_compressed[8:] 789 | # lzo compression 790 | elif record_block_type == b'\x01\x00\x00\x00': 791 | _type = 1 792 | if lzo is None: 793 | print("LZO compression is not supported") 794 | break 795 | # decompress 796 | header = b'\xf0' + pack('>I', decompressed_size) 797 | if check_block: 798 | record_block = lzo.decompress(record_block_compressed[8:], initSize = decompressed_size, blockSize=1308672) 799 | # zlib compression 800 | elif record_block_type == b'\x02\x00\x00\x00': 801 | # decompress 802 | _type = 2 803 | if check_block: 804 | record_block = zlib.decompress(record_block_compressed[8:]) 805 | ###### 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 806 | ###### 需要的信息有 record_block_compressed, decompress_size, 807 | ###### record_block_type 808 | ###### 另外还需要校验信息 adler32 809 | # notice that adler32 return signed value 810 | if check_block: 811 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 812 | assert(len(record_block) == decompressed_size) 813 | # split record block according to the offset info from key block 814 | while i < len(self._key_list): 815 | ### 用来保存索引信息的空字典 816 | index_dict = {} 817 | index_dict['file_pos'] = current_pos 818 | index_dict['compressed_size'] = compressed_size 819 | index_dict['decompressed_size'] = decompressed_size 820 | index_dict['record_block_type'] = _type 821 | record_start, key_text = self._key_list[i] 822 | index_dict['record_start'] = record_start 823 | index_dict['key_text'] = key_text.decode('utf-8') 824 | index_dict['offset'] = offset 825 | # reach the end of current record block 826 | if record_start - offset >= decompressed_size: 827 | break 828 | # record end index 829 | if i < len(self._key_list) - 1: 830 | record_end = self._key_list[i + 1][0] 831 | else: 832 | record_end = decompressed_size + offset 833 | index_dict['record_end'] = record_end 834 | i += 1 835 | #############需要得到 record_block , record_start, record_end, 836 | #############offset 837 | if check_block: 838 | record = record_block[record_start - offset:record_end - offset] 839 | # convert to utf-8 840 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') 841 | # substitute styles 842 | #############是否替换样式表 843 | if self._substyle and self._stylesheet: 844 | record = self._substitute_stylesheet(record) 845 | index_dict_list.append(index_dict) 846 | 847 | offset += decompressed_size 848 | size_counter += compressed_size 849 | #todo: 注意!!! 850 | #assert(size_counter == record_block_size) 851 | f.close 852 | #这里比 mdd 部分稍有不同,应该还需要传递编码以及样式表信息 853 | meta = {} 854 | meta['encoding'] = self._encoding 855 | meta['stylesheet'] = json.dumps(self._stylesheet) 856 | meta['title'] = self._title 857 | meta['description'] = self._description 858 | 859 | return {"index_dict_list":index_dict_list, 'meta':meta} 860 | if __name__ == '__main__': 861 | import sys 862 | import os 863 | import os.path 864 | import argparse 865 | import codecs 866 | 867 | def passcode(s): 868 | try: 869 | regcode, userid = s.split(',') 870 | except: 871 | raise argparse.ArgumentTypeError("Passcode must be regcode,userid") 872 | try: 873 | regcode = codecs.decode(regcode, 'hex') 874 | except: 875 | raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string") 876 | return regcode, userid 877 | 878 | parser = argparse.ArgumentParser() 879 | parser.add_argument('-x', '--extract', action="store_true", 880 | help='extract mdx to source format and extract files from mdd') 881 | parser.add_argument('-s', '--substyle', action="store_true", 882 | help='substitute style definition if present') 883 | parser.add_argument('-d', '--datafolder', default="data", 884 | help='folder to extract data files from mdd') 885 | parser.add_argument('-e', '--encoding', default="", 886 | help='folder to extract data files from mdd') 887 | parser.add_argument('-p', '--passcode', default=None, type=passcode, 888 | help='register_code,email_or_deviceid') 889 | parser.add_argument("filename", nargs='?', help="mdx file name") 890 | args = parser.parse_args() 891 | 892 | # use GUI to select file, default to extract 893 | if not args.filename: 894 | import Tkinter 895 | import tkFileDialog 896 | root = Tkinter.Tk() 897 | root.withdraw() 898 | args.filename = tkFileDialog.askopenfilename(parent=root) 899 | args.extract = True 900 | 901 | if not os.path.exists(args.filename): 902 | print("Please specify a valid MDX/MDD file") 903 | 904 | base, ext = os.path.splitext(args.filename) 905 | 906 | # read mdx file 907 | if ext.lower() == os.path.extsep + 'mdx': 908 | mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode) 909 | if type(args.filename) is unicode: 910 | bfname = args.filename.encode('utf-8') 911 | else: 912 | bfname = args.filename 913 | print('======== %s ========' % bfname) 914 | print(' Number of Entries : %d' % len(mdx)) 915 | for key, value in mdx.header.items(): 916 | print(' %s : %s' % (key, value)) 917 | else: 918 | mdx = None 919 | 920 | # find companion mdd file 921 | mdd_filename = ''.join([base, os.path.extsep, 'mdd']) 922 | if os.path.exists(mdd_filename): 923 | mdd = MDD(mdd_filename, args.passcode) 924 | if type(mdd_filename) is unicode: 925 | bfname = mdd_filename.encode('utf-8') 926 | else: 927 | bfname = mdd_filename 928 | print('======== %s ========' % bfname) 929 | print(' Number of Entries : %d' % len(mdd)) 930 | for key, value in mdd.header.items(): 931 | print(' %s : %s' % (key, value)) 932 | else: 933 | mdd = None 934 | 935 | if args.extract: 936 | # write out glos 937 | if mdx: 938 | output_fname = ''.join([base, os.path.extsep, 'txt']) 939 | tf = open(output_fname, 'wb') 940 | for key, value in mdx.items(): 941 | tf.write(key) 942 | tf.write(b'\r\n') 943 | tf.write(value) 944 | if not value.endswith(b'\n'): 945 | tf.write(b'\r\n') 946 | tf.write(b'\r\n') 947 | tf.close() 948 | # write out style 949 | if mdx.header.get('StyleSheet'): 950 | style_fname = ''.join([base, '_style', os.path.extsep, 'txt']) 951 | sf = open(style_fname, 'wb') 952 | sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines())) 953 | sf.close() 954 | # write out optional data files 955 | if mdd: 956 | datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder) 957 | if not os.path.exists(datafolder): 958 | os.makedirs(datafolder) 959 | for key, value in mdd.items(): 960 | fname = key.decode('utf-8').replace('\\', os.path.sep) 961 | dfname = datafolder + fname 962 | if not os.path.exists(os.path.dirname(dfname)): 963 | os.makedirs(os.path.dirname(dfname)) 964 | df = open(dfname, 'wb') 965 | df.write(value) 966 | df.close() 967 | -------------------------------------------------------------------------------- /ripemd128.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright by https://github.com/zhansliu/writemdict 3 | 4 | ripemd128.py - A simple ripemd128 library in pure Python. 5 | 6 | Supports both Python 2 (versions >= 2.6) and Python 3. 7 | 8 | Usage: 9 | from ripemd128 import ripemd128 10 | digest = ripemd128(b"The quick brown fox jumps over the lazy dog") 11 | assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") 12 | 13 | """ 14 | 15 | 16 | 17 | import struct 18 | 19 | 20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt 21 | 22 | def f(j, x, y, z): 23 | assert(0 <= j and j < 64) 24 | if j < 16: 25 | return x ^ y ^ z 26 | elif j < 32: 27 | return (x & y) | (z & ~x) 28 | elif j < 48: 29 | return (x | (0xffffffff & ~y)) ^ z 30 | else: 31 | return (x & z) | (y & ~z) 32 | 33 | def K(j): 34 | assert(0 <= j and j < 64) 35 | if j < 16: 36 | return 0x00000000 37 | elif j < 32: 38 | return 0x5a827999 39 | elif j < 48: 40 | return 0x6ed9eba1 41 | else: 42 | return 0x8f1bbcdc 43 | 44 | def Kp(j): 45 | assert(0 <= j and j < 64) 46 | if j < 16: 47 | return 0x50a28be6 48 | elif j < 32: 49 | return 0x5c4dd124 50 | elif j < 48: 51 | return 0x6d703ef3 52 | else: 53 | return 0x00000000 54 | 55 | def padandsplit(message): 56 | """ 57 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges 58 | from 0 to 16. 59 | First pads the message to length in bytes is congruent to 56 (mod 64), 60 | by first adding a byte 0x80, and then padding with 0x00 bytes until the 61 | message length is congruent to 56 (mod 64). Then adds the little-endian 62 | 64-bit representation of the original length. Finally, splits the result 63 | up into 64-byte blocks, which are further parsed as 32-bit integers. 64 | """ 65 | origlen = len(message) 66 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1! 67 | message += b"\x80" 68 | message += b"\x00" * (padlength - 1) 69 | message += struct.pack("> (32-s)) & 0xffffffff 86 | 87 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 88 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, 89 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, 90 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2] 91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, 92 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, 93 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, 94 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14] 95 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, 96 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, 97 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, 98 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12] 99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, 100 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, 101 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, 102 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8] 103 | 104 | 105 | def ripemd128(message): 106 | h0 = 0x67452301 107 | h1 = 0xefcdab89 108 | h2 = 0x98badcfe 109 | h3 = 0x10325476 110 | X = padandsplit(message) 111 | for i in range(len(X)): 112 | (A,B,C,D) = (h0,h1,h2,h3) 113 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3) 114 | for j in range(64): 115 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j))) 116 | (A,D,C,B) = (D,C,B,T) 117 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j))) 118 | (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T) 119 | T = add(h1,C,Dp) 120 | h1 = add(h2,D,Ap) 121 | h2 = add(h3,A,Bp) 122 | h3 = add(h0,B,Cp) 123 | h0 = T 124 | 125 | 126 | return struct.pack(" 2 | 3 | 4 | All Available Dictionary 5 | 6 | 7 | {% for item in dicts %} 8 |
9 | {% endfor %} 10 | 11 | 12 | -------------------------------------------------------------------------------- /templates/dict.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | {{title}} 5 | 6 | 7 |

{{title}}

8 | 9 | 10 |
11 | {{description|safe}} 12 |
13 | 25 | 26 | 27 | 28 | -------------------------------------------------------------------------------- /templates/entry.html: -------------------------------------------------------------------------------- 1 |  2 | 3 | 4 | {{title}}/{{entry}} 5 | 6 | 7 | {{content|safe}} 8 | 45 | 46 | 47 | -------------------------------------------------------------------------------- /test.py: -------------------------------------------------------------------------------- 1 | from mdict_query import IndexBuilder 2 | import unittest 3 | import os 4 | import glob 5 | import time 6 | from timeit import timeit 7 | 8 | class TestMdict(unittest.TestCase): 9 | 10 | _mdx_file = glob.glob("mdx/Vocabulary*.mdx")[0] 11 | _repeat = 100 12 | # remove existing db 13 | for f in glob.glob("mdx/Vocabulary*.db"): 14 | os.remove(f) 15 | 16 | def test_builder_noindex(self): 17 | '''test basic function''' 18 | for f in glob.glob("mdx/Vocabulary*.db"): 19 | os.remove(f) 20 | print("***without sql index***\n") 21 | start = time.time() 22 | bd = IndexBuilder(self._mdx_file, sql_index = False, check = True) 23 | print("takes {0} seconds to build without sql index\n".format(time.time() - start)) 24 | 25 | start = time.time() 26 | word = 'dedicate' 27 | for i in range(self._repeat): 28 | self.assertTrue(bd.mdx_lookup(word)) 29 | print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, word, self._repeat)) 30 | for i in range(self._repeat): 31 | bd.get_mdx_keys("dedi*") 32 | print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, "dedi*", self._repeat)) 33 | 34 | def test_builder_index(self): 35 | '''test basic function''' 36 | for f in glob.glob("mdx/Vocabulary*.db"): 37 | os.remove(f) 38 | print("***with sql index***\n") 39 | start = time.time() 40 | bd = IndexBuilder(self._mdx_file, sql_index = True, check = False) 41 | print("takes {0} seconds to build with sql index\n".format(time.time() - start)) 42 | 43 | start = time.time() 44 | word = 'dedicate' 45 | for i in range(self._repeat): 46 | bd.mdx_lookup(word) 47 | print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, word, self._repeat)) 48 | 49 | for i in range(self._repeat): 50 | bd.get_mdx_keys("dedi*") 51 | print("takes {0} second to lookup {1} {2} times\n".format(time.time() - start, "dedi*", self._repeat)) 52 | 53 | 54 | if __name__ == '__main__': 55 | unittest.main() -------------------------------------------------------------------------------- /test_lzo.py: -------------------------------------------------------------------------------- 1 | from mdict_query import IndexBuilder 2 | 3 | bd = IndexBuilder("mdx\\oed.mdx") 4 | keys = bd.get_mdx_keys("ded*") 5 | result = bd.mdx_lookup('a') 6 | pass -------------------------------------------------------------------------------- /web.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, send_from_directory, abort, render_template, jsonify, Response 2 | from mdict_dir import Dir 3 | #from mdict_query import IndexBuilder 4 | import os 5 | import re 6 | import sys 7 | import json 8 | #IndexBuilder('vocab.mdx') 9 | #pass 10 | app = Flask(__name__) 11 | 12 | # add reg support 13 | from werkzeug.routing import BaseConverter 14 | 15 | class RegexConverter(BaseConverter): 16 | def __init__(self, url_map, *items): 17 | super(RegexConverter, self).__init__(url_map) 18 | self.regex = items[0] 19 | 20 | app.url_map.converters['regex'] = RegexConverter 21 | ################# 22 | # 将多层路径整合为文件名 23 | def path2file(path): 24 | return path.replace('/','_') 25 | # 将词典名转为用于url的形式 26 | def title2url(title): 27 | return re.sub(r"。|,|?|\s|,|\.|/|\\|(|)|(|)", "", title.lower()) 28 | # init app 29 | mdict_dir = 'mdx' # mdx/mdd 文件目录 30 | mdd_cache_dir = 'cache' 31 | 32 | if not os.path.isdir(mdict_dir): 33 | print('no mdx directory\n', file=sys.stderr) 34 | os.makedirs(mdict_dir) 35 | 36 | if not os.path.isdir(mdd_cache_dir): 37 | os.makedirs(mdd_cache_dir) 38 | 39 | 40 | mdict = Dir(mdict_dir) 41 | #config = mdict._config['dicts'][0] 42 | mdx_map = {} 43 | for dic in mdict._config['dicts']: 44 | mdx_map[title2url(dic['title'])] = dic['builder'] 45 | ########## 46 | @app.route('/') 47 | def hello_world(): 48 | return 'Hello World' 49 | 50 | 51 | @app.route('/dict/') 52 | def all_dicts(): 53 | dicts = [] 54 | for dic in mdict._config['dicts']: 55 | title = dic['title'] 56 | dicts.append({ 57 | 'title' : title, 58 | 'url' : '/dict/{0}/'.format(title2url(title)) 59 | }) 60 | return render_template('all.html', dicts = dicts) 61 | 62 | @app.route('/dict//') 63 | def description(title): 64 | if title not in mdx_map: 65 | return "没有找到此词典" 66 | for xxx in mdict._config['dicts']: 67 | if title2url(xxx['title']) == title: 68 | return render_template("dict.html", title = xxx['title'], description = xxx['description'], url_title = title) 69 | 70 | 71 | @app.route('/dict/search/<query>/') 72 | def search(query): 73 | result = [] 74 | for xxx in mdict._config['dicts']: 75 | bd = xxx['builder'] 76 | result.append([title2url(xxx['title']), bd.get_mdx_keys(query)]) 77 | dat = json.dumps(result, ensure_ascii = False) 78 | resp = Response(response=dat, # standard way to return json 79 | status=200, 80 | mimetype="application/json") 81 | return(resp) 82 | 83 | 84 | @app.route('/dict/<title>/<regex(".+?\."):base><regex("css|png|jpg|gif|mp3|js|wav|ogg"):ext>') 85 | def getFile(title,base,ext): 86 | #print(base + ext, file=sys.stderr) 87 | if title not in mdx_map: 88 | return "没有找到此词典" 89 | builder = mdx_map[title] 90 | # 是否为外挂文件 91 | external_file = os.path.join(mdict_dir, base + ext) 92 | if os.path.isfile(external_file): 93 | return send_from_directory(mdict_dir, base + ext) 94 | 95 | # 是否是mdd内的文件 96 | cache_name = path2file(base + ext) 97 | cache_full = os.path.join(mdd_cache_dir, cache_name) 98 | if not os.path.isfile(cache_full): 99 | mdd_key = '\\{0}{1}'.format(base,ext).replace("/","\\") 100 | byte = builder.mdd_lookup(mdd_key) 101 | if not byte: # 在 mdd 内未找到指定文件 102 | abort(404) # 返回 404 103 | file = open(cache_full,'wb') 104 | file.write(byte[0]) 105 | file.close() 106 | return send_from_directory(mdd_cache_dir, cache_name) 107 | 108 | 109 | @app.route('/dict/<title>/<hwd>') 110 | def getEntry(title, hwd): 111 | if title not in mdx_map: 112 | return "没有找到此词典" 113 | builder = mdx_map[title] 114 | result = builder.mdx_lookup(hwd) 115 | if result: 116 | text = result[0] 117 | else: 118 | return "<p>在词典{0}中没有找到{1}</p>".format(title, hwd) 119 | 120 | #return 121 | #text.replace("\r\n","").replace("entry://","").replace("sound://","") 122 | return render_template("entry.html", content = text, title = title, entry = hwd) 123 | 124 | if __name__ == '__main__': 125 | app.run('127.0.0.1',5000, debug = True) 126 | 127 | -------------------------------------------------------------------------------- /web.spec: -------------------------------------------------------------------------------- 1 | # -*- mode: python -*- 2 | 3 | block_cipher = None 4 | 5 | 6 | a = Analysis(['web.py'], 7 | pathex=['D:\\Users\\Maple\\Documents\\GitHub\\mdict-query'], 8 | binaries=None, 9 | datas=None, 10 | hiddenimports=[], 11 | hookspath=[], 12 | runtime_hooks=[], 13 | excludes=[], 14 | win_no_prefer_redirects=False, 15 | win_private_assemblies=False, 16 | cipher=block_cipher) 17 | pyz = PYZ(a.pure, a.zipped_data, 18 | cipher=block_cipher) 19 | exe = EXE(pyz, 20 | a.scripts, 21 | exclude_binaries=True, 22 | name='web', 23 | debug=False, 24 | strip=False, 25 | upx=True, 26 | console=True ) 27 | coll = COLLECT(exe, 28 | a.binaries, 29 | a.zipfiles, 30 | a.datas, 31 | strip=False, 32 | upx=True, 33 | name='web') 34 | -------------------------------------------------------------------------------- /wsgi.py: -------------------------------------------------------------------------------- 1 | 2 | from web import app 3 | 4 | if __name__ == "__main__": 5 | app.run() 6 | --------------------------------------------------------------------------------