├── .gitattributes ├── .gitignore ├── GD2Anki.py ├── GD2Anki_setting.py ├── LICENSE ├── README.md ├── mdict_query.py ├── pureSalsa20.py ├── readmdict.py ├── ripemd128.py ├── setting in GoldenDict -2 .png ├── setting in GoldenDict.png └── 升级说明-202308.md /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # celery beat schedule file 95 | celerybeat-schedule 96 | 97 | # SageMath parsed files 98 | *.sage.py 99 | 100 | # Environments 101 | .env 102 | .venv 103 | env/ 104 | venv/ 105 | ENV/ 106 | env.bak/ 107 | venv.bak/ 108 | 109 | # Spyder project settings 110 | .spyderproject 111 | .spyproject 112 | 113 | # Rope project settings 114 | .ropeproject 115 | 116 | # mkdocs documentation 117 | /site 118 | 119 | # mypy 120 | .mypy_cache/ 121 | .dmypy.json 122 | dmypy.json 123 | 124 | # Pyre type checker 125 | .pyre/ 126 | -------------------------------------------------------------------------------- /GD2Anki.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | updated on Sun Augst 19th 20:00:15 2023 4 | 1. support multiple user dicts 5 | 2. support user to set deck 6 | 3. can load seperate CSS into line, so that it shows good in Anki. 7 | 8 | Created on Mon Oct 28 00:00:15 2019 9 | 10 | @author: valuex 11 | """ 12 | 13 | import json 14 | import urllib.request 15 | import re 16 | from mdict_query import IndexBuilder 17 | import configparser 18 | import os 19 | import css_inline 20 | from lxml import html, etree 21 | import sys,io 22 | import bs4 #BeautifulSoup 23 | import tkinter.messagebox 24 | import copy 25 | # sys.stdout = io.TextIOWrapper(sys.stdout.buffer,encoding='gb18030') 26 | 27 | 28 | def request(action, **params): 29 | return {'action': action, 'params': params, 'version': 6} 30 | 31 | def invoke(action, **params): 32 | requestJson = json.dumps(request(action, **params)).encode('utf-8') 33 | response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson))) 34 | if len(response) != 2: 35 | raise Exception('response has an unexpected number of fields') 36 | if 'error' not in response: 37 | raise Exception('response is missing required error field') 38 | if 'result' not in response: 39 | raise Exception('response is missing required result field') 40 | if response['error'] is not None: 41 | raise Exception(response['error']) 42 | return response['result'] 43 | 44 | def NoteContent(dic_deck,FrontStr, BackStr): 45 | BackStr=BackStr.replace('"','\\"') 46 | BackStr=BackStr.replace(u'\xa0',u' ') 47 | BackStr = re.sub(r'', '', BackStr) 48 | # BackStr=BackStr.replace('','
') 49 | newnote=""" 50 | { 51 | "deckName": "%s", 52 | "modelName": "%s", 53 | "fields": { 54 | "%s": "%s", 55 | "%s": "%s" 56 | }, 57 | "options": { 58 | "allowDuplicate": false 59 | }, 60 | "tags": ["GoldenDict"] 61 | } 62 | """ % (dic_deck["dname"],dic_deck["mname"],dic_deck["cfname"],FrontStr,dic_deck["cbname"],BackStr) 63 | 64 | return newnote 65 | 66 | def consolidate_CSS_into_HTML(strHtml): 67 | soup = bs4.BeautifulSoup(strHtml,features="lxml") 68 | stylesheets = soup.findAll("link", {"rel": "stylesheet"}) 69 | for s in stylesheets: 70 | t = soup.new_tag('style') 71 | if(os.path.isfile(s["href"])): 72 | c = bs4.element.NavigableString(open(s["href"]).read()) 73 | t.insert(0,c) 74 | t['type'] = 'text/css' 75 | s.replaceWith(t) 76 | else: 77 | print("css file" + s["href"] + "not found") 78 | continue 79 | return str(soup) 80 | 81 | def merge_html(origin_doc, new_doc): 82 | soup_orgin_doc = get_soup(origin_doc) 83 | soup_newdoc = get_soup(new_doc) 84 | b = soup_orgin_doc.new_tag('hr') 85 | # b.append('----') 86 | soup_orgin_doc.body.append(b) 87 | for element in soup_newdoc.body: 88 | soup_orgin_doc.body.append(copy.copy(element)) 89 | return str(soup_orgin_doc) 90 | def get_soup(strInput): 91 | b_soup = bs4.BeautifulSoup(strInput,features="lxml") 92 | try: 93 | b_soup_body=b_soup.body 94 | except: 95 | html_content=str_to_html(strInput) 96 | b_soup = bs4.BeautifulSoup(html_content,features="lxml") 97 | return b_soup 98 | 99 | def str_to_html(strInput): 100 | html_body="" 101 | list_txt=strInput.splitlines() 102 | for line in list_txt: 103 | body_line="

" + line + "

\n" 104 | html_body=html_body + body_line 105 | html_prefix="" 106 | html_sufix="" 107 | html_doc=html_prefix+html_body+html_sufix 108 | return html_doc 109 | 110 | if __name__ == '__main__': 111 | All_Meanings="" 112 | config = configparser.ConfigParser() 113 | # get absolute path 114 | fp_dir = os.path.dirname(os.path.realpath(sys.argv[0])) 115 | iniFile = os.path.join(fp_dir, "Config.ini") 116 | if(not os.path.isfile(iniFile)): 117 | tkinter.messagebox.showerror(title="Error", message="No ini file found") 118 | exit() 119 | config.read(iniFile, encoding='utf-8') 120 | mdict_files=config['Dicts'] 121 | for dict_file in mdict_files: 122 | mdict=mdict_files[dict_file] 123 | # https://note.nkmk.me/en/python-os-basename-dirname-split-splitext/#get-the-extension-without-dot-period 124 | ext_without_dot = os.path.splitext(mdict)[1][1:] 125 | if(ext_without_dot!="mdx"): 126 | tkinter.messagebox.showerror(title="Ini setting Error", message="Dict file is not mdx") 127 | exit() 128 | mdict_short_name= os.path.splitext(os.path.basename(mdict))[0] 129 | builder = IndexBuilder(mdict) 130 | # Word="count" 131 | Word=sys.argv[1] 132 | Meanings_In_This_Dict=builder.mdx_lookup(Word, ignorecase = True) 133 | # str_content = Meanings.decode('utf-8') 134 | Meanings='\n' . join(Meanings_In_This_Dict) 135 | Meanings="

"+mdict_short_name+"

"+Meanings 136 | Meanings_with_CSS=consolidate_CSS_into_HTML(Meanings) 137 | Meanings_with_CSS_inlined = css_inline.inline(Meanings_with_CSS) 138 | if (All_Meanings==""): 139 | All_Meanings=Meanings_with_CSS_inlined 140 | else: 141 | All_Meanings=merge_html(All_Meanings, Meanings_with_CSS_inlined) 142 | 143 | # fp = open("2.html","w", encoding='utf-8') 144 | # fp.write(All_Meanings) 145 | # for item in Meanings: 146 | # fp.write(item) 147 | # fp.write("\n") 148 | # fp.close() 149 | try: 150 | deck_name=config['Deck']['DeckName'] 151 | modle_name=config['Deck']['ModelName'] 152 | card_front_name=config['Deck']['CardFrontName'] 153 | card_back_name=config['Deck']['CardBackName'] 154 | except: 155 | tkinter.messagebox.showerror(title="Ini setting Error", message="Missing Deck/Model/CardFront/CardBack setting") 156 | exit() 157 | user_card_template={"dname":deck_name,"mname":modle_name,"cfname":card_front_name,"cbname":card_back_name} 158 | CardNote=NoteContent(user_card_template,Word, All_Meanings) 159 | newnote=json.loads(CardNote,strict=False) 160 | 161 | try: 162 | result = invoke('addNote',note=newnote) 163 | print(result) 164 | except: 165 | AlertWhenFails=config['Config']['AlertWhenFails'] 166 | if(AlertWhenFails): 167 | tkinter.messagebox.showerror(title="Oops", message="Something went wrong...") 168 | exit() 169 | 170 | 171 | -------------------------------------------------------------------------------- /GD2Anki_setting.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Mar 5 20:24:57 2021 4 | 5 | @author: wei_x 6 | """ 7 | 8 | 9 | import urllib.request 10 | from socket import timeout 11 | import json 12 | import psutil 13 | import os 14 | from pathlib import Path 15 | import configparser 16 | from tkinter import filedialog 17 | from tkinter import Tk 18 | 19 | config = configparser.ConfigParser() 20 | fp_dir = os.path.dirname(os.path.realpath(__file__)) 21 | iniFile = os.path.join(fp_dir, "Config.ini") 22 | def check_url( url, timeout1=5 ): 23 | try: 24 | if(urllib.request.urlopen(url,timeout=timeout1).getcode() == 200): 25 | return 1 26 | except urllib.error.URLError as e: 27 | return 2 28 | except timeout: 29 | return "Time out" 30 | 31 | def process_running(prcsName): 32 | if(prcsName in (p.name() for p in psutil.process_iter())): 33 | return True 34 | else: 35 | return False 36 | def request(action, **params): 37 | return {'action': action, 'params': params, 'version': 6} 38 | 39 | def invoke(action, **params): 40 | requestJson = json.dumps(request(action, **params)).encode('utf-8') 41 | response = json.load(urllib.request.urlopen(urllib.request.Request('http://localhost:8765', requestJson))) 42 | if len(response) != 2: 43 | raise Exception('response has an unexpected number of fields') 44 | if 'error' not in response: 45 | raise Exception('response is missing required error field') 46 | if 'result' not in response: 47 | raise Exception('response is missing required result field') 48 | if response['error'] is not None: 49 | raise Exception(response['error']) 50 | return response['result'] 51 | 52 | def select_mdx_file(): 53 | root = Tk() 54 | root.filename = filedialog.askopenfilename(initialdir = "/",title = "Select file",filetypes = (("mdx files","*.mdx"),("all files","*.*"))) 55 | if(root.filename): 56 | print (root.filename) 57 | mdxfilepath_win=root.filename.replace("/", "\\") 58 | config['Default'] = {} 59 | config['Default']['mdxfile'] = mdxfilepath_win # update 60 | with open(iniFile, 'w',encoding='utf-8') as configfile: # save 61 | config.write(configfile) 62 | root.withdraw() 63 | return mdxfilepath_win 64 | else: 65 | root.withdraw() 66 | return False 67 | def check_mdxdb_file(mdx_file): 68 | mdxdb_file=Path(mdx_file+'.db') 69 | if(mdxdb_file.is_file()): 70 | print('mdx.db file exists') 71 | if(mdxdb_file.stat().st_size>1): 72 | print('mdx.db file is effective') 73 | return True 74 | else: 75 | os.rename(mdx_file+'.db', mdx_file+'.db.bk') # rename the zero size .db file as a backup 76 | run_mdx_server() 77 | else: 78 | run_mdx_server() 79 | def check_config_mdx_db_3files(): 80 | 81 | # print(iniFile) 82 | 83 | my_file = Path(iniFile) 84 | print("Config.ini exists") 85 | # file exists 86 | if my_file.is_file(): 87 | try: 88 | config.read(iniFile, encoding='utf-8') 89 | mdict=config['Default']['mdxfile'] 90 | if(Path(mdict).is_file()): 91 | print('mdx file exists') 92 | check_mdxdb_file(mdict) 93 | except: 94 | print('can NOT find the mdx file specified in config.ini file') 95 | SelectedFile=select_mdx_file() 96 | if(SelectedFile): 97 | check_mdxdb_file(SelectedFile) 98 | else: 99 | print('user abort, no mdx file seleted') 100 | 101 | else: 102 | print('Config.ini missed') 103 | SelectedFile=select_mdx_file() 104 | if(SelectedFile): 105 | check_mdxdb_file(SelectedFile) 106 | else: 107 | print('user abort, no mdx file seleted') 108 | 109 | def run_mdx_server(): 110 | fp_dir = os.path.dirname(os.path.realpath(__file__)) 111 | mdx_server= os.path.join(fp_dir, "mdx_server.exe") 112 | if(Path(mdx_server).is_file()): 113 | print(mdx_server) 114 | os.startfile(mdx_server) 115 | else: 116 | print('can Not find mdx_server.exe') 117 | def run_GD2Anki(EngWord): 118 | fp_dir = os.path.dirname(os.path.realpath(__file__)) 119 | GD2Anki= os.path.join(fp_dir, "GD2Anki.exe") 120 | if(Path(GD2Anki).is_file()): 121 | GD2Anki= GD2Anki+' ' + EngWord 122 | # os.startfile(GD2Anki) 123 | print(GD2Anki) 124 | os.system(GD2Anki) 125 | else: 126 | print('can Not find GD2Anki.exe') 127 | print(' put GD2Anki_setting.exe and GD2Anki.exe under the same folder') 128 | 129 | 130 | def check_model_name(): 131 | results=invoke('modelNames') 132 | if('NewWordsType' in results): 133 | print('NewWordsType is set correctly') 134 | check_model_field_name() 135 | return True 136 | else: 137 | creat_model() 138 | return False 139 | def check_model_field_name(): 140 | results=invoke('modelFieldNames',modelName='NewWordsType') 141 | if(('Front' in results) and ('Back' in results)): 142 | print('Model field is set correctly') 143 | else: 144 | print('manully update the fields name of the NewWordsType model as Front and Back') 145 | print(' 1- Anki main window, click ') 146 | print(' 2- window, click menu-->select ') 147 | print(' 3- window, click drop down combox-->select -->Click botton') 148 | print(' 4- window, click botton-->Rename the filds names as Front and Back') 149 | # update_model_field_name() 150 | def update_model_field_name(): 151 | NewModel= { 152 | "name": "NewWordsType", 153 | "templates": { 154 | "My Card 1": { 155 | "Front": "{{Front}}", 156 | "Back": "{{Back}}" 157 | } 158 | } 159 | } 160 | results=invoke('updateModelTemplates',model=NewModel) 161 | print(results) 162 | print('Model fields updated') 163 | def creat_model(): 164 | mname="NewWordsType" 165 | order=["Front", "Back"] 166 | css_style="Optional CSS with default to builtin css" 167 | cardTemp=[ 168 | { 169 | "Name": "My Card 1", 170 | "Front": "{{Front}}", 171 | "Back": "{{Back}}" 172 | } 173 | ] 174 | 175 | result=invoke('createModel',modelName=mname,inOrderFields=order,css=css_style,cardTemplates=cardTemp) 176 | print(result) 177 | print('Model created') 178 | 179 | 180 | if __name__ == '__main__': 181 | if(process_running('anki.exe')): 182 | print('anki runs') 183 | else: 184 | print('you should run anki first') 185 | os._exit(0) 186 | a=check_url('http://localhost:8765/') # check ankiconnect 187 | if(a==1): 188 | print("AnkiConnect is sucessfully running") 189 | elif(a==2): 190 | print("AnkiConnect is NOT running or NOT installed") 191 | os._exit(0) 192 | else: 193 | print(a) 194 | os._exit(0) 195 | # check whether the NewWords deck exists 196 | result = invoke('deckNames') 197 | if('NewWords' in result): 198 | print("deck--NewWords-- exists") 199 | else: 200 | invoke('createDeck', deck='NewWords') 201 | print("deck--NewWords-- created") 202 | 203 | 204 | # check whether the config.ini file exists 205 | check_config_mdx_db_3files() 206 | 207 | check_model_name() 208 | run_GD2Anki("apple") -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 valuex 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # GoldenDict2Anki 2 | [中文说明](https://zhuanlan.zhihu.com/p/104513438) 3 | Automatically save the word that you looked up in GoldenDict to Anki 4 | 5 | When you look up a word in GoldenDict's main window or by a small pop-up window, the word along with its explaination will be saved into Anki simutaneouly. 6 | 7 | # Useage: 8 | ## requirement 9 | 1. Anki & AnkiConnect [link](https://ankiweb.net/shared/info/2055492159) 10 | 2. GoldenDict [link](https://sourceforge.net/projects/goldendict/files/early%20access%20builds/) 11 | 3. a local dictionary (*.mdx) file 12 | 13 | ## setting in GoldenDict 14 | 1. Open GoldenDict, Click [Edit]--[Dictionaries] --[Source]--[Programs]--[Add], set as following and click [apply]: 15 | - Type: HTML 16 | - Name: set as your wish, Let's call it **Ak** here 17 | - Command Line: "**YourLocalPah**\GD2Anki.exe" %GDWORD% 18 | - please be noted that path_to_GD2Anki.exe is double quoted and there is a `space` before %GDWORD% 19 | - Icon: Any local icon file 20 | 21 | 2. it is suggested to move the **Ak** dictionary to the last postion, because the saving-to-Anki process takes 3 seconds, not fast enough. 22 | - Click [Edit]--[Dictionaries] --[Dictionaries], select and drag **Ak** dictionary to the last and click [OK] 23 | ## setting in **Config.ini** 24 | 1. **Make sure you get a deck like below image shows** 25 | - **DeckName**: the deck you'll save the word into 26 | - **ModelName**: the model name of the deck 27 | - **CardFrontName**: the **word** you looked up 28 | - **CardBackName**: the **explanation** you get from a local dictionary (*.mdx) which is set in **Config.ini** 29 | - **Note** : in **Config.ini**, there is no need for double-quotion mark for mdxfile path. Just like this 30 | ``` 31 | [Deck] 32 | DeckName=Default 33 | ModelName=Basic 34 | CardFrontName=Front 35 | CardBackName=Back 36 | [Dicts] 37 | mdxfile1 = D:\Downloads\GoldenDict2Anki-master\朗文6中英双解.mdx 38 | mdxfile2 = D:\Downloads\GoldenDict2Anki-master\简明英汉字典增强版.mdx 39 | [Config] 40 | AlertWhenFails=1 41 | ``` 42 | ![image](https://github.com/valuex/GoldenDict2Anki/assets/3627812/45dcd576-a7b0-4cb2-a759-90979225505b) 43 | 44 | 45 | # Based on: 46 | 1. mdict_query: [link](https://github.com/mmjang/mdict-query) 47 | -------------------------------------------------------------------------------- /mdict_query.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from struct import pack, unpack 3 | from io import BytesIO 4 | import re 5 | import sys 6 | import os 7 | import sqlite3 8 | import json 9 | #from aqt.utils import showInfo, showText, tooltip 10 | from readmdict import MDX, MDD 11 | 12 | # zlib compression is used for engine version >=2.0 13 | import zlib 14 | # LZO compression is used for engine version < 2.0 15 | try: 16 | import lzo 17 | except ImportError: 18 | lzo = None 19 | #print("LZO compression support is not available") 20 | 21 | # 2x3 compatible: str()[in Python 3] = unicode()[only in Python 2] 22 | if sys.hexversion >= 0x03000000: 23 | unicode = str 24 | 25 | version = '1.1' 26 | 27 | 28 | class IndexBuilder(object): 29 | # todo: enable history 30 | 31 | def __init__(self, fname, encoding="", passcode=None, force_rebuild=False, 32 | enable_history=False, sql_index=True, check=False): 33 | self._mdx_file = fname 34 | self._encoding = 'utf-8' 35 | self._stylesheet = {} 36 | self._title = '' 37 | self._version = '' 38 | self._description = '' 39 | self._sql_index = sql_index 40 | self._check = check 41 | self._force_rebuild = force_rebuild 42 | _filename, _file_extension = os.path.splitext(fname) 43 | # assert(_file_extension == '.mdx') 44 | # assert(os.path.isfile(fname)) 45 | self._mdx_db = _filename + ".mdx.db" 46 | self._mdd_db = _filename + ".mdd.db" 47 | self._mdd_file = _filename + ".mdd" 48 | self.header_build_flag = False 49 | 50 | def get_header(self): 51 | 52 | def _(): 53 | self.header_build_flag = True 54 | mdx = MDX(self._mdx_file, only_header=True) 55 | self._encoding = mdx.meta['encoding'] 56 | self._stylesheet = json.loads(mdx.meta['stylesheet']) 57 | self._title = mdx.meta['title'] 58 | self._description = mdx.meta['description'] 59 | 60 | if os.path.isfile(self._mdx_db): 61 | # read from META table 62 | try: 63 | conn = sqlite3.connect(self._mdx_db) 64 | #cursor = conn.execute("SELECT * FROM META") 65 | cursor = conn.execute( 66 | 'SELECT value FROM META WHERE key IN ("encoding","stylesheet","title","description","version")') 67 | self._encoding, stylesheet,\ 68 | self._title, self._description, self._version = ( 69 | each[0] for each in cursor) 70 | self._stylesheet = json.loads(stylesheet) 71 | conn.close() 72 | if not self._version: 73 | _() 74 | except: 75 | _() 76 | else: 77 | _() 78 | 79 | def rebuild(self): 80 | self._make_mdx_index() 81 | if os.path.isfile(self._mdd_file): 82 | self._make_mdd_index() 83 | 84 | def check_build(self): 85 | # check if the mdx.db and mdd.db file is available 86 | if self.header_build_flag or not os.path.isfile(self._mdx_db): 87 | self._make_mdx_index() 88 | if os.path.isfile(self._mdd_file) and not os.path.isfile(self._mdd_db): 89 | self._make_mdd_index() 90 | self.header_build_flag = False 91 | 92 | @property 93 | def meta(self): 94 | return {'title': self._title, 'description': self._description, 95 | 'encoding': self._encoding, 'version': self._version, 96 | 'stylesheet': self._stylesheet} 97 | 98 | def _replace_stylesheet(self, txt): 99 | # substitute stylesheet definition 100 | txt_list = re.split('`\d+`', txt) 101 | txt_tag = re.findall('`\d+`', txt) 102 | txt_styled = txt_list[0] 103 | for j, p in enumerate(txt_list[1:]): 104 | style = self._stylesheet[txt_tag[j][1:-1]] 105 | if p and p[-1] == '\n': 106 | txt_styled = txt_styled + \ 107 | style[0].encode('utf-8') + p.rstrip() + \ 108 | style[1].encode('utf-8') + '\r\n' 109 | else: 110 | txt_styled = txt_styled + \ 111 | style[0].encode('utf-8') + p + style[1].encode('utf-8') 112 | return txt_styled 113 | 114 | def _make_mdx_index(self): 115 | if os.path.exists(self._mdx_db): 116 | os.remove(self._mdx_db) 117 | mdx = MDX(self._mdx_file, only_header=False) 118 | index_list = mdx.get_index(check_block=self._check) 119 | conn = sqlite3.connect(self._mdx_db) 120 | c = conn.cursor() 121 | c.execute( 122 | ''' CREATE TABLE MDX_INDEX 123 | (key_text text not null, 124 | file_pos integer, 125 | compressed_size integer, 126 | decompressed_size integer, 127 | record_block_type integer, 128 | record_start integer, 129 | record_end integer, 130 | offset integer 131 | )''' 132 | ) 133 | 134 | tuple_list = [ 135 | (item['key_text'], 136 | item['file_pos'], 137 | item['compressed_size'], 138 | item['decompressed_size'], 139 | item['record_block_type'], 140 | item['record_start'], 141 | item['record_end'], 142 | item['offset'] 143 | ) 144 | for item in index_list 145 | ] 146 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', 147 | tuple_list) 148 | # build the metadata table 149 | c.execute( 150 | '''CREATE TABLE META 151 | (key text, 152 | value text 153 | )''') 154 | c.executemany( 155 | 'INSERT INTO META VALUES (?,?)', 156 | [('encoding', self.meta['encoding']), 157 | ('stylesheet', json.dumps(self.meta['stylesheet'])), 158 | ('title', self.meta['title']), 159 | ('description', self.meta['description']), 160 | ('version', version) 161 | ] 162 | ) 163 | 164 | if self._sql_index: 165 | c.execute( 166 | ''' 167 | CREATE INDEX key_index ON MDX_INDEX (key_text) 168 | ''' 169 | ) 170 | 171 | conn.commit() 172 | conn.close() 173 | 174 | def _make_mdd_index(self): 175 | if os.path.exists(self._mdd_db): 176 | os.remove(self._mdd_db) 177 | mdd = MDD(self._mdd_file) 178 | index_list = mdd.get_index(check_block=self._check) 179 | conn = sqlite3.connect(self._mdd_db) 180 | c = conn.cursor() 181 | c.execute( 182 | ''' CREATE TABLE MDX_INDEX 183 | (key_text text not null unique, 184 | file_pos integer, 185 | compressed_size integer, 186 | decompressed_size integer, 187 | record_block_type integer, 188 | record_start integer, 189 | record_end integer, 190 | offset integer 191 | )''' 192 | ) 193 | 194 | tuple_list = [ 195 | (item['key_text'], 196 | item['file_pos'], 197 | item['compressed_size'], 198 | item['decompressed_size'], 199 | item['record_block_type'], 200 | item['record_start'], 201 | item['record_end'], 202 | item['offset'] 203 | ) 204 | for item in index_list 205 | ] 206 | c.executemany('INSERT INTO MDX_INDEX VALUES (?,?,?,?,?,?,?,?)', 207 | tuple_list) 208 | if self._sql_index: 209 | c.execute( 210 | ''' 211 | CREATE UNIQUE INDEX key_index ON MDX_INDEX (key_text) 212 | ''' 213 | ) 214 | 215 | conn.commit() 216 | conn.close() 217 | 218 | @staticmethod 219 | def get_data_by_index(fmdx, index): 220 | fmdx.seek(index['file_pos']) 221 | record_block_compressed = fmdx.read(index['compressed_size']) 222 | record_block_type = record_block_compressed[:4] 223 | record_block_type = index['record_block_type'] 224 | decompressed_size = index['decompressed_size'] 225 | #adler32 = unpack('>I', record_block_compressed[4:8])[0] 226 | if record_block_type == 0: 227 | _record_block = record_block_compressed[8:] 228 | # lzo compression 229 | elif record_block_type == 1: 230 | if lzo is None: 231 | #print("LZO compression is not supported") 232 | pass 233 | # decompress 234 | header = b'\xf0' + pack('>I', index['decompressed_size']) 235 | _record_block = lzo.decompress(record_block_compressed[ 236 | 8:], initSize=decompressed_size, blockSize=1308672) 237 | # zlib compression 238 | elif record_block_type == 2: 239 | # decompress 240 | _record_block = zlib.decompress(record_block_compressed[8:]) 241 | data = _record_block[index['record_start'] - 242 | index['offset']:index['record_end'] - index['offset']] 243 | return data 244 | 245 | def get_mdx_by_index(self, fmdx, index): 246 | data = self.get_data_by_index(fmdx, index) 247 | record = data.decode(self._encoding, errors='ignore').strip( 248 | u'\x00').encode('utf-8') 249 | if self._stylesheet: 250 | record = self._replace_stylesheet(record) 251 | record = record.decode('utf-8') 252 | return record 253 | 254 | def get_mdd_by_index(self, fmdx, index): 255 | return self.get_data_by_index(fmdx, index) 256 | 257 | @staticmethod 258 | def lookup_indexes(db, keyword, ignorecase=None): 259 | indexes = [] 260 | if ignorecase: 261 | sql = u'SELECT * FROM MDX_INDEX WHERE lower(key_text) = lower("{}")'.format( 262 | keyword) 263 | else: 264 | sql = u'SELECT * FROM MDX_INDEX WHERE key_text = "{}"'.format( 265 | keyword) 266 | with sqlite3.connect(db) as conn: 267 | cursor = conn.execute(sql) 268 | for result in cursor: 269 | index = {} 270 | index['file_pos'] = result[1] 271 | index['compressed_size'] = result[2] 272 | index['decompressed_size'] = result[3] 273 | index['record_block_type'] = result[4] 274 | index['record_start'] = result[5] 275 | index['record_end'] = result[6] 276 | index['offset'] = result[7] 277 | indexes.append(index) 278 | return indexes 279 | 280 | def mdx_lookup(self, keyword, ignorecase=None): 281 | lookup_result_list = [] 282 | indexes = self.lookup_indexes(self._mdx_db, keyword, ignorecase) 283 | with open(self._mdx_file, 'rb') as mdx_file: 284 | for index in indexes: 285 | lookup_result_list.append( 286 | self.get_mdx_by_index(mdx_file, index)) 287 | return lookup_result_list 288 | 289 | def mdd_lookup(self, keyword, ignorecase=None): 290 | lookup_result_list = [] 291 | indexes = self.lookup_indexes(self._mdd_db, keyword, ignorecase) 292 | with open(self._mdd_file, 'rb') as mdd_file: 293 | for index in indexes: 294 | lookup_result_list.append( 295 | self.get_mdd_by_index(mdd_file, index)) 296 | return lookup_result_list 297 | 298 | @staticmethod 299 | def get_keys(db, query=''): 300 | if not db: 301 | return [] 302 | if query: 303 | if '*' in query: 304 | query = query.replace('*', '%') 305 | else: 306 | query = query + '%' 307 | sql = 'SELECT key_text FROM MDX_INDEX WHERE key_text LIKE \"' + query + '\"' 308 | else: 309 | sql = 'SELECT key_text FROM MDX_INDEX' 310 | with sqlite3.connect(db) as conn: 311 | cursor = conn.execute(sql) 312 | keys = [item[0] for item in cursor] 313 | return keys 314 | 315 | def get_mdd_keys(self, query=''): 316 | try: 317 | return self.get_keys(self._mdd_db, query) 318 | except: 319 | return [] 320 | 321 | def get_mdx_keys(self, query=''): 322 | try: 323 | return self.get_keys(self._mdx_db, query) 324 | except: 325 | return [] 326 | 327 | 328 | # mdx_builder = IndexBuilder("oald.mdx") 329 | # text = mdx_builder.mdx_lookup('dedication') 330 | # keys = mdx_builder.get_mdx_keys() 331 | # keys1 = mdx_builder.get_mdx_keys('abstrac') 332 | # keys2 = mdx_builder.get_mdx_keys('*tion') 333 | # for key in keys2: 334 | # text = mdx_builder.mdx_lookup(key)[0] 335 | # pass 336 | -------------------------------------------------------------------------------- /pureSalsa20.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Copyright by https://github.com/zhansliu/writemdict 6 | 7 | pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3 8 | 9 | v4.0: Added Python 3 support, dropped support for Python <= 2.5. 10 | 11 | // zhansliu 12 | 13 | Original comments below. 14 | 15 | ==================================================================== 16 | There are comments here by two authors about three pieces of software: 17 | comments by Larry Bugbee about 18 | Salsa20, the stream cipher by Daniel J. Bernstein 19 | (including comments about the speed of the C version) and 20 | pySalsa20, Bugbee's own Python wrapper for salsa20.c 21 | (including some references), and 22 | comments by Steve Witham about 23 | pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20, 24 | which follows pySalsa20's API, and is in this file. 25 | 26 | Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee) 27 | ----------------------------------------------------------- 28 | 29 | Salsa20 is a fast stream cipher written by Daniel Bernstein 30 | that basically uses a hash function and XOR making for fast 31 | encryption. (Decryption uses the same function.) Salsa20 32 | is simple and quick. 33 | 34 | Some Salsa20 parameter values... 35 | design strength 128 bits 36 | key length 128 or 256 bits, exactly 37 | IV, aka nonce 64 bits, always 38 | chunk size must be in multiples of 64 bytes 39 | 40 | Salsa20 has two reduced versions, 8 and 12 rounds each. 41 | 42 | One benchmark (10 MB): 43 | 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds 44 | AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds 45 | (no I/O and before Python GC kicks in) 46 | 47 | Salsa20 is a Phase 3 finalist in the EU eSTREAM competition 48 | and appears to be one of the fastest ciphers. It is well 49 | documented so I will not attempt any injustice here. Please 50 | see "References" below. 51 | 52 | ...and Salsa20 is "free for any use". 53 | 54 | 55 | pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee) 56 | ------------------------------------------------------------------ 57 | 58 | pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is 59 | as it's name implies, 20 rounds, but there are two reduced 60 | versions, 8 and 12 rounds each. Because the APIs are 61 | identical, pySalsa20 is capable of wrapping all three 62 | versions (number of rounds hardcoded), including a special 63 | version that allows you to set the number of rounds with a 64 | set_rounds() function. Compile the version of your choice 65 | as a shared library (not as a Python extension), name and 66 | install it as libsalsa20.so. 67 | 68 | Sample usage: 69 | from pySalsa20 import Salsa20 70 | s20 = Salsa20(key, IV) 71 | dataout = s20.encryptBytes(datain) # same for decrypt 72 | 73 | This is EXPERIMENTAL software and intended for educational 74 | purposes only. To make experimentation less cumbersome, 75 | pySalsa20 is also free for any use. 76 | 77 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 78 | ANY KIND. USE AT YOUR OWN RISK. 79 | 80 | Enjoy, 81 | 82 | Larry Bugbee 83 | bugbee@seanet.com 84 | April 2007 85 | 86 | 87 | References: 88 | ----------- 89 | http://en.wikipedia.org/wiki/Salsa20 90 | http://en.wikipedia.org/wiki/Daniel_Bernstein 91 | http://cr.yp.to/djb.html 92 | http://www.ecrypt.eu.org/stream/salsa20p3.html 93 | http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip 94 | 95 | 96 | Prerequisites for pySalsa20: 97 | ---------------------------- 98 | - Python 2.5 (haven't tested in 2.4) 99 | 100 | 101 | pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham) 102 | ------------------------------------------------------------------ 103 | 104 | pureSalsa20 is the stand-alone Python code in this file. 105 | It implements the underlying Salsa20 core algorithm 106 | and emulates pySalsa20's Salsa20 class API (minus a bug(*)). 107 | 108 | pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20-- 109 | about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8, 110 | when encrypting 64k-byte blocks on my computer. 111 | 112 | pureSalsa20 is for cases where portability is much more important than 113 | speed. I wrote it for use in a "structured" random number generator. 114 | 115 | There are comments about the reasons for this slowness in 116 | http://www.tiac.net/~sw/2010/02/PureSalsa20 117 | 118 | Sample usage: 119 | from pureSalsa20 import Salsa20 120 | s20 = Salsa20(key, IV) 121 | dataout = s20.encryptBytes(datain) # same for decrypt 122 | 123 | I took the test code from pySalsa20, added a bunch of tests including 124 | rough speed tests, and moved them into the file testSalsa20.py. 125 | To test both pySalsa20 and pureSalsa20, type 126 | python testSalsa20.py 127 | 128 | (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the 129 | libsalsa20.so library and not switched when switching between instances 130 | of the Salsa20 class. 131 | s1 = Salsa20( key, IV, 20 ) 132 | s2 = Salsa20( key, IV, 8 ) 133 | In this example, 134 | with pySalsa20, both s1 and s2 will do 8 rounds of encryption. 135 | with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds. 136 | Perhaps giving each instance its own nRounds variable, which 137 | is passed to the salsa20wordtobyte() function, is insecure. I'm not a 138 | cryptographer. 139 | 140 | pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and 141 | intended for educational purposes only. To make experimentation less 142 | cumbersome, pureSalsa20.py and testSalsa20.py are free for any use. 143 | 144 | Revisions: 145 | ---------- 146 | p3.2 Fixed bug that initialized the output buffer with plaintext! 147 | Saner ramping of nreps in speed test. 148 | Minor changes and print statements. 149 | p3.1 Took timing variability out of add32() and rot32(). 150 | Made the internals more like pySalsa20/libsalsa . 151 | Put the semicolons back in the main loop! 152 | In encryptBytes(), modify a byte array instead of appending. 153 | Fixed speed calculation bug. 154 | Used subclasses instead of patches in testSalsa20.py . 155 | Added 64k-byte messages to speed test to be fair to pySalsa20. 156 | p3 First version, intended to parallel pySalsa20 version 3. 157 | 158 | More references: 159 | ---------------- 160 | http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20] 161 | http://cr.yp.to/snuffle.html [The original name of Salsa20] 162 | http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design] 163 | http://www.tiac.net/~sw/2010/02/PureSalsa20 164 | 165 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 166 | ANY KIND. USE AT YOUR OWN RISK. 167 | 168 | Cheers, 169 | 170 | Steve Witham sw at remove-this tiac dot net 171 | February, 2010 172 | """ 173 | import sys 174 | assert(sys.version_info >= (2, 6)) 175 | 176 | if sys.version_info >= (3,): 177 | integer_types = (int,) 178 | python3 = True 179 | else: 180 | integer_types = (int, long) 181 | python3 = False 182 | 183 | from struct import Struct 184 | little_u64 = Struct( "= 2**64" 238 | ctx = self.ctx 239 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) ) 240 | 241 | def getCounter( self ): 242 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0] 243 | 244 | 245 | def setRounds(self, rounds, testing=False ): 246 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20' 247 | self.rounds = rounds 248 | 249 | 250 | def encryptBytes(self, data): 251 | assert type(data) == bytes, 'data must be byte string' 252 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes' 253 | lendata = len(data) 254 | munged = bytearray(lendata) 255 | for i in range( 0, lendata, 64 ): 256 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False ) 257 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 ) 258 | # Stopping at 2^70 bytes per nonce is user's responsibility. 259 | for j in range( min( 64, lendata - i ) ): 260 | if python3: 261 | munged[ i+j ] = data[ i+j ] ^ h[j] 262 | else: 263 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j]) 264 | 265 | self._lastChunk64 = not lendata % 64 266 | return bytes(munged) 267 | 268 | decryptBytes = encryptBytes # encrypt and decrypt use same function 269 | 270 | #-------------------------------------------------------------------------- 271 | 272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ): 273 | """ Do nRounds Salsa20 rounds on a copy of 274 | input: list or tuple of 16 ints treated as little-endian unsigneds. 275 | Returns a 64-byte string. 276 | """ 277 | 278 | assert( type(input) in ( list, tuple ) and len(input) == 16 ) 279 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) ) 280 | 281 | x = list( input ) 282 | 283 | def XOR( a, b ): return a ^ b 284 | ROTATE = rot32 285 | PLUS = add32 286 | 287 | for i in range( nRounds // 2 ): 288 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c 289 | # unchanged except for indents and the blank line between rounds: 290 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7)); 291 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9)); 292 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13)); 293 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18)); 294 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7)); 295 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9)); 296 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13)); 297 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18)); 298 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7)); 299 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9)); 300 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13)); 301 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18)); 302 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7)); 303 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9)); 304 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13)); 305 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18)); 306 | 307 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7)); 308 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9)); 309 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13)); 310 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18)); 311 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7)); 312 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9)); 313 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13)); 314 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18)); 315 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7)); 316 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9)); 317 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13)); 318 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18)); 319 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7)); 320 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9)); 321 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13)); 322 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18)); 323 | 324 | for i in range( len( input ) ): 325 | x[i] = PLUS( x[i], input[i] ) 326 | return little16_i32.pack( *x ) 327 | 328 | #--------------------------- 32-bit ops ------------------------------- 329 | 330 | def trunc32( w ): 331 | """ Return the bottom 32 bits of w as a Python int. 332 | This creates longs temporarily, but returns an int. """ 333 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) ) 334 | assert type(w) == int 335 | return w 336 | 337 | 338 | def add32( a, b ): 339 | """ Add two 32-bit words discarding carry above 32nd bit, 340 | and without creating a Python long. 341 | Timing shouldn't vary. 342 | """ 343 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF ) 344 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 ) 345 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF ) 346 | 347 | 348 | def rot32( w, nLeft ): 349 | """ Rotate 32-bit word left by nLeft or right by -nLeft 350 | without creating a Python long. 351 | Timing depends on nLeft but not on w. 352 | """ 353 | nLeft &= 31 # which makes nLeft >= 0 354 | if nLeft == 0: 355 | return w 356 | 357 | # Note: now 1 <= nLeft <= 31. 358 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, 359 | # => sLLLLLLRRR and one s which becomes the sign bit. 360 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) ) 361 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w 362 | return RRR | ( sLLLLLL << nLeft ) 363 | 364 | 365 | # --------------------------------- end ----------------------------------- 366 | -------------------------------------------------------------------------------- /readmdict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # readmdict.py 4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser 5 | # 6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang 7 | # 8 | # This program is a free software; you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, version 3 of the License. 11 | # 12 | # You can get a copy of GNU General Public License along this program 13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | 20 | from struct import pack, unpack 21 | from io import BytesIO 22 | import re 23 | import sys 24 | import json 25 | 26 | from ripemd128 import ripemd128 27 | from pureSalsa20 import Salsa20 28 | #from aqt.utils import showInfo, showText, tooltip 29 | 30 | # zlib compression is used for engine version >=2.0 31 | import zlib 32 | # LZO compression is used for engine version < 2.0 33 | try: 34 | import lzo 35 | except ImportError: 36 | lzo = None 37 | #print("LZO compression support is not available") 38 | 39 | # 2x3 compatible 40 | if sys.hexversion >= 0x03000000: 41 | unicode = str 42 | 43 | 44 | def _unescape_entities(text): 45 | """ 46 | unescape offending tags < > " & 47 | """ 48 | text = text.replace(b'<', b'<') 49 | text = text.replace(b'>', b'>') 50 | text = text.replace(b'"', b'"') 51 | text = text.replace(b'&', b'&') 52 | return text 53 | 54 | 55 | def _fast_decrypt(data, key): 56 | b = bytearray(data) 57 | key = bytearray(key) 58 | previous = 0x36 59 | for i in range(len(b)): 60 | t = (b[i] >> 4 | b[i] << 4) & 0xff 61 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)] 62 | previous = b[i] 63 | b[i] = t 64 | return bytes(b) 65 | 66 | 67 | def _mdx_decrypt(comp_block): 68 | key = ripemd128(comp_block[4:8] + pack(b' 129 | """ 130 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL) 131 | tagdict = {} 132 | for key, value in taglist: 133 | tagdict[key] = _unescape_entities(value) 134 | return tagdict 135 | 136 | def _decode_key_block_info(self, key_block_info_compressed): 137 | if self._version >= 2: 138 | # zlib compression 139 | assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00') 140 | # decrypt if needed 141 | if self._encrypt & 0x02: 142 | key_block_info_compressed = _mdx_decrypt( 143 | key_block_info_compressed) 144 | # decompress 145 | key_block_info = zlib.decompress(key_block_info_compressed[8:]) 146 | # adler checksum 147 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0] 148 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff) 149 | else: 150 | # no compression 151 | key_block_info = key_block_info_compressed 152 | # decode 153 | key_block_info_list = [] 154 | num_entries = 0 155 | i = 0 156 | if self._version >= 2: 157 | byte_format = '>H' 158 | byte_width = 2 159 | text_term = 1 160 | else: 161 | byte_format = '>B' 162 | byte_width = 1 163 | text_term = 0 164 | 165 | while i < len(key_block_info): 166 | # number of entries in current key block 167 | num_entries += unpack(self._number_format, 168 | key_block_info[i:i + self._number_width])[0] 169 | i += self._number_width 170 | # text head size 171 | text_head_size = unpack(byte_format, key_block_info[ 172 | i:i + byte_width])[0] 173 | i += byte_width 174 | # text head 175 | if self._encoding != 'UTF-16': 176 | i += text_head_size + text_term 177 | else: 178 | i += (text_head_size + text_term) * 2 179 | # text tail size 180 | text_tail_size = unpack(byte_format, key_block_info[ 181 | i:i + byte_width])[0] 182 | i += byte_width 183 | # text tail 184 | if self._encoding != 'UTF-16': 185 | i += text_tail_size + text_term 186 | else: 187 | i += (text_tail_size + text_term) * 2 188 | # key block compressed size 189 | key_block_compressed_size = unpack(self._number_format, key_block_info[ 190 | i:i + self._number_width])[0] 191 | i += self._number_width 192 | # key block decompressed size 193 | key_block_decompressed_size = unpack(self._number_format, key_block_info[ 194 | i:i + self._number_width])[0] 195 | i += self._number_width 196 | key_block_info_list += [(key_block_compressed_size, 197 | key_block_decompressed_size)] 198 | 199 | assert(num_entries == self._num_entries) 200 | 201 | return key_block_info_list 202 | 203 | def _decode_key_block(self, key_block_compressed, key_block_info_list): 204 | key_list = [] 205 | i = 0 206 | for compressed_size, decompressed_size in key_block_info_list: 207 | start = i 208 | end = i + compressed_size 209 | # 4 bytes : compression type 210 | key_block_type = key_block_compressed[start:start + 4] 211 | # 4 bytes : adler checksum of decompressed key block 212 | adler32 = unpack('>I', key_block_compressed[ 213 | start + 4:start + 8])[0] 214 | if key_block_type == b'\x00\x00\x00\x00': 215 | key_block = key_block_compressed[start + 8:end] 216 | elif key_block_type == b'\x01\x00\x00\x00': 217 | if lzo is None: 218 | print("LZO compression is not supported") 219 | break 220 | # decompress key block 221 | header = b'\xf0' + pack('>I', decompressed_size) 222 | key_block = lzo.decompress(key_block_compressed[ 223 | start + 8:end], initSize=decompressed_size, blockSize=1308672) 224 | elif key_block_type == b'\x02\x00\x00\x00': 225 | # decompress key block 226 | key_block = zlib.decompress( 227 | key_block_compressed[start + 8:end]) 228 | # extract one single key block into a key list 229 | key_list += self._split_key_block(key_block) 230 | # notice that adler32 returns signed value 231 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff) 232 | 233 | i += compressed_size 234 | return key_list 235 | 236 | def _split_key_block(self, key_block): 237 | key_list = [] 238 | key_start_index = 0 239 | while key_start_index < len(key_block): 240 | temp = key_block[ 241 | key_start_index:key_start_index + self._number_width] 242 | # the corresponding record's offset in record block 243 | key_id = unpack(self._number_format, key_block[ 244 | key_start_index:key_start_index + self._number_width])[0] 245 | # key text ends with '\x00' 246 | if self._encoding == 'UTF-16': 247 | delimiter = b'\x00\x00' 248 | width = 2 249 | else: 250 | delimiter = b'\x00' 251 | width = 1 252 | i = key_start_index + self._number_width 253 | while i < len(key_block): 254 | if key_block[i:i + width] == delimiter: 255 | key_end_index = i 256 | break 257 | i += width 258 | key_text = key_block[key_start_index + self._number_width:key_end_index]\ 259 | .decode(self._encoding, errors='ignore').encode('utf-8').strip() 260 | key_start_index = key_end_index + width 261 | key_list += [(key_id, key_text)] 262 | return key_list 263 | 264 | @property 265 | def meta(self): 266 | return {'title': self._title, 'description': self._description, 267 | 'encoding': self._encoding, 'version': self._version, 268 | 'stylesheet': json.dumps(self._stylesheet)} 269 | 270 | def _read_header(self): 271 | f = open(self._fname, 'rb') 272 | # number of bytes of header text 273 | header_bytes_size = unpack('>I', f.read(4))[0] 274 | header_bytes = f.read(header_bytes_size) 275 | # 4 bytes: adler32 checksum of header, in little endian 276 | adler32 = unpack('= 0x03000000: 288 | encoding = encoding.decode('utf-8') 289 | # GB18030 > GBK > GB2312 290 | if encoding in ['GBK', 'GB2312']: 291 | encoding = 'GB18030' 292 | self._encoding = encoding 293 | # 读取标题和描述 294 | if b'Title' in header_tag: 295 | self._title = header_tag[b'Title'].decode('utf-8') 296 | else: 297 | self._title = '' 298 | 299 | if b'Description' in header_tag: 300 | self._description = header_tag[b'Description'].decode('utf-8') 301 | else: 302 | self._description = '' 303 | pass 304 | # encryption flag 305 | # 0x00 - no encryption 306 | # 0x01 - encrypt record block 307 | # 0x02 - encrypt key info block 308 | if b'Encrypted' not in header_tag or header_tag[b'Encrypted'] == b'No': 309 | self._encrypt = 0 310 | elif header_tag[b'Encrypted'] == b'Yes': 311 | self._encrypt = 1 312 | else: 313 | self._encrypt = int(header_tag[b'Encrypted']) 314 | 315 | # stylesheet attribute if present takes form of: 316 | # style_number # 1-255 317 | # style_begin # or '' 318 | # style_end # or '' 319 | # store stylesheet in dict in the form of 320 | # {'number' : ('style_begin', 'style_end')} 321 | self._stylesheet = {} 322 | if header_tag.get('StyleSheet'): 323 | lines = header_tag['StyleSheet'].splitlines() 324 | for i in range(0, len(lines), 3): 325 | self._stylesheet[lines[i]] = (lines[i + 1], lines[i + 2]) 326 | 327 | # before version 2.0, number is 4 bytes integer 328 | # version 2.0 and above uses 8 bytes 329 | self._version = float(header_tag[b'GeneratedByEngineVersion']) 330 | if self._version < 2.0: 331 | self._number_width = 4 332 | self._number_format = '>I' 333 | else: 334 | self._number_width = 8 335 | self._number_format = '>Q' 336 | 337 | return header_tag 338 | 339 | def _read_keys(self): 340 | f = open(self._fname, 'rb') 341 | f.seek(self._key_block_offset) 342 | 343 | # the following numbers could be encrypted 344 | if self._version >= 2.0: 345 | num_bytes = 8 * 5 346 | else: 347 | num_bytes = 4 * 4 348 | block = f.read(num_bytes) 349 | 350 | if self._encrypt & 1: 351 | if self._passcode is None: 352 | raise RuntimeError( 353 | 'user identification is needed to read encrypted file') 354 | regcode, userid = self._passcode 355 | if isinstance(userid, unicode): 356 | userid = userid.encode('utf8') 357 | if self.header[b'RegisterBy'] == b'EMail': 358 | encrypted_key = _decrypt_regcode_by_email(regcode, userid) 359 | else: 360 | encrypted_key = _decrypt_regcode_by_deviceid(regcode, userid) 361 | block = _salsa_decrypt(block, encrypted_key) 362 | 363 | # decode this block 364 | sf = BytesIO(block) 365 | # number of key blocks 366 | num_key_blocks = self._read_number(sf) 367 | # number of entries 368 | self._num_entries = self._read_number(sf) 369 | # number of bytes of key block info after decompression 370 | if self._version >= 2.0: 371 | key_block_info_decomp_size = self._read_number(sf) 372 | # number of bytes of key block info 373 | key_block_info_size = self._read_number(sf) 374 | # number of bytes of key block 375 | key_block_size = self._read_number(sf) 376 | 377 | # 4 bytes: adler checksum of previous 5 numbers 378 | if self._version >= 2.0: 379 | adler32 = unpack('>I', f.read(4))[0] 380 | assert adler32 == (zlib.adler32(block) & 0xffffffff) 381 | 382 | # read key block info, which indicates key block's compressed and 383 | # decompressed size 384 | key_block_info = f.read(key_block_info_size) 385 | key_block_info_list = self._decode_key_block_info(key_block_info) 386 | assert(num_key_blocks == len(key_block_info_list)) 387 | 388 | # read key block 389 | key_block_compressed = f.read(key_block_size) 390 | # extract key block 391 | key_list = self._decode_key_block( 392 | key_block_compressed, key_block_info_list) 393 | 394 | self._record_block_offset = f.tell() 395 | f.close() 396 | 397 | return key_list 398 | 399 | def _read_keys_brutal(self): 400 | f = open(self._fname, 'rb') 401 | f.seek(self._key_block_offset) 402 | 403 | # the following numbers could be encrypted, disregard them! 404 | if self._version >= 2.0: 405 | num_bytes = 8 * 5 + 4 406 | key_block_type = b'\x02\x00\x00\x00' 407 | else: 408 | num_bytes = 4 * 4 409 | key_block_type = b'\x01\x00\x00\x00' 410 | block = f.read(num_bytes) 411 | 412 | # key block info 413 | # 4 bytes '\x02\x00\x00\x00' 414 | # 4 bytes adler32 checksum 415 | # unknown number of bytes follows until '\x02\x00\x00\x00' which marks 416 | # the beginning of key block 417 | key_block_info = f.read(8) 418 | if self._version >= 2.0: 419 | assert key_block_info[:4] == b'\x02\x00\x00\x00' 420 | while True: 421 | fpos = f.tell() 422 | t = f.read(1024) 423 | index = t.find(key_block_type) 424 | if index != -1: 425 | key_block_info += t[:index] 426 | f.seek(fpos + index) 427 | break 428 | else: 429 | key_block_info += t 430 | 431 | key_block_info_list = self._decode_key_block_info(key_block_info) 432 | key_block_size = sum(list(zip(*key_block_info_list))[0]) 433 | 434 | # read key block 435 | key_block_compressed = f.read(key_block_size) 436 | # extract key block 437 | key_list = self._decode_key_block( 438 | key_block_compressed, key_block_info_list) 439 | 440 | self._record_block_offset = f.tell() 441 | f.close() 442 | 443 | self._num_entries = len(key_list) 444 | return key_list 445 | 446 | 447 | class MDD(MDict): 448 | """ 449 | MDict resource file format (*.MDD) reader. 450 | >>> mdd = MDD('example.mdd') 451 | >>> len(mdd) 452 | 208 453 | >>> for filename,content in mdd.items(): 454 | ... print filename, content[:10] 455 | """ 456 | 457 | def __init__(self, fname, passcode=None): 458 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode) 459 | 460 | def items(self): 461 | """Return a generator which in turn produce tuples in the form of (filename, content) 462 | """ 463 | return self._decode_record_block() 464 | 465 | def _decode_record_block(self): 466 | f = open(self._fname, 'rb') 467 | f.seek(self._record_block_offset) 468 | 469 | num_record_blocks = self._read_number(f) 470 | num_entries = self._read_number(f) 471 | assert(num_entries == self._num_entries) 472 | record_block_info_size = self._read_number(f) 473 | record_block_size = self._read_number(f) 474 | 475 | # record block info section 476 | record_block_info_list = [] 477 | size_counter = 0 478 | for i in range(num_record_blocks): 479 | compressed_size = self._read_number(f) 480 | decompressed_size = self._read_number(f) 481 | record_block_info_list += [(compressed_size, decompressed_size)] 482 | size_counter += self._number_width * 2 483 | assert(size_counter == record_block_info_size) 484 | 485 | # actual record block 486 | offset = 0 487 | i = 0 488 | size_counter = 0 489 | for compressed_size, decompressed_size in record_block_info_list: 490 | record_block_compressed = f.read(compressed_size) 491 | # 4 bytes: compression type 492 | record_block_type = record_block_compressed[:4] 493 | # 4 bytes: adler32 checksum of decompressed record block 494 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 495 | if record_block_type == b'\x00\x00\x00\x00': 496 | record_block = record_block_compressed[8:] 497 | elif record_block_type == b'\x01\x00\x00\x00': 498 | if lzo is None: 499 | print("LZO compression is not supported") 500 | break 501 | # decompress 502 | header = b'\xf0' + pack('>I', decompressed_size) 503 | record_block = lzo.decompress(record_block_compressed[ 504 | start + 8:end], initSize=decompressed_size, blockSize=1308672) 505 | elif record_block_type == b'\x02\x00\x00\x00': 506 | # decompress 507 | record_block = zlib.decompress(record_block_compressed[8:]) 508 | 509 | # notice that adler32 return signed value 510 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 511 | 512 | assert(len(record_block) == decompressed_size) 513 | # split record block according to the offset info from key block 514 | while i < len(self._key_list): 515 | record_start, key_text = self._key_list[i] 516 | # reach the end of current record block 517 | if record_start - offset >= len(record_block): 518 | break 519 | # record end index 520 | if i < len(self._key_list) - 1: 521 | record_end = self._key_list[i + 1][0] 522 | else: 523 | record_end = len(record_block) + offset 524 | i += 1 525 | data = record_block[record_start - offset:record_end - offset] 526 | yield key_text, data 527 | offset += len(record_block) 528 | size_counter += compressed_size 529 | assert(size_counter == record_block_size) 530 | 531 | f.close() 532 | 533 | # 获取 mdx 文件的索引列表,格式为 534 | # key_text(关键词,可以由后面的 keylist 得到) 535 | # file_pos(record_block开始的位置) 536 | # compressed_size(record_block压缩前的大小) 537 | # decompressed_size(解压后的大小) 538 | # record_block_type(record_block 的压缩类型) 539 | # record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 540 | # record_end 541 | # offset 542 | def get_index(self, check_block=True): 543 | f = open(self._fname, 'rb') 544 | index_dict_list = [] 545 | f.seek(self._record_block_offset) 546 | 547 | num_record_blocks = self._read_number(f) 548 | num_entries = self._read_number(f) 549 | assert(num_entries == self._num_entries) 550 | record_block_info_size = self._read_number(f) 551 | record_block_size = self._read_number(f) 552 | 553 | # record block info section 554 | record_block_info_list = [] 555 | size_counter = 0 556 | for i in range(num_record_blocks): 557 | compressed_size = self._read_number(f) 558 | decompressed_size = self._read_number(f) 559 | record_block_info_list += [(compressed_size, decompressed_size)] 560 | size_counter += self._number_width * 2 561 | # todo:注意!!! 562 | assert(size_counter == record_block_info_size) 563 | 564 | # actual record block 565 | offset = 0 566 | i = 0 567 | size_counter = 0 568 | for compressed_size, decompressed_size in record_block_info_list: 569 | current_pos = f.tell() 570 | record_block_compressed = f.read(compressed_size) 571 | # 4 bytes: compression type 572 | record_block_type = record_block_compressed[:4] 573 | # 4 bytes: adler32 checksum of decompressed record block 574 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 575 | if record_block_type == b'\x00\x00\x00\x00': 576 | _type = 0 577 | if check_block: 578 | record_block = record_block_compressed[8:] 579 | elif record_block_type == b'\x01\x00\x00\x00': 580 | _type = 1 581 | if lzo is None: 582 | print("LZO compression is not supported") 583 | break 584 | # decompress 585 | header = b'\xf0' + pack('>I', decompressed_size) 586 | if check_block: 587 | record_block = lzo.decompress(record_block_compressed[ 588 | start + 8:end], initSize=decompressed_size, blockSize=1308672) 589 | elif record_block_type == b'\x02\x00\x00\x00': 590 | # decompress 591 | _type = 2 592 | if check_block: 593 | record_block = zlib.decompress(record_block_compressed[8:]) 594 | 595 | # notice that adler32 return signed value 596 | if check_block: 597 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 598 | assert(len(record_block) == decompressed_size) 599 | # split record block according to the offset info from key block 600 | while i < len(self._key_list): 601 | # 用来保存索引信息的空字典 602 | index_dict = {} 603 | index_dict['file_pos'] = current_pos 604 | index_dict['compressed_size'] = compressed_size 605 | index_dict['decompressed_size'] = decompressed_size 606 | index_dict['record_block_type'] = _type 607 | record_start, key_text = self._key_list[i] 608 | index_dict['record_start'] = record_start 609 | index_dict['key_text'] = key_text.decode( 610 | "utf-8", errors='ignore') 611 | index_dict['offset'] = offset 612 | # reach the end of current record block 613 | if record_start - offset >= decompressed_size: 614 | break 615 | # record end index 616 | if i < len(self._key_list) - 1: 617 | record_end = self._key_list[i + 1][0] 618 | else: 619 | record_end = decompressed_size + offset 620 | index_dict['record_end'] = record_end 621 | i += 1 622 | if check_block: 623 | data = record_block[ 624 | record_start - offset:record_end - offset] 625 | index_dict_list.append(index_dict) 626 | # yield key_text, data 627 | offset += decompressed_size 628 | size_counter += compressed_size 629 | assert(size_counter == record_block_size) 630 | f.close() 631 | return index_dict_list 632 | 633 | 634 | class MDX(MDict): 635 | """ 636 | MDict dictionary file format (*.MDX) reader. 637 | >>> mdx = MDX('example.mdx') 638 | >>> len(mdx) 639 | 42481 640 | >>> for key,value in mdx.items(): 641 | ... print key, value[:10] 642 | """ 643 | 644 | def __init__(self, fname, encoding='', substyle=False, passcode=None, only_header=False): 645 | MDict.__init__(self, fname, encoding, passcode, only_header) 646 | self._substyle = substyle 647 | 648 | def items(self): 649 | """Return a generator which in turn produce tuples in the form of (key, value) 650 | """ 651 | return self._decode_record_block() 652 | 653 | def _substitute_stylesheet(self, txt): 654 | # substitute stylesheet definition 655 | txt_list = re.split('`\d+`', txt) 656 | txt_tag = re.findall('`\d+`', txt) 657 | txt_styled = txt_list[0] 658 | for j, p in enumerate(txt_list[1:]): 659 | style = self._stylesheet[txt_tag[j][1:-1]] 660 | if p and p[-1] == '\n': 661 | txt_styled = txt_styled + \ 662 | style[0] + p.rstrip() + style[1] + '\r\n' 663 | else: 664 | txt_styled = txt_styled + style[0] + p + style[1] 665 | return txt_styled 666 | 667 | def _decode_record_block(self): 668 | f = open(self._fname, 'rb') 669 | f.seek(self._record_block_offset) 670 | 671 | num_record_blocks = self._read_number(f) 672 | num_entries = self._read_number(f) 673 | assert(num_entries == self._num_entries) 674 | record_block_info_size = self._read_number(f) 675 | record_block_size = self._read_number(f) 676 | 677 | # record block info section 678 | record_block_info_list = [] 679 | size_counter = 0 680 | for i in range(num_record_blocks): 681 | compressed_size = self._read_number(f) 682 | decompressed_size = self._read_number(f) 683 | record_block_info_list += [(compressed_size, decompressed_size)] 684 | size_counter += self._number_width * 2 685 | assert(size_counter == record_block_info_size) 686 | 687 | # actual record block data 688 | offset = 0 689 | i = 0 690 | size_counter = 0 691 | # 最后的索引表的格式为 692 | # key_text(关键词,可以由后面的 keylist 得到) 693 | # file_pos(record_block开始的位置) 694 | # compressed_size(record_block压缩前的大小) 695 | # decompressed_size(解压后的大小) 696 | # record_block_type(record_block 的压缩类型) 697 | # record_start (以下三个为从 record_block 中提取某一条记录需要的参数,可以直接保存) 698 | # record_end 699 | # offset 700 | for compressed_size, decompressed_size in record_block_info_list: 701 | record_block_compressed = f.read(compressed_size) 702 | # 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) 703 | # 另外还需要记录当前 f 对象的位置 704 | # 使用 f.tell() 命令/ 在建立索引是需要 f.seek() 705 | # 4 bytes indicates block compression type 706 | record_block_type = record_block_compressed[:4] 707 | # 4 bytes adler checksum of uncompressed content 708 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 709 | # no compression 710 | if record_block_type == b'\x00\x00\x00\x00': 711 | record_block = record_block_compressed[8:] 712 | # lzo compression 713 | elif record_block_type == b'\x01\x00\x00\x00': 714 | if lzo is None: 715 | print("LZO compression is not supported") 716 | break 717 | # decompress 718 | header = b'\xf0' + pack('>I', decompressed_size) 719 | record_block = lzo.decompress(record_block_compressed[ 720 | 8:], initSize=decompressed_size, blockSize=1308672) 721 | # zlib compression 722 | elif record_block_type == b'\x02\x00\x00\x00': 723 | # decompress 724 | record_block = zlib.decompress(record_block_compressed[8:]) 725 | # 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 726 | # 需要的信息有 record_block_compressed, decompress_size, 727 | # record_block_type 728 | # 另外还需要校验信息 adler32 729 | # notice that adler32 return signed value 730 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 731 | 732 | assert(len(record_block) == decompressed_size) 733 | # split record block according to the offset info from key block 734 | while i < len(self._key_list): 735 | record_start, key_text = self._key_list[i] 736 | # reach the end of current record block 737 | if record_start - offset >= len(record_block): 738 | break 739 | # record end index 740 | if i < len(self._key_list) - 1: 741 | record_end = self._key_list[i + 1][0] 742 | else: 743 | record_end = len(record_block) + offset 744 | i += 1 745 | # 需要得到 record_block , record_start, record_end, 746 | # offset 747 | record = record_block[ 748 | record_start - offset:record_end - offset] 749 | # convert to utf-8 750 | record = record.decode(self._encoding, errors='ignore').strip( 751 | u'\x00').encode('utf-8') 752 | # substitute styles 753 | # 是否替换样式表 754 | if self._substyle and self._stylesheet: 755 | record = self._substitute_stylesheet(record) 756 | 757 | yield key_text, record 758 | offset += len(record_block) 759 | size_counter += compressed_size 760 | assert(size_counter == record_block_size) 761 | 762 | f.close() 763 | 764 | # 获取 mdx 文件的索引列表,格式为 765 | # key_text(关键词,可以由后面的 keylist 得到) 766 | # file_pos(record_block开始的位置) 767 | # compressed_size(record_block压缩前的大小) 768 | # decompressed_size(解压后的大小) 769 | # record_block_type(record_block 的压缩类型) 770 | # record_start (以下三个为从 record_block 中提取某一条记录需要的参数,可以直接保存) 771 | # record_end 772 | # offset 773 | # 所需 metadata 774 | ### 775 | def get_index(self, check_block=True): 776 | # 索引列表 777 | index_dict_list = [] 778 | f = open(self._fname, 'rb') 779 | f.seek(self._record_block_offset) 780 | 781 | num_record_blocks = self._read_number(f) 782 | num_entries = self._read_number(f) 783 | assert(num_entries == self._num_entries) 784 | record_block_info_size = self._read_number(f) 785 | record_block_size = self._read_number(f) 786 | 787 | # record block info section 788 | record_block_info_list = [] 789 | size_counter = 0 790 | for i in range(num_record_blocks): 791 | compressed_size = self._read_number(f) 792 | decompressed_size = self._read_number(f) 793 | record_block_info_list += [(compressed_size, decompressed_size)] 794 | size_counter += self._number_width * 2 795 | assert(size_counter == record_block_info_size) 796 | 797 | # actual record block data 798 | offset = 0 799 | i = 0 800 | size_counter = 0 801 | # 最后的索引表的格式为 802 | # key_text(关键词,可以由后面的 keylist 得到) 803 | # file_pos(record_block开始的位置) 804 | # compressed_size(record_block压缩前的大小) 805 | # decompressed_size(解压后的大小) 806 | # record_block_type(record_block 的压缩类型) 807 | # record_start (以下三个为从 record_block 中提取某一调记录需要的参数,可以直接保存) 808 | # record_end 809 | # offset 810 | for compressed_size, decompressed_size in record_block_info_list: 811 | current_pos = f.tell() 812 | record_block_compressed = f.read(compressed_size) 813 | # 要得到 record_block_compressed 需要得到 compressed_size (这个可以直接记录) 814 | # 另外还需要记录当前 f 对象的位置 815 | # 使用 f.tell() 命令/ 在建立索引是需要 f.seek() 816 | # 4 bytes indicates block compression type 817 | record_block_type = record_block_compressed[:4] 818 | # 4 bytes adler checksum of uncompressed content 819 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 820 | # no compression 821 | if record_block_type == b'\x00\x00\x00\x00': 822 | _type = 0 823 | record_block = record_block_compressed[8:] 824 | # lzo compression 825 | elif record_block_type == b'\x01\x00\x00\x00': 826 | _type = 1 827 | if lzo is None: 828 | print("LZO compression is not supported") 829 | break 830 | # decompress 831 | header = b'\xf0' + pack('>I', decompressed_size) 832 | if check_block: 833 | record_block = lzo.decompress(record_block_compressed[ 834 | 8:], initSize=decompressed_size, blockSize=1308672) 835 | # zlib compression 836 | elif record_block_type == b'\x02\x00\x00\x00': 837 | # decompress 838 | _type = 2 839 | if check_block: 840 | record_block = zlib.decompress(record_block_compressed[8:]) 841 | # 这里比较重要的是先要得到 record_block, 而 record_block 是解压得到的,其中一共有三种解压方法 842 | # 需要的信息有 record_block_compressed, decompress_size, 843 | # record_block_type 844 | # 另外还需要校验信息 adler32 845 | # notice that adler32 return signed value 846 | if check_block: 847 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 848 | assert(len(record_block) == decompressed_size) 849 | # split record block according to the offset info from key block 850 | while i < len(self._key_list): 851 | # 用来保存索引信息的空字典 852 | index_dict = {} 853 | index_dict['file_pos'] = current_pos 854 | index_dict['compressed_size'] = compressed_size 855 | index_dict['decompressed_size'] = decompressed_size 856 | index_dict['record_block_type'] = _type 857 | record_start, key_text = self._key_list[i] 858 | index_dict['record_start'] = record_start 859 | index_dict['key_text'] = key_text.decode( 860 | 'utf-8', errors='ignore') 861 | index_dict['offset'] = offset 862 | # reach the end of current record block 863 | if record_start - offset >= decompressed_size: 864 | break 865 | # record end index 866 | if i < len(self._key_list) - 1: 867 | record_end = self._key_list[i + 1][0] 868 | else: 869 | record_end = decompressed_size + offset 870 | index_dict['record_end'] = record_end 871 | i += 1 872 | # 需要得到 record_block , record_start, record_end, 873 | # offset 874 | if check_block: 875 | record = record_block[ 876 | record_start - offset:record_end - offset] 877 | # convert to utf-8 878 | record = record.decode(self._encoding, errors='ignore').strip( 879 | u'\x00').encode('utf-8') 880 | # substitute styles 881 | # 是否替换样式表 882 | if self._substyle and self._stylesheet: 883 | record = self._substitute_stylesheet(record) 884 | index_dict_list.append(index_dict) 885 | 886 | offset += decompressed_size 887 | size_counter += compressed_size 888 | # todo: 注意!!! 889 | #assert(size_counter == record_block_size) 890 | f.close 891 | return index_dict_list 892 | 893 | if __name__ == '__main__': 894 | import sys 895 | import os 896 | import os.path 897 | import argparse 898 | import codecs 899 | 900 | def passcode(s): 901 | try: 902 | regcode, userid = s.split(',') 903 | except: 904 | raise argparse.ArgumentTypeError("Passcode must be regcode,userid") 905 | try: 906 | regcode = codecs.decode(regcode, 'hex') 907 | except: 908 | raise argparse.ArgumentTypeError( 909 | "regcode must be a 32 bytes hexadecimal string") 910 | return regcode, userid 911 | 912 | parser = argparse.ArgumentParser() 913 | parser.add_argument('-x', '--extract', action="store_true", 914 | help='extract mdx to source format and extract files from mdd') 915 | parser.add_argument('-s', '--substyle', action="store_true", 916 | help='substitute style definition if present') 917 | parser.add_argument('-d', '--datafolder', default="data", 918 | help='folder to extract data files from mdd') 919 | parser.add_argument('-e', '--encoding', default="", 920 | help='folder to extract data files from mdd') 921 | parser.add_argument('-p', '--passcode', default=None, type=passcode, 922 | help='register_code,email_or_deviceid') 923 | parser.add_argument("filename", nargs='?', help="mdx file name") 924 | args = parser.parse_args() 925 | 926 | # use GUI to select file, default to extract 927 | if not args.filename: 928 | import Tkinter 929 | import tkFileDialog 930 | root = Tkinter.Tk() 931 | root.withdraw() 932 | args.filename = tkFileDialog.askopenfilename(parent=root) 933 | args.extract = True 934 | 935 | if not os.path.exists(args.filename): 936 | print("Please specify a valid MDX/MDD file") 937 | 938 | base, ext = os.path.splitext(args.filename) 939 | 940 | # read mdx file 941 | if ext.lower() == os.path.extsep + 'mdx': 942 | mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode) 943 | if type(args.filename) is unicode: 944 | bfname = args.filename.encode('utf-8') 945 | else: 946 | bfname = args.filename 947 | print('======== %s ========' % bfname) 948 | print(' Number of Entries : %d' % len(mdx)) 949 | for key, value in mdx.header.items(): 950 | print(' %s : %s' % (key, value)) 951 | else: 952 | mdx = None 953 | 954 | # find companion mdd file 955 | mdd_filename = ''.join([base, os.path.extsep, 'mdd']) 956 | if os.path.exists(mdd_filename): 957 | mdd = MDD(mdd_filename, args.passcode) 958 | if type(mdd_filename) is unicode: 959 | bfname = mdd_filename.encode('utf-8') 960 | else: 961 | bfname = mdd_filename 962 | print('======== %s ========' % bfname) 963 | print(' Number of Entries : %d' % len(mdd)) 964 | for key, value in mdd.header.items(): 965 | print(' %s : %s' % (key, value)) 966 | else: 967 | mdd = None 968 | 969 | if args.extract: 970 | # write out glos 971 | if mdx: 972 | output_fname = ''.join([base, os.path.extsep, 'txt']) 973 | tf = open(output_fname, 'wb') 974 | for key, value in mdx.items(): 975 | tf.write(key) 976 | tf.write(b'\r\n') 977 | tf.write(value) 978 | if not value.endswith(b'\n'): 979 | tf.write(b'\r\n') 980 | tf.write(b'\r\n') 981 | tf.close() 982 | # write out style 983 | if mdx.header.get('StyleSheet'): 984 | style_fname = ''.join([base, '_style', os.path.extsep, 'txt']) 985 | sf = open(style_fname, 'wb') 986 | sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines())) 987 | sf.close() 988 | # write out optional data files 989 | if mdd: 990 | datafolder = os.path.join( 991 | os.path.dirname(args.filename), args.datafolder) 992 | if not os.path.exists(datafolder): 993 | os.makedirs(datafolder) 994 | for key, value in mdd.items(): 995 | fname = key.decode('utf-8').replace('\\', os.path.sep) 996 | dfname = datafolder + fname 997 | if not os.path.exists(os.path.dirname(dfname)): 998 | os.makedirs(os.path.dirname(dfname)) 999 | df = open(dfname, 'wb') 1000 | df.write(value) 1001 | df.close() 1002 | -------------------------------------------------------------------------------- /ripemd128.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright by https://github.com/zhansliu/writemdict 3 | 4 | ripemd128.py - A simple ripemd128 library in pure Python. 5 | 6 | Supports both Python 2 (versions >= 2.6) and Python 3. 7 | 8 | Usage: 9 | from ripemd128 import ripemd128 10 | digest = ripemd128(b"The quick brown fox jumps over the lazy dog") 11 | assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") 12 | 13 | """ 14 | 15 | 16 | 17 | import struct 18 | 19 | 20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt 21 | 22 | def f(j, x, y, z): 23 | assert(0 <= j and j < 64) 24 | if j < 16: 25 | return x ^ y ^ z 26 | elif j < 32: 27 | return (x & y) | (z & ~x) 28 | elif j < 48: 29 | return (x | (0xffffffff & ~y)) ^ z 30 | else: 31 | return (x & z) | (y & ~z) 32 | 33 | def K(j): 34 | assert(0 <= j and j < 64) 35 | if j < 16: 36 | return 0x00000000 37 | elif j < 32: 38 | return 0x5a827999 39 | elif j < 48: 40 | return 0x6ed9eba1 41 | else: 42 | return 0x8f1bbcdc 43 | 44 | def Kp(j): 45 | assert(0 <= j and j < 64) 46 | if j < 16: 47 | return 0x50a28be6 48 | elif j < 32: 49 | return 0x5c4dd124 50 | elif j < 48: 51 | return 0x6d703ef3 52 | else: 53 | return 0x00000000 54 | 55 | def padandsplit(message): 56 | """ 57 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges 58 | from 0 to 16. 59 | First pads the message to length in bytes is congruent to 56 (mod 64), 60 | by first adding a byte 0x80, and then padding with 0x00 bytes until the 61 | message length is congruent to 56 (mod 64). Then adds the little-endian 62 | 64-bit representation of the original length. Finally, splits the result 63 | up into 64-byte blocks, which are further parsed as 32-bit integers. 64 | """ 65 | origlen = len(message) 66 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1! 67 | message += b"\x80" 68 | message += b"\x00" * (padlength - 1) 69 | message += struct.pack("> (32-s)) & 0xffffffff 86 | 87 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 88 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, 89 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, 90 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2] 91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, 92 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, 93 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, 94 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14] 95 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, 96 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, 97 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, 98 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12] 99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, 100 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, 101 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, 102 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8] 103 | 104 | 105 | def ripemd128(message): 106 | h0 = 0x67452301 107 | h1 = 0xefcdab89 108 | h2 = 0x98badcfe 109 | h3 = 0x10325476 110 | X = padandsplit(message) 111 | for i in range(len(X)): 112 | (A,B,C,D) = (h0,h1,h2,h3) 113 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3) 114 | for j in range(64): 115 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j))) 116 | (A,D,C,B) = (D,C,B,T) 117 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j))) 118 | (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T) 119 | T = add(h1,C,Dp) 120 | h1 = add(h2,D,Ap) 121 | h2 = add(h3,A,Bp) 122 | h3 = add(h0,B,Cp) 123 | h0 = T 124 | 125 | 126 | return struct.pack("查看网页源码->找到对应的css文件,将该css文件另存到mdx文件同一个目录下 50 | --------------------------------------------------------------------------------