├── .idea ├── dictionaries │ └── xiaohang.xml └── vcs.xml ├── CreateAnkiImport_GrePhrase.py ├── CreateAnkiImport_GreWord.py ├── README.md ├── add_similar_word.py ├── anki_import.ipynb ├── anki_import.md ├── base_data ├── GREGao Fen Bi Bei Duan Yu Da Pe - Yan Yu Zhen ,Gao Yu ,Chen Qi.txt ├── GREHe Xin Ci Hui Kao Fa Jing Xi (Xin Dong Fang Da Yu Ying Yu Xue Xi Cong Shu ) - Chen Qi.txt ├── GREHe Xin Ci Hui Zhu Ji Yu Jing - Cao Tian Cheng.txt └── bzsdbdc_dic.txt ├── convert_duanyu.py ├── convert_new3000.py ├── convert_zhuji.py ├── example_usage.apkg ├── explore_all_in_one.ipynb ├── explore_all_in_one.md ├── my_helpers.py ├── pureSalsa20.py ├── readmdict.py ├── ripemd128.py ├── sync_to_file_magic_command.py └── wagnerfischerpp.py /.idea/dictionaries/xiaohang.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | jupyter 5 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /CreateAnkiImport_GrePhrase.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import json 3 | import codecs 4 | import os 5 | from my_helpers import * 6 | file_name_duanyu = 'duanyu_base_d.txt' 7 | duanyu_base_d = is_file_and_json_load(file_name_duanyu) 8 | output_file_GrePhrase = 'AnkiImportData_GrePhrase.txt' 9 | def convert_to_GrePhrase(): 10 | with codecs.open(output_file_GrePhrase, 'w', encoding='utf-8') as f: 11 | my_notes = '' 12 | for phrase_uid, phrase_dict in duanyu_base_d.iteritems(): 13 | one_line = [phrase_uid, phrase_dict['phrase'], phrase_dict['usage_index'], my_notes, 14 | phrase_dict['en_exp'], phrase_dict['cn_exp'], 15 | phrase_dict['example'], phrase_dict['gre_example_cn'], 16 | phrase_dict['gre_example_en']] 17 | one_line = '\t'.join(one_line) + '\n' 18 | f.write(one_line) 19 | if __name__ == '__main__': 20 | if not (duanyu_base_d is None): 21 | convert_to_GrePhrase() 22 | -------------------------------------------------------------------------------- /CreateAnkiImport_GreWord.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import json 3 | import codecs 4 | import os 5 | from my_helpers import * 6 | file_name_new3000 = 'new3000_base_d.txt' 7 | file_name_zhuji = 'zhuji_base_d.txt' 8 | file_name_bzsdbdc = 'base_data\\bzsdbdc_dic.txt' 9 | output_file_GreWord = 'AnkiImportData_GreWord.txt' 10 | new3000_base_d = None 11 | zhuji3000_base_d = None 12 | bzsdbdc_data = None 13 | new3000_base_d = is_file_and_json_load(file_name_new3000) 14 | zhuji3000_base_d = is_file_and_json_load(file_name_zhuji) 15 | bzsdbdc_data = is_file_and_json_load(file_name_bzsdbdc) 16 | no_data_new3000 = new3000_base_d is None 17 | no_data_zhuji = zhuji3000_base_d is None 18 | no_data_bzsdbdc = bzsdbdc_data is None 19 | def add_extra_fields(): 20 | if no_data_new3000: 21 | print 'New3000 data file does not exists! Nothing can be done...' 22 | return 23 | iter_path = [('all','',True), ('key','usages',False),('all','',False)] 24 | for word, usage_d in iter_through_general(new3000_base_d, iter_path): 25 | usage_d['audio'] = '' 26 | usage_d['mynotes'] = '' 27 | for word_d in new3000_base_d.itervalues(): 28 | word_d['similar_form'] = '' 29 | def convert_to_GreWord(): 30 | if no_data_new3000: 31 | print 'New3000 data file does not exists! Nothing can be done...' 32 | return 33 | if no_data_zhuji: 34 | print 'No data of zhuji!' 35 | if no_data_bzsdbdc: 36 | print 'No data of bzsdbdc!' 37 | output_list = [] 38 | None_repr = u'' 39 | join_by_line_break = u'
'.join 40 | replace_with_br = lambda _str: _str.replace('\n', '
') 41 | tag_pos_prefix = ' in_' 42 | for word in new3000_base_d: 43 | # new 3000 part 44 | """ 45 | the structure of a word of new3000_base_d.txt 46 | 47 | {'phon': u"[\u02cc\xe6d'l\u026ab]", 48 | 'pos': (1, 6), 49 | 'usages': [{'ants': u'\u53cd\u3000considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684', 50 | 'ants_d': {'cn': u'\u9884\u5148\u8ba1\u5212\u7684', 51 | 'en': u'considered, planned, premeditated, rehearsed ', 52 | 'en_cn': u'considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684'}, 53 | 'der': '', 54 | 'examples': u'content...', 55 | 'en': u'not bad for an ad-lib comedy routine', 56 | 'en_cn': u'content...'}, 57 | 'exp': u'*adj.* \u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**', 58 | 'exp_d': {'cn': u'\u5373\u5174\u7684', 59 | 'en': u'made or done **without previous thought or preparation**', 60 | 'en_cn': u'\u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**'}, 61 | 'ph_symbl': u"[\u02cc\xe6d'l\u026ab]", 62 | 'pspeech': u'adj.', 63 | 'syns': u'content...'} 64 | """ 65 | one_new3000_word_d = new3000_base_d[word] 66 | word_pos_L, word_pos_U = one_new3000_word_d['pos'] 67 | word_pos = u'L' + unicode(word_pos_L) + u' U' + unicode(word_pos_U) 68 | num_usages = len(one_new3000_word_d['usages']) 69 | usages_tag = unicode(num_usages) + u'_usage' 70 | 71 | for usage_index, usage in enumerate(one_new3000_word_d['usages']): 72 | word_phs = usage['ph_symbl'] 73 | word_tags = usages_tag + tag_pos_prefix + 'zaiyaoniming3000' 74 | if not no_data_zhuji: 75 | if word in zhuji3000_base_d: 76 | word_tags += tag_pos_prefix + 'zhuji3000' 77 | if not no_data_bzsdbdc: 78 | if word in bzsdbdc_data: 79 | word_tags += tag_pos_prefix + 'bzsdbdc' 80 | usage_index = unicode(usage_index+1) 81 | word_uid = unicode(word) + usage_index 82 | ph_symbl = usage['ph_symbl'] 83 | word_Audio = usage['audio'] 84 | pspeech = usage['pspeech'] 85 | exp_en = usage['exp_d']['en'] 86 | exp_cn = usage['exp_d']['cn'] 87 | exp_en_cn = usage['exp_d']['en_cn'] 88 | # combine other explanation 89 | #usage_index_l = range(num_usages) 90 | #usage_index_l.remove(usage_index) 91 | #exp_other = ['**考法%d**:'%(i+1) + one_new3000_word_d['usages'][i]['exp_d']['en_cn'] +'\n' for i in usage_index_l] 92 | # use word_block_str as all explanation 93 | exp_all = one_new3000_word_d['word_block_str'] 94 | examples_en = usage['examples_d']['en'] 95 | examples_cn = usage['examples_d']['cn'] 96 | examples_en_cn = usage['examples_d']['en_cn'] 97 | examples_others = '' 98 | ants_en = usage['ants_d']['en'] 99 | ants_cn = usage['ants_d']['cn'] 100 | ants_en_cn = usage['ants_d']['en_cn'] 101 | syns = usage['syns'] 102 | # der from the book zaiyaoniming3000 103 | der_new3000 = usage['der'] 104 | 105 | # bzsdbdc part 106 | how_to_mem_bzsdbdc = None_repr 107 | if not no_data_bzsdbdc: 108 | if word in bzsdbdc_data: 109 | how_to_mem_bzsdbdc = bzsdbdc_data[word]['combined'] 110 | 111 | # zhuji3000 part 112 | how_to_mem_zhuji3000, eytma_gr, eytma_gr_exp, eytma_cognates = None_repr, None_repr, None_repr, None_repr 113 | ''' 114 | the structure of a word of zhuji3000_base_d 115 | {'content': u'[\u6839] per- [through] + vad [go] + -e [v.], go through, \u904d\u5e03 \u2192 vt. \u5f25\u6f2b\uff0c\u5145\u6ee1\n', 116 | 'ety': 'vad, vag, ced', 117 | 'etyma_cognates_l': u'pervade, evasive, extravagant, vague, cessation, incessant', 118 | 'etyma_group_explanation': u'group explanation content', 119 | 'phon': u"[p\u0259r've\u026ad]", 120 | 'pos': u'6, 7', 121 | 'summary': u'summary content', 122 | 'word': u'pervade'} 123 | ''' 124 | if not no_data_zhuji: 125 | if word in zhuji3000_base_d: 126 | how_to_mem_zhuji3000 = zhuji3000_base_d[word]['content'] 127 | eytma_gr = zhuji3000_base_d[word]['ety'] 128 | eytma_gr_exp = zhuji3000_base_d[word]['etyma_group_explanation'] 129 | eytma_cognates = zhuji3000_base_d[word]['etyma_cognates_l'] 130 | # extra fields 131 | mynotes = usage['mynotes'] 132 | similar_form = one_new3000_word_d['similar_form'] 133 | """ 134 | Anki GreWord Structure 135 | word_uid word usage_index ph_symbl word_audio pspeech mynotes 136 | exp_en exp_cn exp_en_cn exp_all 137 | examples_en examples_cn examples_encn examples_others 138 | ants_en ants_cn ants_encn 139 | syns der_new3000 140 | how_to_mem_bzsdbdc how_to_mem_zhuji3000 141 | etyma_group etyma_group_exp etyma_cognates 142 | position similar_form tags 143 | """ 144 | one_line = [word_uid, word, usage_index, ph_symbl, word_Audio, pspeech, mynotes, 145 | exp_en, exp_cn, exp_en_cn, exp_all, 146 | examples_en, examples_cn, examples_en_cn, examples_others, 147 | ants_en, ants_cn, ants_en_cn] +\ 148 | [syns, der_new3000, how_to_mem_bzsdbdc, how_to_mem_zhuji3000, 149 | eytma_gr, eytma_gr_exp, eytma_cognates, word_pos, similar_form, word_tags] 150 | for index, _str in enumerate(one_line): 151 | _str = replace_with_br(collapse_blank_line(_str).strip(' \n')) 152 | one_line[index] = custom_html_element(_str) 153 | output_list.append(one_line) 154 | output_list.sort(key=lambda x: x[0]) 155 | return output_list 156 | def main(): 157 | add_field_audio_and_mynotes() 158 | output_list = convert_to_GreWord() 159 | if output_list is None: 160 | return 161 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f: 162 | for one_line in output_list: 163 | one_string = u'\t'.join(one_line) + '\n' 164 | f.write(one_string) 165 | del output_list 166 | if __name__ == '__main__': 167 | main() 168 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Intro 2 | 3 | Some scripts that convert a series of GRE Vocabulary books in Chinese to Anki notes. 4 | 5 | The markdown cells of the jupyter notebooks are written in Chinese, but contain 6 | detailed explanation and walk through. 7 | 8 | `explore_all_in_one.ipynb` converts the txt source file to a structured dict object. 9 | 10 | `anki_import.ipynb` uses the converted dict object to generate import file for Anki. 11 | 12 | `readmdict.py` `ripemd128.py` `pureSalsa20.py` are the tools to unzip mdd and mdx file. 13 | See https://bitbucket.org/xwang/mdict-analysis/overview for more details. 14 | 15 | I share the ready to use import txt and audio files on baiduyun. You can use the 16 | example_usage.apkg to build the note type. 17 | 18 | http://pan.baidu.com/s/1pJ5W9uF password：xgif 19 | 20 | 21 | -------------------------------------------------------------------------------- /add_similar_word.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | from multiprocessing import Pool 4 | from wagnerfischerpp import WagnerFischer 5 | import codecs 6 | import json 7 | 8 | 9 | with codecs.open('new3000_base_d.txt') as f: 10 | new3000_base_d = json.load(f,encoding='utf-8') 11 | 12 | 13 | def get_similar_word(word_a, threshold=2): 14 | distance_l = [] 15 | for word_b in new3000_base_d: 16 | if word_b == word_a: 17 | continue 18 | cost_a_b = WagnerFischer(word_a, word_b).cost 19 | if cost_a_b <= threshold: 20 | distance_l.append((cost_a_b, word_b)) 21 | distance_l.sort() 22 | return distance_l 23 | 24 | 25 | def gen_brief_exp(word): 26 | brief_exp_l = [] 27 | for usage_d in new3000_base_d[word]['usages']: 28 | brief_exp_l.append(usage_d['exp_d']['cn']) 29 | return word + ': ' + u'；'.join(brief_exp_l) 30 | 31 | 32 | def add_similar_word_single_word(word): 33 | similar_word_l = get_similar_word(word) 34 | exp_l = [] 35 | for cost, similar_word in similar_word_l: 36 | exp_l.append(gen_brief_exp(similar_word)) 37 | #new3000_base_d[word]['similar_word'] = ' | '.join(exp_l) 38 | print '+', 39 | return word, ' | '.join(exp_l) 40 | 41 | 42 | def add_similar_word_multiprocessing(): 43 | pool = Pool(4) 44 | result = pool.map(add_similar_word_single_word, new3000_base_d.iterkeys()) 45 | pool.close() 46 | with codecs.open('similar_word.txt', 'w', encoding='utf-8') as f: 47 | json.dump(result, f) 48 | 49 | if __name__ == '__main__': 50 | add_similar_word_multiprocessing() -------------------------------------------------------------------------------- /anki_import.md: -------------------------------------------------------------------------------- 1 | 2 | # 说明 3 | 4 | 这个notebook展示了如何将一个json对象转换为可导入Anki的文件。重点在于Anki中NoteType的设计。内容上承接explore_all_in_one.ipynb。 5 | 6 | 《GRE核心词汇考法精析》、《GRE核心词汇助记与精练》以及从网上找到的《不择手段背单词》对应NoteType为GreWord。 7 | 《GRE高分必备短语搭配》对应NoteTpye为GrePhrase。 8 | 9 | notebook执行完后，会自动生成两个脚本，名字参见变量file_name_greword，file_name_grephrase。单独运行两个脚本也可完成转换，只要有python就可使用。 10 | 11 | 子章节《处理发音文件》和《添加文件》，需要许多定制文件。所以没有导出到转换脚本。如果没有对应文件的话，直接运行这个notebook而会报错。所以如果只想得到无发音无笔记版本的导入文件，请运行那两个转换脚本。 12 | 13 | 转换出的Anki导入文件，名字参见变量output_file_GreWord，output_file_GrePhrase。 14 | 15 | 16 | ```python 17 | %run sync_to_file_magic_command.py 18 | ``` 19 | 20 | 21 | ```python 22 | file_name_greword = 'CreateAnkiImport_GreWord.py' 23 | file_name_grephrase = 'CreateAnkiImport_GrePhrase.py' 24 | configCreAnkiImpGreWord = file_name_greword 25 | configCreAnkiImpGrePhrase = file_name_grephrase 26 | configMyHelpers = 'my_helpers.py' 27 | ``` 28 | 29 | # 补充两个辅助函数 30 | 31 | 32 | ```python 33 | %%sync_to_file $configMyHelpers 34 | def custom_html_element(_str): 35 | """ 36 | convert the markdown notations in a string to html tags 37 | currently, only two kinds of markdown notation exist in all the strings 38 | ** and * 39 | """ 40 | formatted_str = _str 41 | # format double asterisk 42 | match_double_asterisk_re = re.compile(u'\*\*(.*?)\*\*') 43 | # replace **...** with ... 44 | #formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 45 | # replace **...** with ... 46 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 47 | # format single asterisk 48 | # replace *...* with ... 49 | match_single_asterisk_re = re.compile(u'\*(.*?)\*') 50 | formatted_str = match_single_asterisk_re.sub(r'\1', formatted_str) 51 | return formatted_str 52 | ``` 53 | 54 | 55 | ```python 56 | %%sync_to_file $configMyHelpers 57 | def is_file_and_json_load(file_name_str): 58 | if os.path.isfile(file_name_str): 59 | with codecs.open(file_name_str, 'r', encoding='utf-8') as f: 60 | json_d = json.load(f) 61 | return json_d 62 | ``` 63 | 64 | 65 | ```python 66 | %%sync_to_file $configCreAnkiImpGreWord $configCreAnkiImpGrePhrase -m o 67 | 68 | # coding:utf-8 69 | import json 70 | import codecs 71 | import os 72 | from my_helpers import * 73 | ``` 74 | 75 | # GreWord 76 | 77 | 78 | ```python 79 | # example 80 | test_str = 'to **put an end to**(something planned or previously agreed to)' 81 | print custom_html_element(test_str) 82 | del test_str 83 | ``` 84 | 85 | to put an end to(something planned or previously agreed to) 86 | 87 | 88 | 89 | ```python 90 | %%sync_to_file $configCreAnkiImpGreWord 91 | file_name_new3000 = 'new3000_base_d.txt' 92 | file_name_zhuji = 'zhuji_base_d.txt' 93 | file_name_bzsdbdc = 'base_data\\bzsdbdc_dic.txt' 94 | output_file_GreWord = 'AnkiImportData_GreWord.txt' 95 | new3000_base_d = None 96 | zhuji3000_base_d = None 97 | bzsdbdc_data = None 98 | new3000_base_d = is_file_and_json_load(file_name_new3000) 99 | zhuji3000_base_d = is_file_and_json_load(file_name_zhuji) 100 | bzsdbdc_data = is_file_and_json_load(file_name_bzsdbdc) 101 | ``` 102 | 103 | 104 | ```python 105 | %%sync_to_file $configCreAnkiImpGreWord 106 | no_data_new3000 = new3000_base_d is None 107 | no_data_zhuji = zhuji3000_base_d is None 108 | no_data_bzsdbdc = bzsdbdc_data is None 109 | ``` 110 | 111 | ## 核心转换函数 112 | 113 | 114 | ```python 115 | %%sync_to_file $configCreAnkiImpGreWord 116 | def add_extra_fields(): 117 | if no_data_new3000: 118 | print 'New3000 data file does not exists! Nothing can be done...' 119 | return 120 | iter_path = [('all','',True), ('key','usages',False),('all','',False)] 121 | for word, usage_d in iter_through_general(new3000_base_d, iter_path): 122 | usage_d['audio'] = '' 123 | usage_d['mynotes'] = '' 124 | for word_d in new3000_base_d.itervalues(): 125 | word_d['similar_form'] = '' 126 | ``` 127 | 128 | 129 | ```python 130 | add_extra_fields() 131 | ``` 132 | 133 | 134 | ```python 135 | # test 136 | #pprint(new3000_base_d['abandon']) 137 | ``` 138 | 139 | 140 | ```python 141 | %%sync_to_file $configCreAnkiImpGreWord 142 | def convert_to_GreWord(): 143 | if no_data_new3000: 144 | print 'New3000 data file does not exists! Nothing can be done...' 145 | return 146 | if no_data_zhuji: 147 | print 'No data of zhuji!' 148 | if no_data_bzsdbdc: 149 | print 'No data of bzsdbdc!' 150 | output_list = [] 151 | None_repr = u'' 152 | join_by_line_break = u'
'.join 153 | replace_with_br = lambda _str: _str.replace('\n', '
') 154 | tag_pos_prefix = ' in_' 155 | for word in new3000_base_d: 156 | # new 3000 part 157 | """ 158 | the structure of a word of new3000_base_d.txt 159 | 160 | {'phon': u"[\u02cc\xe6d'l\u026ab]", 161 | 'pos': (1, 6), 162 | 'usages': [{'ants': u'\u53cd\u3000considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684', 163 | 'ants_d': {'cn': u'\u9884\u5148\u8ba1\u5212\u7684', 164 | 'en': u'considered, planned, premeditated, rehearsed ', 165 | 'en_cn': u'considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684'}, 166 | 'der': '', 167 | 'examples': u'content...', 168 | 'en': u'not bad for an ad-lib comedy routine', 169 | 'en_cn': u'content...'}, 170 | 'exp': u'*adj.* \u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**', 171 | 'exp_d': {'cn': u'\u5373\u5174\u7684', 172 | 'en': u'made or done **without previous thought or preparation**', 173 | 'en_cn': u'\u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**'}, 174 | 'ph_symbl': u"[\u02cc\xe6d'l\u026ab]", 175 | 'pspeech': u'adj.', 176 | 'syns': u'content...'} 177 | """ 178 | one_new3000_word_d = new3000_base_d[word] 179 | word_pos_L, word_pos_U = one_new3000_word_d['pos'] 180 | word_pos = u'L' + unicode(word_pos_L) + u' U' + unicode(word_pos_U) 181 | num_usages = len(one_new3000_word_d['usages']) 182 | usages_tag = unicode(num_usages) + u'_usage' 183 | 184 | for usage_index, usage in enumerate(one_new3000_word_d['usages']): 185 | word_phs = usage['ph_symbl'] 186 | word_tags = usages_tag + tag_pos_prefix + 'zaiyaoniming3000' 187 | if not no_data_zhuji: 188 | if word in zhuji3000_base_d: 189 | word_tags += tag_pos_prefix + 'zhuji3000' 190 | if not no_data_bzsdbdc: 191 | if word in bzsdbdc_data: 192 | word_tags += tag_pos_prefix + 'bzsdbdc' 193 | usage_index = unicode(usage_index+1) 194 | word_uid = unicode(word) + usage_index 195 | ph_symbl = usage['ph_symbl'] 196 | word_Audio = usage['audio'] 197 | pspeech = usage['pspeech'] 198 | exp_en = usage['exp_d']['en'] 199 | exp_cn = usage['exp_d']['cn'] 200 | exp_en_cn = usage['exp_d']['en_cn'] 201 | # combine other explanation 202 | #usage_index_l = range(num_usages) 203 | #usage_index_l.remove(usage_index) 204 | #exp_other = ['**考法%d**:'%(i+1) + one_new3000_word_d['usages'][i]['exp_d']['en_cn'] +'\n' for i in usage_index_l] 205 | # use word_block_str as all explanation 206 | exp_all = one_new3000_word_d['word_block_str'] 207 | examples_en = usage['examples_d']['en'] 208 | examples_cn = usage['examples_d']['cn'] 209 | examples_en_cn = usage['examples_d']['en_cn'] 210 | examples_others = '' 211 | ants_en = usage['ants_d']['en'] 212 | ants_cn = usage['ants_d']['cn'] 213 | ants_en_cn = usage['ants_d']['en_cn'] 214 | syns = usage['syns'] 215 | # der from the book zaiyaoniming3000 216 | der_new3000 = usage['der'] 217 | 218 | # bzsdbdc part 219 | how_to_mem_bzsdbdc = None_repr 220 | if not no_data_bzsdbdc: 221 | if word in bzsdbdc_data: 222 | how_to_mem_bzsdbdc = bzsdbdc_data[word]['combined'] 223 | 224 | # zhuji3000 part 225 | how_to_mem_zhuji3000, eytma_gr, eytma_gr_exp, eytma_cognates = None_repr, None_repr, None_repr, None_repr 226 | ''' 227 | the structure of a word of zhuji3000_base_d 228 | {'content': u'[\u6839] per- [through] + vad [go] + -e [v.], go through, \u904d\u5e03 \u2192 vt. \u5f25\u6f2b\uff0c\u5145\u6ee1\n', 229 | 'ety': 'vad, vag, ced', 230 | 'etyma_cognates_l': u'pervade, evasive, extravagant, vague, cessation, incessant', 231 | 'etyma_group_explanation': u'group explanation content', 232 | 'phon': u"[p\u0259r've\u026ad]", 233 | 'pos': u'6, 7', 234 | 'summary': u'summary content', 235 | 'word': u'pervade'} 236 | ''' 237 | if not no_data_zhuji: 238 | if word in zhuji3000_base_d: 239 | how_to_mem_zhuji3000 = zhuji3000_base_d[word]['content'] 240 | eytma_gr = zhuji3000_base_d[word]['ety'] 241 | eytma_gr_exp = zhuji3000_base_d[word]['etyma_group_explanation'] 242 | eytma_cognates = zhuji3000_base_d[word]['etyma_cognates_l'] 243 | # extra fields 244 | mynotes = usage['mynotes'] 245 | similar_form = one_new3000_word_d['similar_form'] 246 | """ 247 | Anki GreWord Structure 248 | word_uid word usage_index ph_symbl word_audio pspeech mynotes 249 | exp_en exp_cn exp_en_cn exp_all 250 | examples_en examples_cn examples_encn examples_others 251 | ants_en ants_cn ants_encn 252 | syns der_new3000 253 | how_to_mem_bzsdbdc how_to_mem_zhuji3000 254 | etyma_group etyma_group_exp etyma_cognates 255 | position similar_form tags 256 | """ 257 | one_line = [word_uid, word, usage_index, ph_symbl, word_Audio, pspeech, mynotes, 258 | exp_en, exp_cn, exp_en_cn, exp_all, 259 | examples_en, examples_cn, examples_en_cn, examples_others, 260 | ants_en, ants_cn, ants_en_cn] +\ 261 | [syns, der_new3000, how_to_mem_bzsdbdc, how_to_mem_zhuji3000, 262 | eytma_gr, eytma_gr_exp, eytma_cognates, word_pos, similar_form, word_tags] 263 | for index, _str in enumerate(one_line): 264 | _str = replace_with_br(collapse_blank_line(_str).strip(' \n')) 265 | one_line[index] = custom_html_element(_str) 266 | output_list.append(one_line) 267 | output_list.sort(key=lambda x: x[0]) 268 | return output_list 269 | ``` 270 | 271 | 上面的函数构建了基本的Anki导入文件。现在还需要将发音文件的指针添加进去。 272 | 如果是更新原有的note，那么还需要将原有note的mynotes字段取出来，放到output_list的对应位置。 273 | 所以先不执行下面的函数。等到数据补充齐全后再运行。 274 | 275 | 276 | ```python 277 | %%sync_to_file $configCreAnkiImpGreWord 278 | def main(): 279 | add_field_audio_and_mynotes() 280 | output_list = convert_to_GreWord() 281 | if output_list is None: 282 | return 283 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f: 284 | for one_line in output_list: 285 | one_string = u'\t'.join(one_line) + '\n' 286 | f.write(one_string) 287 | del output_list 288 | ``` 289 | 290 | 291 | ```python 292 | %%sync_to_file $configCreAnkiImpGreWord -p 293 | if __name__ == '__main__': 294 | main() 295 | ``` 296 | 297 | ## 处理发音文件的思路 298 | 299 | Anki中，添加发音文件的语法是`[sound:发音文件指针]`。发音文件指针即发音文件的文件名。所有相关文件必须放在Anki自己的`collection.media`文件夹里。所以路径应该使用相对引用。 300 | 301 | 接下来，从各个发音库抽取文件指针，并且将相应文件拷贝到Anki的`collection.media`文件夹下，同时将指针添加到new3000_base_d中。 302 | 303 | ## 再要你命3000中的多音词 304 | 305 | 306 | ```python 307 | print new3000_base_d['addict']['usages'][0].keys() 308 | ``` 309 | 310 | [u'exp_d', u'pspeech', u'ph_symbl', u'ants_d', u'der', u'ants', 'mynotes', u'examples', u'examples_d', u'exp', 'audio', u'syns'] 311 | 312 | 313 | 314 | ```python 315 | path_to_pron = [('all','',True), ('key','usages',False), ('all','',True),('key','ph_symbl',False)] 316 | pre_word_pron = None 317 | multi_pron_word_set = set() 318 | for word, usage_index, word_pron in iter_through_general(new3000_base_d, deepcopy(path_to_pron)): 319 | if usage_index > 0: 320 | if word_pron != pre_word_pron: 321 | multi_pron_word_set.add(word) 322 | else: 323 | pre_word_pron = word_pron 324 | ``` 325 | 326 | 327 | ```python 328 | print multi_pron_word_set 329 | ``` 330 | 331 | set([u'incarnate', u'articulate', u'appropriate', u'incense', u'subordinate', u'animate', u'surmise', u'content', u'duplicate', u'escort', u'moderate', u'compliment', u'entrance', u'intimate', u'addict', u'compound', u'aggregate', u'discharge', u'diffuse', u'convert', u'elaborate', u'exploit', u'contract', u'project', u'initiate', u'ally', u'alloy', u'intrigue']) 332 | 333 | 334 | ## 来源：dsl格式字典 335 | 336 | dsl格式的Longman Pronunciation Dictionary 3rd Ed. 337 | 338 | 关于处理dsl的基本知识，参考[Full Text Search in GoldenDict](https://lisok3ajr.wordpress.com/2012/09/18/full-text-search-in-goldendict/) 339 | 340 | ### 读取数据 341 | 342 | 343 | ```python 344 | import gzip 345 | ``` 346 | 347 | 348 | ```python 349 | file_pronunciation = 'D:\Eudict\dsl\En-En_Longman_Pronunciation3\En-En-Longman_Pronunciation.dsl.dz' 350 | dsl_str = gzip.open(file_pronunciation, mode='r').read().decode('utf-16') 351 | print dsl_str[100:400] 352 | ``` 353 | 354 | sh" 355 | 356 | A 357 | [m1][b]A, a[/b] [i] name of letter[/i] [p]BrE[/p] [s]uk_ld44a.wav[/s] [p]AmE[/p] [s]us_l3a-2.wav[/s] [c mediumblue]eɪ[/c][/m] 358 | [m1]▷ [b]A's, As, a's[/b] [c mediumblue]eɪz[/c][i] —Communications code name:[/i][c darkmagenta] Alfa[/c][/m] 359 | [m1]▶[b][c blue]ˌ[/c]A[c blue]ˈ[/c]1[c blue]◂[/c], [c 360 | 361 | 362 | 363 | ```python 364 | match_word_fun = lambda word: re.search('^(%s)[ \t]*$(.*?)(?=^[^ \t])'%word, dsl_str, re.M|re.S) 365 | findall_word_fun = lambda word: re.findall('^(%s)[ \t]*$(.*?)(?=^[^ \t])'%word, dsl_str, re.M|re.S) 366 | match_us_pron_re = re.compile('\[s\](us.*?)\[/s\]') 367 | ``` 368 | 369 | 有的单词，其下属派生词，以▷标识，也有自己的音标。这部分中可能出现斜体字，以[i]..[/i]标识。只有主释义单词后面的斜体才是音标。 370 | 371 | 372 | ```python 373 | match_pspeech_re = re.compile('^[ \t]*?\[m1\]\[b\].*?\[/b\] \[i\] ([a-z, ]+).*?\[/i\]', re.M) 374 | ``` 375 | 376 | 377 | ```python 378 | # test 379 | def unit_test(): 380 | result = match_word_fun('content') 381 | result_str = result.group() 382 | print result_str 383 | print 'All pronunciation files: ', match_us_pron_re.findall(result_str) 384 | print 'All part of speech: ', match_pspeech_re.findall(result_str) 385 | #unit_test() 386 | del unit_test 387 | ``` 388 | 389 | ### 将dsl_str转换为dict 390 | 391 | 392 | ```python 393 | extract_word_block_re = re.compile(ur'^([a-z-]+)[ \t]*$(.*?)(?=^[^ \t])', re.M|re.S|re.I) 394 | ``` 395 | 396 | 397 | ```python 398 | # test 399 | #extract_word_block_re.findall(dsl_str[0:5000]) 400 | ``` 401 | 402 | 403 | ```python 404 | dsl_pron_d = {} 405 | for one_match_obj in extract_word_block_re.finditer(dsl_str): 406 | word = one_match_obj.group(1) 407 | if word in dsl_pron_d: 408 | print '%s already exists!'%word 409 | one_word_d = {} 410 | word_block = one_match_obj.group(2) 411 | one_word_d['word_block'] = word_block 412 | one_word_d['pspeech_l'] = match_pspeech_re.findall(word_block) 413 | one_word_d['ph_symbol_l'] = match_us_pron_re.findall(word_block) 414 | if word in multi_pron_word_set: 415 | #print 'check pspeech' 416 | #print word, one_word_d['pspeech_l'] 417 | pass 418 | dsl_pron_d[word] = one_word_d 419 | ``` 420 | 421 | 422 | ```python 423 | # example 424 | iter_print(dsl_pron_d['content']) 425 | ``` 426 | 427 | word_block 428 | {{Roman}}I{{/Roman}} 429 | [m1][b]con|tent[/b] [i] adjective, verb, noun 'contentment'[/i] [p]BrE[/p] [s]uk_ld44content.wav[/s] [p]AmE[/p] [s]us_l3content2.wav[/s] [c mediumblue]kən |ˈtent[/c] [p]§[/p]\ [sub]([/sub]ˌ[sub])[/sub]kɒn-[/m] 430 | [m1]▷ [b]con|tented[/b] [c mediumblue]ˈtent ɪd[/c] -əd [p]AmE[/p]\ [c mediumblue]ˈten[i]t̬[/i] əd[/c][/m] 431 | [m1]▷ [b]con|tenting[/b] [c mediumblue]ˈtent ɪŋ[/c] [p]AmE[/p]\ [c mediumblue]ˈten[i]t̬[/i] ɪŋ[/c][/m] 432 | [m1]▷ [b]con|tents[/b] [c mediumblue]ˈten[i]t[/i]s[/c][/m] 433 | {{Roman}}II{{/Roman}} 434 | [m1][b]content[/b] [i] noun 'matter contained'[/i] [p]BrE[/p] [s]uk_content2.wav[/s] [p]AmE[/p] [s]us_l3content.wav[/s] [c mediumblue]ˈkɒn tent[/c] [p]AmE[/p]\ [c mediumblue]ˈkɑːn-[/c][/m] 435 | [m1]▷ [b]content|s[/b] [c mediumblue]s[/c][/m] 436 | ph_symbol_l 437 | 0 438 | us_l3content2.wav 439 | 1 440 | us_l3content.wav 441 | pspeech_l 442 | 0 443 | adjective, verb, noun 444 | 1 445 | noun 446 | 447 | 448 | ### 统计词性对应关系 449 | 450 | 451 | ```python 452 | def summary_pspeech(): 453 | #dsl 454 | dsl_pspeech_set = set() 455 | for word, word_d in dsl_pron_d.iteritems(): 456 | dsl_pspeech_l = word_d['pspeech_l'] 457 | for pspeech in dsl_pspeech_l: 458 | dsl_pspeech_set.add(pspeech) 459 | # new3000 460 | new3000_pspeech_set = set() 461 | path_to_pspeech = path_to_pron = [('all','',True), ('key','usages',False), ('all','',False),('key','pspeech',False)] 462 | for word, pspeech in iter_through_general(new3000_base_d, path_to_pspeech): 463 | for sub_pspeech in pspeech.split('/'): 464 | new3000_pspeech_set.add(sub_pspeech) 465 | stripped_pspeech = pspeech.strip('.') 466 | if word in dsl_pron_d: 467 | for dsl_pspeech in dsl_pron_d[word]['pspeech_l']: 468 | if dsl_pspeech.startswith(stripped_pspeech): 469 | break 470 | else: 471 | if len(dsl_pron_d[word]['ph_symbol_l']) > 1: 472 | #print 'pspeech of %s in new3000 not match with dsl'%word 473 | # a lot! 474 | pass 475 | print dsl_pspeech_set 476 | print new3000_pspeech_set 477 | ``` 478 | 479 | 480 | ```python 481 | # summary_pspeech() 482 | ``` 483 | 484 | dsl_pron_n中的有效词性类别：adjective verb pronoun preposition adverb 485 | 所以，只要看看dsl_pron_n中的词性是不是以new3000_base_d中的开头就可以。 486 | 487 | ## 再要你命3000同dsl_d比较 488 | 489 | 将西欧字符转换为普通字符，即éï转为ei 490 | 491 | 492 | ```python 493 | def check_pron_in_new3000_and_dsl(word, print_only_bad_result = True): 494 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0] 495 | return_message_l = [] 496 | not_found = False 497 | if not (word_converted in dsl_pron_d): 498 | return_message_l.append('**%s** not found in dsl'%word) 499 | not_found = True 500 | else: 501 | pron_in_dsl_l = dsl_pron_d[word_converted]['ph_symbol_l'] 502 | pspeech_in_dsl_l = dsl_pron_d[word_converted]['pspeech_l'] 503 | pron_new3000_l = [] 504 | pspeech_new3000_l = [] 505 | for usage_d in new3000_base_d[word]['usages']: 506 | pron_new3000_l.append(usage_d['ph_symbl']) 507 | pspeech_new3000_l.append(usage_d['pspeech']) 508 | diff_pron_new3000_set = set(pron_new3000_l) 509 | if len(pron_in_dsl_l) < len(diff_pron_new3000_set): 510 | message = '**%s** in dsl has less pron'%word 511 | message += '\n' + str(len(pron_in_dsl_l)) + ', ' + str(len(diff_pron_new3000_set)) 512 | message += '\n' + ','.join(pron_in_dsl_l) 513 | message += '\n' + ','.join(pron_new3000_l) 514 | return_message_l.append(message) 515 | else: 516 | if not print_only_bad_result: 517 | return_message_l.append('**%s** in dsl has enough pron'%word) 518 | return '\n'.join(return_message_l), not_found 519 | ``` 520 | 521 | 522 | ```python 523 | result_l = [] 524 | not_found_word_l = [] 525 | for word in new3000_base_d.iterkeys(): 526 | message_str, not_found = check_pron_in_new3000_and_dsl(word) 527 | if message_str != '': 528 | result_l.append(message_str) 529 | if not_found: 530 | not_found_word_l.append(word) 531 | if word in multi_pron_word_set: 532 | print 'Warning! **%s** in multi_pron_word_set'%word 533 | ``` 534 | 535 | 536 | ```python 537 | with codecs.open('temp_check_pron_log.txt', 'w', encoding='utf-8') as f: 538 | json.dump(result_l, f, indent=5) 539 | json.dump(not_found_word_l, f, indent=2) 540 | ``` 541 | 542 | 543 | ```python 544 | print '%d words not found'%len(not_found_word_l) 545 | ``` 546 | 547 | 153 words not found 548 | 549 | 550 | 虽然还有153个没找到，但注意到，多音词都在其中。 551 | 552 | ## 用韦氏发音库补充 553 | 554 | 从网上找的韦氏发音库，网址：http://blog.emagic.org.cn/content/i1931.html 555 | 556 | ed2k链接 557 | 558 | ed2k://|file|%E9%9F%A6%E6%B0%8F%E5%B8%B8%E7%94%A8%E5%8D%95%E8%AF%8D%E8%AF%AD%E9%9F%B3%E5%BA%93.rar|315458082|88b70fe90a6658cec689352f66a7af6c|h=4rblspftuskt5gfvmpbnfkdvhi2ey3fn|/ 559 | 560 | 561 | ```python 562 | path_of_media_source = 'D:\\mvoice\\' 563 | word_list_file = 'word_list.txt' 564 | ``` 565 | 566 | 567 | ```python 568 | media_path_dict = {} 569 | match_word = r'([a-z1-9 ~]+)\.mp3' 570 | match_word_re = re.compile(match_word, re.I|re.M) 571 | with codecs.open(path_of_media_source + word_list_file, encoding='utf-8') as f: 572 | for line in f: 573 | result = match_word_re.search(line) 574 | if not (result is None): 575 | media_path_dict[result.group(1)] = line.strip() 576 | else: 577 | #print line 578 | pass 579 | ``` 580 | 581 | 582 | ```python 583 | print media_path_dict['habit'] 584 | ``` 585 | 586 | D:\mvoice\h\habit.mp3 587 | 588 | 589 | 590 | ```python 591 | count = 0 592 | still_not_found_word_l = [] 593 | for word in not_found_word_l: 594 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0] 595 | if word_converted in media_path_dict: 596 | count += 1 597 | #print 'found', word 598 | else: 599 | still_not_found_word_l.append(word) 600 | print 'found %d of %d'%(count, len(not_found_word_l)) 601 | ``` 602 | 603 | found 57 of 153 604 | 605 | 606 | ## 用mdict补充 607 | 608 | 使用朗文当代第5版的mdx和mdd文件 609 | 610 | 使用插件 https://bitbucket.org/xwang/mdict-analysis 611 | 612 | 613 | 614 | ```python 615 | from readmdict import MDX, MDD 616 | from bs4 import BeautifulSoup 617 | file_to_longman_mdx = "D:\Eudict\Frequent\Longman Dictionary of Contemporary English.mdx" 618 | mdx = MDX(file_to_longman_mdx) 619 | longman_mdx_iter = mdx.items() 620 | longman_in_new3000_d = {} 621 | for word, word_block in longman_mdx_iter: 622 | if word in new3000_base_d: 623 | longman_in_new3000_d[word] = word_block 624 | print 'In longman, found %d words of new3000 (%d in total)'%(len(longman_in_new3000_d), len(new3000_base_d)) 625 | ``` 626 | 627 | In longman, found 2954 words of new3000 (3145 in total) 628 | 629 | 630 | 抽取音频地址 631 | 632 | 633 | ```python 634 | # this is the pattern we gonna use 635 | soup = BeautifulSoup(longman_in_new3000_d['abandon'],"lxml") 636 | print soup.find_all(href=re.compile('sound.*?US'))[0]['href'][8:] 637 | ``` 638 | 639 | US_abandon1.spx 640 | 641 | 642 | 643 | ```python 644 | count = 0 645 | still_still_not_found_word_l = [] 646 | longman_found_word_d = {} 647 | for word in still_not_found_word_l: 648 | founded = False 649 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0] 650 | if word_converted in longman_in_new3000_d: 651 | soup = BeautifulSoup(longman_in_new3000_d[word_converted],"lxml") 652 | find_result = soup.find_all(href=re.compile('sound.*?US')) 653 | if len(find_result) != 0: 654 | count += 1 655 | #print word 656 | founded = True 657 | longman_found_word_d[word] = find_result[0]['href'][8:] 658 | if not founded: 659 | still_still_not_found_word_l.append(word) 660 | print 'found %d of %d'%(count, len(still_not_found_word_l)) 661 | ``` 662 | 663 | found 52 of 96 664 | 665 | 666 | 667 | ```python 668 | # example 669 | longman_found_word_d['ingratiating'] 670 | ``` 671 | 672 | 673 | 674 | 675 | 'US_ingratiating.spx' 676 | 677 | 678 | 679 | 680 | ```python 681 | # unzip the mdd mdx file. 682 | # Warning! This take a lot of time. I have already unpacked it, so commend the next line 683 | #! python readmdict.py -x "D:\Eudict\Frequent\Longman Dictionary of Contemporary English.mdx" 684 | ``` 685 | 686 | ## 添加音频指针 687 | 688 | 689 | ```python 690 | import shutil 691 | ``` 692 | 693 | 694 | ```python 695 | anki_media_collection = os.path.expanduser('~\\Documents\\Anki\\xiaohang\\collection.media') 696 | dsl_source_media_path = 'D:\Eudict\dsl\En-En_Longman_Pronunciation3\En-En-Longman_Pronunciation.dsl.dz.files' 697 | longman_source_media_path = 'D:\Eudict\Frequent\data' 698 | ``` 699 | 700 | 701 | ```python 702 | def add_audio_pointer(word): 703 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0] 704 | word_d = new3000_base_d[word] 705 | for usage_d in word_d['usages']: 706 | usage_d['audio'] = '' 707 | source_audio_file_name = None 708 | first_pspeech_match_obj = re.search('^([a-z]+)\.', usage_d['pspeech']) 709 | if first_pspeech_match_obj is None: 710 | print '%s has no pspeech'%word 711 | new3000_pspeech = '' 712 | else: 713 | new3000_pspeech = first_pspeech_match_obj.group(1) 714 | if new3000_pspeech in ['vt', 'vi']: 715 | new3000_pspeech = 'v' 716 | new_audio_pointer_without_ext = word_converted + '_' + new3000_pspeech 717 | new_audio_file_name_without_ext = anki_media_collection + '\\' + new_audio_pointer_without_ext 718 | new_audio_pointer_without_ext = '[sound:' + new_audio_pointer_without_ext 719 | existed = False 720 | for file_ext in ['.wav', '.mp3', '.spx']: 721 | if os.path.isfile(new_audio_file_name_without_ext + file_ext): 722 | # print 'existed!' 723 | existed = True 724 | usage_d['audio'] = new_audio_pointer_without_ext + file_ext + ']' 725 | break 726 | if existed: 727 | continue 728 | if word_converted in dsl_pron_d: 729 | dsl_word_d = dsl_pron_d[word_converted] 730 | if word in multi_pron_word_set: 731 | # check pspeech 732 | for index, dsl_pspeech in enumerate(dsl_word_d['pspeech_l']): 733 | for dsl_sub_pspeech in dsl_pspeech.split(','): 734 | if dsl_sub_pspeech.strip().startswith(new3000_pspeech): 735 | source_audio_file_name = dsl_source_media_path + '\\' + dsl_word_d['ph_symbol_l'][index] 736 | break 737 | else: 738 | print 'no match of pspeech, word %s'%word 739 | print dsl_word_d['pspeech_l'], new3000_pspeech 740 | pass 741 | else: 742 | # use the first audio pointer 743 | source_audio_file_name = dsl_source_media_path + '\\' + dsl_word_d['ph_symbol_l'][0] 744 | if not (source_audio_file_name is None): 745 | new_audio_pointer = new_audio_pointer_without_ext + '.wav]' 746 | new_audio_file_name = new_audio_file_name_without_ext + '.wav' 747 | else: 748 | # the not found word 749 | if word_converted in media_path_dict: 750 | # try webster 751 | source_audio_file_name = media_path_dict[word_converted] 752 | new_audio_pointer = new_audio_pointer_without_ext + '.mp3]' 753 | new_audio_file_name = new_audio_file_name_without_ext + '.mp3' 754 | elif word in longman_found_word_d: 755 | # try longman 756 | source_audio_file_name = longman_source_media_path + '\\' + longman_found_word_d[word] 757 | new_audio_pointer = new_audio_pointer_without_ext + '.spx]' 758 | new_audio_file_name = new_audio_file_name_without_ext + '.spx' 759 | if not (source_audio_file_name is None): 760 | usage_d['audio'] = new_audio_pointer 761 | shutil.copy(source_audio_file_name, new_audio_file_name) 762 | ``` 763 | 764 | 765 | ```python 766 | for word in new3000_base_d: 767 | add_audio_pointer(word) 768 | ``` 769 | 770 | 771 | ```python 772 | # example 773 | word = 'compendium' 774 | for index, usage_d in enumerate(new3000_base_d[word]['usages']): 775 | print usage_d['audio'] 776 | ``` 777 | 778 | [sound:compendium_n.mp3] 779 | [sound:compendium_n.mp3] 780 | 781 | 782 | ## 转换为mp3 783 | 784 | 到这里，电脑上已经可以发音了。但手机只支持mp3格式，所以要将collection.media中的wav和spx转换为mp3。 785 | 786 | 使用pydub+ffmpeg 787 | 788 | 参考[Pydub ](https://github.com/jiaaro/pydub/) 789 | 790 | 791 | ```python 792 | from pydub import AudioSegment 793 | import glob 794 | ``` 795 | 796 | 797 | ```python 798 | def convert_to_mp3(): 799 | owd = os.getcwd() 800 | os.chdir(anki_media_collection) 801 | extension_list = ('*.wav', '*.spx') 802 | for extension in extension_list: 803 | for audio in glob.glob(extension): 804 | mp3_filename = os.path.splitext(os.path.basename(audio))[0] + '.mp3' 805 | if not os.path.isfile(mp3_filename): 806 | AudioSegment.from_file(audio).export(mp3_filename, format='mp3') 807 | os.chdir(owd) 808 | ``` 809 | 810 | 811 | ```python 812 | convert_to_mp3() 813 | ``` 814 | 815 | 816 | ```python 817 | def modify_audio_pointer(): 818 | path_to_usage_d = path_to_pron = [('all','',False), ('key','usages',False), ('all','',False)] 819 | for usage_d, in iter_through_general(new3000_base_d, path_to_usage_d): 820 | old_audio_name = usage_d['audio'] 821 | if old_audio_name != '': 822 | new_audio_name = os.path.splitext(os.path.basename(old_audio_name))[0] + '.mp3]' 823 | usage_d['audio'] = new_audio_name 824 | ``` 825 | 826 | 827 | ```python 828 | modify_audio_pointer() 829 | ``` 830 | 831 | 832 | ```python 833 | # test 834 | #iter_print(new3000_base_d['chaperone']) 835 | ``` 836 | 837 | ## 添加笔记 838 | 839 | 840 | ```python 841 | old_anki_GreWord_file_name = 'old_anki_greword.txt' 842 | ``` 843 | 844 | 845 | ```python 846 | def add_mynotes(): 847 | if not os.path.isfile(old_anki_GreWord_file_name): 848 | return 849 | old_data_line_l = codecs_open_r_utf8(old_anki_GreWord_file_name).split('\n') 850 | for line in old_data_line_l: 851 | field_l = line.split('\t') 852 | word = field_l[1] 853 | usage_index = int(field_l[2]) 854 | my_note = field_l[6] 855 | if my_note != '': 856 | new3000_base_d[word]['usages'][usage_index-1]['mynotes'] = my_note 857 | ``` 858 | 859 | 860 | ```python 861 | add_mynotes() 862 | ``` 863 | 864 | ## 添加形近词 865 | 866 | 第一遍背诵时，由于不会精确的记忆词形，经常出现各种误识别。所以加入一个环节，根据[Wagner–Fischer算法](https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm)自动生成形近词，并将形近词的简单中文释义添加到其后。计算单词的Levenshtein distance时，使用的别人写好的[算法](https://gist.github.com/kylebgorman/8034009)，并没做优化，所以可能慢一些。另外，也没有缓存距离，即计算完A、B间的距离后，B、A间还会再计算一次。 867 | 868 | 由于ipython执行multiprocess有bug，[multiprocessing](https://docs.python.org/2/library/multiprocessing.html)提到： 869 | 870 | This means that some examples, such as the Pool examples will not work in the interactive interpreter. 871 | 872 | 所以，具体程序参见脚本`add_similar_word.py`。该脚本生成`similar_word.txt`，里面有需要的数据。 873 | 874 | 875 | ```python 876 | with open('similar_word.txt') as f: 877 | similar_word_l = json.load(f) 878 | ``` 879 | 880 | 881 | ```python 882 | similar_word_d = {pair[0]:pair[1] for pair in similar_word_l} 883 | ``` 884 | 885 | 886 | ```python 887 | def add_similar_word_to_new3000_base_d(): 888 | for word in similar_word_d: 889 | new3000_base_d[word]['similar_form'] = similar_word_d[word] 890 | ``` 891 | 892 | ## 生成文件 893 | 894 | 895 | ```python 896 | greword_import_data_l = convert_to_GreWord() 897 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f: 898 | for one_line in greword_import_data_l: 899 | one_string = u'\t'.join(one_line) + '\n' 900 | f.write(one_string) 901 | ``` 902 | 903 | 904 | ```python 905 | # test 906 | #iter_print(new3000_base_d['hike']) 907 | ``` 908 | 909 | # GrePhrase 910 | 911 | 912 | ```python 913 | %%sync_to_file $configCreAnkiImpGrePhrase 914 | file_name_duanyu = 'duanyu_base_d.txt' 915 | duanyu_base_d = is_file_and_json_load(file_name_duanyu) 916 | output_file_GrePhrase = 'AnkiImportData_GrePhrase.txt' 917 | ``` 918 | 919 | 920 | ```python 921 | print 'The structure of duanyu_base_d' 922 | pprint(duanyu_base_d['under one\'s control1']) 923 | ``` 924 | 925 | The structure of duanyu_base_d 926 | {u'cn_exp': u'\u5728\u2026\u2026\u7684\u63a7\u5236\u4e4b\u4e0b', 927 | u'en_exp': u'If something is **under** your **control**, you have the **power to make** all the important **decisions** about the way that it is run.', 928 | u'example': u'The current protest doesn\u2019t look likely to be brought under government\u2019s control any time soon.', 929 | u'gre_example_cn': u'\u5f53\u5fb7\u514b\u8428\u65af\u5dde\u8fd8\u5904\u4e8e\u58a8\u897f\u54e5\u7684\u7ba1\u8f96\u4e2d\u65f6\uff0c\u5c3d\u7ba1\u58a8\u897f\u54e5\u653f\u5e9c\u6781\u529b\u529d\u963b\u6765\u81ea\u7f8e\u56fd\u7684\u79fb\u6c11\uff0c\u5fb7\u5dde\u7684\u4eba\u53e3\u8fd8\u662f\u7ffb\u4e86\u4e24\u756a\u3002', 930 | u'gre_example_en': u'While Texas was under Mexican control, the population of Texas quadrupled, in spite of the fact that Mexico discouraged immigration from the United States.', 931 | u'phrase': u"under one's control", 932 | u'pos': 7, 933 | u'usage_index': u'1'} 934 | 935 | 936 | 937 | ```python 938 | %%sync_to_file $configCreAnkiImpGrePhrase 939 | def convert_to_GrePhrase(): 940 | with codecs.open(output_file_GrePhrase, 'w', encoding='utf-8') as f: 941 | my_notes = '' 942 | for phrase_uid, phrase_dict in duanyu_base_d.iteritems(): 943 | one_line = [phrase_uid, phrase_dict['phrase'], phrase_dict['usage_index'], my_notes, 944 | phrase_dict['en_exp'], phrase_dict['cn_exp'], 945 | phrase_dict['example'], phrase_dict['gre_example_cn'], 946 | phrase_dict['gre_example_en']] 947 | one_line = '\t'.join(one_line) + '\n' 948 | f.write(one_line) 949 | ``` 950 | 951 | 952 | ```python 953 | convert_to_GrePhrase() 954 | ``` 955 | 956 | 957 | ```python 958 | %%sync_to_file $file_name_grephrase -p 959 | 960 | if __name__ == '__main__': 961 | if not (duanyu_base_d is None): 962 | convert_to_GrePhrase() 963 | ``` 964 | 965 | 966 | ```python 967 | 968 | ``` 969 | 970 | 971 | ```python 972 | ! jupyter nbconvert anki_import.ipynb --to markdown 973 | ! jupyter nbconvert anki_import.ipynb -- to html 974 | ``` 975 | 976 | [NbConvertApp] WARNING | Collisions detected in jupyter_nbconvert_config.py and jupyter_nbconvert_config.json config files. jupyter_nbconvert_config.json has higher priority: { 977 | "Exporter": { 978 | "template_path": "['.', 'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates'] ignored, using [u'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates']" 979 | } 980 | } 981 | C:\Users\xiaohang\Anaconda\lib\site-packages\IPython\nbconvert.py:13: ShimWarning: The `IPython.nbconvert` package has been deprecated. You should import from ipython_nbconvert instead. 982 | "You should import from ipython_nbconvert instead.", ShimWarning) 983 | [NbConvertApp] Converting notebook anki_import.ipynb to markdown 984 | [NbConvertApp] Writing 30598 bytes to anki_import.md 985 | [NbConvertApp] WARNING | Collisions detected in jupyter_nbconvert_config.py and jupyter_nbconvert_config.json config files. jupyter_nbconvert_config.json has higher priority: { 986 | "Exporter": { 987 | "template_path": "['.', 'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates'] ignored, using [u'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates']" 988 | } 989 | } 990 | [NbConvertApp] WARNING | pattern u'to' matched no files 991 | [NbConvertApp] WARNING | pattern u'html' matched no files 992 | C:\Users\xiaohang\Anaconda\lib\site-packages\IPython\nbconvert.py:13: ShimWarning: The `IPython.nbconvert` package has been deprecated. You should import from ipython_nbconvert instead. 993 | "You should import from ipython_nbconvert instead.", ShimWarning) 994 | [NbConvertApp] Converting notebook anki_import.ipynb to html 995 | [NbConvertApp] Writing 298264 bytes to anki_import.html 996 | 997 | -------------------------------------------------------------------------------- /convert_duanyu.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | 12 | from my_helpers import * 13 | file_duanyu = "base_data\GREGao Fen Bi Bei Duan Yu Da Pe - Yan Yu Zhen ,Gao Yu ,Chen Qi.txt" 14 | def extract_dy_unit(base_str): 15 | base_str = base_str.split(u"# 索引\n")[0] 16 | match_dy_unit_start_re = re.compile(ur'^# Unit \d+', re.M) 17 | base_unit_str_l = extract_content_between(base_str, match_dy_unit_start_re) 18 | base_unit_str_l = [base_unit_str.split(u'## 检测练习\n')[0] for base_unit_str in base_unit_str_l] 19 | return base_unit_str_l 20 | def extract_dy_index_content(base_str): 21 | match_dy_index_cn_start_re = re.compile(u' (?=[\u4e00-\u9fa5\u3010])') 22 | index_str = base_str.split(u"# 索引\n")[1] 23 | index_d = {} 24 | for line_str in index_str.split('\n'): 25 | if line_str == '': 26 | continue 27 | line_str = strF2H(line_str) 28 | en_cn = match_dy_index_cn_start_re.split(line_str) 29 | if len(en_cn) == 2: 30 | index_d[en_cn[0]] = en_cn[1] 31 | else: 32 | print 'Warning, no en or no cn:', en_cn 33 | return index_d 34 | def extract_dy_phrase_d(base_unit_str_l): 35 | base_phrase_d = {} 36 | for unit_index, base_unit_str in enumerate(base_unit_str_l): 37 | match_phrase_start_re = re.compile(ur'^\*\*([a-z].*?)([\u3000\u4e00-\u9fa5].*)?\*\*$', 38 | re.M|re.I) 39 | phrase_block_str_l = extract_content_between(base_unit_str, match_phrase_start_re) 40 | for phrase_block_str in phrase_block_str_l: 41 | match_result = match_phrase_start_re.match(phrase_block_str) 42 | if match_result is None: 43 | print phrase_block_str 44 | phrase_en = match_result.group(1) 45 | phrase_exp_cn = match_result.group(2) 46 | if phrase_exp_cn is None: 47 | phrase_exp_cn = '' 48 | else: 49 | phrase_exp_cn = phrase_exp_cn.strip(u'\u3000 ') 50 | phrase_block_str = phrase_block_str[match_result.end():].strip('\n ') 51 | base_phrase_d[phrase_en] = {'exp_cn': phrase_exp_cn, 52 | 'phrase_block_str': phrase_block_str, 53 | 'pos': unit_index} 54 | return base_phrase_d 55 | def process_dy_phrase_block_str(base_d): 56 | processed_phrase_d = {} 57 | for phrase, base_phrase_d in base_d.iteritems(): 58 | phrase_block_str = base_phrase_d['phrase_block_str'] 59 | has_multiple_cn_exp = base_phrase_d['exp_cn'] == '' 60 | match_dy_multi_cn_exp_start_re = re.compile(ur'^\*\*\d+\. (.*)\*\*$', re.M) 61 | if has_multiple_cn_exp: 62 | exp_cn_l = match_dy_multi_cn_exp_start_re.findall(phrase_block_str) 63 | phrase_block_str_l = extract_content_between(phrase_block_str, 64 | match_dy_multi_cn_exp_start_re) 65 | else: 66 | exp_cn_l = [base_phrase_d['exp_cn']] 67 | phrase_block_str_l = [phrase_block_str] 68 | 69 | match_en_exp_re = re.compile(ur'^\*\*释\*\* (.*)$', re.M) 70 | match_example_re = re.compile(ur'^\*\*例\*\* (.*)$', re.M) 71 | match_gre_example = re.compile(ur'\*\*题\*\* (.*)$', re.S) 72 | 73 | for usage_index, phrase_block_str in enumerate(phrase_block_str_l): 74 | 75 | phrase_detailed_d = {} 76 | exp_en = match_en_exp_re.search(phrase_block_str).group(1) 77 | example = match_example_re.search(phrase_block_str).group(1) 78 | gre_example_en_cn = match_gre_example.search(phrase_block_str).group(1).split('\n') 79 | gre_example_en = gre_example_en_cn[0] 80 | gre_example_cn = gre_example_en_cn[2] 81 | phrase_detailed_d = {'en_exp': exp_en, 82 | 'cn_exp': exp_cn_l[usage_index], 83 | 'example': example, 84 | 'gre_example_en': gre_example_en, 85 | 'gre_example_cn': gre_example_cn, 86 | 'pos': base_phrase_d['pos'], 87 | 'usage_index': unicode(usage_index + 1), 88 | 'phrase': phrase 89 | } 90 | phrase_uid = phrase + unicode(usage_index+1) 91 | processed_phrase_d[phrase_uid] = phrase_detailed_d 92 | return processed_phrase_d 93 | def main(file_name=None): 94 | if file_name is None: 95 | file_name = file_duanyu 96 | # for module call 97 | if not os.path.isfile(file_name): 98 | return 99 | dy_base_str = codecs_open_r_utf8(file_duanyu) 100 | match_escape_char_re = re.compile(r'\\(?=[\[\]()*+])') 101 | dy_base_str = match_escape_char_re.sub('', dy_base_str) 102 | dy_base_unit_str_l = extract_dy_unit(dy_base_str) 103 | dy_index_d = extract_dy_index_content(dy_base_str) 104 | dy_phrase_d = extract_dy_phrase_d(dy_base_unit_str_l) 105 | # revise ’' 106 | dy_phrase_d['under one\'s control'] = dy_phrase_d[u'under one’s control'] 107 | dy_phrase_d['on one\'s own'] = dy_phrase_d[u'on one’s own'] 108 | del dy_phrase_d[u'under one’s control'], dy_phrase_d[u'on one’s own'] 109 | dy_phrase_processed_d = process_dy_phrase_block_str(dy_phrase_d) 110 | with codecs.open('duanyu_base_d.txt', 'w', encoding='utf-8') as f: 111 | json.dump(dy_phrase_processed_d, f) 112 | if __name__ == '__main__': 113 | main() 114 | -------------------------------------------------------------------------------- /convert_new3000.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | 12 | from my_helpers import * 13 | file_new_3000 = "base_data\GREHe Xin Ci Hui Kao Fa Jing Xi (Xin Dong Fang Da Yu Ying Yu Xue Xi Cong Shu ) - Chen Qi.txt" 14 | match_new3000_list_start_re = re.compile(ur'^# List \d+', re.M) 15 | def strip_last_list(list_data): 16 | strip_start_re = re.compile(ur'# Word List 1　与说有关的词根构成的单词(.|\n)*$') 17 | return strip_start_re.sub('', list_data) 18 | match_unit_start_re = re.compile(ur'^## Unit \d+', re.M) 19 | match_word_block_start = re.compile(ur'^\*\*(?P[a-z\-éï]+)\*\*(?P［.+］)?', re.U|re.M) 20 | # phon represent phonetic symbol 21 | def get_word_of_one_unit(unit_block_str, list_index, unit_index): 22 | returned_words_d_d = {} 23 | word_block_str_l = extract_content_between(unit_block_str, match_word_block_start) 24 | for word_block_str in word_block_str_l: 25 | first_line_match = match_word_block_start.match(word_block_str) 26 | word = first_line_match.group('word') 27 | phon = first_line_match.group('phon') 28 | one_word_d = {'word_block_str': match_word_block_start.sub('', word_block_str), 29 | 'phon': strF2H(phon) if phon else u'', 30 | 'pos':(list_index, unit_index)} 31 | returned_words_d_d[word] = one_word_d 32 | return returned_words_d_d 33 | def get_new3000_base_d(base_unit_data_l_l): 34 | _new3000_base_d = {} 35 | for list_index, unit_data_l in enumerate(base_unit_data_l_l): 36 | for unit_index, unit_data in enumerate(unit_data_l): 37 | _new3000_base_d.update(get_word_of_one_unit(unit_data, list_index+1, unit_index+1)) 38 | return _new3000_base_d 39 | # revise 40 | def revise_word_base_data(word_d): 41 | # revise anarchist 42 | word_block_str = 'word_block_str' 43 | to_revise_word_d = word_d['anarchist'] 44 | to_revise_str = to_revise_word_d[word_block_str] 45 | to_revise_word_d[word_block_str] = to_revise_str.replace(u'同', u'近') 46 | # revise compliment 47 | to_revise_word_d = word_d['compliment'] 48 | to_revise_str = to_revise_word_d[word_block_str] 49 | to_revise_word_d['phon'] = [strF2H(phon) for phon in re.findall(ur'［.+?］', to_revise_str)] 50 | to_revise_word_d[word_block_str] = '\n'.join(to_revise_str.split('\n')[1:]) 51 | # reviseantediluvian, revise anecdote 52 | for to_revise_word in ['antediluvian', 'anecdote']: 53 | to_revise_word_d = word_d[to_revise_word] 54 | to_revise_str = to_revise_word_d[word_block_str] 55 | temp_index = 0 56 | for match_result in re.finditer(ur'\n\n', to_revise_str): 57 | if temp_index == 2: 58 | to_revise_str = to_revise_str[0:match_result.start()] + u'‖' + to_revise_str[match_result.end():] 59 | break 60 | temp_index += 1 61 | to_revise_word_d[word_block_str] = to_revise_str 62 | return word_d 63 | character_start = {'examples': '例', 64 | 'syns': '近', 65 | 'ants': '反', 66 | 'der': '派'} 67 | is_str_start_with_character_fun_d = {} 68 | for key, value in character_start.iteritems(): 69 | def gen_match_fun_closure(_value): 70 | return lambda s: s[0] == _value.decode('utf-8') 71 | is_str_start_with_character_fun_d[key] = gen_match_fun_closure(value) 72 | def revise_entry_name(words_d): 73 | # revise random 74 | words_d['random']['word_block_str'] = words_d['random']['word_block_str'].replace(u'例　aimless', 75 | u'近　aimless') 76 | # revise sordid 77 | words_d['sordid']['word_block_str'] = words_d['random']['word_block_str'].replace(u'近　Behind his generous', 78 | u'例　Behind his generous') 79 | # revise clan 80 | words_d['clan']['word_block_str'] = words_d['clan']['word_block_str'] .replace(u'反　clannish', 81 | u'派　clannish') 82 | match_usage_start_re = re.compile(ur'^【考(?:法|点)\d?】(.*)$', re.M|re.U) 83 | match_der = re.compile(ur'^') 84 | def wb_str_2_usages_d_l(word_block_str): 85 | ''' 86 | convert word block (string) to usages like structure 87 | input: the 'word_block_str' attribute of a word dictionary 88 | return: two lists, 89 | the first with its 'i'th element indicating whether 90 | the 'i'th usage has a complex der 91 | the second is the list of usages 92 | ''' 93 | usage_template = {'exp': '', 94 | 'examples': '', 95 | 'syns': '', 96 | 'ants': '', 97 | 'der': ''} 98 | usages_str_l = extract_content_between(word_block_str, match_usage_start_re) 99 | usages_d_l = [] 100 | is_complex_der_l = [] 101 | 102 | for one_usage_str in usages_str_l: 103 | one_usage_d = deepcopy(usage_template) 104 | is_complex_der = False 105 | has_der = False 106 | one_usage_lines = one_usage_str.split('\n') 107 | one_usage_d['exp'] = match_usage_start_re.match(one_usage_lines[0]).group(1) 108 | 109 | for line in one_usage_lines[1:]: 110 | has_been_matched = False 111 | 112 | if line == '' or line == '\n': 113 | continue 114 | # match "例" "反", etc. 115 | for field_name, match_func in is_str_start_with_character_fun_d.iteritems(): 116 | if match_func(line): 117 | has_been_matched = True 118 | if has_der: 119 | one_usage_d['der'] += '\n' + line.strip() 120 | is_complex_der = True 121 | else: 122 | # test 123 | if one_usage_d[field_name] != '': 124 | print '****Multi line field!****' 125 | print word_block_str 126 | pass 127 | one_usage_d[field_name] = line.strip() 128 | if field_name == 'der': 129 | # test 130 | if has_der: 131 | # print 'Warning! der in der!' 132 | # print one_usage_str 133 | pass 134 | has_der = True 135 | break 136 | if not has_been_matched: 137 | # after printed out, it can be seen that these lines are all aphorisms 138 | # so, useless for our purpose 139 | #print line 140 | pass 141 | usages_d_l.append(one_usage_d) 142 | is_complex_der_l.append(is_complex_der) 143 | return is_complex_der_l, usages_d_l 144 | def gen_usages_for_all_words(words_d): 145 | match_der_word = re.compile(ur'^派 ([a-z,/\-éï]+)', re.M) 146 | complex_ders_d = {} 147 | for word in words_d: 148 | if words_d[word]['word_block_str'] == '': 149 | print 'Empty word:', word 150 | continue 151 | is_complex_der_l, words_d[word]['usages'] = wb_str_2_usages_d_l(words_d[word]['word_block_str']) 152 | if True in is_complex_der_l: 153 | for i, one_usage in enumerate(words_d[word]['usages']): 154 | # revise plumb 155 | if i == 2 and word == u'plumb': 156 | one_usage['example'] = one_usage['der'] 157 | one_usage['der'] = '' 158 | continue 159 | if is_complex_der_l[i]: 160 | whole_der_block_str = strF2H(one_usage['der']) 161 | der_block_str_l = extract_content_between(whole_der_block_str, match_der_word) 162 | for der_block_str in der_block_str_l: 163 | # revise daunt 164 | if word == 'daunt': 165 | der_block_str = der_block_str.replace(', ', '/') 166 | der_word = match_der_word.match(der_block_str).group(1) 167 | der_block_str = match_der_word.sub(ur'【考法】', der_block_str) 168 | complex_ders_d[der_word] = {} 169 | _, complex_ders_d[der_word]['usages'] = wb_str_2_usages_d_l(der_block_str) 170 | if len(complex_ders_d[der_word]['usages']) != 1: 171 | print 'Warning! Not unqiue explanation!' 172 | continue 173 | complex_ders_d[der_word]['usages'][0]['der'] = u'源 ' + word 174 | complex_ders_d[der_word]['phon'] = u'' 175 | complex_ders_d[der_word]['pos'] = words_d[word]['pos'] 176 | complex_ders_d[der_word]['word_block_str'] = u'' 177 | # test 178 | #print der_word 179 | #iter_print(complex_ders_d[der_word]['usages']) 180 | #del words_d[word]['word_block_str'] 181 | return complex_ders_d, words_d 182 | match_phon_re = re.compile(ur'［.*］', re.U) 183 | match_pspeech_re = re.compile(ur'\*([a-z\/.]+\.)\*') 184 | has_cn_char_fun = lambda _str: re.compile(ur'[\u4e00-\u9fa5]').search(_str) is not None 185 | def process_exp(exp_field_str): 186 | ''' 187 | input: a unicode object corresponding the explanation line of the word 188 | return: dict {exp, pspeech, ph_symbl} 189 | ''' 190 | if exp_field_str == '': 191 | print 'Warning! No explanation!' 192 | return 193 | returned_d = {'exp': {'en': '', 'cn': '', 'en_cn': ''}, 194 | 'pspeech': '', 195 | 'ph_symbl': ''} 196 | 197 | result = match_pspeech_re.search(exp_field_str) 198 | if result: 199 | returned_d['pspeech'] = result.group(1) 200 | exp_field_str = match_pspeech_re.sub('', exp_field_str, 1) 201 | 202 | result = match_phon_re.search(exp_field_str) 203 | if result: 204 | returned_d['ph_symbl'] = result.group() 205 | exp_field_str = match_phon_re.sub('', exp_field_str, 1).strip() 206 | 207 | returned_d['exp']['en_cn'] = exp_field_str.strip() 208 | 209 | # seperate en and cn 210 | spered_str_l = [_str.strip() for _str in strF2H(exp_field_str).split(u':')] 211 | seperator_count = len(spered_str_l) - 1 212 | if seperator_count == 0: 213 | # test whether no seperator guarantees no chinese explanation 214 | # print 'No sep', spered_str_l 215 | returned_d['exp']['cn'] = spered_str_l[0] 216 | elif seperator_count == 1: 217 | returned_d['exp']['cn'], returned_d['exp']['en'] = spered_str_l 218 | elif seperator_count == 2: 219 | # test 220 | # print 'Two sep: ', spered_str_l 221 | has_char_cn_boo_l = map(has_cn_char_fun, spered_str_l) 222 | returned_d['exp']['cn'] = u':'.join([spered_str_l[i] for i in range(seperator_count+1) if has_char_cn_boo_l[i]]) 223 | returned_d['exp']['en'] = u':'.join([spered_str_l[i] for i in range(seperator_count+1) if not has_char_cn_boo_l[i]]) 224 | # test 225 | #iter_print(returned_d['exp']) 226 | else: 227 | # test 228 | #print 'More than two sep: ', exp_field_str 229 | pass 230 | return returned_d 231 | def process_exp_field_for_all_words(words_d): 232 | for word, usage_index, exp_str in iter_value_of_key_through_d_l_d_d(words_d, 'usages', 'exp', 233 | yield_top_key=True, yield_list_index=True): 234 | base_exp_d = None 235 | # get base_exp_d 236 | # revise abuse 237 | if word == 'abuse' and usage_index == 1: 238 | exp_str_l = exp_str.split(';') 239 | base_exp_d, extra_exp_d = map(process_exp, exp_str_l) 240 | base_exp_d['exp']['en'] = base_exp_d['exp']['en'] + ';' + extra_exp_d['exp']['en'] 241 | base_exp_d['exp']['cn'] = base_exp_d['exp']['cn'] + ';' + extra_exp_d['exp']['cn'] 242 | # test 243 | #iter_print(base_exp_d) 244 | 245 | # revise disaffected 246 | if word == 'disaffect': 247 | base_exp_d = process_exp(exp_str.split(';')[0]) 248 | # test 249 | #iter_print(base_exp_d) 250 | 251 | else: 252 | base_exp_d = process_exp(exp_str) 253 | 254 | # get phonic symbol from parent field 255 | if base_exp_d['ph_symbl'] == u'': 256 | # revise compliment 257 | if word == 'compliment': 258 | if usage_index == 0: 259 | base_exp_d['ph_symbl'] = 'n. ' + words_d[word]['phon'][0] + \ 260 | ' v. ' + words_d[word]['phon'][1] 261 | else: 262 | base_exp_d['ph_symbl'] = words_d[word]['phon'][0] 263 | else: 264 | # test 265 | if usage_index > 2: 266 | #print word 267 | pass 268 | base_exp_d['ph_symbl'] = words_d[word]['phon'] 269 | one_usage = words_d[word]['usages'][usage_index] 270 | one_usage['ph_symbl'] = base_exp_d['ph_symbl'] 271 | del base_exp_d['ph_symbl'] 272 | one_usage['pspeech'] = base_exp_d['pspeech'] 273 | del base_exp_d['pspeech'] 274 | one_usage['exp_d'] = base_exp_d['exp'] 275 | return words_d 276 | match_all_cn_re = ur' ?[a-z0-9：。；，“”（）、？《》]*?[\u4e00-\u9fa5]+.*?(?=$|[a-z]+ [a-z]+)' 277 | match_all_cn_re = re.compile(match_all_cn_re, re.I) 278 | match_cn_punc_with_en_char_fun = lambda _str: re.search(ur'[。？]( )?(?=[a-z])', _str, re.I) 279 | match_cn_char_with_en_char_fun = lambda _str: re.search(ur'[\u4e00-\u9fa5](?=[a-z])', _str, re.I) 280 | # revise 281 | def revise_no_sep(words_d): 282 | path_to_example = [('all', '', True), ('key', 'usages', False), ('all','',True),('key','examples',False)] 283 | example_iter = iter_through_general(words_d, path_to_example) 284 | for word, usage_index, example_str in example_iter: 285 | if example_str == '': 286 | continue 287 | example_str = example_str[2:] 288 | if u'\u2016' not in example_str: 289 | results = match_all_cn_re.findall(example_str) 290 | if len(results) > 1: 291 | index_to_add_sep = None 292 | one_result = match_cn_punc_with_en_char_fun(example_str) 293 | if one_result: 294 | index_to_add_sep = one_result.end() 295 | elif word in [u'heckle', u'carefree']: 296 | one_result = match_cn_char_with_en_char_fun(example_str) 297 | index_to_add_sep = one_result.end() 298 | elif word == 'clarify': 299 | example_str = example_str.replace(u';', u'\u2016') 300 | if index_to_add_sep: 301 | example_str = example_str[:index_to_add_sep] + u'\u2016' + example_str[index_to_add_sep:] 302 | words_d[word]['usages'][usage_index]['examples'] = u'例 ' + example_str 303 | return words_d 304 | match_sentence_en_part_re = re.compile(ur'[a-z0-9éï\'";:,?!%()$ⅠⅡ.*/\- —　‘’“”（）]+(?=[＜《〈\u4e00-\u9fa5])', re.I) 305 | def sep_en_cn_sentence(sentences_str): 306 | if sentences_str == '': 307 | return '', '', '', 308 | sentences_str = sentences_str[2:].replace(u'\!', u'!') 309 | is_number_fun = lambda _str: re.match('\d', _str) 310 | en_str_l = [] 311 | cn_str_l = [] 312 | en_cn_str_l= [] 313 | for sentence in sentences_str.split(u'\u2016'): 314 | sentence = sentence.strip(u' 　\n') 315 | en_cn_str_l.append(sentence) 316 | result = match_sentence_en_part_re.match(sentence) 317 | if result: 318 | en_str = result.group() 319 | # test 320 | if not (en_str[-1] in [' ', '.', u'）', u'”']): 321 | if en_str[-1] == u'“': 322 | #print en_str 323 | en_str = en_str[:-1] 324 | #print en_str 325 | elif is_number_fun(en_str[-1]) or (en_str[-2:] in ['RE', 'IT', 'on', 'NA']): 326 | #print en_str 327 | last_blank_space = len(en_str) - 1 328 | while en_str[last_blank_space] != ' ': 329 | last_blank_space -= 1 330 | en_str = en_str[:last_blank_space] 331 | #print en_str 332 | elif en_str[-2:] == u'“‘': 333 | #print en_str 334 | en_str = en_str[:-2] 335 | #print en_str 336 | else: 337 | #print en_str 338 | #print sentence 339 | pass 340 | en_str_l.append(strF2H(en_str).strip()) 341 | cn_str_l.append(sentence.replace(en_str, '')) 342 | else: 343 | print sentence 344 | raise ValueError('Warning! No en part!') 345 | return new_line_join(en_str_l), new_line_join(cn_str_l), new_line_join(en_cn_str_l) 346 | def process_examples(words_d): 347 | path_to_example = [('all', '', True), ('key', 'usages', False), ('all','',True),('key','examples',False)] 348 | example_iter = iter_through_general(words_d, path_to_example) 349 | for word, usage_index, example_str in example_iter: 350 | examples_en, examples_cn, examples_encn = sep_en_cn_sentence(example_str) 351 | words_d[word]['usages'][usage_index]['examples_d'] = {'en': examples_en, 'cn': examples_cn, 'en_cn': examples_encn} 352 | return words_d 353 | match_ants_en_part_re = re.compile(ur'[a-zéï][a-zéï ,-/]+(?=[　\u4e00-\u9fa5（]|$)', re.I) 354 | def sep_en_cn_ants(ants_str): 355 | if ants_str == '': 356 | return '', '', '', 0 357 | ants_str = ants_str[2:] 358 | num_ants_of_explanations = 0 359 | en_str_l = match_ants_en_part_re.findall(ants_str) 360 | num_ants_of_explanations = len(en_str_l) 361 | # test 362 | if num_ants_of_explanations == 0: 363 | print 'Warning! No en part!', ants_str 364 | cn_str = match_ants_en_part_re.sub('', ants_str).strip(' \n') 365 | search_en_fun = lambda _str: re.search(r'[a-z]', _str, re.I) 366 | if search_en_fun(cn_str): 367 | print 'Warning! en in cn part!', cn_str 368 | en_cn = ants_str.strip(' \n') 369 | return '; '.join(en_str_l), cn_str, en_cn, num_ants_of_explanations 370 | def process_all_ants(words_d): 371 | path_to_ants = [('all','',True),('key','usages',False),('all','',True),('key','ants',False)] 372 | ants_iter = iter_through_general(words_d, path_to_ants) 373 | for word, usage_index, ant_str in ants_iter: 374 | en_str, cn_str, en_cn_str, num_exps = sep_en_cn_ants(ant_str) 375 | words_d[word]['usages'][usage_index]['ants_d'] = {'en': en_str, 'cn': cn_str, 'en_cn': en_cn_str} 376 | # test 377 | if num_exps > 1: 378 | #print word 379 | pass 380 | return words_d 381 | strip_first_two_chars_fun = lambda _str: _str[2:] 382 | def process_all_syns(words_d): 383 | path_to_syns = [('all','',True),('key','usages',False),('all','',True),('key','syns',False)] 384 | for word, usage_index, syns_str in iter_through_general(words_d, path_to_syns): 385 | usage_d = words_d[word]['usages'][usage_index] 386 | usage_d['syns'] = strip_first_two_chars_fun(syns_str) 387 | return words_d 388 | def supplement_word_ph_symbl(words_d): 389 | path_to_phsymb = [('all','',True),('key','usages',False),('all','',True),('key','ph_symbl',False)] 390 | for word, usage_index, ph_symbl in iter_through_general(words_d, path_to_phsymb): 391 | usage_d = words_d[word]['usages'][usage_index] 392 | if usage_d['ph_symbl'] == '': 393 | cur_pspeech = usage_d['pspeech'] 394 | if usage_index == 0: 395 | # uncommend print if you want to check 396 | #print 'Word %s has no phonetic symbol, maybe it is a derivative.'%word 397 | continue 398 | pre_usage_d = words_d[word]['usages'][usage_index-1] 399 | pre_pspeech = pre_usage_d['pspeech'] 400 | pre_phsymbl = pre_usage_d['ph_symbl'] 401 | if pre_pspeech != cur_pspeech: 402 | if not cur_pspeech.startswith('v'): 403 | # already check the v. vi. vt. case 404 | print 'Previous pspeech is different. Please check! Word %s'%word 405 | iter_print(usage_d) 406 | continue 407 | usage_d['ph_symbl'] = pre_phsymbl 408 | return words_d 409 | def main(file_name=None): 410 | if file_name is None: 411 | file_name = file_new_3000 412 | # for module call 413 | if not os.path.isfile(file_name): 414 | return 415 | new3000_base_str = codecs_open_r_utf8(file_new_3000) 416 | new3000_base_list_data_l = extract_content_between(new3000_base_str, match_new3000_list_start_re) 417 | new3000_base_list_data_l[30] = strip_last_list(new3000_base_list_data_l[30]) 418 | new3000_base_unit_data_l_l = map(functools.partial(extract_content_between, 419 | match_re=match_unit_start_re), 420 | new3000_base_list_data_l) 421 | new3000_base_d = get_new3000_base_d(new3000_base_unit_data_l_l) 422 | # revise 423 | subset_to_revise_d = {word:deepcopy(new3000_base_d[word]) for word in ['anarchist', 'compliment', 'antediluvian', 'anecdote']} 424 | subset_to_revise_d = revise_word_base_data(subset_to_revise_d) 425 | new3000_base_d.update(subset_to_revise_d) 426 | del subset_to_revise_d, new3000_base_list_data_l, new3000_base_unit_data_l_l, new3000_base_str 427 | revise_entry_name(new3000_base_d) 428 | complex_ders_d, new3000_base_d = gen_usages_for_all_words(new3000_base_d) 429 | new3000_base_d.update(complex_ders_d) 430 | del complex_ders_d 431 | new3000_base_d = process_exp_field_for_all_words(new3000_base_d) 432 | new3000_base_d = revise_no_sep(new3000_base_d) 433 | new3000_base_d = process_examples(new3000_base_d) 434 | new3000_base_d['enfranchise']['usages'][1]['ants'] = new3000_base_d['enfranchise']['usages'][1]['ants'].replace(u'subdue; enthrall', u'subdue, enthrall') 435 | new3000_base_d = process_all_ants(new3000_base_d) 436 | new3000_base_d = process_all_syns(new3000_base_d) 437 | # revise compendium 438 | new3000_base_d['compendium']['usages'][1]['pspeech'] = 'n.' 439 | new3000_base_d = supplement_word_ph_symbl(new3000_base_d) 440 | with codecs.open('new3000_base_d.txt', 'w', encoding='utf-8') as f: 441 | json.dump(new3000_base_d, f) 442 | 443 | if __name__ == '__main__': 444 | main() -------------------------------------------------------------------------------- /convert_zhuji.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | 12 | from my_helpers import * 13 | file_zhuji = "base_data\GREHe Xin Ci Hui Zhu Ji Yu Jing - Cao Tian Cheng.txt" 14 | match_escape_char_re = re.compile(r'\\(?=[\[\]()*+])') 15 | match_zhuji_list_start_re = re.compile(ur'### List \d+', re.M) 16 | def get_etyma_block_d_l_l(list_data_l): 17 | match_etyma_block_start = re.compile(r'^\d+\.(.*)$\n|^Unit \d+$', re.M) 18 | etyma_block_d_l_l = [] 19 | for list_index, base_list_str in enumerate(list_data_l): 20 | if list_index > 38: 21 | break 22 | etyma_block_d_l = [] 23 | base_list_str = base_list_str.split(u'小结&复习')[0] 24 | etyma_block_str_l = extract_content_between(base_list_str, match_etyma_block_start) 25 | for etyma_index, etyma_block_str in enumerate(etyma_block_str_l): 26 | ety_str = match_etyma_block_start.search(etyma_block_str).group(1) 27 | if ety_str is None: 28 | ety_str = '' 29 | ety_str = ety_str.strip() 30 | if ety_str == u'其他': 31 | #print u'词根是其他' 32 | ety_str = '' 33 | if list_index == 36-1: 34 | ety_str = u'与动物有关的单词，' + ety_str 35 | ety_str = ety_str.strip() 36 | etyma_block_str_and_summary_str = etyma_block_str.split(u'小结') 37 | summary_str = etyma_block_str_and_summary_str[1] if len(etyma_block_str_and_summary_str) == 2 else '' 38 | etyma_block_str = match_etyma_block_start.sub('', etyma_block_str_and_summary_str[0]) 39 | # revise surg, cit 40 | if ety_str == 'surg, cit': 41 | temp_str_l = etyma_block_str.split('\n') 42 | #iter_print(temp_str_l) 43 | # insert line 5 after line 0 44 | modified_str_l = [temp_str_l[0], temp_str_l[5]] + temp_str_l[1:5] + temp_str_l[6:] 45 | etyma_block_str = '\n'.join(modified_str_l) 46 | #print etyma_block_str 47 | # revise rejoice 48 | if ety_str == u'欢乐与喜悦': 49 | temp_str_l = etyma_block_str.split('\n') 50 | #iter_print(temp_str_l) 51 | modified_str_l = [temp_str_l[0], temp_str_l[9]] + temp_str_l[1:9] 52 | etyma_block_str = '\n'.join(modified_str_l) 53 | #print etyma_block_str 54 | etyma_block_d = {'pos':(list_index+1, etyma_index+1), 55 | 'ety': ety_str, 56 | 'ety_block_str': etyma_block_str, 57 | 'summary': summary_str} 58 | etyma_block_d_l.append(etyma_block_d) 59 | etyma_block_d_l_l.append(etyma_block_d_l) 60 | return etyma_block_d_l_l 61 | def revise_miss_etyma(base_d_l_l): 62 | # revise list 25 etyma 3 revise tum 63 | base_d_l_l[25-1][3-1]['ety'] = 'tum' 64 | # revise list 5 etyma 4 revise post, pound 65 | base_d_l_l[5-1][4-1]['ety'] = 'post, pound' 66 | # revise list 6 etyma 7 revise vad, vag, ced 67 | base_d_l_l[6-1][7-1]['ety'] = 'vad, vag, ced' 68 | match_cognate_block_start_re = re.compile(ur'^([a-zéï-]+)(.*?)(\[.*\])$', re.M|re.I) 69 | def process_ety_block_str(base_d_l_l): 70 | path_to_ety_block_str = [('all','',True),('all','',True),('key','ety_block_str',False)] 71 | for list_index, ety_index, ety_block_str in iter_through_general(base_d_l_l, 72 | path_to_ety_block_str): 73 | etyma_block_d = base_d_l_l[list_index][ety_index] 74 | returned_l = extract_content_between(ety_block_str, match_cognate_block_start_re, True) 75 | ety_group_exp = returned_l.pop(0).strip() 76 | etyma_block_d['etyma_group_explanation'] = ety_group_exp 77 | etyma_block_d['cognate_block_str_l'] = returned_l 78 | # revise List 13, ety 3 revise scru 79 | def revise_scru(base_d_l_l): 80 | ''' 81 | please only call it one time 82 | or re-run the code cells starting from 83 | "zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l)" 84 | ''' 85 | to_revise_l = base_d_l_l[13-1][3-1]['cognate_block_str_l'] 86 | #iter_print(to_revise_l) 87 | # remove element 3-5 and build new dict 88 | new_l = to_revise_l[3:] 89 | to_revise_l[2] = to_revise_l[2].replace(u'以下的4个单词可以将scru按照读音联想成“四顾”，表示“ (顾虑地) 看”。', '') 90 | to_revise_l = to_revise_l[:3] 91 | new_ety = 'scru' 92 | new_ety_group_exp = u'将scru按照读音联想成“四顾”，表示“ (顾虑地) 看”' 93 | new_ety_d = {'cognate_block_str_l': new_l, 'pos': (13, 3), 94 | 'ety': new_ety, 95 | 'etyma_group_explanation': new_ety_group_exp, 96 | 'summary':'', 'ety_block_str':''} 97 | base_d_l_l[13-1].append(new_ety_d) 98 | def process_cognate_block(cognate_block_str): 99 | cognate_dict = {} 100 | cognate_lines_l = cognate_block_str.split('\n') 101 | first_line_match = match_cognate_block_start_re.match(cognate_lines_l.pop(0)) 102 | word = first_line_match.group(1) 103 | if (word == '') or (word is None): 104 | print 'Warning!' 105 | cognate_dict['word'] = word 106 | phon = first_line_match.group(3) 107 | cognate_dict['phon'] = phon if not (phon is None) else '' 108 | 109 | modified_cognate_lines_l = [] 110 | for cognate_line in cognate_lines_l: 111 | cognate_line = cognate_line.strip() 112 | if cognate_line == '': 113 | pass 114 | elif cognate_line.startswith(u'源'): 115 | # revise 源 116 | cognate_line = cognate_line.replace(u'源', u'[源]') 117 | # print cognate_line 118 | elif cognate_dict['word'] == u'facilitate': 119 | pass 120 | elif cognate_dict['word'] in ['jocular', 'jocund', 'jovial', 'rejoice']: 121 | pass 122 | elif cognate_line.startswith(u'以下两个单词中'): 123 | pass 124 | elif not cognate_line.startswith(u'['): 125 | # test 126 | print 'current line:', cognate_line, '\ncurrent block\n', cognate_block_str 127 | break 128 | else: 129 | pass 130 | modified_cognate_lines_l.append(cognate_line) 131 | cognate_dict['content'] = '\n'.join(modified_cognate_lines_l) 132 | return cognate_dict 133 | def process_all_cognate_block(base_data_d_l_l): 134 | base_word_d = {} 135 | path_to_cognate_block_str = [('all','',True),('all','',True), 136 | ('key','cognate_block_str_l',False),('all','',True)] 137 | for list_index, eytma_index, cognate_index, cognate_block_str in iter_through_general(base_data_d_l_l, 138 | path_to_cognate_block_str): 139 | one_word_d = process_cognate_block(cognate_block_str) 140 | word = one_word_d['word'] 141 | for _key in ['pos', 'ety', 'etyma_group_explanation', 'summary']: 142 | one_word_d[_key] = base_data_d_l_l[list_index][eytma_index][_key] 143 | one_word_d['pos'] = ', '.join([unicode(i) for i in one_word_d['pos']]) 144 | one_word_d['etyma_cognates_l'] = '' # waiting to be filled later 145 | if word in base_word_d: 146 | print 'Warning! word already exists!', word 147 | base_word_d[word] = one_word_d 148 | return base_word_d 149 | def add_etyma_cognates_l(base_word_d, base_d_l_l): 150 | path_to_etyma_d = [('all','',False),('all','',False)] 151 | for etyma_d, in iter_through_general(base_d_l_l, path_to_etyma_d): 152 | ety_str = etyma_d['ety'] 153 | ety_group_exp = etyma_d['etyma_group_explanation'] 154 | if ety_str != '' or ety_group_exp != '': 155 | if ety_str == '': 156 | # test 157 | print ety_group_exp 158 | etyma_cognates_l = [] 159 | for cognate_block_str in etyma_d['cognate_block_str_l']: 160 | word = match_cognate_block_start_re.match(cognate_block_str).group(1) 161 | etyma_cognates_l.append(word) 162 | for word in etyma_cognates_l: 163 | base_word_d[word]['etyma_cognates_l'] = ', '.join(etyma_cognates_l) 164 | def main(file_name=None): 165 | if file_name is None: 166 | file_name = file_zhuji 167 | # for module call 168 | if not os.path.isfile(file_name): 169 | return 170 | zhuji_base_str = codecs_open_r_utf8(file_zhuji) 171 | zhuji_base_str = match_escape_char_re.sub('', zhuji_base_str) 172 | zhuji_base_str = collapse_blank_line(zhuji_base_str) 173 | with codecs.open('temp_zhuji_base_str.txt', 'w', encoding='utf-8') as f: 174 | f.write(zhuji_base_str) 175 | zhuji_base_str = zhuji_base_str.split(u'# 第二篇核心词汇练习')[0] 176 | zhuji_base_list_l = extract_content_between(zhuji_base_str, match_zhuji_list_start_re) 177 | zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l) 178 | revise_miss_etyma(zhuji_base_d_l_l) 179 | process_ety_block_str(zhuji_base_d_l_l) 180 | revise_scru(zhuji_base_d_l_l) 181 | zhuji_base_word_d = process_all_cognate_block(zhuji_base_d_l_l) 182 | add_etyma_cognates_l(zhuji_base_word_d, zhuji_base_d_l_l) 183 | with codecs.open('zhuji_base_d.txt', 'w', encoding='utf-8') as f: 184 | json.dump(zhuji_base_word_d, f) 185 | 186 | if __name__ == '__main__': 187 | main() -------------------------------------------------------------------------------- /example_usage.apkg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/empenguinxh/Anki-CreateImportFile/9950194a50145fa3e1b84a535d7942136b28c418/example_usage.apkg -------------------------------------------------------------------------------- /my_helpers.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | def strF2H(ustring): 12 | ''' 13 | convert full width character to half width 14 | input: a unicode object 15 | return: a unicode object 16 | ''' 17 | h_ustring = u"" 18 | assert isinstance(ustring, unicode) 19 | for uchar in ustring: 20 | inside_code = ord(uchar) 21 | if inside_code == 12288: 22 | # white space 23 | inside_code = 32 24 | elif 65281 <= inside_code <= 65374: 25 | # other characters 26 | inside_code -= 65248 27 | 28 | h_ustring += unichr(inside_code) 29 | return h_ustring 30 | # pretty print the embedded unicode of list or dict 31 | def iter_print(obj_iter, indent=0, increment=2, max_top_level_print=None, 32 | top_level=True, top_level_extra_line_feed=False, print_list_index=True): 33 | if not hasattr(obj_iter, '__iter__'): 34 | if isinstance(obj_iter, basestring): 35 | if obj_iter == u'': 36 | pass 37 | elif '\n' in obj_iter: 38 | for line in obj_iter.split('\n'): 39 | if line: 40 | print ' '*indent, line 41 | else: 42 | print ' '*indent, obj_iter 43 | else: 44 | print ' '*indent, obj_iter 45 | return 46 | print_count = 0 47 | if isinstance(obj_iter, dict): 48 | for key, iter_sub_obj in obj_iter.iteritems(): 49 | print ' '*indent, key 50 | iter_print(iter_sub_obj, indent+increment, increment, None, False, False, print_list_index) 51 | if top_level: 52 | print_count += 1 53 | if max_top_level_print: 54 | if print_count >= max_top_level_print: 55 | break 56 | if top_level_extra_line_feed: 57 | print '\n' 58 | else: 59 | for list_index, sub_obj_iter in enumerate(obj_iter): 60 | if print_list_index: 61 | print ' '*indent, list_index 62 | iter_print(sub_obj_iter, indent+increment, increment, None, False, False, print_list_index) 63 | if top_level: 64 | print_count += 1 65 | if max_top_level_print: 66 | if print_count >= max_top_level_print: 67 | break 68 | if top_level_extra_line_feed: 69 | print '\n' 70 | def extract_content_between(obj_str, match_re, return_str_before_first_match=False): 71 | ''' 72 | extract content between the start of two equal pattern found in a str, 73 | also extract the content after the last match 74 | input: obj_str, the string to extract content from, must be a unicode object 75 | match_re, the pattern to be matched 76 | return: a list of str 77 | return_str_before_first_match: whether to return the str before the first match of the given patter 78 | ''' 79 | assert isinstance(obj_str, unicode) 80 | retype = type(re.compile(r'a str')) 81 | assert isinstance(match_re, retype) 82 | 83 | match_results_iter = match_re.finditer(obj_str) 84 | returned_str_l = [] 85 | start_index = None 86 | end_index = None 87 | first_start_index = None 88 | for match_result in match_results_iter: 89 | if first_start_index is None: 90 | first_start_index = match_result.start() 91 | if not (start_index is None): 92 | end_index = match_result.start() 93 | returned_str_l.append(obj_str[start_index:end_index]) 94 | start_index = match_result.start() 95 | returned_str_l.append(obj_str[start_index:]) 96 | if return_str_before_first_match: 97 | returned_str_l = [obj_str[:first_start_index]] + returned_str_l 98 | return returned_str_l 99 | def iter_value_of_key_through_d_l_d_d(obj_d_l_d_d, key_2nd_level, key_4th_level, 100 | expected_draw=1.0, yield_top_key=False, yield_list_index=False): 101 | ''' 102 | a function that return a generator 103 | it will iter through all the values of the first level dict with every value being themself a dict 104 | for every such value dict, 105 | a key specified by key_2nd_level is used to access a list 106 | for every elment of the list 107 | a key specified by key_4th_level is used to access the corresponding value 108 | so in total it is a two level nested loop 109 | 110 | key_2nd_level: what it points to must be a list 111 | 112 | expected_draw: roughly control the proportion of the innermost values to be sampled 113 | can be an integar, which will be converted to the corresponding probability 114 | 115 | yield_top_key: whether to include the top key 116 | yield_list_index: whether to include the list index 117 | note that (yield_top_key=False, yield_list_index=True) is a useless combination, so raise an ValueError 118 | ''' 119 | if isinstance(expected_draw, int): 120 | expected_draw = float(expected_draw)/len(obj_d_l_d_d) 121 | assert isinstance(expected_draw, float) 122 | for top_key, value_d_l_d in obj_d_l_d_d.iteritems(): 123 | assert isinstance(value_d_l_d[key_2nd_level], list) 124 | for _list_index, value_d in enumerate(value_d_l_d[key_2nd_level]): 125 | if random() <= expected_draw: 126 | if (not yield_top_key) and (not yield_list_index): 127 | yield value_d[key_4th_level] 128 | elif yield_top_key and (not yield_list_index): 129 | yield top_key, value_d[key_4th_level] 130 | elif yield_top_key and yield_list_index: 131 | yield top_key, _list_index, value_d[key_4th_level] 132 | else: 133 | raise ValueError('Invalid Combination of yield_top_key and yield_list_index') 134 | def iter_through_general(obj_iter, path, yield_flags=True, final_yield_object=None): 135 | ''' 136 | iter through an object following the given path 137 | yield_flags: control whether to yield the flags indicating the path at the global level 138 | final_yield_object: internal parameter, don't modify 139 | obj_iter: an iterable variable 140 | path: a sequence, each element has the following structure 141 | (how_to_iter, what_to_iter, yield_flag) 142 | how_to_iter: a str, accept the following values 143 | 'all' or 'all_values': iter through key-value pair for dict, and all elements for other type 144 | if yield_flag is True, attach key or index to the final yield object 145 | 'all_keys', only iter through the keys of a dict 146 | obj_iter must be a dict 147 | 'key', iter through the value of a given key 148 | what_to_iter must be a str representing a key in obj_iter 149 | if yield_flag is True, attach key to the final yield object 150 | ignored when obj_iter is not dict 151 | 'keys', iter through the values of a given set of keys 152 | what_to_iter must be a tuple with elements reprenting keys in obj_iter 153 | if yield_flag is True, attach key to the final yield object 154 | ignored when obj_iter is not dict 155 | 'index', iter through a given element 156 | what_to_iter must be an int within bound 157 | if yield_flag is True, attach index to the final yield object 158 | ignored when obj_iter is dict 159 | 'indexes', iter through the elements with given indexes 160 | what_to_iter must be an list of int within bound 161 | if yield_flag is True, attach key to the final yield object 162 | ignored when obj_iter is dict 163 | what_to_iter: content decided by how_to_iter 164 | ignored for the following values of how_to_iter 165 | all, all_values, all_keys 166 | yield_flag: True or False 167 | True: depending on how_to_iter, attch different flags to the final result 168 | False: no flag wil be yield 169 | ignored for the following values of how_to_iter 170 | all_keys 171 | ''' 172 | is_dict = isinstance(obj_iter, dict) 173 | if final_yield_object is None: 174 | final_yield_object = [] 175 | if len(path) == 0: 176 | if yield_flags: 177 | final_yield_object.append(obj_iter) 178 | yield final_yield_object 179 | else: 180 | yield obj_iter 181 | else: 182 | how_to_iter, what_to_iter, yield_flag = path.pop(0) 183 | assert isinstance(how_to_iter, basestring) 184 | if how_to_iter in [u'all', u'all_values', u'keys', u'indexes']: 185 | if how_to_iter in [u'keys', u'indexes']: 186 | assert hasattr(what_to_iter, '__iter__') 187 | for item in what_to_iter: 188 | if is_dict: 189 | assert how_to_iter == u'keys' 190 | assert isinstance(item, basestring) 191 | assert item in obj_iter 192 | else: 193 | assert how_to_iter == u'indexes' 194 | assert isinstance(item, int) 195 | assert item < len(obj_iter) 196 | temp_iterator = ((item, obj_iter[item]) for item in what_to_iter) 197 | else: 198 | temp_iterator = obj_iter.iteritems() if is_dict else enumerate(obj_iter) 199 | for flag, sub_obj_iter in temp_iterator: 200 | final_yield_object_copy = deepcopy(final_yield_object) 201 | if yield_flag: 202 | final_yield_object_copy.append(flag) 203 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object_copy): 204 | yield value 205 | elif how_to_iter == u'all_keys': 206 | assert is_dict 207 | for key in obj_iter.iterkeys(): 208 | if yield_flags: 209 | final_yield_object.append(key) 210 | yield final_yield_object 211 | else: 212 | yield key 213 | elif how_to_iter in [u'key', u'index']: 214 | if is_dict: 215 | assert how_to_iter == u'key' 216 | assert isinstance(what_to_iter, basestring) 217 | assert what_to_iter in obj_iter 218 | else: 219 | assert how_to_iter == u'index' 220 | assert isinstance(what_to_iter, int) 221 | assert what_to_iter < len(obj_iter) 222 | sub_obj_iter = obj_iter[what_to_iter] 223 | if yield_flag: 224 | final_yield_object.append(what_to_iter) 225 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object): 226 | yield value 227 | else: 228 | raise ValueError('Invalid path') 229 | def reservoir_sample_k(obj_iter, k): 230 | assert isinstance(k, int) 231 | assert hasattr(obj_iter, '__iter__') 232 | # fit into k items 233 | sampled_l = [] 234 | for _ in range(k): 235 | sampled_l.append(obj_iter.next()) 236 | i = k 237 | for item in obj_iter: 238 | i += 1 239 | j = randint(1, i) 240 | if j <= k: 241 | sampled_l[j-1] = item 242 | return sampled_l 243 | def iter_through_and_sample_k(obj_iter, k, path): 244 | obj_iter_follow_path = iter_through_general(obj_iter, path) 245 | return reservoir_sample_k(obj_iter_follow_path, k) 246 | strip_white_space = lambda _str: _str.replace(' ', '') 247 | new_line_join = lambda str_l: '\n'.join(str_l) 248 | def codecs_open_r_utf8(file_path): 249 | with codecs.open(file_path, 'r', 'utf-8') as f: 250 | returned_str = f.read() 251 | return returned_str 252 | # merge blank lines 253 | def collapse_blank_line(base_str): 254 | match_double_line_feed_re = re.compile(r'\n\n') 255 | while match_double_line_feed_re.search(base_str): 256 | base_str = match_double_line_feed_re.sub(r'\n', base_str) 257 | return base_str 258 | def custom_html_element(_str): 259 | """ 260 | convert the markdown notations in a string to html tags 261 | currently, only two kinds of markdown notation exist in all the strings 262 | ** and * 263 | """ 264 | formatted_str = _str 265 | # format double asterisk 266 | match_double_asterisk_re = re.compile(u'\*\*(.*?)\*\*') 267 | # replace **...** with ... 268 | #formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 269 | # replace **...** with ... 270 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 271 | # format single asterisk 272 | # replace *...* with ... 273 | match_single_asterisk_re = re.compile(u'\*(.*?)\*') 274 | formatted_str = match_single_asterisk_re.sub(r'\1', formatted_str) 275 | return formatted_str 276 | def is_file_and_json_load(file_name_str): 277 | if os.path.isfile(file_name_str): 278 | with codecs.open(file_name_str, 'r', encoding='utf-8') as f: 279 | json_d = json.load(f) 280 | return json_d 281 | -------------------------------------------------------------------------------- /pureSalsa20.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | """ 5 | Copyright by https://github.com/zhansliu/writemdict 6 | 7 | pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3 8 | 9 | v4.0: Added Python 3 support, dropped support for Python <= 2.5. 10 | 11 | // zhansliu 12 | 13 | Original comments below. 14 | 15 | ==================================================================== 16 | There are comments here by two authors about three pieces of software: 17 | comments by Larry Bugbee about 18 | Salsa20, the stream cipher by Daniel J. Bernstein 19 | (including comments about the speed of the C version) and 20 | pySalsa20, Bugbee's own Python wrapper for salsa20.c 21 | (including some references), and 22 | comments by Steve Witham about 23 | pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20, 24 | which follows pySalsa20's API, and is in this file. 25 | 26 | Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee) 27 | ----------------------------------------------------------- 28 | 29 | Salsa20 is a fast stream cipher written by Daniel Bernstein 30 | that basically uses a hash function and XOR making for fast 31 | encryption. (Decryption uses the same function.) Salsa20 32 | is simple and quick. 33 | 34 | Some Salsa20 parameter values... 35 | design strength 128 bits 36 | key length 128 or 256 bits, exactly 37 | IV, aka nonce 64 bits, always 38 | chunk size must be in multiples of 64 bytes 39 | 40 | Salsa20 has two reduced versions, 8 and 12 rounds each. 41 | 42 | One benchmark (10 MB): 43 | 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds 44 | AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds 45 | (no I/O and before Python GC kicks in) 46 | 47 | Salsa20 is a Phase 3 finalist in the EU eSTREAM competition 48 | and appears to be one of the fastest ciphers. It is well 49 | documented so I will not attempt any injustice here. Please 50 | see "References" below. 51 | 52 | ...and Salsa20 is "free for any use". 53 | 54 | 55 | pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee) 56 | ------------------------------------------------------------------ 57 | 58 | pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is 59 | as it's name implies, 20 rounds, but there are two reduced 60 | versions, 8 and 12 rounds each. Because the APIs are 61 | identical, pySalsa20 is capable of wrapping all three 62 | versions (number of rounds hardcoded), including a special 63 | version that allows you to set the number of rounds with a 64 | set_rounds() function. Compile the version of your choice 65 | as a shared library (not as a Python extension), name and 66 | install it as libsalsa20.so. 67 | 68 | Sample usage: 69 | from pySalsa20 import Salsa20 70 | s20 = Salsa20(key, IV) 71 | dataout = s20.encryptBytes(datain) # same for decrypt 72 | 73 | This is EXPERIMENTAL software and intended for educational 74 | purposes only. To make experimentation less cumbersome, 75 | pySalsa20 is also free for any use. 76 | 77 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 78 | ANY KIND. USE AT YOUR OWN RISK. 79 | 80 | Enjoy, 81 | 82 | Larry Bugbee 83 | bugbee@seanet.com 84 | April 2007 85 | 86 | 87 | References: 88 | ----------- 89 | http://en.wikipedia.org/wiki/Salsa20 90 | http://en.wikipedia.org/wiki/Daniel_Bernstein 91 | http://cr.yp.to/djb.html 92 | http://www.ecrypt.eu.org/stream/salsa20p3.html 93 | http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip 94 | 95 | 96 | Prerequisites for pySalsa20: 97 | ---------------------------- 98 | - Python 2.5 (haven't tested in 2.4) 99 | 100 | 101 | pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham) 102 | ------------------------------------------------------------------ 103 | 104 | pureSalsa20 is the stand-alone Python code in this file. 105 | It implements the underlying Salsa20 core algorithm 106 | and emulates pySalsa20's Salsa20 class API (minus a bug(*)). 107 | 108 | pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20-- 109 | about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8, 110 | when encrypting 64k-byte blocks on my computer. 111 | 112 | pureSalsa20 is for cases where portability is much more important than 113 | speed. I wrote it for use in a "structured" random number generator. 114 | 115 | There are comments about the reasons for this slowness in 116 | http://www.tiac.net/~sw/2010/02/PureSalsa20 117 | 118 | Sample usage: 119 | from pureSalsa20 import Salsa20 120 | s20 = Salsa20(key, IV) 121 | dataout = s20.encryptBytes(datain) # same for decrypt 122 | 123 | I took the test code from pySalsa20, added a bunch of tests including 124 | rough speed tests, and moved them into the file testSalsa20.py. 125 | To test both pySalsa20 and pureSalsa20, type 126 | python testSalsa20.py 127 | 128 | (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the 129 | libsalsa20.so library and not switched when switching between instances 130 | of the Salsa20 class. 131 | s1 = Salsa20( key, IV, 20 ) 132 | s2 = Salsa20( key, IV, 8 ) 133 | In this example, 134 | with pySalsa20, both s1 and s2 will do 8 rounds of encryption. 135 | with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds. 136 | Perhaps giving each instance its own nRounds variable, which 137 | is passed to the salsa20wordtobyte() function, is insecure. I'm not a 138 | cryptographer. 139 | 140 | pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and 141 | intended for educational purposes only. To make experimentation less 142 | cumbersome, pureSalsa20.py and testSalsa20.py are free for any use. 143 | 144 | Revisions: 145 | ---------- 146 | p3.2 Fixed bug that initialized the output buffer with plaintext! 147 | Saner ramping of nreps in speed test. 148 | Minor changes and print statements. 149 | p3.1 Took timing variability out of add32() and rot32(). 150 | Made the internals more like pySalsa20/libsalsa . 151 | Put the semicolons back in the main loop! 152 | In encryptBytes(), modify a byte array instead of appending. 153 | Fixed speed calculation bug. 154 | Used subclasses instead of patches in testSalsa20.py . 155 | Added 64k-byte messages to speed test to be fair to pySalsa20. 156 | p3 First version, intended to parallel pySalsa20 version 3. 157 | 158 | More references: 159 | ---------------- 160 | http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20] 161 | http://cr.yp.to/snuffle.html [The original name of Salsa20] 162 | http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design] 163 | http://www.tiac.net/~sw/2010/02/PureSalsa20 164 | 165 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF 166 | ANY KIND. USE AT YOUR OWN RISK. 167 | 168 | Cheers, 169 | 170 | Steve Witham sw at remove-this tiac dot net 171 | February, 2010 172 | """ 173 | import sys 174 | assert(sys.version_info >= (2, 6)) 175 | 176 | if sys.version_info >= (3,): 177 | integer_types = (int,) 178 | python3 = True 179 | else: 180 | integer_types = (int, long) 181 | python3 = False 182 | 183 | from struct import Struct 184 | little_u64 = Struct( "= 2**64" 238 | ctx = self.ctx 239 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) ) 240 | 241 | def getCounter( self ): 242 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0] 243 | 244 | 245 | def setRounds(self, rounds, testing=False ): 246 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20' 247 | self.rounds = rounds 248 | 249 | 250 | def encryptBytes(self, data): 251 | assert type(data) == bytes, 'data must be byte string' 252 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes' 253 | lendata = len(data) 254 | munged = bytearray(lendata) 255 | for i in range( 0, lendata, 64 ): 256 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False ) 257 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 ) 258 | # Stopping at 2^70 bytes per nonce is user's responsibility. 259 | for j in range( min( 64, lendata - i ) ): 260 | if python3: 261 | munged[ i+j ] = data[ i+j ] ^ h[j] 262 | else: 263 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j]) 264 | 265 | self._lastChunk64 = not lendata % 64 266 | return bytes(munged) 267 | 268 | decryptBytes = encryptBytes # encrypt and decrypt use same function 269 | 270 | #-------------------------------------------------------------------------- 271 | 272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ): 273 | """ Do nRounds Salsa20 rounds on a copy of 274 | input: list or tuple of 16 ints treated as little-endian unsigneds. 275 | Returns a 64-byte string. 276 | """ 277 | 278 | assert( type(input) in ( list, tuple ) and len(input) == 16 ) 279 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) ) 280 | 281 | x = list( input ) 282 | 283 | def XOR( a, b ): return a ^ b 284 | ROTATE = rot32 285 | PLUS = add32 286 | 287 | for i in range( nRounds // 2 ): 288 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c 289 | # unchanged except for indents and the blank line between rounds: 290 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7)); 291 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9)); 292 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13)); 293 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18)); 294 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7)); 295 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9)); 296 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13)); 297 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18)); 298 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7)); 299 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9)); 300 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13)); 301 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18)); 302 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7)); 303 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9)); 304 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13)); 305 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18)); 306 | 307 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7)); 308 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9)); 309 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13)); 310 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18)); 311 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7)); 312 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9)); 313 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13)); 314 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18)); 315 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7)); 316 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9)); 317 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13)); 318 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18)); 319 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7)); 320 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9)); 321 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13)); 322 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18)); 323 | 324 | for i in range( len( input ) ): 325 | x[i] = PLUS( x[i], input[i] ) 326 | return little16_i32.pack( *x ) 327 | 328 | #--------------------------- 32-bit ops ------------------------------- 329 | 330 | def trunc32( w ): 331 | """ Return the bottom 32 bits of w as a Python int. 332 | This creates longs temporarily, but returns an int. """ 333 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) ) 334 | assert type(w) == int 335 | return w 336 | 337 | 338 | def add32( a, b ): 339 | """ Add two 32-bit words discarding carry above 32nd bit, 340 | and without creating a Python long. 341 | Timing shouldn't vary. 342 | """ 343 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF ) 344 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 ) 345 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF ) 346 | 347 | 348 | def rot32( w, nLeft ): 349 | """ Rotate 32-bit word left by nLeft or right by -nLeft 350 | without creating a Python long. 351 | Timing depends on nLeft but not on w. 352 | """ 353 | nLeft &= 31 # which makes nLeft >= 0 354 | if nLeft == 0: 355 | return w 356 | 357 | # Note: now 1 <= nLeft <= 31. 358 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's, 359 | # => sLLLLLLRRR and one s which becomes the sign bit. 360 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) ) 361 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w 362 | return RRR | ( sLLLLLL << nLeft ) 363 | 364 | 365 | # --------------------------------- end ----------------------------------- 366 | -------------------------------------------------------------------------------- /readmdict.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # readmdict.py 4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser 5 | # 6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang 7 | # 8 | # This program is a free software; you can redistribute it and/or modify 9 | # it under the terms of the GNU General Public License as published by 10 | # the Free Software Foundation, version 3 of the License. 11 | # 12 | # You can get a copy of GNU General Public License along this program 13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt 14 | # 15 | # This program is distributed in the hope that it will be useful, 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 | # GNU General Public License for more details. 19 | 20 | from struct import pack, unpack 21 | from io import BytesIO 22 | import re 23 | 24 | from ripemd128 import ripemd128 25 | from pureSalsa20 import Salsa20 26 | 27 | # zlib compression is used for engine version >=2.0 28 | import zlib 29 | # LZO compression is used for engine version < 2.0 30 | try: 31 | import lzo 32 | except ImportError: 33 | lzo = None 34 | print("LZO compression support is not available") 35 | 36 | 37 | def _unescape_entities(text): 38 | """ 39 | unescape offending tags < > " & 40 | """ 41 | text = text.replace(b'<', b'<') 42 | text = text.replace(b'>', b'>') 43 | text = text.replace(b'"', b'"') 44 | text = text.replace(b'&', b'&') 45 | return text 46 | 47 | 48 | def _fast_decrypt(data, key): 49 | b = bytearray(data) 50 | key = bytearray(key) 51 | previous = 0x36 52 | for i in range(len(b)): 53 | t = (b[i] >> 4 | b[i] << 4) & 0xff 54 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)] 55 | previous = b[i] 56 | b[i] = t 57 | return bytes(b) 58 | 59 | 60 | def _mdx_decrypt(comp_block): 61 | key = ripemd128(comp_block[4:8] + pack(b' 116 | """ 117 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL) 118 | tagdict = {} 119 | for key, value in taglist: 120 | tagdict[key] = _unescape_entities(value) 121 | return tagdict 122 | 123 | def _decode_key_block_info(self, key_block_info_compressed): 124 | if self._version >= 2: 125 | # zlib compression 126 | assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00') 127 | # decrypt if needed 128 | if self._encrypt & 0x02: 129 | key_block_info_compressed = _mdx_decrypt(key_block_info_compressed) 130 | # decompress 131 | key_block_info = zlib.decompress(key_block_info_compressed[8:]) 132 | # adler checksum 133 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0] 134 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff) 135 | else: 136 | # no compression 137 | key_block_info = key_block_info_compressed 138 | # decode 139 | key_block_info_list = [] 140 | num_entries = 0 141 | i = 0 142 | if self._version >= 2: 143 | byte_format = '>H' 144 | byte_width = 2 145 | text_term = 1 146 | else: 147 | byte_format = '>B' 148 | byte_width = 1 149 | text_term = 0 150 | 151 | while i < len(key_block_info): 152 | # number of entries in current key block 153 | num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 154 | i += self._number_width 155 | # text head size 156 | text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] 157 | i += byte_width 158 | # text head 159 | if self._encoding != 'UTF-16': 160 | i += text_head_size + text_term 161 | else: 162 | i += (text_head_size + text_term) * 2 163 | # text tail size 164 | text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0] 165 | i += byte_width 166 | # text tail 167 | if self._encoding != 'UTF-16': 168 | i += text_tail_size + text_term 169 | else: 170 | i += (text_tail_size + text_term) * 2 171 | # key block compressed size 172 | key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 173 | i += self._number_width 174 | # key block decompressed size 175 | key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0] 176 | i += self._number_width 177 | key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)] 178 | 179 | #assert(num_entries == self._num_entries) 180 | 181 | return key_block_info_list 182 | 183 | def _decode_key_block(self, key_block_compressed, key_block_info_list): 184 | key_list = [] 185 | i = 0 186 | for compressed_size, decompressed_size in key_block_info_list: 187 | start = i 188 | end = i + compressed_size 189 | # 4 bytes : compression type 190 | key_block_type = key_block_compressed[start:start+4] 191 | # 4 bytes : adler checksum of decompressed key block 192 | adler32 = unpack('>I', key_block_compressed[start+4:start+8])[0] 193 | if key_block_type == b'\x00\x00\x00\x00': 194 | key_block = key_block_compressed[start+8:end] 195 | elif key_block_type == b'\x01\x00\x00\x00': 196 | if lzo is None: 197 | print("LZO compression is not supported") 198 | break 199 | # decompress key block 200 | header = b'\xf0' + pack('>I', decompressed_size) 201 | key_block = lzo.decompress(header + key_block_compressed[start+8:end]) 202 | elif key_block_type == b'\x02\x00\x00\x00': 203 | # decompress key block 204 | key_block = zlib.decompress(key_block_compressed[start+8:end]) 205 | # extract one single key block into a key list 206 | key_list += self._split_key_block(key_block) 207 | # notice that adler32 returns signed value 208 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff) 209 | 210 | i += compressed_size 211 | return key_list 212 | 213 | def _split_key_block(self, key_block): 214 | key_list = [] 215 | key_start_index = 0 216 | while key_start_index < len(key_block): 217 | # the corresponding record's offset in record block 218 | key_id = unpack(self._number_format, key_block[key_start_index:key_start_index+self._number_width])[0] 219 | # key text ends with '\x00' 220 | if self._encoding == 'UTF-16': 221 | delimiter = b'\x00\x00' 222 | width = 2 223 | else: 224 | delimiter = b'\x00' 225 | width = 1 226 | i = key_start_index + self._number_width 227 | while i < len(key_block): 228 | if key_block[i:i+width] == delimiter: 229 | key_end_index = i 230 | break 231 | i += width 232 | key_text = key_block[key_start_index+self._number_width:key_end_index]\ 233 | .decode(self._encoding, errors='ignore').encode('utf-8').strip() 234 | key_start_index = key_end_index + width 235 | key_list += [(key_id, key_text)] 236 | return key_list 237 | 238 | def _read_header(self): 239 | import sys 240 | f = open(self._fname, 'rb') 241 | # number of bytes of header text 242 | header_bytes_size = unpack('>I', f.read(4))[0] 243 | header_bytes = f.read(header_bytes_size) 244 | # 4 bytes: adler32 checksum of header, in little endian 245 | adler32 = unpack('= 0x03000000: 257 | encoding = encoding.decode('utf-8') 258 | # GB18030 > GBK > GB2312 259 | if encoding in ['GBK', 'GB2312']: 260 | encoding = 'GB18030' 261 | self._encoding = encoding 262 | # encryption flag 263 | # 0x00 - no encryption 264 | # 0x01 - encrypt record block 265 | # 0x02 - encrypt key info block 266 | if header_tag[b'Encrypted'] == b'No': 267 | self._encrypt = 0 268 | elif header_tag[b'Encrypted'] == b'Yes': 269 | self._encrypt = 1 270 | else: 271 | self._encrypt = int(header_tag[b'Encrypted']) 272 | 273 | # stylesheet attribute if present takes form of: 274 | # style_number # 1-255 275 | # style_begin # or '' 276 | # style_end # or '' 277 | # store stylesheet in dict in the form of 278 | # {'number' : ('style_begin', 'style_end')} 279 | self._stylesheet = {} 280 | if header_tag.get('StyleSheet'): 281 | lines = header_tag['StyleSheet'].splitlines() 282 | for i in range(0, len(lines), 3): 283 | self._stylesheet[lines[i]] = (lines[i+1], lines[i+2]) 284 | 285 | # before version 2.0, number is 4 bytes integer 286 | # version 2.0 and above uses 8 bytes 287 | self._version = float(header_tag[b'GeneratedByEngineVersion']) 288 | if self._version < 2.0: 289 | self._number_width = 4 290 | self._number_format = '>I' 291 | else: 292 | self._number_width = 8 293 | self._number_format = '>Q' 294 | 295 | return header_tag 296 | 297 | def _read_keys(self): 298 | f = open(self._fname, 'rb') 299 | f.seek(self._key_block_offset) 300 | 301 | # the following numbers could be encrypted 302 | if self._version >= 2.0: 303 | num_bytes = 8 * 5 304 | else: 305 | num_bytes = 4 * 4 306 | block = f.read(num_bytes) 307 | 308 | if self._encrypt & 1: 309 | if self._passcode is None: 310 | raise RuntimeError('user identification is needed to read encrypted file') 311 | regcode, userid = self._passcode 312 | if self.header['RegisterBy'] == 'EMail': 313 | encrypted_key = _decrypt_regcode_by_email(regcode.decode('hex'), userid) 314 | else: 315 | encrypted_key = _decrypt_regcode_by_deviceid(regcode.decode('hex'), userid) 316 | block = _salsa_decrypt(block, encrypted_key) 317 | 318 | # decode this block 319 | sf = BytesIO(block) 320 | # number of key blocks 321 | num_key_blocks = self._read_number(sf) 322 | # number of entries 323 | self._num_entries = self._read_number(sf) 324 | # number of bytes of key block info after decompression 325 | if self._version >= 2.0: 326 | key_block_info_decomp_size = self._read_number(sf) 327 | # number of bytes of key block info 328 | key_block_info_size = self._read_number(sf) 329 | # number of bytes of key block 330 | key_block_size = self._read_number(sf) 331 | 332 | # 4 bytes: adler checksum of previous 5 numbers 333 | if self._version >= 2.0: 334 | adler32 = unpack('>I', f.read(4))[0] 335 | assert adler32 == (zlib.adler32(block) & 0xffffffff) 336 | 337 | # read key block info, which indicates key block's compressed and decompressed size 338 | key_block_info = f.read(key_block_info_size) 339 | key_block_info_list = self._decode_key_block_info(key_block_info) 340 | assert(num_key_blocks == len(key_block_info_list)) 341 | 342 | # read key block 343 | key_block_compressed = f.read(key_block_size) 344 | # extract key block 345 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 346 | 347 | self._record_block_offset = f.tell() 348 | f.close() 349 | 350 | return key_list 351 | 352 | def _read_keys_brutal(self): 353 | f = open(self._fname, 'rb') 354 | f.seek(self._key_block_offset) 355 | 356 | # the following numbers could be encrypted, disregard them! 357 | if self._version >= 2.0: 358 | num_bytes = 8 * 5 + 4 359 | key_block_type = b'\x02\x00\x00\x00' 360 | else: 361 | num_bytes = 4 * 4 362 | key_block_type = b'\x01\x00\x00\x00' 363 | block = f.read(num_bytes) 364 | 365 | # key block info 366 | # 4 bytes '\x02\x00\x00\x00' 367 | # 4 bytes adler32 checksum 368 | # unknown number of bytes follows until '\x02\x00\x00\x00' which marks the beginning of key block 369 | key_block_info = f.read(8) 370 | if self._version >= 2.0: 371 | assert key_block_info[:4] == b'\x02\x00\x00\x00' 372 | while True: 373 | fpos = f.tell() 374 | t = f.read(1024) 375 | index = t.find(key_block_type) 376 | if index != -1: 377 | key_block_info += t[:index] 378 | f.seek(fpos + index) 379 | break 380 | else: 381 | key_block_info += t 382 | 383 | key_block_info_list = self._decode_key_block_info(key_block_info) 384 | key_block_size = sum(list(zip(*key_block_info_list))[0]) 385 | 386 | # read key block 387 | key_block_compressed = f.read(key_block_size) 388 | # extract key block 389 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list) 390 | 391 | self._record_block_offset = f.tell() 392 | f.close() 393 | 394 | self._num_entries = len(key_list) 395 | return key_list 396 | 397 | 398 | class MDD(MDict): 399 | """ 400 | MDict resource file format (*.MDD) reader. 401 | >>> mdd = MDD('example.mdd') 402 | >>> len(mdd) 403 | 208 404 | >>> for filename,content in mdd.items(): 405 | ... print filename, content[:10] 406 | """ 407 | def __init__(self, fname, passcode=None): 408 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode) 409 | 410 | def items(self): 411 | """Return a generator which in turn produce tuples in the form of (filename, content) 412 | """ 413 | return self._decode_record_block() 414 | 415 | def _decode_record_block(self): 416 | f = open(self._fname, 'rb') 417 | f.seek(self._record_block_offset) 418 | 419 | num_record_blocks = self._read_number(f) 420 | num_entries = self._read_number(f) 421 | assert(num_entries == self._num_entries) 422 | record_block_info_size = self._read_number(f) 423 | record_block_size = self._read_number(f) 424 | 425 | # record block info section 426 | record_block_info_list = [] 427 | size_counter = 0 428 | for i in range(num_record_blocks): 429 | compressed_size = self._read_number(f) 430 | decompressed_size = self._read_number(f) 431 | record_block_info_list += [(compressed_size, decompressed_size)] 432 | size_counter += self._number_width * 2 433 | assert(size_counter == record_block_info_size) 434 | 435 | # actual record block 436 | offset = 0 437 | i = 0 438 | size_counter = 0 439 | for compressed_size, decompressed_size in record_block_info_list: 440 | record_block_compressed = f.read(compressed_size) 441 | # 4 bytes: compression type 442 | record_block_type = record_block_compressed[:4] 443 | # 4 bytes: adler32 checksum of decompressed record block 444 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 445 | if record_block_type == '\x00\x00\x00\x00': 446 | record_block = record_block_compressed[8:] 447 | elif record_block_type == '\x01\x00\x00\x00': 448 | if lzo is None: 449 | print("LZO compression is not supported") 450 | break 451 | # decompress 452 | header = '\xf0' + pack('>I', decompressed_size) 453 | record_block = lzo.decompress(header + record_block_compressed[8:]) 454 | elif record_block_type == b'\x02\x00\x00\x00': 455 | # decompress 456 | record_block = zlib.decompress(record_block_compressed[8:]) 457 | 458 | # notice that adler32 return signed value 459 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 460 | 461 | assert(len(record_block) == decompressed_size) 462 | # split record block according to the offset info from key block 463 | while i < len(self._key_list): 464 | record_start, key_text = self._key_list[i] 465 | # reach the end of current record block 466 | if record_start - offset >= len(record_block): 467 | break 468 | # record end index 469 | if i < len(self._key_list)-1: 470 | record_end = self._key_list[i+1][0] 471 | else: 472 | record_end = len(record_block) + offset 473 | i += 1 474 | data = record_block[record_start-offset:record_end-offset] 475 | yield key_text, data 476 | offset += len(record_block) 477 | size_counter += compressed_size 478 | assert(size_counter == record_block_size) 479 | 480 | f.close() 481 | 482 | 483 | class MDX(MDict): 484 | """ 485 | MDict dictionary file format (*.MDD) reader. 486 | >>> mdx = MDX('example.mdx') 487 | >>> len(mdx) 488 | 42481 489 | >>> for key,value in mdx.items(): 490 | ... print key, value[:10] 491 | """ 492 | def __init__(self, fname, encoding='', substyle=False, passcode=None): 493 | MDict.__init__(self, fname, encoding, passcode) 494 | self._substyle = substyle 495 | 496 | def items(self): 497 | """Return a generator which in turn produce tuples in the form of (key, value) 498 | """ 499 | return self._decode_record_block() 500 | 501 | def _substitute_stylesheet(self, txt): 502 | # substitute stylesheet definition 503 | txt_list = re.split('`\d+`', txt) 504 | txt_tag = re.findall('`\d+`', txt) 505 | txt_styled = txt_list[0] 506 | for j, p in enumerate(txt_list[1:]): 507 | style = self._stylesheet[txt_tag[j][1:-1]] 508 | if p and p[-1] == '\n': 509 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n' 510 | else: 511 | txt_styled = txt_styled + style[0] + p + style[1] 512 | return txt_styled 513 | 514 | def _decode_record_block(self): 515 | f = open(self._fname, 'rb') 516 | f.seek(self._record_block_offset) 517 | 518 | num_record_blocks = self._read_number(f) 519 | num_entries = self._read_number(f) 520 | assert(num_entries == self._num_entries) 521 | record_block_info_size = self._read_number(f) 522 | record_block_size = self._read_number(f) 523 | 524 | # record block info section 525 | record_block_info_list = [] 526 | size_counter = 0 527 | for i in range(num_record_blocks): 528 | compressed_size = self._read_number(f) 529 | decompressed_size = self._read_number(f) 530 | record_block_info_list += [(compressed_size, decompressed_size)] 531 | size_counter += self._number_width * 2 532 | assert(size_counter == record_block_info_size) 533 | 534 | # actual record block data 535 | offset = 0 536 | i = 0 537 | size_counter = 0 538 | for compressed_size, decompressed_size in record_block_info_list: 539 | record_block_compressed = f.read(compressed_size) 540 | # 4 bytes indicates block compression type 541 | record_block_type = record_block_compressed[:4] 542 | # 4 bytes adler checksum of uncompressed content 543 | adler32 = unpack('>I', record_block_compressed[4:8])[0] 544 | # no compression 545 | if record_block_type == b'\x00\x00\x00\x00': 546 | record_block = record_block_compressed[8:] 547 | # lzo compression 548 | elif record_block_type == b'\x01\x00\x00\x00': 549 | if lzo is None: 550 | print("LZO compression is not supported") 551 | break 552 | # decompress 553 | header = b'\xf0' + pack('>I', decompressed_size) 554 | record_block = lzo.decompress(header + record_block_compressed[8:]) 555 | # zlib compression 556 | elif record_block_type == b'\x02\x00\x00\x00': 557 | # decompress 558 | record_block = zlib.decompress(record_block_compressed[8:]) 559 | 560 | # notice that adler32 return signed value 561 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff) 562 | 563 | assert(len(record_block) == decompressed_size) 564 | # split record block according to the offset info from key block 565 | while i < len(self._key_list): 566 | record_start, key_text = self._key_list[i] 567 | # reach the end of current record block 568 | if record_start - offset >= len(record_block): 569 | break 570 | # record end index 571 | if i < len(self._key_list)-1: 572 | record_end = self._key_list[i+1][0] 573 | else: 574 | record_end = len(record_block) + offset 575 | i += 1 576 | record = record_block[record_start-offset:record_end-offset] 577 | # convert to utf-8 578 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8') 579 | # substitute styles 580 | if self._substyle and self._stylesheet: 581 | record = self._substitute_stylesheet(record) 582 | 583 | yield key_text, record 584 | offset += len(record_block) 585 | size_counter += compressed_size 586 | assert(size_counter == record_block_size) 587 | 588 | f.close() 589 | 590 | 591 | if __name__ == '__main__': 592 | import sys 593 | import os 594 | import os.path 595 | import argparse 596 | 597 | # 2x3 compatible 598 | if sys.hexversion >= 0x03000000: 599 | unicode = str 600 | 601 | def passcode(s): 602 | try: 603 | regcode, userid = s.split(',') 604 | except: 605 | raise argparse.ArgumentTypeError("Passcode must be regcode,userid") 606 | try: 607 | regcode.decode('hex') 608 | except: 609 | raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string") 610 | return regcode, userid 611 | 612 | parser = argparse.ArgumentParser() 613 | parser.add_argument('-x', '--extract', action="store_true", 614 | help='extract mdx to source format and extract files from mdd') 615 | parser.add_argument('-s', '--substyle', action="store_true", 616 | help='substitute style definition if present') 617 | parser.add_argument('-d', '--datafolder', default="data", 618 | help='folder to extract data files from mdd') 619 | parser.add_argument('-e', '--encoding', default="", 620 | help='folder to extract data files from mdd') 621 | parser.add_argument('-p', '--passcode', default=None, type=passcode, 622 | help='register_code,email_or_deviceid') 623 | parser.add_argument("filename", nargs='?', help="mdx file name") 624 | args = parser.parse_args() 625 | 626 | # use GUI to select file, default to extract 627 | if not args.filename: 628 | import Tkinter 629 | import tkFileDialog 630 | root = Tkinter.Tk() 631 | root.withdraw() 632 | args.filename = tkFileDialog.askopenfilename(parent=root) 633 | args.extract = True 634 | 635 | if not os.path.exists(args.filename): 636 | print("Please specify a valid MDX/MDD file") 637 | 638 | base, ext = os.path.splitext(args.filename) 639 | 640 | # read mdx file 641 | if ext.lower() == os.path.extsep + 'mdx': 642 | mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode) 643 | if type(args.filename) is unicode: 644 | bfname = args.filename.encode('utf-8') 645 | else: 646 | bfname = args.filename 647 | print('======== %s ========' % bfname) 648 | print(' Number of Entries : %d' % len(mdx)) 649 | for key, value in mdx.header.items(): 650 | print(' %s : %s' % (key, value)) 651 | else: 652 | mdx = None 653 | 654 | # find companion mdd file 655 | mdd_filename = ''.join([base, os.path.extsep, 'mdd']) 656 | if os.path.exists(mdd_filename): 657 | mdd = MDD(mdd_filename, args.passcode) 658 | if type(mdd_filename) is unicode: 659 | bfname = mdd_filename.encode('utf-8') 660 | else: 661 | bfname = mdd_filename 662 | print('======== %s ========' % bfname) 663 | print(' Number of Entries : %d' % len(mdd)) 664 | for key, value in mdd.header.items(): 665 | print(' %s : %s' % (key, value)) 666 | else: 667 | mdd = None 668 | 669 | if args.extract: 670 | # write out glos 671 | if mdx: 672 | output_fname = ''.join([base, os.path.extsep, 'txt']) 673 | tf = open(output_fname, 'wb') 674 | for key, value in mdx.items(): 675 | tf.write(key) 676 | tf.write(b'\r\n') 677 | tf.write(value) 678 | tf.write(b'\r\n') 679 | tf.write(b'\r\n') 680 | tf.close() 681 | # write out style 682 | if mdx.header.get('StyleSheet'): 683 | style_fname = ''.join([base, '_style', os.path.extsep, 'txt']) 684 | sf = open(style_fname, 'wb') 685 | sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines())) 686 | sf.close() 687 | # write out optional data files 688 | if mdd: 689 | datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder) 690 | if not os.path.exists(datafolder): 691 | os.makedirs(datafolder) 692 | for key, value in mdd.items(): 693 | dfname = ''.join([datafolder, key.replace('\\', os.path.sep).decode('utf-8')]) 694 | if not os.path.exists(os.path.dirname(dfname)): 695 | os.makedirs(os.path.dirname(dfname)) 696 | df = open(dfname, 'wb') 697 | df.write(value) 698 | df.close() 699 | -------------------------------------------------------------------------------- /ripemd128.py: -------------------------------------------------------------------------------- 1 | """ 2 | Copyright by https://github.com/zhansliu/writemdict 3 | 4 | ripemd128.py - A simple ripemd128 library in pure Python. 5 | 6 | Supports both Python 2 (versions >= 2.6) and Python 3. 7 | 8 | Usage: 9 | from ripemd128 import ripemd128 10 | digest = ripemd128(b"The quick brown fox jumps over the lazy dog") 11 | assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96") 12 | 13 | """ 14 | 15 | 16 | 17 | import struct 18 | 19 | 20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt 21 | 22 | def f(j, x, y, z): 23 | assert(0 <= j and j < 64) 24 | if j < 16: 25 | return x ^ y ^ z 26 | elif j < 32: 27 | return (x & y) | (z & ~x) 28 | elif j < 48: 29 | return (x | (0xffffffff & ~y)) ^ z 30 | else: 31 | return (x & z) | (y & ~z) 32 | 33 | def K(j): 34 | assert(0 <= j and j < 64) 35 | if j < 16: 36 | return 0x00000000 37 | elif j < 32: 38 | return 0x5a827999 39 | elif j < 48: 40 | return 0x6ed9eba1 41 | else: 42 | return 0x8f1bbcdc 43 | 44 | def Kp(j): 45 | assert(0 <= j and j < 64) 46 | if j < 16: 47 | return 0x50a28be6 48 | elif j < 32: 49 | return 0x5c4dd124 50 | elif j < 48: 51 | return 0x6d703ef3 52 | else: 53 | return 0x00000000 54 | 55 | def padandsplit(message): 56 | """ 57 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges 58 | from 0 to 16. 59 | First pads the message to length in bytes is congruent to 56 (mod 64), 60 | by first adding a byte 0x80, and then padding with 0x00 bytes until the 61 | message length is congruent to 56 (mod 64). Then adds the little-endian 62 | 64-bit representation of the original length. Finally, splits the result 63 | up into 64-byte blocks, which are further parsed as 32-bit integers. 64 | """ 65 | origlen = len(message) 66 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1! 67 | message += b"\x80" 68 | message += b"\x00" * (padlength - 1) 69 | message += struct.pack("> (32-s)) & 0xffffffff 86 | 87 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15, 88 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8, 89 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12, 90 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2] 91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12, 92 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2, 93 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13, 94 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14] 95 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8, 96 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12, 97 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5, 98 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12] 99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6, 100 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11, 101 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5, 102 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8] 103 | 104 | 105 | def ripemd128(message): 106 | h0 = 0x67452301 107 | h1 = 0xefcdab89 108 | h2 = 0x98badcfe 109 | h3 = 0x10325476 110 | X = padandsplit(message) 111 | for i in range(len(X)): 112 | (A,B,C,D) = (h0,h1,h2,h3) 113 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3) 114 | for j in range(64): 115 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j))) 116 | (A,D,C,B) = (D,C,B,T) 117 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j))) 118 | (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T) 119 | T = add(h1,C,Dp) 120 | h1 = add(h2,D,Ap) 121 | h2 = add(h3,A,Bp) 122 | h3 = add(h0,B,Cp) 123 | h0 = T 124 | 125 | 126 | return struct.pack(" 1: 270 | self.log_match_result(_log_message_l, search_scope_re_str, 'm') 271 | else: 272 | self.log_match_result(_log_message_l, search_scope_re_str, 'u') 273 | if has_both_arg: 274 | search_start_index_l[target_index] = first_match_obj.end(1) 275 | search_end_index_l[target_index] = first_match_obj.start(3) 276 | elif has_after_arg: 277 | search_start_index_l[target_index] = first_match_obj.end() 278 | else: # has_before_arg 279 | search_end_index_l[target_index] = first_match_obj.start() 280 | if try_before_arg_flag: 281 | # no -a and -b pattern, try the -b pattern 282 | _log_message_l.append('Restricted to the before pattern') 283 | n_match, first_match_obj = search_target_str(args_d['before'], target_str, 284 | scope_re_flag) 285 | if n_match == 0: 286 | self.log_match_result(args_d['before'], 'n') 287 | try_after_arg_flag = True 288 | else: 289 | if n_match > 1: 290 | self.log_match_result(args_d['before'], 'm') 291 | else: 292 | self.log_match_result(args_d['before'], 'u') 293 | search_end_index_l[target_index] = first_match_obj.start() 294 | if try_after_arg_flag: 295 | # no -a and -b pattern, no -b pattern, last try -a pattern 296 | _log_message_l.append('Last try the after pattern') 297 | n_match, first_match_obj = search_target_str(args_d['after'], target_str, 298 | scope_re_flag) 299 | if n_match == 0: 300 | self.log_match_result(_log_message_l, args_d['after'], 'n') 301 | _log_message_l.append('All tries failed! The search scope remains as default') 302 | else: 303 | if n_match > 1: 304 | self.log_match_result(_log_message_l, args_d['after'], 'm') 305 | else: 306 | self.log_match_result(_log_message_l, args_d['after'], 'u') 307 | search_start_index_l[target_index] = first_match_obj.end() 308 | _log_message_l.append('-- Finished. File: ' + file_path_l[target_index]) 309 | else: 310 | _log_message_l.append('No argument is provided. The search scope remains as default') 311 | _par_d['search_start_index_l'], _par_d['search_end_index_l'] = \ 312 | search_start_index_l, search_end_index_l 313 | 314 | @staticmethod 315 | def modify_target_str(_log_message_l, _par_d): 316 | _log_message_l.append('# Begin to modify the target str') 317 | file_path_l = _par_d['file_path_l'] 318 | n_target_str = _par_d['n_target_str'] 319 | target_str_l = _par_d['target_str_l'] 320 | cell = _par_d['cell'] 321 | args_d = _par_d['args_d'] 322 | search_start_index_l = _par_d['search_start_index_l'] 323 | search_end_index_l = _par_d['search_end_index_l'] 324 | indent = args_d['indent'] 325 | indented_cell_l = [] 326 | # pre process the cell: skip lines that are blank 327 | # before the first non blank line or after the last non blank line 328 | cell = cell.strip() 329 | cell_line_l = [] if cell == '' else cell.split('\n') 330 | n_cell_line = len(cell_line_l) 331 | modified_target_str_l = [target_str_l[i] for i in range(n_target_str)] 332 | update_d = {'cell': cell, 'cell_line_l': cell_line_l, 333 | 'n_cell_line': n_cell_line, 334 | 'modified_target_str_l': modified_target_str_l} 335 | _par_d.update(update_d) 336 | if n_cell_line == 0: 337 | # nothing to write 338 | _log_message_l.append('!! Empty cell. Nothing to write.') 339 | return 340 | # indent cell for writing 341 | for cell_line in cell_line_l: 342 | indented_cell_l.append(' '*indent + cell_line) 343 | indented_cell = '\n'.join(indented_cell_l) 344 | # log writing mode 345 | append_message_d = {'o': '!! Writing mode is overwrite.', 346 | 'i': '!! Writing mode is insert.', 347 | 'a': '!! Writing mode is append.', 348 | 'di': '!! Writing mode is different and insert.', 349 | 'da': '!! Writing mode is different and append.'} 350 | _log_message_l.append(append_message_d[args_d['mode']]) 351 | # begin to build modified str 352 | for target_index, target_str in enumerate(target_str_l): 353 | file_path = file_path_l[target_index] 354 | _log_message_l.append('++ Deal with file ' + file_path) 355 | start_index = search_start_index_l[target_index] 356 | end_index = search_end_index_l[target_index] 357 | if target_str == '': 358 | _log_message_l.append('Target file is empty.') 359 | modified_target_str_l[target_index] = indented_cell 360 | else: 361 | left_segment_end = None 362 | right_segment_start = None 363 | if args_d['mode'] == 'o': 364 | left_segment_end = start_index 365 | right_segment_start = end_index 366 | elif args_d['mode'] in ['i', 'di']: 367 | left_segment_end = start_index 368 | right_segment_start = start_index 369 | else: 370 | # args_d['mode'] in ['a', 'da']: 371 | left_segment_end = end_index 372 | right_segment_start = end_index 373 | if args_d['mode'] in ['da', 'di']: 374 | # try to match the cell as whole 375 | _log_message_l.append('Try to match the cell as whole.') 376 | cell_re = construct_indent_line_re(cell) 377 | n_match, _ = search_target_str(cell_re, target_str, re.M, 378 | start_index, end_index) 379 | if n_match > 0: 380 | _log_message_l.append('Whole cell matched. No need to update.') 381 | left_segment_end = None 382 | right_segment_start = None 383 | if not (left_segment_end is None): 384 | modified_str = target_str[:left_segment_end] 385 | if modified_str != '': 386 | if modified_str[-1] != '\n': 387 | modified_str += '\n' 388 | modified_str += indented_cell 389 | if modified_str[-1] != '\n': 390 | modified_str += '\n' 391 | modified_str += target_str[right_segment_start:] 392 | modified_target_str_l[target_index] = modified_str 393 | _log_message_l.append('Target str is modified') 394 | _log_message_l.append('-- Finished. File: ' + file_path) 395 | 396 | @staticmethod 397 | def log_match_result(_log_l, pattern_str, key_word='u'): 398 | _log_l.append(SyncToFile.match_result_prefix_d[key_word]) 399 | formatted_pattern_str = '-wrap2 ' + pattern_str 400 | _log_l.append(formatted_pattern_str) 401 | 402 | 403 | # In order to actually use these magics, you must register them with a 404 | # running IPython. This code must be placed in a file that is loaded once 405 | # IPython is up and running: 406 | ip = get_ipython() 407 | # You can register the class itself without instantiating it. IPython will 408 | # call the default constructor on it. 409 | ip.register_magics(SyncToFile) -------------------------------------------------------------------------------- /wagnerfischerpp.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # 3 | # Copyright (c) 2013-2014 Kyle Gorman 4 | # 5 | # Permission is hereby granted, free of charge, to any person obtaining a 6 | # copy of this software and associated documentation files (the 7 | # "Software"), to deal in the Software without restriction, including 8 | # without limitation the rights to use, copy, modify, merge, publish, 9 | # distribute, sublicense, and/or sell copies of the Software, and to 10 | # permit persons to whom the Software is furnished to do so, subject to 11 | # the following conditions: 12 | # 13 | # The above copyright notice and this permission notice shall be included 14 | # in all copies or substantial portions of the Software. 15 | # 16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 17 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF 18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. 19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY 20 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, 21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE 22 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 23 | # 24 | # wagnerfischerpp.py: efficient computation of Levenshtein distance and 25 | # all optimal alignments with arbitrary edit costs. The algorithm for 26 | # computing the dynamic programming table used has been discovered many 27 | # times, but most notably by Wagner & Fischer: 28 | # 29 | # R.A. Wagner & M.J. Fischer. 1974. The string-to-string correction 30 | # problem. Journal of the ACM, 21(1): 168-173. 31 | # 32 | # Wagner & Fischer also describe an algorithm ("Algorithm Y") to find the 33 | # alignment path (i.e., list of edit operations involved in the optimal 34 | # alignment), but it it is specified such that in fact it only generates 35 | # one such path, whereas many such paths may exist, particularly when 36 | # multiple edit operations have the same cost. For example, when all edit 37 | # operations have the same cost, there are two equal-cost alignments of 38 | # "TGAC" and "GCAC": 39 | # 40 | # TGAC TGxAC 41 | # ss== d=i== 42 | # GCAC xGCAC 43 | # 44 | # However, all such paths can be generated efficiently, as follows. First, 45 | # the dynamic programming table "cells" are defined as tuples of (partial 46 | # cost, set of all operations reaching this cell with minimal cost). As a 47 | # result, the completed table can be thought of as an unweighted, directed 48 | # graph (or FSA). The bottom right cell (the one containing the Levenshtein 49 | # distance) is the start state and the origin as end state. The set of arcs 50 | # are the set of operations in each cell as arcs. (Many of the cells of the 51 | # table, those which are not visited by any optimal alignment, are under 52 | # the graph interpretation unconnected vertices, and can be ignored. Every 53 | # path between the bottom right cell and the origin cell is an optimal 54 | # alignment. These paths can be efficiently enumerated using breadth-first 55 | # traversal. The trick here is that elements in deque must not only contain 56 | # indices but also partial paths. Averaging over all such paths, we can 57 | # come up with an estimate of the number of insertions, deletions, and 58 | # substitutions involved as well; in the example above, we say S = 1 and 59 | # D, I = 0.5. 60 | 61 | from __future__ import division 62 | 63 | from pprint import PrettyPrinter 64 | from collections import deque, namedtuple, Counter 65 | 66 | # default costs 67 | 68 | INSERTION = 1 69 | DELETION = 1 70 | SUBSTITUTION = 1 71 | 72 | Trace = namedtuple("Trace", ["cost", "ops"]) 73 | 74 | 75 | class WagnerFischer(object): 76 | 77 | """ 78 | An object representing a (set of) Levenshtein alignments between two 79 | iterable objects (they need not be strings). The cost of the optimal 80 | alignment is scored in `self.cost`, and all Levenshtein alignments can 81 | be generated using self.alignments()`. 82 | 83 | Basic tests: 84 | 85 | >>> WagnerFischer("god", "gawd").cost 86 | 2 87 | >>> WagnerFischer("sitting", "kitten").cost 88 | 3 89 | >>> WagnerFischer("bana", "banananana").cost 90 | 6 91 | >>> WagnerFischer("bana", "bana").cost 92 | 0 93 | >>> WagnerFischer("banana", "angioplastical").cost 94 | 11 95 | >>> WagnerFischer("angioplastical", "banana").cost 96 | 11 97 | >>> WagnerFischer("Saturday", "Sunday").cost 98 | 3 99 | 100 | IDS tests: 101 | 102 | >>> WagnerFischer("doytauvab", "doyvautab").IDS() == {"S": 2.0} 103 | True 104 | >>> WagnerFischer("kitten", "sitting").IDS() == {"I": 1.0, "S": 2.0} 105 | True 106 | """ 107 | 108 | # initialize pretty printer (shared across all class instances) 109 | pprint = PrettyPrinter(width=75) 110 | 111 | def __init__(self, A, B, insertion=INSERTION, deletion=DELETION, 112 | substitution=SUBSTITUTION): 113 | # score operation costs in a dictionary, for programmatic access 114 | self.costs = {"I": insertion, "D": deletion, "S": substitution} 115 | # initialize table 116 | self.asz = len(A) 117 | self.bsz = len(B) 118 | self._table = [[None for _ in xrange(self.bsz + 1)] for 119 | _ in xrange(self.asz + 1)] 120 | # from now on, all indexing done using self.__getitem__ 121 | ## fill in edges 122 | self[0][0] = Trace(0, {"O"}) # start cell 123 | for i in xrange(1, self.asz + 1): 124 | self[i][0] = Trace(i * self.costs["D"], {"D"}) 125 | for j in xrange(1, self.bsz + 1): 126 | self[0][j] = Trace(j * self.costs["I"], {"I"}) 127 | ## fill in rest 128 | for i in xrange(len(A)): 129 | for j in xrange(len(B)): 130 | # clean it up in case there are more than one 131 | # check for match first, always cheapest option 132 | if A[i] == B[j]: 133 | self[i + 1][j + 1] = Trace(self[i][j].cost, {"M"}) 134 | # check for other types 135 | else: 136 | costI = self[i + 1][j].cost + self.costs["I"] 137 | costD = self[i][j + 1].cost + self.costs["D"] 138 | costS = self[i][j].cost + self.costs["S"] 139 | # determine min of three 140 | min_val = min(costI, costD, costS) 141 | # write that much in 142 | trace = Trace(min_val, set()) 143 | # add _all_ operations matching minimum value 144 | if costI == min_val: 145 | trace.ops.add("I") 146 | if costD == min_val: 147 | trace.ops.add("D") 148 | if costS == min_val: 149 | trace.ops.add("S") 150 | # write to table 151 | self[i + 1][j + 1] = trace 152 | # store optimum cost as a property 153 | self.cost = self[-1][-1].cost 154 | 155 | def __repr__(self): 156 | return self.pprint.pformat(self._table) 157 | 158 | def __iter__(self): 159 | for row in self._table: 160 | yield row 161 | 162 | def __getitem__(self, i): 163 | """ 164 | Returns the i-th row of the table, which is a list and so 165 | can be indexed. Therefore, e.g., self[2][3] == self._table[2][3] 166 | """ 167 | return self._table[i] 168 | 169 | # stuff for generating alignments 170 | 171 | def _stepback(self, i, j, trace, path_back): 172 | """ 173 | Given a cell location (i, j) and a Trace object trace, generate 174 | all traces they point back to in the table 175 | """ 176 | for op in trace.ops: 177 | if op == "M": 178 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["M"] 179 | elif op == "I": 180 | yield i, j - 1, self[i][j - 1], path_back + ["I"] 181 | elif op == "D": 182 | yield i - 1, j, self[i - 1][j], path_back + ["D"] 183 | elif op == "S": 184 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["S"] 185 | elif op == "O": 186 | return # origin cell, we"re done iterating 187 | else: 188 | raise ValueError("Unknown op '{}'".format(op)) 189 | 190 | def alignments(self, bfirst=False): 191 | """ 192 | Generate all alignments with optimal cost by traversing the 193 | an implicit graph on the dynamic programming table. By default, 194 | depth-first traversal is used, since users seem to get tired 195 | waiting for their first results. 196 | """ 197 | # each cell of the queue is a tuple of (i, j, trace, path_back) 198 | # where i, j is the current index, trace is the trace object at 199 | # this cell 200 | if bfirst: 201 | return self._bfirst_alignments() 202 | else: 203 | return self._dfirst_alignments() 204 | 205 | def _dfirst_alignments(self): 206 | """ 207 | Generate alignments via depth-first traversal. 208 | """ 209 | stack = list(self._stepback(self.asz, self.bsz, self[-1][-1], [])) 210 | while stack: 211 | (i, j, trace, path_back) = stack.pop() 212 | if trace.ops == {"O"}: 213 | path_back.reverse() 214 | yield path_back 215 | continue 216 | stack.extend(self._stepback(i, j, trace, path_back)) 217 | 218 | def _bfirst_alignments(self): 219 | """ 220 | Generate alignments via breadth-first traversal. 221 | """ 222 | queue = deque(self._stepback(self.asz, self.bsz, self[-1][-1], [])) 223 | while queue: 224 | (i, j, trace, path_back) = queue.popleft() 225 | if trace.ops == {"O"}: 226 | path_back.reverse() 227 | yield path_back 228 | continue 229 | queue.extend(self._stepback(i, j, trace, path_back)) 230 | 231 | def IDS(self): 232 | """ 233 | Estimate insertions, deletions, and substitution _count_ (not 234 | costs). Non-integer values arise when there are multiple possible 235 | alignments with the same cost. 236 | """ 237 | npaths = 0 238 | opcounts = Counter() 239 | for alignment in self.alignments(): 240 | # count edit types for this path, ignoring "M" (which is free) 241 | opcounts += Counter(op for op in alignment if op != "M") 242 | npaths += 1 243 | # average over all paths 244 | return Counter({o: c / npaths for (o, c) in opcounts.iteritems()}) 245 | 246 | 247 | if __name__ == "__main__": 248 | import doctest 249 | doctest.testmod() --------------------------------------------------------------------------------