├── .idea
├── dictionaries
│ └── xiaohang.xml
└── vcs.xml
├── CreateAnkiImport_GrePhrase.py
├── CreateAnkiImport_GreWord.py
├── README.md
├── add_similar_word.py
├── anki_import.ipynb
├── anki_import.md
├── base_data
├── GREGao Fen Bi Bei Duan Yu Da Pe - Yan Yu Zhen ,Gao Yu ,Chen Qi.txt
├── GREHe Xin Ci Hui Kao Fa Jing Xi (Xin Dong Fang Da Yu Ying Yu Xue Xi Cong Shu ) - Chen Qi.txt
├── GREHe Xin Ci Hui Zhu Ji Yu Jing - Cao Tian Cheng.txt
└── bzsdbdc_dic.txt
├── convert_duanyu.py
├── convert_new3000.py
├── convert_zhuji.py
├── example_usage.apkg
├── explore_all_in_one.ipynb
├── explore_all_in_one.md
├── my_helpers.py
├── pureSalsa20.py
├── readmdict.py
├── ripemd128.py
├── sync_to_file_magic_command.py
└── wagnerfischerpp.py
/.idea/dictionaries/xiaohang.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 | jupyter
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/CreateAnkiImport_GrePhrase.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import json
3 | import codecs
4 | import os
5 | from my_helpers import *
6 | file_name_duanyu = 'duanyu_base_d.txt'
7 | duanyu_base_d = is_file_and_json_load(file_name_duanyu)
8 | output_file_GrePhrase = 'AnkiImportData_GrePhrase.txt'
9 | def convert_to_GrePhrase():
10 | with codecs.open(output_file_GrePhrase, 'w', encoding='utf-8') as f:
11 | my_notes = ''
12 | for phrase_uid, phrase_dict in duanyu_base_d.iteritems():
13 | one_line = [phrase_uid, phrase_dict['phrase'], phrase_dict['usage_index'], my_notes,
14 | phrase_dict['en_exp'], phrase_dict['cn_exp'],
15 | phrase_dict['example'], phrase_dict['gre_example_cn'],
16 | phrase_dict['gre_example_en']]
17 | one_line = '\t'.join(one_line) + '\n'
18 | f.write(one_line)
19 | if __name__ == '__main__':
20 | if not (duanyu_base_d is None):
21 | convert_to_GrePhrase()
22 |
--------------------------------------------------------------------------------
/CreateAnkiImport_GreWord.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import json
3 | import codecs
4 | import os
5 | from my_helpers import *
6 | file_name_new3000 = 'new3000_base_d.txt'
7 | file_name_zhuji = 'zhuji_base_d.txt'
8 | file_name_bzsdbdc = 'base_data\\bzsdbdc_dic.txt'
9 | output_file_GreWord = 'AnkiImportData_GreWord.txt'
10 | new3000_base_d = None
11 | zhuji3000_base_d = None
12 | bzsdbdc_data = None
13 | new3000_base_d = is_file_and_json_load(file_name_new3000)
14 | zhuji3000_base_d = is_file_and_json_load(file_name_zhuji)
15 | bzsdbdc_data = is_file_and_json_load(file_name_bzsdbdc)
16 | no_data_new3000 = new3000_base_d is None
17 | no_data_zhuji = zhuji3000_base_d is None
18 | no_data_bzsdbdc = bzsdbdc_data is None
19 | def add_extra_fields():
20 | if no_data_new3000:
21 | print 'New3000 data file does not exists! Nothing can be done...'
22 | return
23 | iter_path = [('all','',True), ('key','usages',False),('all','',False)]
24 | for word, usage_d in iter_through_general(new3000_base_d, iter_path):
25 | usage_d['audio'] = ''
26 | usage_d['mynotes'] = ''
27 | for word_d in new3000_base_d.itervalues():
28 | word_d['similar_form'] = ''
29 | def convert_to_GreWord():
30 | if no_data_new3000:
31 | print 'New3000 data file does not exists! Nothing can be done...'
32 | return
33 | if no_data_zhuji:
34 | print 'No data of zhuji!'
35 | if no_data_bzsdbdc:
36 | print 'No data of bzsdbdc!'
37 | output_list = []
38 | None_repr = u''
39 | join_by_line_break = u'
'.join
40 | replace_with_br = lambda _str: _str.replace('\n', '
')
41 | tag_pos_prefix = ' in_'
42 | for word in new3000_base_d:
43 | # new 3000 part
44 | """
45 | the structure of a word of new3000_base_d.txt
46 |
47 | {'phon': u"[\u02cc\xe6d'l\u026ab]",
48 | 'pos': (1, 6),
49 | 'usages': [{'ants': u'\u53cd\u3000considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684',
50 | 'ants_d': {'cn': u'\u9884\u5148\u8ba1\u5212\u7684',
51 | 'en': u'considered, planned, premeditated, rehearsed ',
52 | 'en_cn': u'considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684'},
53 | 'der': '',
54 | 'examples': u'content...',
55 | 'en': u'not bad for an ad-lib comedy routine',
56 | 'en_cn': u'content...'},
57 | 'exp': u'*adj.* \u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**',
58 | 'exp_d': {'cn': u'\u5373\u5174\u7684',
59 | 'en': u'made or done **without previous thought or preparation**',
60 | 'en_cn': u'\u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**'},
61 | 'ph_symbl': u"[\u02cc\xe6d'l\u026ab]",
62 | 'pspeech': u'adj.',
63 | 'syns': u'content...'}
64 | """
65 | one_new3000_word_d = new3000_base_d[word]
66 | word_pos_L, word_pos_U = one_new3000_word_d['pos']
67 | word_pos = u'L' + unicode(word_pos_L) + u' U' + unicode(word_pos_U)
68 | num_usages = len(one_new3000_word_d['usages'])
69 | usages_tag = unicode(num_usages) + u'_usage'
70 |
71 | for usage_index, usage in enumerate(one_new3000_word_d['usages']):
72 | word_phs = usage['ph_symbl']
73 | word_tags = usages_tag + tag_pos_prefix + 'zaiyaoniming3000'
74 | if not no_data_zhuji:
75 | if word in zhuji3000_base_d:
76 | word_tags += tag_pos_prefix + 'zhuji3000'
77 | if not no_data_bzsdbdc:
78 | if word in bzsdbdc_data:
79 | word_tags += tag_pos_prefix + 'bzsdbdc'
80 | usage_index = unicode(usage_index+1)
81 | word_uid = unicode(word) + usage_index
82 | ph_symbl = usage['ph_symbl']
83 | word_Audio = usage['audio']
84 | pspeech = usage['pspeech']
85 | exp_en = usage['exp_d']['en']
86 | exp_cn = usage['exp_d']['cn']
87 | exp_en_cn = usage['exp_d']['en_cn']
88 | # combine other explanation
89 | #usage_index_l = range(num_usages)
90 | #usage_index_l.remove(usage_index)
91 | #exp_other = ['**考法%d**:'%(i+1) + one_new3000_word_d['usages'][i]['exp_d']['en_cn'] +'\n' for i in usage_index_l]
92 | # use word_block_str as all explanation
93 | exp_all = one_new3000_word_d['word_block_str']
94 | examples_en = usage['examples_d']['en']
95 | examples_cn = usage['examples_d']['cn']
96 | examples_en_cn = usage['examples_d']['en_cn']
97 | examples_others = ''
98 | ants_en = usage['ants_d']['en']
99 | ants_cn = usage['ants_d']['cn']
100 | ants_en_cn = usage['ants_d']['en_cn']
101 | syns = usage['syns']
102 | # der from the book zaiyaoniming3000
103 | der_new3000 = usage['der']
104 |
105 | # bzsdbdc part
106 | how_to_mem_bzsdbdc = None_repr
107 | if not no_data_bzsdbdc:
108 | if word in bzsdbdc_data:
109 | how_to_mem_bzsdbdc = bzsdbdc_data[word]['combined']
110 |
111 | # zhuji3000 part
112 | how_to_mem_zhuji3000, eytma_gr, eytma_gr_exp, eytma_cognates = None_repr, None_repr, None_repr, None_repr
113 | '''
114 | the structure of a word of zhuji3000_base_d
115 | {'content': u'[\u6839] per- [through] + vad [go] + -e [v.], go through, \u904d\u5e03 \u2192 vt. \u5f25\u6f2b\uff0c\u5145\u6ee1\n',
116 | 'ety': 'vad, vag, ced',
117 | 'etyma_cognates_l': u'pervade, evasive, extravagant, vague, cessation, incessant',
118 | 'etyma_group_explanation': u'group explanation content',
119 | 'phon': u"[p\u0259r've\u026ad]",
120 | 'pos': u'6, 7',
121 | 'summary': u'summary content',
122 | 'word': u'pervade'}
123 | '''
124 | if not no_data_zhuji:
125 | if word in zhuji3000_base_d:
126 | how_to_mem_zhuji3000 = zhuji3000_base_d[word]['content']
127 | eytma_gr = zhuji3000_base_d[word]['ety']
128 | eytma_gr_exp = zhuji3000_base_d[word]['etyma_group_explanation']
129 | eytma_cognates = zhuji3000_base_d[word]['etyma_cognates_l']
130 | # extra fields
131 | mynotes = usage['mynotes']
132 | similar_form = one_new3000_word_d['similar_form']
133 | """
134 | Anki GreWord Structure
135 | word_uid word usage_index ph_symbl word_audio pspeech mynotes
136 | exp_en exp_cn exp_en_cn exp_all
137 | examples_en examples_cn examples_encn examples_others
138 | ants_en ants_cn ants_encn
139 | syns der_new3000
140 | how_to_mem_bzsdbdc how_to_mem_zhuji3000
141 | etyma_group etyma_group_exp etyma_cognates
142 | position similar_form tags
143 | """
144 | one_line = [word_uid, word, usage_index, ph_symbl, word_Audio, pspeech, mynotes,
145 | exp_en, exp_cn, exp_en_cn, exp_all,
146 | examples_en, examples_cn, examples_en_cn, examples_others,
147 | ants_en, ants_cn, ants_en_cn] +\
148 | [syns, der_new3000, how_to_mem_bzsdbdc, how_to_mem_zhuji3000,
149 | eytma_gr, eytma_gr_exp, eytma_cognates, word_pos, similar_form, word_tags]
150 | for index, _str in enumerate(one_line):
151 | _str = replace_with_br(collapse_blank_line(_str).strip(' \n'))
152 | one_line[index] = custom_html_element(_str)
153 | output_list.append(one_line)
154 | output_list.sort(key=lambda x: x[0])
155 | return output_list
156 | def main():
157 | add_field_audio_and_mynotes()
158 | output_list = convert_to_GreWord()
159 | if output_list is None:
160 | return
161 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f:
162 | for one_line in output_list:
163 | one_string = u'\t'.join(one_line) + '\n'
164 | f.write(one_string)
165 | del output_list
166 | if __name__ == '__main__':
167 | main()
168 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Intro
2 |
3 | Some scripts that convert a series of GRE Vocabulary books in Chinese to Anki notes.
4 |
5 | The markdown cells of the jupyter notebooks are written in Chinese, but contain
6 | detailed explanation and walk through.
7 |
8 | `explore_all_in_one.ipynb` converts the txt source file to a structured dict object.
9 |
10 | `anki_import.ipynb` uses the converted dict object to generate import file for Anki.
11 |
12 | `readmdict.py` `ripemd128.py` `pureSalsa20.py` are the tools to unzip mdd and mdx file.
13 | See https://bitbucket.org/xwang/mdict-analysis/overview for more details.
14 |
15 | I share the ready to use import txt and audio files on baiduyun. You can use the
16 | example_usage.apkg to build the note type.
17 |
18 | http://pan.baidu.com/s/1pJ5W9uF password:xgif
19 |
20 |
21 |
--------------------------------------------------------------------------------
/add_similar_word.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 |
3 | from multiprocessing import Pool
4 | from wagnerfischerpp import WagnerFischer
5 | import codecs
6 | import json
7 |
8 |
9 | with codecs.open('new3000_base_d.txt') as f:
10 | new3000_base_d = json.load(f,encoding='utf-8')
11 |
12 |
13 | def get_similar_word(word_a, threshold=2):
14 | distance_l = []
15 | for word_b in new3000_base_d:
16 | if word_b == word_a:
17 | continue
18 | cost_a_b = WagnerFischer(word_a, word_b).cost
19 | if cost_a_b <= threshold:
20 | distance_l.append((cost_a_b, word_b))
21 | distance_l.sort()
22 | return distance_l
23 |
24 |
25 | def gen_brief_exp(word):
26 | brief_exp_l = []
27 | for usage_d in new3000_base_d[word]['usages']:
28 | brief_exp_l.append(usage_d['exp_d']['cn'])
29 | return word + ': ' + u';'.join(brief_exp_l)
30 |
31 |
32 | def add_similar_word_single_word(word):
33 | similar_word_l = get_similar_word(word)
34 | exp_l = []
35 | for cost, similar_word in similar_word_l:
36 | exp_l.append(gen_brief_exp(similar_word))
37 | #new3000_base_d[word]['similar_word'] = ' | '.join(exp_l)
38 | print '+',
39 | return word, ' | '.join(exp_l)
40 |
41 |
42 | def add_similar_word_multiprocessing():
43 | pool = Pool(4)
44 | result = pool.map(add_similar_word_single_word, new3000_base_d.iterkeys())
45 | pool.close()
46 | with codecs.open('similar_word.txt', 'w', encoding='utf-8') as f:
47 | json.dump(result, f)
48 |
49 | if __name__ == '__main__':
50 | add_similar_word_multiprocessing()
--------------------------------------------------------------------------------
/anki_import.md:
--------------------------------------------------------------------------------
1 |
2 | # 说明
3 |
4 | 这个notebook展示了如何将一个json对象转换为可导入Anki的文件。重点在于Anki中NoteType的设计。内容上承接explore_all_in_one.ipynb。
5 |
6 | 《GRE核心词汇考法精析》、《GRE核心词汇助记与精练》以及从网上找到的《不择手段背单词》对应NoteType为GreWord。
7 | 《GRE高分必备短语搭配》对应NoteTpye为GrePhrase。
8 |
9 | notebook执行完后,会自动生成两个脚本,名字参见变量file_name_greword,file_name_grephrase。单独运行两个脚本也可完成转换,只要有python就可使用。
10 |
11 | 子章节《处理发音文件》和《添加文件》,需要许多定制文件。所以没有导出到转换脚本。如果没有对应文件的话,直接运行这个notebook而会报错。所以如果只想得到无发音无笔记版本的导入文件,请运行那两个转换脚本。
12 |
13 | 转换出的Anki导入文件,名字参见变量output_file_GreWord,output_file_GrePhrase。
14 |
15 |
16 | ```python
17 | %run sync_to_file_magic_command.py
18 | ```
19 |
20 |
21 | ```python
22 | file_name_greword = 'CreateAnkiImport_GreWord.py'
23 | file_name_grephrase = 'CreateAnkiImport_GrePhrase.py'
24 | configCreAnkiImpGreWord = file_name_greword
25 | configCreAnkiImpGrePhrase = file_name_grephrase
26 | configMyHelpers = 'my_helpers.py'
27 | ```
28 |
29 | # 补充两个辅助函数
30 |
31 |
32 | ```python
33 | %%sync_to_file $configMyHelpers
34 | def custom_html_element(_str):
35 | """
36 | convert the markdown notations in a string to html tags
37 | currently, only two kinds of markdown notation exist in all the strings
38 | ** and *
39 | """
40 | formatted_str = _str
41 | # format double asterisk
42 | match_double_asterisk_re = re.compile(u'\*\*(.*?)\*\*')
43 | # replace **...** with ...
44 | #formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str)
45 | # replace **...** with ...
46 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str)
47 | # format single asterisk
48 | # replace *...* with ...
49 | match_single_asterisk_re = re.compile(u'\*(.*?)\*')
50 | formatted_str = match_single_asterisk_re.sub(r'\1', formatted_str)
51 | return formatted_str
52 | ```
53 |
54 |
55 | ```python
56 | %%sync_to_file $configMyHelpers
57 | def is_file_and_json_load(file_name_str):
58 | if os.path.isfile(file_name_str):
59 | with codecs.open(file_name_str, 'r', encoding='utf-8') as f:
60 | json_d = json.load(f)
61 | return json_d
62 | ```
63 |
64 |
65 | ```python
66 | %%sync_to_file $configCreAnkiImpGreWord $configCreAnkiImpGrePhrase -m o
67 |
68 | # coding:utf-8
69 | import json
70 | import codecs
71 | import os
72 | from my_helpers import *
73 | ```
74 |
75 | # GreWord
76 |
77 |
78 | ```python
79 | # example
80 | test_str = 'to **put an end to**(something planned or previously agreed to)'
81 | print custom_html_element(test_str)
82 | del test_str
83 | ```
84 |
85 | to put an end to(something planned or previously agreed to)
86 |
87 |
88 |
89 | ```python
90 | %%sync_to_file $configCreAnkiImpGreWord
91 | file_name_new3000 = 'new3000_base_d.txt'
92 | file_name_zhuji = 'zhuji_base_d.txt'
93 | file_name_bzsdbdc = 'base_data\\bzsdbdc_dic.txt'
94 | output_file_GreWord = 'AnkiImportData_GreWord.txt'
95 | new3000_base_d = None
96 | zhuji3000_base_d = None
97 | bzsdbdc_data = None
98 | new3000_base_d = is_file_and_json_load(file_name_new3000)
99 | zhuji3000_base_d = is_file_and_json_load(file_name_zhuji)
100 | bzsdbdc_data = is_file_and_json_load(file_name_bzsdbdc)
101 | ```
102 |
103 |
104 | ```python
105 | %%sync_to_file $configCreAnkiImpGreWord
106 | no_data_new3000 = new3000_base_d is None
107 | no_data_zhuji = zhuji3000_base_d is None
108 | no_data_bzsdbdc = bzsdbdc_data is None
109 | ```
110 |
111 | ## 核心转换函数
112 |
113 |
114 | ```python
115 | %%sync_to_file $configCreAnkiImpGreWord
116 | def add_extra_fields():
117 | if no_data_new3000:
118 | print 'New3000 data file does not exists! Nothing can be done...'
119 | return
120 | iter_path = [('all','',True), ('key','usages',False),('all','',False)]
121 | for word, usage_d in iter_through_general(new3000_base_d, iter_path):
122 | usage_d['audio'] = ''
123 | usage_d['mynotes'] = ''
124 | for word_d in new3000_base_d.itervalues():
125 | word_d['similar_form'] = ''
126 | ```
127 |
128 |
129 | ```python
130 | add_extra_fields()
131 | ```
132 |
133 |
134 | ```python
135 | # test
136 | #pprint(new3000_base_d['abandon'])
137 | ```
138 |
139 |
140 | ```python
141 | %%sync_to_file $configCreAnkiImpGreWord
142 | def convert_to_GreWord():
143 | if no_data_new3000:
144 | print 'New3000 data file does not exists! Nothing can be done...'
145 | return
146 | if no_data_zhuji:
147 | print 'No data of zhuji!'
148 | if no_data_bzsdbdc:
149 | print 'No data of bzsdbdc!'
150 | output_list = []
151 | None_repr = u''
152 | join_by_line_break = u'
'.join
153 | replace_with_br = lambda _str: _str.replace('\n', '
')
154 | tag_pos_prefix = ' in_'
155 | for word in new3000_base_d:
156 | # new 3000 part
157 | """
158 | the structure of a word of new3000_base_d.txt
159 |
160 | {'phon': u"[\u02cc\xe6d'l\u026ab]",
161 | 'pos': (1, 6),
162 | 'usages': [{'ants': u'\u53cd\u3000considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684',
163 | 'ants_d': {'cn': u'\u9884\u5148\u8ba1\u5212\u7684',
164 | 'en': u'considered, planned, premeditated, rehearsed ',
165 | 'en_cn': u'considered, planned, premeditated, rehearsed \u9884\u5148\u8ba1\u5212\u7684'},
166 | 'der': '',
167 | 'examples': u'content...',
168 | 'en': u'not bad for an ad-lib comedy routine',
169 | 'en_cn': u'content...'},
170 | 'exp': u'*adj.* \u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**',
171 | 'exp_d': {'cn': u'\u5373\u5174\u7684',
172 | 'en': u'made or done **without previous thought or preparation**',
173 | 'en_cn': u'\u5373\u5174\u7684\uff1amade or done **without previous thought or preparation**'},
174 | 'ph_symbl': u"[\u02cc\xe6d'l\u026ab]",
175 | 'pspeech': u'adj.',
176 | 'syns': u'content...'}
177 | """
178 | one_new3000_word_d = new3000_base_d[word]
179 | word_pos_L, word_pos_U = one_new3000_word_d['pos']
180 | word_pos = u'L' + unicode(word_pos_L) + u' U' + unicode(word_pos_U)
181 | num_usages = len(one_new3000_word_d['usages'])
182 | usages_tag = unicode(num_usages) + u'_usage'
183 |
184 | for usage_index, usage in enumerate(one_new3000_word_d['usages']):
185 | word_phs = usage['ph_symbl']
186 | word_tags = usages_tag + tag_pos_prefix + 'zaiyaoniming3000'
187 | if not no_data_zhuji:
188 | if word in zhuji3000_base_d:
189 | word_tags += tag_pos_prefix + 'zhuji3000'
190 | if not no_data_bzsdbdc:
191 | if word in bzsdbdc_data:
192 | word_tags += tag_pos_prefix + 'bzsdbdc'
193 | usage_index = unicode(usage_index+1)
194 | word_uid = unicode(word) + usage_index
195 | ph_symbl = usage['ph_symbl']
196 | word_Audio = usage['audio']
197 | pspeech = usage['pspeech']
198 | exp_en = usage['exp_d']['en']
199 | exp_cn = usage['exp_d']['cn']
200 | exp_en_cn = usage['exp_d']['en_cn']
201 | # combine other explanation
202 | #usage_index_l = range(num_usages)
203 | #usage_index_l.remove(usage_index)
204 | #exp_other = ['**考法%d**:'%(i+1) + one_new3000_word_d['usages'][i]['exp_d']['en_cn'] +'\n' for i in usage_index_l]
205 | # use word_block_str as all explanation
206 | exp_all = one_new3000_word_d['word_block_str']
207 | examples_en = usage['examples_d']['en']
208 | examples_cn = usage['examples_d']['cn']
209 | examples_en_cn = usage['examples_d']['en_cn']
210 | examples_others = ''
211 | ants_en = usage['ants_d']['en']
212 | ants_cn = usage['ants_d']['cn']
213 | ants_en_cn = usage['ants_d']['en_cn']
214 | syns = usage['syns']
215 | # der from the book zaiyaoniming3000
216 | der_new3000 = usage['der']
217 |
218 | # bzsdbdc part
219 | how_to_mem_bzsdbdc = None_repr
220 | if not no_data_bzsdbdc:
221 | if word in bzsdbdc_data:
222 | how_to_mem_bzsdbdc = bzsdbdc_data[word]['combined']
223 |
224 | # zhuji3000 part
225 | how_to_mem_zhuji3000, eytma_gr, eytma_gr_exp, eytma_cognates = None_repr, None_repr, None_repr, None_repr
226 | '''
227 | the structure of a word of zhuji3000_base_d
228 | {'content': u'[\u6839] per- [through] + vad [go] + -e [v.], go through, \u904d\u5e03 \u2192 vt. \u5f25\u6f2b\uff0c\u5145\u6ee1\n',
229 | 'ety': 'vad, vag, ced',
230 | 'etyma_cognates_l': u'pervade, evasive, extravagant, vague, cessation, incessant',
231 | 'etyma_group_explanation': u'group explanation content',
232 | 'phon': u"[p\u0259r've\u026ad]",
233 | 'pos': u'6, 7',
234 | 'summary': u'summary content',
235 | 'word': u'pervade'}
236 | '''
237 | if not no_data_zhuji:
238 | if word in zhuji3000_base_d:
239 | how_to_mem_zhuji3000 = zhuji3000_base_d[word]['content']
240 | eytma_gr = zhuji3000_base_d[word]['ety']
241 | eytma_gr_exp = zhuji3000_base_d[word]['etyma_group_explanation']
242 | eytma_cognates = zhuji3000_base_d[word]['etyma_cognates_l']
243 | # extra fields
244 | mynotes = usage['mynotes']
245 | similar_form = one_new3000_word_d['similar_form']
246 | """
247 | Anki GreWord Structure
248 | word_uid word usage_index ph_symbl word_audio pspeech mynotes
249 | exp_en exp_cn exp_en_cn exp_all
250 | examples_en examples_cn examples_encn examples_others
251 | ants_en ants_cn ants_encn
252 | syns der_new3000
253 | how_to_mem_bzsdbdc how_to_mem_zhuji3000
254 | etyma_group etyma_group_exp etyma_cognates
255 | position similar_form tags
256 | """
257 | one_line = [word_uid, word, usage_index, ph_symbl, word_Audio, pspeech, mynotes,
258 | exp_en, exp_cn, exp_en_cn, exp_all,
259 | examples_en, examples_cn, examples_en_cn, examples_others,
260 | ants_en, ants_cn, ants_en_cn] +\
261 | [syns, der_new3000, how_to_mem_bzsdbdc, how_to_mem_zhuji3000,
262 | eytma_gr, eytma_gr_exp, eytma_cognates, word_pos, similar_form, word_tags]
263 | for index, _str in enumerate(one_line):
264 | _str = replace_with_br(collapse_blank_line(_str).strip(' \n'))
265 | one_line[index] = custom_html_element(_str)
266 | output_list.append(one_line)
267 | output_list.sort(key=lambda x: x[0])
268 | return output_list
269 | ```
270 |
271 | 上面的函数构建了基本的Anki导入文件。现在还需要将发音文件的指针添加进去。
272 | 如果是更新原有的note,那么还需要将原有note的mynotes字段取出来,放到output_list的对应位置。
273 | 所以先不执行下面的函数。等到数据补充齐全后再运行。
274 |
275 |
276 | ```python
277 | %%sync_to_file $configCreAnkiImpGreWord
278 | def main():
279 | add_field_audio_and_mynotes()
280 | output_list = convert_to_GreWord()
281 | if output_list is None:
282 | return
283 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f:
284 | for one_line in output_list:
285 | one_string = u'\t'.join(one_line) + '\n'
286 | f.write(one_string)
287 | del output_list
288 | ```
289 |
290 |
291 | ```python
292 | %%sync_to_file $configCreAnkiImpGreWord -p
293 | if __name__ == '__main__':
294 | main()
295 | ```
296 |
297 | ## 处理发音文件的思路
298 |
299 | Anki中,添加发音文件的语法是`[sound:发音文件指针]`。发音文件指针即发音文件的文件名。所有相关文件必须放在Anki自己的`collection.media`文件夹里。所以路径应该使用相对引用。
300 |
301 | 接下来,从各个发音库抽取文件指针,并且将相应文件拷贝到Anki的`collection.media`文件夹下,同时将指针添加到new3000_base_d中。
302 |
303 | ## 再要你命3000中的多音词
304 |
305 |
306 | ```python
307 | print new3000_base_d['addict']['usages'][0].keys()
308 | ```
309 |
310 | [u'exp_d', u'pspeech', u'ph_symbl', u'ants_d', u'der', u'ants', 'mynotes', u'examples', u'examples_d', u'exp', 'audio', u'syns']
311 |
312 |
313 |
314 | ```python
315 | path_to_pron = [('all','',True), ('key','usages',False), ('all','',True),('key','ph_symbl',False)]
316 | pre_word_pron = None
317 | multi_pron_word_set = set()
318 | for word, usage_index, word_pron in iter_through_general(new3000_base_d, deepcopy(path_to_pron)):
319 | if usage_index > 0:
320 | if word_pron != pre_word_pron:
321 | multi_pron_word_set.add(word)
322 | else:
323 | pre_word_pron = word_pron
324 | ```
325 |
326 |
327 | ```python
328 | print multi_pron_word_set
329 | ```
330 |
331 | set([u'incarnate', u'articulate', u'appropriate', u'incense', u'subordinate', u'animate', u'surmise', u'content', u'duplicate', u'escort', u'moderate', u'compliment', u'entrance', u'intimate', u'addict', u'compound', u'aggregate', u'discharge', u'diffuse', u'convert', u'elaborate', u'exploit', u'contract', u'project', u'initiate', u'ally', u'alloy', u'intrigue'])
332 |
333 |
334 | ## 来源:dsl格式字典
335 |
336 | dsl格式的Longman Pronunciation Dictionary 3rd Ed.
337 |
338 | 关于处理dsl的基本知识,参考[Full Text Search in GoldenDict](https://lisok3ajr.wordpress.com/2012/09/18/full-text-search-in-goldendict/)
339 |
340 | ### 读取数据
341 |
342 |
343 | ```python
344 | import gzip
345 | ```
346 |
347 |
348 | ```python
349 | file_pronunciation = 'D:\Eudict\dsl\En-En_Longman_Pronunciation3\En-En-Longman_Pronunciation.dsl.dz'
350 | dsl_str = gzip.open(file_pronunciation, mode='r').read().decode('utf-16')
351 | print dsl_str[100:400]
352 | ```
353 |
354 | sh"
355 |
356 | A
357 | [m1][b]A, a[/b] [i] name of letter[/i] [p]BrE[/p] [s]uk_ld44a.wav[/s] [p]AmE[/p] [s]us_l3a-2.wav[/s] [c mediumblue]eɪ[/c][/m]
358 | [m1]▷ [b]A's, As, a's[/b] [c mediumblue]eɪz[/c][i] —Communications code name:[/i][c darkmagenta] Alfa[/c][/m]
359 | [m1]▶[b][c blue]ˌ[/c]A[c blue]ˈ[/c]1[c blue]◂[/c], [c
360 |
361 |
362 |
363 | ```python
364 | match_word_fun = lambda word: re.search('^(%s)[ \t]*$(.*?)(?=^[^ \t])'%word, dsl_str, re.M|re.S)
365 | findall_word_fun = lambda word: re.findall('^(%s)[ \t]*$(.*?)(?=^[^ \t])'%word, dsl_str, re.M|re.S)
366 | match_us_pron_re = re.compile('\[s\](us.*?)\[/s\]')
367 | ```
368 |
369 | 有的单词,其下属派生词,以▷标识,也有自己的音标。这部分中可能出现斜体字,以[i]..[/i]标识。只有主释义单词后面的斜体才是音标。
370 |
371 |
372 | ```python
373 | match_pspeech_re = re.compile('^[ \t]*?\[m1\]\[b\].*?\[/b\] \[i\] ([a-z, ]+).*?\[/i\]', re.M)
374 | ```
375 |
376 |
377 | ```python
378 | # test
379 | def unit_test():
380 | result = match_word_fun('content')
381 | result_str = result.group()
382 | print result_str
383 | print 'All pronunciation files: ', match_us_pron_re.findall(result_str)
384 | print 'All part of speech: ', match_pspeech_re.findall(result_str)
385 | #unit_test()
386 | del unit_test
387 | ```
388 |
389 | ### 将dsl_str转换为dict
390 |
391 |
392 | ```python
393 | extract_word_block_re = re.compile(ur'^([a-z-]+)[ \t]*$(.*?)(?=^[^ \t])', re.M|re.S|re.I)
394 | ```
395 |
396 |
397 | ```python
398 | # test
399 | #extract_word_block_re.findall(dsl_str[0:5000])
400 | ```
401 |
402 |
403 | ```python
404 | dsl_pron_d = {}
405 | for one_match_obj in extract_word_block_re.finditer(dsl_str):
406 | word = one_match_obj.group(1)
407 | if word in dsl_pron_d:
408 | print '%s already exists!'%word
409 | one_word_d = {}
410 | word_block = one_match_obj.group(2)
411 | one_word_d['word_block'] = word_block
412 | one_word_d['pspeech_l'] = match_pspeech_re.findall(word_block)
413 | one_word_d['ph_symbol_l'] = match_us_pron_re.findall(word_block)
414 | if word in multi_pron_word_set:
415 | #print 'check pspeech'
416 | #print word, one_word_d['pspeech_l']
417 | pass
418 | dsl_pron_d[word] = one_word_d
419 | ```
420 |
421 |
422 | ```python
423 | # example
424 | iter_print(dsl_pron_d['content'])
425 | ```
426 |
427 | word_block
428 | {{Roman}}I{{/Roman}}
429 | [m1][b]con|tent[/b] [i] adjective, verb, noun 'contentment'[/i] [p]BrE[/p] [s]uk_ld44content.wav[/s] [p]AmE[/p] [s]us_l3content2.wav[/s] [c mediumblue]kən |ˈtent[/c] [p]§[/p]\ [sub]([/sub]ˌ[sub])[/sub]kɒn-[/m]
430 | [m1]▷ [b]con|tented[/b] [c mediumblue]ˈtent ɪd[/c] -əd [p]AmE[/p]\ [c mediumblue]ˈten[i]t̬[/i] əd[/c][/m]
431 | [m1]▷ [b]con|tenting[/b] [c mediumblue]ˈtent ɪŋ[/c] [p]AmE[/p]\ [c mediumblue]ˈten[i]t̬[/i] ɪŋ[/c][/m]
432 | [m1]▷ [b]con|tents[/b] [c mediumblue]ˈten[i]t[/i]s[/c][/m]
433 | {{Roman}}II{{/Roman}}
434 | [m1][b]content[/b] [i] noun 'matter contained'[/i] [p]BrE[/p] [s]uk_content2.wav[/s] [p]AmE[/p] [s]us_l3content.wav[/s] [c mediumblue]ˈkɒn tent[/c] [p]AmE[/p]\ [c mediumblue]ˈkɑːn-[/c][/m]
435 | [m1]▷ [b]content|s[/b] [c mediumblue]s[/c][/m]
436 | ph_symbol_l
437 | 0
438 | us_l3content2.wav
439 | 1
440 | us_l3content.wav
441 | pspeech_l
442 | 0
443 | adjective, verb, noun
444 | 1
445 | noun
446 |
447 |
448 | ### 统计词性对应关系
449 |
450 |
451 | ```python
452 | def summary_pspeech():
453 | #dsl
454 | dsl_pspeech_set = set()
455 | for word, word_d in dsl_pron_d.iteritems():
456 | dsl_pspeech_l = word_d['pspeech_l']
457 | for pspeech in dsl_pspeech_l:
458 | dsl_pspeech_set.add(pspeech)
459 | # new3000
460 | new3000_pspeech_set = set()
461 | path_to_pspeech = path_to_pron = [('all','',True), ('key','usages',False), ('all','',False),('key','pspeech',False)]
462 | for word, pspeech in iter_through_general(new3000_base_d, path_to_pspeech):
463 | for sub_pspeech in pspeech.split('/'):
464 | new3000_pspeech_set.add(sub_pspeech)
465 | stripped_pspeech = pspeech.strip('.')
466 | if word in dsl_pron_d:
467 | for dsl_pspeech in dsl_pron_d[word]['pspeech_l']:
468 | if dsl_pspeech.startswith(stripped_pspeech):
469 | break
470 | else:
471 | if len(dsl_pron_d[word]['ph_symbol_l']) > 1:
472 | #print 'pspeech of %s in new3000 not match with dsl'%word
473 | # a lot!
474 | pass
475 | print dsl_pspeech_set
476 | print new3000_pspeech_set
477 | ```
478 |
479 |
480 | ```python
481 | # summary_pspeech()
482 | ```
483 |
484 | dsl_pron_n中的有效词性类别:adjective verb pronoun preposition adverb
485 | 所以,只要看看dsl_pron_n中的词性是不是以new3000_base_d中的开头就可以。
486 |
487 | ## 再要你命3000同dsl_d比较
488 |
489 | 将西欧字符转换为普通字符,即éï转为ei
490 |
491 |
492 | ```python
493 | def check_pron_in_new3000_and_dsl(word, print_only_bad_result = True):
494 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0]
495 | return_message_l = []
496 | not_found = False
497 | if not (word_converted in dsl_pron_d):
498 | return_message_l.append('**%s** not found in dsl'%word)
499 | not_found = True
500 | else:
501 | pron_in_dsl_l = dsl_pron_d[word_converted]['ph_symbol_l']
502 | pspeech_in_dsl_l = dsl_pron_d[word_converted]['pspeech_l']
503 | pron_new3000_l = []
504 | pspeech_new3000_l = []
505 | for usage_d in new3000_base_d[word]['usages']:
506 | pron_new3000_l.append(usage_d['ph_symbl'])
507 | pspeech_new3000_l.append(usage_d['pspeech'])
508 | diff_pron_new3000_set = set(pron_new3000_l)
509 | if len(pron_in_dsl_l) < len(diff_pron_new3000_set):
510 | message = '**%s** in dsl has less pron'%word
511 | message += '\n' + str(len(pron_in_dsl_l)) + ', ' + str(len(diff_pron_new3000_set))
512 | message += '\n' + ','.join(pron_in_dsl_l)
513 | message += '\n' + ','.join(pron_new3000_l)
514 | return_message_l.append(message)
515 | else:
516 | if not print_only_bad_result:
517 | return_message_l.append('**%s** in dsl has enough pron'%word)
518 | return '\n'.join(return_message_l), not_found
519 | ```
520 |
521 |
522 | ```python
523 | result_l = []
524 | not_found_word_l = []
525 | for word in new3000_base_d.iterkeys():
526 | message_str, not_found = check_pron_in_new3000_and_dsl(word)
527 | if message_str != '':
528 | result_l.append(message_str)
529 | if not_found:
530 | not_found_word_l.append(word)
531 | if word in multi_pron_word_set:
532 | print 'Warning! **%s** in multi_pron_word_set'%word
533 | ```
534 |
535 |
536 | ```python
537 | with codecs.open('temp_check_pron_log.txt', 'w', encoding='utf-8') as f:
538 | json.dump(result_l, f, indent=5)
539 | json.dump(not_found_word_l, f, indent=2)
540 | ```
541 |
542 |
543 | ```python
544 | print '%d words not found'%len(not_found_word_l)
545 | ```
546 |
547 | 153 words not found
548 |
549 |
550 | 虽然还有153个没找到,但注意到,多音词都在其中。
551 |
552 | ## 用韦氏发音库补充
553 |
554 | 从网上找的韦氏发音库,网址:http://blog.emagic.org.cn/content/i1931.html
555 |
556 | ed2k链接
557 |
558 | ed2k://|file|%E9%9F%A6%E6%B0%8F%E5%B8%B8%E7%94%A8%E5%8D%95%E8%AF%8D%E8%AF%AD%E9%9F%B3%E5%BA%93.rar|315458082|88b70fe90a6658cec689352f66a7af6c|h=4rblspftuskt5gfvmpbnfkdvhi2ey3fn|/
559 |
560 |
561 | ```python
562 | path_of_media_source = 'D:\\mvoice\\'
563 | word_list_file = 'word_list.txt'
564 | ```
565 |
566 |
567 | ```python
568 | media_path_dict = {}
569 | match_word = r'([a-z1-9 ~]+)\.mp3'
570 | match_word_re = re.compile(match_word, re.I|re.M)
571 | with codecs.open(path_of_media_source + word_list_file, encoding='utf-8') as f:
572 | for line in f:
573 | result = match_word_re.search(line)
574 | if not (result is None):
575 | media_path_dict[result.group(1)] = line.strip()
576 | else:
577 | #print line
578 | pass
579 | ```
580 |
581 |
582 | ```python
583 | print media_path_dict['habit']
584 | ```
585 |
586 | D:\mvoice\h\habit.mp3
587 |
588 |
589 |
590 | ```python
591 | count = 0
592 | still_not_found_word_l = []
593 | for word in not_found_word_l:
594 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0]
595 | if word_converted in media_path_dict:
596 | count += 1
597 | #print 'found', word
598 | else:
599 | still_not_found_word_l.append(word)
600 | print 'found %d of %d'%(count, len(not_found_word_l))
601 | ```
602 |
603 | found 57 of 153
604 |
605 |
606 | ## 用mdict补充
607 |
608 | 使用朗文当代第5版的mdx和mdd文件
609 |
610 | 使用插件 https://bitbucket.org/xwang/mdict-analysis
611 |
612 |
613 |
614 | ```python
615 | from readmdict import MDX, MDD
616 | from bs4 import BeautifulSoup
617 | file_to_longman_mdx = "D:\Eudict\Frequent\Longman Dictionary of Contemporary English.mdx"
618 | mdx = MDX(file_to_longman_mdx)
619 | longman_mdx_iter = mdx.items()
620 | longman_in_new3000_d = {}
621 | for word, word_block in longman_mdx_iter:
622 | if word in new3000_base_d:
623 | longman_in_new3000_d[word] = word_block
624 | print 'In longman, found %d words of new3000 (%d in total)'%(len(longman_in_new3000_d), len(new3000_base_d))
625 | ```
626 |
627 | In longman, found 2954 words of new3000 (3145 in total)
628 |
629 |
630 | 抽取音频地址
631 |
632 |
633 | ```python
634 | # this is the pattern we gonna use
635 | soup = BeautifulSoup(longman_in_new3000_d['abandon'],"lxml")
636 | print soup.find_all(href=re.compile('sound.*?US'))[0]['href'][8:]
637 | ```
638 |
639 | US_abandon1.spx
640 |
641 |
642 |
643 | ```python
644 | count = 0
645 | still_still_not_found_word_l = []
646 | longman_found_word_d = {}
647 | for word in still_not_found_word_l:
648 | founded = False
649 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0]
650 | if word_converted in longman_in_new3000_d:
651 | soup = BeautifulSoup(longman_in_new3000_d[word_converted],"lxml")
652 | find_result = soup.find_all(href=re.compile('sound.*?US'))
653 | if len(find_result) != 0:
654 | count += 1
655 | #print word
656 | founded = True
657 | longman_found_word_d[word] = find_result[0]['href'][8:]
658 | if not founded:
659 | still_still_not_found_word_l.append(word)
660 | print 'found %d of %d'%(count, len(still_not_found_word_l))
661 | ```
662 |
663 | found 52 of 96
664 |
665 |
666 |
667 | ```python
668 | # example
669 | longman_found_word_d['ingratiating']
670 | ```
671 |
672 |
673 |
674 |
675 | 'US_ingratiating.spx'
676 |
677 |
678 |
679 |
680 | ```python
681 | # unzip the mdd mdx file.
682 | # Warning! This take a lot of time. I have already unpacked it, so commend the next line
683 | #! python readmdict.py -x "D:\Eudict\Frequent\Longman Dictionary of Contemporary English.mdx"
684 | ```
685 |
686 | ## 添加音频指针
687 |
688 |
689 | ```python
690 | import shutil
691 | ```
692 |
693 |
694 | ```python
695 | anki_media_collection = os.path.expanduser('~\\Documents\\Anki\\xiaohang\\collection.media')
696 | dsl_source_media_path = 'D:\Eudict\dsl\En-En_Longman_Pronunciation3\En-En-Longman_Pronunciation.dsl.dz.files'
697 | longman_source_media_path = 'D:\Eudict\Frequent\data'
698 | ```
699 |
700 |
701 | ```python
702 | def add_audio_pointer(word):
703 | word_converted = word.replace(u'é', 'e').replace(u'ï', 'i').split('/')[0]
704 | word_d = new3000_base_d[word]
705 | for usage_d in word_d['usages']:
706 | usage_d['audio'] = ''
707 | source_audio_file_name = None
708 | first_pspeech_match_obj = re.search('^([a-z]+)\.', usage_d['pspeech'])
709 | if first_pspeech_match_obj is None:
710 | print '%s has no pspeech'%word
711 | new3000_pspeech = ''
712 | else:
713 | new3000_pspeech = first_pspeech_match_obj.group(1)
714 | if new3000_pspeech in ['vt', 'vi']:
715 | new3000_pspeech = 'v'
716 | new_audio_pointer_without_ext = word_converted + '_' + new3000_pspeech
717 | new_audio_file_name_without_ext = anki_media_collection + '\\' + new_audio_pointer_without_ext
718 | new_audio_pointer_without_ext = '[sound:' + new_audio_pointer_without_ext
719 | existed = False
720 | for file_ext in ['.wav', '.mp3', '.spx']:
721 | if os.path.isfile(new_audio_file_name_without_ext + file_ext):
722 | # print 'existed!'
723 | existed = True
724 | usage_d['audio'] = new_audio_pointer_without_ext + file_ext + ']'
725 | break
726 | if existed:
727 | continue
728 | if word_converted in dsl_pron_d:
729 | dsl_word_d = dsl_pron_d[word_converted]
730 | if word in multi_pron_word_set:
731 | # check pspeech
732 | for index, dsl_pspeech in enumerate(dsl_word_d['pspeech_l']):
733 | for dsl_sub_pspeech in dsl_pspeech.split(','):
734 | if dsl_sub_pspeech.strip().startswith(new3000_pspeech):
735 | source_audio_file_name = dsl_source_media_path + '\\' + dsl_word_d['ph_symbol_l'][index]
736 | break
737 | else:
738 | print 'no match of pspeech, word %s'%word
739 | print dsl_word_d['pspeech_l'], new3000_pspeech
740 | pass
741 | else:
742 | # use the first audio pointer
743 | source_audio_file_name = dsl_source_media_path + '\\' + dsl_word_d['ph_symbol_l'][0]
744 | if not (source_audio_file_name is None):
745 | new_audio_pointer = new_audio_pointer_without_ext + '.wav]'
746 | new_audio_file_name = new_audio_file_name_without_ext + '.wav'
747 | else:
748 | # the not found word
749 | if word_converted in media_path_dict:
750 | # try webster
751 | source_audio_file_name = media_path_dict[word_converted]
752 | new_audio_pointer = new_audio_pointer_without_ext + '.mp3]'
753 | new_audio_file_name = new_audio_file_name_without_ext + '.mp3'
754 | elif word in longman_found_word_d:
755 | # try longman
756 | source_audio_file_name = longman_source_media_path + '\\' + longman_found_word_d[word]
757 | new_audio_pointer = new_audio_pointer_without_ext + '.spx]'
758 | new_audio_file_name = new_audio_file_name_without_ext + '.spx'
759 | if not (source_audio_file_name is None):
760 | usage_d['audio'] = new_audio_pointer
761 | shutil.copy(source_audio_file_name, new_audio_file_name)
762 | ```
763 |
764 |
765 | ```python
766 | for word in new3000_base_d:
767 | add_audio_pointer(word)
768 | ```
769 |
770 |
771 | ```python
772 | # example
773 | word = 'compendium'
774 | for index, usage_d in enumerate(new3000_base_d[word]['usages']):
775 | print usage_d['audio']
776 | ```
777 |
778 | [sound:compendium_n.mp3]
779 | [sound:compendium_n.mp3]
780 |
781 |
782 | ## 转换为mp3
783 |
784 | 到这里,电脑上已经可以发音了。但手机只支持mp3格式,所以要将collection.media中的wav和spx转换为mp3。
785 |
786 | 使用pydub+ffmpeg
787 |
788 | 参考[Pydub ](https://github.com/jiaaro/pydub/)
789 |
790 |
791 | ```python
792 | from pydub import AudioSegment
793 | import glob
794 | ```
795 |
796 |
797 | ```python
798 | def convert_to_mp3():
799 | owd = os.getcwd()
800 | os.chdir(anki_media_collection)
801 | extension_list = ('*.wav', '*.spx')
802 | for extension in extension_list:
803 | for audio in glob.glob(extension):
804 | mp3_filename = os.path.splitext(os.path.basename(audio))[0] + '.mp3'
805 | if not os.path.isfile(mp3_filename):
806 | AudioSegment.from_file(audio).export(mp3_filename, format='mp3')
807 | os.chdir(owd)
808 | ```
809 |
810 |
811 | ```python
812 | convert_to_mp3()
813 | ```
814 |
815 |
816 | ```python
817 | def modify_audio_pointer():
818 | path_to_usage_d = path_to_pron = [('all','',False), ('key','usages',False), ('all','',False)]
819 | for usage_d, in iter_through_general(new3000_base_d, path_to_usage_d):
820 | old_audio_name = usage_d['audio']
821 | if old_audio_name != '':
822 | new_audio_name = os.path.splitext(os.path.basename(old_audio_name))[0] + '.mp3]'
823 | usage_d['audio'] = new_audio_name
824 | ```
825 |
826 |
827 | ```python
828 | modify_audio_pointer()
829 | ```
830 |
831 |
832 | ```python
833 | # test
834 | #iter_print(new3000_base_d['chaperone'])
835 | ```
836 |
837 | ## 添加笔记
838 |
839 |
840 | ```python
841 | old_anki_GreWord_file_name = 'old_anki_greword.txt'
842 | ```
843 |
844 |
845 | ```python
846 | def add_mynotes():
847 | if not os.path.isfile(old_anki_GreWord_file_name):
848 | return
849 | old_data_line_l = codecs_open_r_utf8(old_anki_GreWord_file_name).split('\n')
850 | for line in old_data_line_l:
851 | field_l = line.split('\t')
852 | word = field_l[1]
853 | usage_index = int(field_l[2])
854 | my_note = field_l[6]
855 | if my_note != '':
856 | new3000_base_d[word]['usages'][usage_index-1]['mynotes'] = my_note
857 | ```
858 |
859 |
860 | ```python
861 | add_mynotes()
862 | ```
863 |
864 | ## 添加形近词
865 |
866 | 第一遍背诵时,由于不会精确的记忆词形,经常出现各种误识别。所以加入一个环节,根据[Wagner–Fischer算法](https://en.wikipedia.org/wiki/Wagner%E2%80%93Fischer_algorithm)自动生成形近词,并将形近词的简单中文释义添加到其后。计算单词的Levenshtein distance时,使用的别人写好的[算法](https://gist.github.com/kylebgorman/8034009),并没做优化,所以可能慢一些。另外,也没有缓存距离,即计算完A、B间的距离后,B、A间还会再计算一次。
867 |
868 | 由于ipython执行multiprocess有bug,[multiprocessing](https://docs.python.org/2/library/multiprocessing.html)提到:
869 |
870 | This means that some examples, such as the Pool examples will not work in the interactive interpreter.
871 |
872 | 所以,具体程序参见脚本`add_similar_word.py`。该脚本生成`similar_word.txt`,里面有需要的数据。
873 |
874 |
875 | ```python
876 | with open('similar_word.txt') as f:
877 | similar_word_l = json.load(f)
878 | ```
879 |
880 |
881 | ```python
882 | similar_word_d = {pair[0]:pair[1] for pair in similar_word_l}
883 | ```
884 |
885 |
886 | ```python
887 | def add_similar_word_to_new3000_base_d():
888 | for word in similar_word_d:
889 | new3000_base_d[word]['similar_form'] = similar_word_d[word]
890 | ```
891 |
892 | ## 生成文件
893 |
894 |
895 | ```python
896 | greword_import_data_l = convert_to_GreWord()
897 | with codecs.open(output_file_GreWord, 'w', encoding='utf-8') as f:
898 | for one_line in greword_import_data_l:
899 | one_string = u'\t'.join(one_line) + '\n'
900 | f.write(one_string)
901 | ```
902 |
903 |
904 | ```python
905 | # test
906 | #iter_print(new3000_base_d['hike'])
907 | ```
908 |
909 | # GrePhrase
910 |
911 |
912 | ```python
913 | %%sync_to_file $configCreAnkiImpGrePhrase
914 | file_name_duanyu = 'duanyu_base_d.txt'
915 | duanyu_base_d = is_file_and_json_load(file_name_duanyu)
916 | output_file_GrePhrase = 'AnkiImportData_GrePhrase.txt'
917 | ```
918 |
919 |
920 | ```python
921 | print 'The structure of duanyu_base_d'
922 | pprint(duanyu_base_d['under one\'s control1'])
923 | ```
924 |
925 | The structure of duanyu_base_d
926 | {u'cn_exp': u'\u5728\u2026\u2026\u7684\u63a7\u5236\u4e4b\u4e0b',
927 | u'en_exp': u'If something is **under** your **control**, you have the **power to make** all the important **decisions** about the way that it is run.',
928 | u'example': u'The current protest doesn\u2019t look likely to be brought under government\u2019s control any time soon.',
929 | u'gre_example_cn': u'\u5f53\u5fb7\u514b\u8428\u65af\u5dde\u8fd8\u5904\u4e8e\u58a8\u897f\u54e5\u7684\u7ba1\u8f96\u4e2d\u65f6\uff0c\u5c3d\u7ba1\u58a8\u897f\u54e5\u653f\u5e9c\u6781\u529b\u529d\u963b\u6765\u81ea\u7f8e\u56fd\u7684\u79fb\u6c11\uff0c\u5fb7\u5dde\u7684\u4eba\u53e3\u8fd8\u662f\u7ffb\u4e86\u4e24\u756a\u3002',
930 | u'gre_example_en': u'While Texas was under Mexican control, the population of Texas quadrupled, in spite of the fact that Mexico discouraged immigration from the United States.',
931 | u'phrase': u"under one's control",
932 | u'pos': 7,
933 | u'usage_index': u'1'}
934 |
935 |
936 |
937 | ```python
938 | %%sync_to_file $configCreAnkiImpGrePhrase
939 | def convert_to_GrePhrase():
940 | with codecs.open(output_file_GrePhrase, 'w', encoding='utf-8') as f:
941 | my_notes = ''
942 | for phrase_uid, phrase_dict in duanyu_base_d.iteritems():
943 | one_line = [phrase_uid, phrase_dict['phrase'], phrase_dict['usage_index'], my_notes,
944 | phrase_dict['en_exp'], phrase_dict['cn_exp'],
945 | phrase_dict['example'], phrase_dict['gre_example_cn'],
946 | phrase_dict['gre_example_en']]
947 | one_line = '\t'.join(one_line) + '\n'
948 | f.write(one_line)
949 | ```
950 |
951 |
952 | ```python
953 | convert_to_GrePhrase()
954 | ```
955 |
956 |
957 | ```python
958 | %%sync_to_file $file_name_grephrase -p
959 |
960 | if __name__ == '__main__':
961 | if not (duanyu_base_d is None):
962 | convert_to_GrePhrase()
963 | ```
964 |
965 |
966 | ```python
967 |
968 | ```
969 |
970 |
971 | ```python
972 | ! jupyter nbconvert anki_import.ipynb --to markdown
973 | ! jupyter nbconvert anki_import.ipynb -- to html
974 | ```
975 |
976 | [NbConvertApp] WARNING | Collisions detected in jupyter_nbconvert_config.py and jupyter_nbconvert_config.json config files. jupyter_nbconvert_config.json has higher priority: {
977 | "Exporter": {
978 | "template_path": "['.', 'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates'] ignored, using [u'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates']"
979 | }
980 | }
981 | C:\Users\xiaohang\Anaconda\lib\site-packages\IPython\nbconvert.py:13: ShimWarning: The `IPython.nbconvert` package has been deprecated. You should import from ipython_nbconvert instead.
982 | "You should import from ipython_nbconvert instead.", ShimWarning)
983 | [NbConvertApp] Converting notebook anki_import.ipynb to markdown
984 | [NbConvertApp] Writing 30598 bytes to anki_import.md
985 | [NbConvertApp] WARNING | Collisions detected in jupyter_nbconvert_config.py and jupyter_nbconvert_config.json config files. jupyter_nbconvert_config.json has higher priority: {
986 | "Exporter": {
987 | "template_path": "['.', 'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates'] ignored, using [u'C:\\\\Users\\\\xiaohang\\\\AppData\\\\Roaming\\\\jupyter\\\\templates']"
988 | }
989 | }
990 | [NbConvertApp] WARNING | pattern u'to' matched no files
991 | [NbConvertApp] WARNING | pattern u'html' matched no files
992 | C:\Users\xiaohang\Anaconda\lib\site-packages\IPython\nbconvert.py:13: ShimWarning: The `IPython.nbconvert` package has been deprecated. You should import from ipython_nbconvert instead.
993 | "You should import from ipython_nbconvert instead.", ShimWarning)
994 | [NbConvertApp] Converting notebook anki_import.ipynb to html
995 | [NbConvertApp] Writing 298264 bytes to anki_import.html
996 |
997 |
--------------------------------------------------------------------------------
/convert_duanyu.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import re
3 | import json
4 | import codecs
5 | import functools
6 | import os.path
7 | from random import random
8 | from random import randint
9 | from pprint import pprint
10 | from copy import deepcopy
11 |
12 | from my_helpers import *
13 | file_duanyu = "base_data\GREGao Fen Bi Bei Duan Yu Da Pe - Yan Yu Zhen ,Gao Yu ,Chen Qi.txt"
14 | def extract_dy_unit(base_str):
15 | base_str = base_str.split(u"# 索引\n")[0]
16 | match_dy_unit_start_re = re.compile(ur'^# Unit \d+', re.M)
17 | base_unit_str_l = extract_content_between(base_str, match_dy_unit_start_re)
18 | base_unit_str_l = [base_unit_str.split(u'## 检测练习\n')[0] for base_unit_str in base_unit_str_l]
19 | return base_unit_str_l
20 | def extract_dy_index_content(base_str):
21 | match_dy_index_cn_start_re = re.compile(u' (?=[\u4e00-\u9fa5\u3010])')
22 | index_str = base_str.split(u"# 索引\n")[1]
23 | index_d = {}
24 | for line_str in index_str.split('\n'):
25 | if line_str == '':
26 | continue
27 | line_str = strF2H(line_str)
28 | en_cn = match_dy_index_cn_start_re.split(line_str)
29 | if len(en_cn) == 2:
30 | index_d[en_cn[0]] = en_cn[1]
31 | else:
32 | print 'Warning, no en or no cn:', en_cn
33 | return index_d
34 | def extract_dy_phrase_d(base_unit_str_l):
35 | base_phrase_d = {}
36 | for unit_index, base_unit_str in enumerate(base_unit_str_l):
37 | match_phrase_start_re = re.compile(ur'^\*\*([a-z].*?)([\u3000\u4e00-\u9fa5].*)?\*\*$',
38 | re.M|re.I)
39 | phrase_block_str_l = extract_content_between(base_unit_str, match_phrase_start_re)
40 | for phrase_block_str in phrase_block_str_l:
41 | match_result = match_phrase_start_re.match(phrase_block_str)
42 | if match_result is None:
43 | print phrase_block_str
44 | phrase_en = match_result.group(1)
45 | phrase_exp_cn = match_result.group(2)
46 | if phrase_exp_cn is None:
47 | phrase_exp_cn = ''
48 | else:
49 | phrase_exp_cn = phrase_exp_cn.strip(u'\u3000 ')
50 | phrase_block_str = phrase_block_str[match_result.end():].strip('\n ')
51 | base_phrase_d[phrase_en] = {'exp_cn': phrase_exp_cn,
52 | 'phrase_block_str': phrase_block_str,
53 | 'pos': unit_index}
54 | return base_phrase_d
55 | def process_dy_phrase_block_str(base_d):
56 | processed_phrase_d = {}
57 | for phrase, base_phrase_d in base_d.iteritems():
58 | phrase_block_str = base_phrase_d['phrase_block_str']
59 | has_multiple_cn_exp = base_phrase_d['exp_cn'] == ''
60 | match_dy_multi_cn_exp_start_re = re.compile(ur'^\*\*\d+\. (.*)\*\*$', re.M)
61 | if has_multiple_cn_exp:
62 | exp_cn_l = match_dy_multi_cn_exp_start_re.findall(phrase_block_str)
63 | phrase_block_str_l = extract_content_between(phrase_block_str,
64 | match_dy_multi_cn_exp_start_re)
65 | else:
66 | exp_cn_l = [base_phrase_d['exp_cn']]
67 | phrase_block_str_l = [phrase_block_str]
68 |
69 | match_en_exp_re = re.compile(ur'^\*\*释\*\* (.*)$', re.M)
70 | match_example_re = re.compile(ur'^\*\*例\*\* (.*)$', re.M)
71 | match_gre_example = re.compile(ur'\*\*题\*\* (.*)$', re.S)
72 |
73 | for usage_index, phrase_block_str in enumerate(phrase_block_str_l):
74 |
75 | phrase_detailed_d = {}
76 | exp_en = match_en_exp_re.search(phrase_block_str).group(1)
77 | example = match_example_re.search(phrase_block_str).group(1)
78 | gre_example_en_cn = match_gre_example.search(phrase_block_str).group(1).split('\n')
79 | gre_example_en = gre_example_en_cn[0]
80 | gre_example_cn = gre_example_en_cn[2]
81 | phrase_detailed_d = {'en_exp': exp_en,
82 | 'cn_exp': exp_cn_l[usage_index],
83 | 'example': example,
84 | 'gre_example_en': gre_example_en,
85 | 'gre_example_cn': gre_example_cn,
86 | 'pos': base_phrase_d['pos'],
87 | 'usage_index': unicode(usage_index + 1),
88 | 'phrase': phrase
89 | }
90 | phrase_uid = phrase + unicode(usage_index+1)
91 | processed_phrase_d[phrase_uid] = phrase_detailed_d
92 | return processed_phrase_d
93 | def main(file_name=None):
94 | if file_name is None:
95 | file_name = file_duanyu
96 | # for module call
97 | if not os.path.isfile(file_name):
98 | return
99 | dy_base_str = codecs_open_r_utf8(file_duanyu)
100 | match_escape_char_re = re.compile(r'\\(?=[\[\]()*+])')
101 | dy_base_str = match_escape_char_re.sub('', dy_base_str)
102 | dy_base_unit_str_l = extract_dy_unit(dy_base_str)
103 | dy_index_d = extract_dy_index_content(dy_base_str)
104 | dy_phrase_d = extract_dy_phrase_d(dy_base_unit_str_l)
105 | # revise ’'
106 | dy_phrase_d['under one\'s control'] = dy_phrase_d[u'under one’s control']
107 | dy_phrase_d['on one\'s own'] = dy_phrase_d[u'on one’s own']
108 | del dy_phrase_d[u'under one’s control'], dy_phrase_d[u'on one’s own']
109 | dy_phrase_processed_d = process_dy_phrase_block_str(dy_phrase_d)
110 | with codecs.open('duanyu_base_d.txt', 'w', encoding='utf-8') as f:
111 | json.dump(dy_phrase_processed_d, f)
112 | if __name__ == '__main__':
113 | main()
114 |
--------------------------------------------------------------------------------
/convert_new3000.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import re
3 | import json
4 | import codecs
5 | import functools
6 | import os.path
7 | from random import random
8 | from random import randint
9 | from pprint import pprint
10 | from copy import deepcopy
11 |
12 | from my_helpers import *
13 | file_new_3000 = "base_data\GREHe Xin Ci Hui Kao Fa Jing Xi (Xin Dong Fang Da Yu Ying Yu Xue Xi Cong Shu ) - Chen Qi.txt"
14 | match_new3000_list_start_re = re.compile(ur'^# List \d+', re.M)
15 | def strip_last_list(list_data):
16 | strip_start_re = re.compile(ur'# Word List 1 与说有关的词根构成的单词(.|\n)*$')
17 | return strip_start_re.sub('', list_data)
18 | match_unit_start_re = re.compile(ur'^## Unit \d+', re.M)
19 | match_word_block_start = re.compile(ur'^\*\*(?P[a-z\-éï]+)\*\*(?P[.+])?', re.U|re.M)
20 | # phon represent phonetic symbol
21 | def get_word_of_one_unit(unit_block_str, list_index, unit_index):
22 | returned_words_d_d = {}
23 | word_block_str_l = extract_content_between(unit_block_str, match_word_block_start)
24 | for word_block_str in word_block_str_l:
25 | first_line_match = match_word_block_start.match(word_block_str)
26 | word = first_line_match.group('word')
27 | phon = first_line_match.group('phon')
28 | one_word_d = {'word_block_str': match_word_block_start.sub('', word_block_str),
29 | 'phon': strF2H(phon) if phon else u'',
30 | 'pos':(list_index, unit_index)}
31 | returned_words_d_d[word] = one_word_d
32 | return returned_words_d_d
33 | def get_new3000_base_d(base_unit_data_l_l):
34 | _new3000_base_d = {}
35 | for list_index, unit_data_l in enumerate(base_unit_data_l_l):
36 | for unit_index, unit_data in enumerate(unit_data_l):
37 | _new3000_base_d.update(get_word_of_one_unit(unit_data, list_index+1, unit_index+1))
38 | return _new3000_base_d
39 | # revise
40 | def revise_word_base_data(word_d):
41 | # revise anarchist
42 | word_block_str = 'word_block_str'
43 | to_revise_word_d = word_d['anarchist']
44 | to_revise_str = to_revise_word_d[word_block_str]
45 | to_revise_word_d[word_block_str] = to_revise_str.replace(u'同', u'近')
46 | # revise compliment
47 | to_revise_word_d = word_d['compliment']
48 | to_revise_str = to_revise_word_d[word_block_str]
49 | to_revise_word_d['phon'] = [strF2H(phon) for phon in re.findall(ur'[.+?]', to_revise_str)]
50 | to_revise_word_d[word_block_str] = '\n'.join(to_revise_str.split('\n')[1:])
51 | # reviseantediluvian, revise anecdote
52 | for to_revise_word in ['antediluvian', 'anecdote']:
53 | to_revise_word_d = word_d[to_revise_word]
54 | to_revise_str = to_revise_word_d[word_block_str]
55 | temp_index = 0
56 | for match_result in re.finditer(ur'\n\n', to_revise_str):
57 | if temp_index == 2:
58 | to_revise_str = to_revise_str[0:match_result.start()] + u'‖' + to_revise_str[match_result.end():]
59 | break
60 | temp_index += 1
61 | to_revise_word_d[word_block_str] = to_revise_str
62 | return word_d
63 | character_start = {'examples': '例',
64 | 'syns': '近',
65 | 'ants': '反',
66 | 'der': '派'}
67 | is_str_start_with_character_fun_d = {}
68 | for key, value in character_start.iteritems():
69 | def gen_match_fun_closure(_value):
70 | return lambda s: s[0] == _value.decode('utf-8')
71 | is_str_start_with_character_fun_d[key] = gen_match_fun_closure(value)
72 | def revise_entry_name(words_d):
73 | # revise random
74 | words_d['random']['word_block_str'] = words_d['random']['word_block_str'].replace(u'例 aimless',
75 | u'近 aimless')
76 | # revise sordid
77 | words_d['sordid']['word_block_str'] = words_d['random']['word_block_str'].replace(u'近 Behind his generous',
78 | u'例 Behind his generous')
79 | # revise clan
80 | words_d['clan']['word_block_str'] = words_d['clan']['word_block_str'] .replace(u'反 clannish',
81 | u'派 clannish')
82 | match_usage_start_re = re.compile(ur'^【考(?:法|点)\d?】(.*)$', re.M|re.U)
83 | match_der = re.compile(ur'^')
84 | def wb_str_2_usages_d_l(word_block_str):
85 | '''
86 | convert word block (string) to usages like structure
87 | input: the 'word_block_str' attribute of a word dictionary
88 | return: two lists,
89 | the first with its 'i'th element indicating whether
90 | the 'i'th usage has a complex der
91 | the second is the list of usages
92 | '''
93 | usage_template = {'exp': '',
94 | 'examples': '',
95 | 'syns': '',
96 | 'ants': '',
97 | 'der': ''}
98 | usages_str_l = extract_content_between(word_block_str, match_usage_start_re)
99 | usages_d_l = []
100 | is_complex_der_l = []
101 |
102 | for one_usage_str in usages_str_l:
103 | one_usage_d = deepcopy(usage_template)
104 | is_complex_der = False
105 | has_der = False
106 | one_usage_lines = one_usage_str.split('\n')
107 | one_usage_d['exp'] = match_usage_start_re.match(one_usage_lines[0]).group(1)
108 |
109 | for line in one_usage_lines[1:]:
110 | has_been_matched = False
111 |
112 | if line == '' or line == '\n':
113 | continue
114 | # match "例" "反", etc.
115 | for field_name, match_func in is_str_start_with_character_fun_d.iteritems():
116 | if match_func(line):
117 | has_been_matched = True
118 | if has_der:
119 | one_usage_d['der'] += '\n' + line.strip()
120 | is_complex_der = True
121 | else:
122 | # test
123 | if one_usage_d[field_name] != '':
124 | print '****Multi line field!****'
125 | print word_block_str
126 | pass
127 | one_usage_d[field_name] = line.strip()
128 | if field_name == 'der':
129 | # test
130 | if has_der:
131 | # print 'Warning! der in der!'
132 | # print one_usage_str
133 | pass
134 | has_der = True
135 | break
136 | if not has_been_matched:
137 | # after printed out, it can be seen that these lines are all aphorisms
138 | # so, useless for our purpose
139 | #print line
140 | pass
141 | usages_d_l.append(one_usage_d)
142 | is_complex_der_l.append(is_complex_der)
143 | return is_complex_der_l, usages_d_l
144 | def gen_usages_for_all_words(words_d):
145 | match_der_word = re.compile(ur'^派 ([a-z,/\-éï]+)', re.M)
146 | complex_ders_d = {}
147 | for word in words_d:
148 | if words_d[word]['word_block_str'] == '':
149 | print 'Empty word:', word
150 | continue
151 | is_complex_der_l, words_d[word]['usages'] = wb_str_2_usages_d_l(words_d[word]['word_block_str'])
152 | if True in is_complex_der_l:
153 | for i, one_usage in enumerate(words_d[word]['usages']):
154 | # revise plumb
155 | if i == 2 and word == u'plumb':
156 | one_usage['example'] = one_usage['der']
157 | one_usage['der'] = ''
158 | continue
159 | if is_complex_der_l[i]:
160 | whole_der_block_str = strF2H(one_usage['der'])
161 | der_block_str_l = extract_content_between(whole_der_block_str, match_der_word)
162 | for der_block_str in der_block_str_l:
163 | # revise daunt
164 | if word == 'daunt':
165 | der_block_str = der_block_str.replace(', ', '/')
166 | der_word = match_der_word.match(der_block_str).group(1)
167 | der_block_str = match_der_word.sub(ur'【考法】', der_block_str)
168 | complex_ders_d[der_word] = {}
169 | _, complex_ders_d[der_word]['usages'] = wb_str_2_usages_d_l(der_block_str)
170 | if len(complex_ders_d[der_word]['usages']) != 1:
171 | print 'Warning! Not unqiue explanation!'
172 | continue
173 | complex_ders_d[der_word]['usages'][0]['der'] = u'源 ' + word
174 | complex_ders_d[der_word]['phon'] = u''
175 | complex_ders_d[der_word]['pos'] = words_d[word]['pos']
176 | complex_ders_d[der_word]['word_block_str'] = u''
177 | # test
178 | #print der_word
179 | #iter_print(complex_ders_d[der_word]['usages'])
180 | #del words_d[word]['word_block_str']
181 | return complex_ders_d, words_d
182 | match_phon_re = re.compile(ur'[.*]', re.U)
183 | match_pspeech_re = re.compile(ur'\*([a-z\/.]+\.)\*')
184 | has_cn_char_fun = lambda _str: re.compile(ur'[\u4e00-\u9fa5]').search(_str) is not None
185 | def process_exp(exp_field_str):
186 | '''
187 | input: a unicode object corresponding the explanation line of the word
188 | return: dict {exp, pspeech, ph_symbl}
189 | '''
190 | if exp_field_str == '':
191 | print 'Warning! No explanation!'
192 | return
193 | returned_d = {'exp': {'en': '', 'cn': '', 'en_cn': ''},
194 | 'pspeech': '',
195 | 'ph_symbl': ''}
196 |
197 | result = match_pspeech_re.search(exp_field_str)
198 | if result:
199 | returned_d['pspeech'] = result.group(1)
200 | exp_field_str = match_pspeech_re.sub('', exp_field_str, 1)
201 |
202 | result = match_phon_re.search(exp_field_str)
203 | if result:
204 | returned_d['ph_symbl'] = result.group()
205 | exp_field_str = match_phon_re.sub('', exp_field_str, 1).strip()
206 |
207 | returned_d['exp']['en_cn'] = exp_field_str.strip()
208 |
209 | # seperate en and cn
210 | spered_str_l = [_str.strip() for _str in strF2H(exp_field_str).split(u':')]
211 | seperator_count = len(spered_str_l) - 1
212 | if seperator_count == 0:
213 | # test whether no seperator guarantees no chinese explanation
214 | # print 'No sep', spered_str_l
215 | returned_d['exp']['cn'] = spered_str_l[0]
216 | elif seperator_count == 1:
217 | returned_d['exp']['cn'], returned_d['exp']['en'] = spered_str_l
218 | elif seperator_count == 2:
219 | # test
220 | # print 'Two sep: ', spered_str_l
221 | has_char_cn_boo_l = map(has_cn_char_fun, spered_str_l)
222 | returned_d['exp']['cn'] = u':'.join([spered_str_l[i] for i in range(seperator_count+1) if has_char_cn_boo_l[i]])
223 | returned_d['exp']['en'] = u':'.join([spered_str_l[i] for i in range(seperator_count+1) if not has_char_cn_boo_l[i]])
224 | # test
225 | #iter_print(returned_d['exp'])
226 | else:
227 | # test
228 | #print 'More than two sep: ', exp_field_str
229 | pass
230 | return returned_d
231 | def process_exp_field_for_all_words(words_d):
232 | for word, usage_index, exp_str in iter_value_of_key_through_d_l_d_d(words_d, 'usages', 'exp',
233 | yield_top_key=True, yield_list_index=True):
234 | base_exp_d = None
235 | # get base_exp_d
236 | # revise abuse
237 | if word == 'abuse' and usage_index == 1:
238 | exp_str_l = exp_str.split(';')
239 | base_exp_d, extra_exp_d = map(process_exp, exp_str_l)
240 | base_exp_d['exp']['en'] = base_exp_d['exp']['en'] + ';' + extra_exp_d['exp']['en']
241 | base_exp_d['exp']['cn'] = base_exp_d['exp']['cn'] + ';' + extra_exp_d['exp']['cn']
242 | # test
243 | #iter_print(base_exp_d)
244 |
245 | # revise disaffected
246 | if word == 'disaffect':
247 | base_exp_d = process_exp(exp_str.split(';')[0])
248 | # test
249 | #iter_print(base_exp_d)
250 |
251 | else:
252 | base_exp_d = process_exp(exp_str)
253 |
254 | # get phonic symbol from parent field
255 | if base_exp_d['ph_symbl'] == u'':
256 | # revise compliment
257 | if word == 'compliment':
258 | if usage_index == 0:
259 | base_exp_d['ph_symbl'] = 'n. ' + words_d[word]['phon'][0] + \
260 | ' v. ' + words_d[word]['phon'][1]
261 | else:
262 | base_exp_d['ph_symbl'] = words_d[word]['phon'][0]
263 | else:
264 | # test
265 | if usage_index > 2:
266 | #print word
267 | pass
268 | base_exp_d['ph_symbl'] = words_d[word]['phon']
269 | one_usage = words_d[word]['usages'][usage_index]
270 | one_usage['ph_symbl'] = base_exp_d['ph_symbl']
271 | del base_exp_d['ph_symbl']
272 | one_usage['pspeech'] = base_exp_d['pspeech']
273 | del base_exp_d['pspeech']
274 | one_usage['exp_d'] = base_exp_d['exp']
275 | return words_d
276 | match_all_cn_re = ur' ?[a-z0-9:。;,“”()、?《》]*?[\u4e00-\u9fa5]+.*?(?=$|[a-z]+ [a-z]+)'
277 | match_all_cn_re = re.compile(match_all_cn_re, re.I)
278 | match_cn_punc_with_en_char_fun = lambda _str: re.search(ur'[。?]( )?(?=[a-z])', _str, re.I)
279 | match_cn_char_with_en_char_fun = lambda _str: re.search(ur'[\u4e00-\u9fa5](?=[a-z])', _str, re.I)
280 | # revise
281 | def revise_no_sep(words_d):
282 | path_to_example = [('all', '', True), ('key', 'usages', False), ('all','',True),('key','examples',False)]
283 | example_iter = iter_through_general(words_d, path_to_example)
284 | for word, usage_index, example_str in example_iter:
285 | if example_str == '':
286 | continue
287 | example_str = example_str[2:]
288 | if u'\u2016' not in example_str:
289 | results = match_all_cn_re.findall(example_str)
290 | if len(results) > 1:
291 | index_to_add_sep = None
292 | one_result = match_cn_punc_with_en_char_fun(example_str)
293 | if one_result:
294 | index_to_add_sep = one_result.end()
295 | elif word in [u'heckle', u'carefree']:
296 | one_result = match_cn_char_with_en_char_fun(example_str)
297 | index_to_add_sep = one_result.end()
298 | elif word == 'clarify':
299 | example_str = example_str.replace(u';', u'\u2016')
300 | if index_to_add_sep:
301 | example_str = example_str[:index_to_add_sep] + u'\u2016' + example_str[index_to_add_sep:]
302 | words_d[word]['usages'][usage_index]['examples'] = u'例 ' + example_str
303 | return words_d
304 | match_sentence_en_part_re = re.compile(ur'[a-z0-9éï\'";:,?!%()$ⅠⅡ.*/\- — ‘’“”()]+(?=[<《〈\u4e00-\u9fa5])', re.I)
305 | def sep_en_cn_sentence(sentences_str):
306 | if sentences_str == '':
307 | return '', '', '',
308 | sentences_str = sentences_str[2:].replace(u'\!', u'!')
309 | is_number_fun = lambda _str: re.match('\d', _str)
310 | en_str_l = []
311 | cn_str_l = []
312 | en_cn_str_l= []
313 | for sentence in sentences_str.split(u'\u2016'):
314 | sentence = sentence.strip(u' \n')
315 | en_cn_str_l.append(sentence)
316 | result = match_sentence_en_part_re.match(sentence)
317 | if result:
318 | en_str = result.group()
319 | # test
320 | if not (en_str[-1] in [' ', '.', u')', u'”']):
321 | if en_str[-1] == u'“':
322 | #print en_str
323 | en_str = en_str[:-1]
324 | #print en_str
325 | elif is_number_fun(en_str[-1]) or (en_str[-2:] in ['RE', 'IT', 'on', 'NA']):
326 | #print en_str
327 | last_blank_space = len(en_str) - 1
328 | while en_str[last_blank_space] != ' ':
329 | last_blank_space -= 1
330 | en_str = en_str[:last_blank_space]
331 | #print en_str
332 | elif en_str[-2:] == u'“‘':
333 | #print en_str
334 | en_str = en_str[:-2]
335 | #print en_str
336 | else:
337 | #print en_str
338 | #print sentence
339 | pass
340 | en_str_l.append(strF2H(en_str).strip())
341 | cn_str_l.append(sentence.replace(en_str, ''))
342 | else:
343 | print sentence
344 | raise ValueError('Warning! No en part!')
345 | return new_line_join(en_str_l), new_line_join(cn_str_l), new_line_join(en_cn_str_l)
346 | def process_examples(words_d):
347 | path_to_example = [('all', '', True), ('key', 'usages', False), ('all','',True),('key','examples',False)]
348 | example_iter = iter_through_general(words_d, path_to_example)
349 | for word, usage_index, example_str in example_iter:
350 | examples_en, examples_cn, examples_encn = sep_en_cn_sentence(example_str)
351 | words_d[word]['usages'][usage_index]['examples_d'] = {'en': examples_en, 'cn': examples_cn, 'en_cn': examples_encn}
352 | return words_d
353 | match_ants_en_part_re = re.compile(ur'[a-zéï][a-zéï ,-/]+(?=[ \u4e00-\u9fa5(]|$)', re.I)
354 | def sep_en_cn_ants(ants_str):
355 | if ants_str == '':
356 | return '', '', '', 0
357 | ants_str = ants_str[2:]
358 | num_ants_of_explanations = 0
359 | en_str_l = match_ants_en_part_re.findall(ants_str)
360 | num_ants_of_explanations = len(en_str_l)
361 | # test
362 | if num_ants_of_explanations == 0:
363 | print 'Warning! No en part!', ants_str
364 | cn_str = match_ants_en_part_re.sub('', ants_str).strip(' \n')
365 | search_en_fun = lambda _str: re.search(r'[a-z]', _str, re.I)
366 | if search_en_fun(cn_str):
367 | print 'Warning! en in cn part!', cn_str
368 | en_cn = ants_str.strip(' \n')
369 | return '; '.join(en_str_l), cn_str, en_cn, num_ants_of_explanations
370 | def process_all_ants(words_d):
371 | path_to_ants = [('all','',True),('key','usages',False),('all','',True),('key','ants',False)]
372 | ants_iter = iter_through_general(words_d, path_to_ants)
373 | for word, usage_index, ant_str in ants_iter:
374 | en_str, cn_str, en_cn_str, num_exps = sep_en_cn_ants(ant_str)
375 | words_d[word]['usages'][usage_index]['ants_d'] = {'en': en_str, 'cn': cn_str, 'en_cn': en_cn_str}
376 | # test
377 | if num_exps > 1:
378 | #print word
379 | pass
380 | return words_d
381 | strip_first_two_chars_fun = lambda _str: _str[2:]
382 | def process_all_syns(words_d):
383 | path_to_syns = [('all','',True),('key','usages',False),('all','',True),('key','syns',False)]
384 | for word, usage_index, syns_str in iter_through_general(words_d, path_to_syns):
385 | usage_d = words_d[word]['usages'][usage_index]
386 | usage_d['syns'] = strip_first_two_chars_fun(syns_str)
387 | return words_d
388 | def supplement_word_ph_symbl(words_d):
389 | path_to_phsymb = [('all','',True),('key','usages',False),('all','',True),('key','ph_symbl',False)]
390 | for word, usage_index, ph_symbl in iter_through_general(words_d, path_to_phsymb):
391 | usage_d = words_d[word]['usages'][usage_index]
392 | if usage_d['ph_symbl'] == '':
393 | cur_pspeech = usage_d['pspeech']
394 | if usage_index == 0:
395 | # uncommend print if you want to check
396 | #print 'Word %s has no phonetic symbol, maybe it is a derivative.'%word
397 | continue
398 | pre_usage_d = words_d[word]['usages'][usage_index-1]
399 | pre_pspeech = pre_usage_d['pspeech']
400 | pre_phsymbl = pre_usage_d['ph_symbl']
401 | if pre_pspeech != cur_pspeech:
402 | if not cur_pspeech.startswith('v'):
403 | # already check the v. vi. vt. case
404 | print 'Previous pspeech is different. Please check! Word %s'%word
405 | iter_print(usage_d)
406 | continue
407 | usage_d['ph_symbl'] = pre_phsymbl
408 | return words_d
409 | def main(file_name=None):
410 | if file_name is None:
411 | file_name = file_new_3000
412 | # for module call
413 | if not os.path.isfile(file_name):
414 | return
415 | new3000_base_str = codecs_open_r_utf8(file_new_3000)
416 | new3000_base_list_data_l = extract_content_between(new3000_base_str, match_new3000_list_start_re)
417 | new3000_base_list_data_l[30] = strip_last_list(new3000_base_list_data_l[30])
418 | new3000_base_unit_data_l_l = map(functools.partial(extract_content_between,
419 | match_re=match_unit_start_re),
420 | new3000_base_list_data_l)
421 | new3000_base_d = get_new3000_base_d(new3000_base_unit_data_l_l)
422 | # revise
423 | subset_to_revise_d = {word:deepcopy(new3000_base_d[word]) for word in ['anarchist', 'compliment', 'antediluvian', 'anecdote']}
424 | subset_to_revise_d = revise_word_base_data(subset_to_revise_d)
425 | new3000_base_d.update(subset_to_revise_d)
426 | del subset_to_revise_d, new3000_base_list_data_l, new3000_base_unit_data_l_l, new3000_base_str
427 | revise_entry_name(new3000_base_d)
428 | complex_ders_d, new3000_base_d = gen_usages_for_all_words(new3000_base_d)
429 | new3000_base_d.update(complex_ders_d)
430 | del complex_ders_d
431 | new3000_base_d = process_exp_field_for_all_words(new3000_base_d)
432 | new3000_base_d = revise_no_sep(new3000_base_d)
433 | new3000_base_d = process_examples(new3000_base_d)
434 | new3000_base_d['enfranchise']['usages'][1]['ants'] = new3000_base_d['enfranchise']['usages'][1]['ants'].replace(u'subdue; enthrall', u'subdue, enthrall')
435 | new3000_base_d = process_all_ants(new3000_base_d)
436 | new3000_base_d = process_all_syns(new3000_base_d)
437 | # revise compendium
438 | new3000_base_d['compendium']['usages'][1]['pspeech'] = 'n.'
439 | new3000_base_d = supplement_word_ph_symbl(new3000_base_d)
440 | with codecs.open('new3000_base_d.txt', 'w', encoding='utf-8') as f:
441 | json.dump(new3000_base_d, f)
442 |
443 | if __name__ == '__main__':
444 | main()
--------------------------------------------------------------------------------
/convert_zhuji.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import re
3 | import json
4 | import codecs
5 | import functools
6 | import os.path
7 | from random import random
8 | from random import randint
9 | from pprint import pprint
10 | from copy import deepcopy
11 |
12 | from my_helpers import *
13 | file_zhuji = "base_data\GREHe Xin Ci Hui Zhu Ji Yu Jing - Cao Tian Cheng.txt"
14 | match_escape_char_re = re.compile(r'\\(?=[\[\]()*+])')
15 | match_zhuji_list_start_re = re.compile(ur'### List \d+', re.M)
16 | def get_etyma_block_d_l_l(list_data_l):
17 | match_etyma_block_start = re.compile(r'^\d+\.(.*)$\n|^Unit \d+$', re.M)
18 | etyma_block_d_l_l = []
19 | for list_index, base_list_str in enumerate(list_data_l):
20 | if list_index > 38:
21 | break
22 | etyma_block_d_l = []
23 | base_list_str = base_list_str.split(u'小结&复习')[0]
24 | etyma_block_str_l = extract_content_between(base_list_str, match_etyma_block_start)
25 | for etyma_index, etyma_block_str in enumerate(etyma_block_str_l):
26 | ety_str = match_etyma_block_start.search(etyma_block_str).group(1)
27 | if ety_str is None:
28 | ety_str = ''
29 | ety_str = ety_str.strip()
30 | if ety_str == u'其他':
31 | #print u'词根是其他'
32 | ety_str = ''
33 | if list_index == 36-1:
34 | ety_str = u'与动物有关的单词,' + ety_str
35 | ety_str = ety_str.strip()
36 | etyma_block_str_and_summary_str = etyma_block_str.split(u'小结')
37 | summary_str = etyma_block_str_and_summary_str[1] if len(etyma_block_str_and_summary_str) == 2 else ''
38 | etyma_block_str = match_etyma_block_start.sub('', etyma_block_str_and_summary_str[0])
39 | # revise surg, cit
40 | if ety_str == 'surg, cit':
41 | temp_str_l = etyma_block_str.split('\n')
42 | #iter_print(temp_str_l)
43 | # insert line 5 after line 0
44 | modified_str_l = [temp_str_l[0], temp_str_l[5]] + temp_str_l[1:5] + temp_str_l[6:]
45 | etyma_block_str = '\n'.join(modified_str_l)
46 | #print etyma_block_str
47 | # revise rejoice
48 | if ety_str == u'欢乐与喜悦':
49 | temp_str_l = etyma_block_str.split('\n')
50 | #iter_print(temp_str_l)
51 | modified_str_l = [temp_str_l[0], temp_str_l[9]] + temp_str_l[1:9]
52 | etyma_block_str = '\n'.join(modified_str_l)
53 | #print etyma_block_str
54 | etyma_block_d = {'pos':(list_index+1, etyma_index+1),
55 | 'ety': ety_str,
56 | 'ety_block_str': etyma_block_str,
57 | 'summary': summary_str}
58 | etyma_block_d_l.append(etyma_block_d)
59 | etyma_block_d_l_l.append(etyma_block_d_l)
60 | return etyma_block_d_l_l
61 | def revise_miss_etyma(base_d_l_l):
62 | # revise list 25 etyma 3 revise tum
63 | base_d_l_l[25-1][3-1]['ety'] = 'tum'
64 | # revise list 5 etyma 4 revise post, pound
65 | base_d_l_l[5-1][4-1]['ety'] = 'post, pound'
66 | # revise list 6 etyma 7 revise vad, vag, ced
67 | base_d_l_l[6-1][7-1]['ety'] = 'vad, vag, ced'
68 | match_cognate_block_start_re = re.compile(ur'^([a-zéï-]+)(.*?)(\[.*\])$', re.M|re.I)
69 | def process_ety_block_str(base_d_l_l):
70 | path_to_ety_block_str = [('all','',True),('all','',True),('key','ety_block_str',False)]
71 | for list_index, ety_index, ety_block_str in iter_through_general(base_d_l_l,
72 | path_to_ety_block_str):
73 | etyma_block_d = base_d_l_l[list_index][ety_index]
74 | returned_l = extract_content_between(ety_block_str, match_cognate_block_start_re, True)
75 | ety_group_exp = returned_l.pop(0).strip()
76 | etyma_block_d['etyma_group_explanation'] = ety_group_exp
77 | etyma_block_d['cognate_block_str_l'] = returned_l
78 | # revise List 13, ety 3 revise scru
79 | def revise_scru(base_d_l_l):
80 | '''
81 | please only call it one time
82 | or re-run the code cells starting from
83 | "zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l)"
84 | '''
85 | to_revise_l = base_d_l_l[13-1][3-1]['cognate_block_str_l']
86 | #iter_print(to_revise_l)
87 | # remove element 3-5 and build new dict
88 | new_l = to_revise_l[3:]
89 | to_revise_l[2] = to_revise_l[2].replace(u'以下的4个单词可以将scru按照读音联想成“四顾”,表示“ (顾虑地) 看”。', '')
90 | to_revise_l = to_revise_l[:3]
91 | new_ety = 'scru'
92 | new_ety_group_exp = u'将scru按照读音联想成“四顾”,表示“ (顾虑地) 看”'
93 | new_ety_d = {'cognate_block_str_l': new_l, 'pos': (13, 3),
94 | 'ety': new_ety,
95 | 'etyma_group_explanation': new_ety_group_exp,
96 | 'summary':'', 'ety_block_str':''}
97 | base_d_l_l[13-1].append(new_ety_d)
98 | def process_cognate_block(cognate_block_str):
99 | cognate_dict = {}
100 | cognate_lines_l = cognate_block_str.split('\n')
101 | first_line_match = match_cognate_block_start_re.match(cognate_lines_l.pop(0))
102 | word = first_line_match.group(1)
103 | if (word == '') or (word is None):
104 | print 'Warning!'
105 | cognate_dict['word'] = word
106 | phon = first_line_match.group(3)
107 | cognate_dict['phon'] = phon if not (phon is None) else ''
108 |
109 | modified_cognate_lines_l = []
110 | for cognate_line in cognate_lines_l:
111 | cognate_line = cognate_line.strip()
112 | if cognate_line == '':
113 | pass
114 | elif cognate_line.startswith(u'源'):
115 | # revise 源
116 | cognate_line = cognate_line.replace(u'源', u'[源]')
117 | # print cognate_line
118 | elif cognate_dict['word'] == u'facilitate':
119 | pass
120 | elif cognate_dict['word'] in ['jocular', 'jocund', 'jovial', 'rejoice']:
121 | pass
122 | elif cognate_line.startswith(u'以下两个单词中'):
123 | pass
124 | elif not cognate_line.startswith(u'['):
125 | # test
126 | print 'current line:', cognate_line, '\ncurrent block\n', cognate_block_str
127 | break
128 | else:
129 | pass
130 | modified_cognate_lines_l.append(cognate_line)
131 | cognate_dict['content'] = '\n'.join(modified_cognate_lines_l)
132 | return cognate_dict
133 | def process_all_cognate_block(base_data_d_l_l):
134 | base_word_d = {}
135 | path_to_cognate_block_str = [('all','',True),('all','',True),
136 | ('key','cognate_block_str_l',False),('all','',True)]
137 | for list_index, eytma_index, cognate_index, cognate_block_str in iter_through_general(base_data_d_l_l,
138 | path_to_cognate_block_str):
139 | one_word_d = process_cognate_block(cognate_block_str)
140 | word = one_word_d['word']
141 | for _key in ['pos', 'ety', 'etyma_group_explanation', 'summary']:
142 | one_word_d[_key] = base_data_d_l_l[list_index][eytma_index][_key]
143 | one_word_d['pos'] = ', '.join([unicode(i) for i in one_word_d['pos']])
144 | one_word_d['etyma_cognates_l'] = '' # waiting to be filled later
145 | if word in base_word_d:
146 | print 'Warning! word already exists!', word
147 | base_word_d[word] = one_word_d
148 | return base_word_d
149 | def add_etyma_cognates_l(base_word_d, base_d_l_l):
150 | path_to_etyma_d = [('all','',False),('all','',False)]
151 | for etyma_d, in iter_through_general(base_d_l_l, path_to_etyma_d):
152 | ety_str = etyma_d['ety']
153 | ety_group_exp = etyma_d['etyma_group_explanation']
154 | if ety_str != '' or ety_group_exp != '':
155 | if ety_str == '':
156 | # test
157 | print ety_group_exp
158 | etyma_cognates_l = []
159 | for cognate_block_str in etyma_d['cognate_block_str_l']:
160 | word = match_cognate_block_start_re.match(cognate_block_str).group(1)
161 | etyma_cognates_l.append(word)
162 | for word in etyma_cognates_l:
163 | base_word_d[word]['etyma_cognates_l'] = ', '.join(etyma_cognates_l)
164 | def main(file_name=None):
165 | if file_name is None:
166 | file_name = file_zhuji
167 | # for module call
168 | if not os.path.isfile(file_name):
169 | return
170 | zhuji_base_str = codecs_open_r_utf8(file_zhuji)
171 | zhuji_base_str = match_escape_char_re.sub('', zhuji_base_str)
172 | zhuji_base_str = collapse_blank_line(zhuji_base_str)
173 | with codecs.open('temp_zhuji_base_str.txt', 'w', encoding='utf-8') as f:
174 | f.write(zhuji_base_str)
175 | zhuji_base_str = zhuji_base_str.split(u'# 第二篇 核心词汇练习')[0]
176 | zhuji_base_list_l = extract_content_between(zhuji_base_str, match_zhuji_list_start_re)
177 | zhuji_base_d_l_l = get_etyma_block_d_l_l(zhuji_base_list_l)
178 | revise_miss_etyma(zhuji_base_d_l_l)
179 | process_ety_block_str(zhuji_base_d_l_l)
180 | revise_scru(zhuji_base_d_l_l)
181 | zhuji_base_word_d = process_all_cognate_block(zhuji_base_d_l_l)
182 | add_etyma_cognates_l(zhuji_base_word_d, zhuji_base_d_l_l)
183 | with codecs.open('zhuji_base_d.txt', 'w', encoding='utf-8') as f:
184 | json.dump(zhuji_base_word_d, f)
185 |
186 | if __name__ == '__main__':
187 | main()
--------------------------------------------------------------------------------
/example_usage.apkg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/empenguinxh/Anki-CreateImportFile/9950194a50145fa3e1b84a535d7942136b28c418/example_usage.apkg
--------------------------------------------------------------------------------
/my_helpers.py:
--------------------------------------------------------------------------------
1 | # coding:utf-8
2 | import re
3 | import json
4 | import codecs
5 | import functools
6 | import os.path
7 | from random import random
8 | from random import randint
9 | from pprint import pprint
10 | from copy import deepcopy
11 | def strF2H(ustring):
12 | '''
13 | convert full width character to half width
14 | input: a unicode object
15 | return: a unicode object
16 | '''
17 | h_ustring = u""
18 | assert isinstance(ustring, unicode)
19 | for uchar in ustring:
20 | inside_code = ord(uchar)
21 | if inside_code == 12288:
22 | # white space
23 | inside_code = 32
24 | elif 65281 <= inside_code <= 65374:
25 | # other characters
26 | inside_code -= 65248
27 |
28 | h_ustring += unichr(inside_code)
29 | return h_ustring
30 | # pretty print the embedded unicode of list or dict
31 | def iter_print(obj_iter, indent=0, increment=2, max_top_level_print=None,
32 | top_level=True, top_level_extra_line_feed=False, print_list_index=True):
33 | if not hasattr(obj_iter, '__iter__'):
34 | if isinstance(obj_iter, basestring):
35 | if obj_iter == u'':
36 | pass
37 | elif '\n' in obj_iter:
38 | for line in obj_iter.split('\n'):
39 | if line:
40 | print ' '*indent, line
41 | else:
42 | print ' '*indent, obj_iter
43 | else:
44 | print ' '*indent, obj_iter
45 | return
46 | print_count = 0
47 | if isinstance(obj_iter, dict):
48 | for key, iter_sub_obj in obj_iter.iteritems():
49 | print ' '*indent, key
50 | iter_print(iter_sub_obj, indent+increment, increment, None, False, False, print_list_index)
51 | if top_level:
52 | print_count += 1
53 | if max_top_level_print:
54 | if print_count >= max_top_level_print:
55 | break
56 | if top_level_extra_line_feed:
57 | print '\n'
58 | else:
59 | for list_index, sub_obj_iter in enumerate(obj_iter):
60 | if print_list_index:
61 | print ' '*indent, list_index
62 | iter_print(sub_obj_iter, indent+increment, increment, None, False, False, print_list_index)
63 | if top_level:
64 | print_count += 1
65 | if max_top_level_print:
66 | if print_count >= max_top_level_print:
67 | break
68 | if top_level_extra_line_feed:
69 | print '\n'
70 | def extract_content_between(obj_str, match_re, return_str_before_first_match=False):
71 | '''
72 | extract content between the start of two equal pattern found in a str,
73 | also extract the content after the last match
74 | input: obj_str, the string to extract content from, must be a unicode object
75 | match_re, the pattern to be matched
76 | return: a list of str
77 | return_str_before_first_match: whether to return the str before the first match of the given patter
78 | '''
79 | assert isinstance(obj_str, unicode)
80 | retype = type(re.compile(r'a str'))
81 | assert isinstance(match_re, retype)
82 |
83 | match_results_iter = match_re.finditer(obj_str)
84 | returned_str_l = []
85 | start_index = None
86 | end_index = None
87 | first_start_index = None
88 | for match_result in match_results_iter:
89 | if first_start_index is None:
90 | first_start_index = match_result.start()
91 | if not (start_index is None):
92 | end_index = match_result.start()
93 | returned_str_l.append(obj_str[start_index:end_index])
94 | start_index = match_result.start()
95 | returned_str_l.append(obj_str[start_index:])
96 | if return_str_before_first_match:
97 | returned_str_l = [obj_str[:first_start_index]] + returned_str_l
98 | return returned_str_l
99 | def iter_value_of_key_through_d_l_d_d(obj_d_l_d_d, key_2nd_level, key_4th_level,
100 | expected_draw=1.0, yield_top_key=False, yield_list_index=False):
101 | '''
102 | a function that return a generator
103 | it will iter through all the values of the first level dict with every value being themself a dict
104 | for every such value dict,
105 | a key specified by key_2nd_level is used to access a list
106 | for every elment of the list
107 | a key specified by key_4th_level is used to access the corresponding value
108 | so in total it is a two level nested loop
109 |
110 | key_2nd_level: what it points to must be a list
111 |
112 | expected_draw: roughly control the proportion of the innermost values to be sampled
113 | can be an integar, which will be converted to the corresponding probability
114 |
115 | yield_top_key: whether to include the top key
116 | yield_list_index: whether to include the list index
117 | note that (yield_top_key=False, yield_list_index=True) is a useless combination, so raise an ValueError
118 | '''
119 | if isinstance(expected_draw, int):
120 | expected_draw = float(expected_draw)/len(obj_d_l_d_d)
121 | assert isinstance(expected_draw, float)
122 | for top_key, value_d_l_d in obj_d_l_d_d.iteritems():
123 | assert isinstance(value_d_l_d[key_2nd_level], list)
124 | for _list_index, value_d in enumerate(value_d_l_d[key_2nd_level]):
125 | if random() <= expected_draw:
126 | if (not yield_top_key) and (not yield_list_index):
127 | yield value_d[key_4th_level]
128 | elif yield_top_key and (not yield_list_index):
129 | yield top_key, value_d[key_4th_level]
130 | elif yield_top_key and yield_list_index:
131 | yield top_key, _list_index, value_d[key_4th_level]
132 | else:
133 | raise ValueError('Invalid Combination of yield_top_key and yield_list_index')
134 | def iter_through_general(obj_iter, path, yield_flags=True, final_yield_object=None):
135 | '''
136 | iter through an object following the given path
137 | yield_flags: control whether to yield the flags indicating the path at the global level
138 | final_yield_object: internal parameter, don't modify
139 | obj_iter: an iterable variable
140 | path: a sequence, each element has the following structure
141 | (how_to_iter, what_to_iter, yield_flag)
142 | how_to_iter: a str, accept the following values
143 | 'all' or 'all_values': iter through key-value pair for dict, and all elements for other type
144 | if yield_flag is True, attach key or index to the final yield object
145 | 'all_keys', only iter through the keys of a dict
146 | obj_iter must be a dict
147 | 'key', iter through the value of a given key
148 | what_to_iter must be a str representing a key in obj_iter
149 | if yield_flag is True, attach key to the final yield object
150 | ignored when obj_iter is not dict
151 | 'keys', iter through the values of a given set of keys
152 | what_to_iter must be a tuple with elements reprenting keys in obj_iter
153 | if yield_flag is True, attach key to the final yield object
154 | ignored when obj_iter is not dict
155 | 'index', iter through a given element
156 | what_to_iter must be an int within bound
157 | if yield_flag is True, attach index to the final yield object
158 | ignored when obj_iter is dict
159 | 'indexes', iter through the elements with given indexes
160 | what_to_iter must be an list of int within bound
161 | if yield_flag is True, attach key to the final yield object
162 | ignored when obj_iter is dict
163 | what_to_iter: content decided by how_to_iter
164 | ignored for the following values of how_to_iter
165 | all, all_values, all_keys
166 | yield_flag: True or False
167 | True: depending on how_to_iter, attch different flags to the final result
168 | False: no flag wil be yield
169 | ignored for the following values of how_to_iter
170 | all_keys
171 | '''
172 | is_dict = isinstance(obj_iter, dict)
173 | if final_yield_object is None:
174 | final_yield_object = []
175 | if len(path) == 0:
176 | if yield_flags:
177 | final_yield_object.append(obj_iter)
178 | yield final_yield_object
179 | else:
180 | yield obj_iter
181 | else:
182 | how_to_iter, what_to_iter, yield_flag = path.pop(0)
183 | assert isinstance(how_to_iter, basestring)
184 | if how_to_iter in [u'all', u'all_values', u'keys', u'indexes']:
185 | if how_to_iter in [u'keys', u'indexes']:
186 | assert hasattr(what_to_iter, '__iter__')
187 | for item in what_to_iter:
188 | if is_dict:
189 | assert how_to_iter == u'keys'
190 | assert isinstance(item, basestring)
191 | assert item in obj_iter
192 | else:
193 | assert how_to_iter == u'indexes'
194 | assert isinstance(item, int)
195 | assert item < len(obj_iter)
196 | temp_iterator = ((item, obj_iter[item]) for item in what_to_iter)
197 | else:
198 | temp_iterator = obj_iter.iteritems() if is_dict else enumerate(obj_iter)
199 | for flag, sub_obj_iter in temp_iterator:
200 | final_yield_object_copy = deepcopy(final_yield_object)
201 | if yield_flag:
202 | final_yield_object_copy.append(flag)
203 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object_copy):
204 | yield value
205 | elif how_to_iter == u'all_keys':
206 | assert is_dict
207 | for key in obj_iter.iterkeys():
208 | if yield_flags:
209 | final_yield_object.append(key)
210 | yield final_yield_object
211 | else:
212 | yield key
213 | elif how_to_iter in [u'key', u'index']:
214 | if is_dict:
215 | assert how_to_iter == u'key'
216 | assert isinstance(what_to_iter, basestring)
217 | assert what_to_iter in obj_iter
218 | else:
219 | assert how_to_iter == u'index'
220 | assert isinstance(what_to_iter, int)
221 | assert what_to_iter < len(obj_iter)
222 | sub_obj_iter = obj_iter[what_to_iter]
223 | if yield_flag:
224 | final_yield_object.append(what_to_iter)
225 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object):
226 | yield value
227 | else:
228 | raise ValueError('Invalid path')
229 | def reservoir_sample_k(obj_iter, k):
230 | assert isinstance(k, int)
231 | assert hasattr(obj_iter, '__iter__')
232 | # fit into k items
233 | sampled_l = []
234 | for _ in range(k):
235 | sampled_l.append(obj_iter.next())
236 | i = k
237 | for item in obj_iter:
238 | i += 1
239 | j = randint(1, i)
240 | if j <= k:
241 | sampled_l[j-1] = item
242 | return sampled_l
243 | def iter_through_and_sample_k(obj_iter, k, path):
244 | obj_iter_follow_path = iter_through_general(obj_iter, path)
245 | return reservoir_sample_k(obj_iter_follow_path, k)
246 | strip_white_space = lambda _str: _str.replace(' ', '')
247 | new_line_join = lambda str_l: '\n'.join(str_l)
248 | def codecs_open_r_utf8(file_path):
249 | with codecs.open(file_path, 'r', 'utf-8') as f:
250 | returned_str = f.read()
251 | return returned_str
252 | # merge blank lines
253 | def collapse_blank_line(base_str):
254 | match_double_line_feed_re = re.compile(r'\n\n')
255 | while match_double_line_feed_re.search(base_str):
256 | base_str = match_double_line_feed_re.sub(r'\n', base_str)
257 | return base_str
258 | def custom_html_element(_str):
259 | """
260 | convert the markdown notations in a string to html tags
261 | currently, only two kinds of markdown notation exist in all the strings
262 | ** and *
263 | """
264 | formatted_str = _str
265 | # format double asterisk
266 | match_double_asterisk_re = re.compile(u'\*\*(.*?)\*\*')
267 | # replace **...** with ...
268 | #formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str)
269 | # replace **...** with ...
270 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str)
271 | # format single asterisk
272 | # replace *...* with ...
273 | match_single_asterisk_re = re.compile(u'\*(.*?)\*')
274 | formatted_str = match_single_asterisk_re.sub(r'\1', formatted_str)
275 | return formatted_str
276 | def is_file_and_json_load(file_name_str):
277 | if os.path.isfile(file_name_str):
278 | with codecs.open(file_name_str, 'r', encoding='utf-8') as f:
279 | json_d = json.load(f)
280 | return json_d
281 |
--------------------------------------------------------------------------------
/pureSalsa20.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding: utf-8
3 |
4 | """
5 | Copyright by https://github.com/zhansliu/writemdict
6 |
7 | pureSalsa20.py -- a pure Python implementation of the Salsa20 cipher, ported to Python 3
8 |
9 | v4.0: Added Python 3 support, dropped support for Python <= 2.5.
10 |
11 | // zhansliu
12 |
13 | Original comments below.
14 |
15 | ====================================================================
16 | There are comments here by two authors about three pieces of software:
17 | comments by Larry Bugbee about
18 | Salsa20, the stream cipher by Daniel J. Bernstein
19 | (including comments about the speed of the C version) and
20 | pySalsa20, Bugbee's own Python wrapper for salsa20.c
21 | (including some references), and
22 | comments by Steve Witham about
23 | pureSalsa20, Witham's pure Python 2.5 implementation of Salsa20,
24 | which follows pySalsa20's API, and is in this file.
25 |
26 | Salsa20: a Fast Streaming Cipher (comments by Larry Bugbee)
27 | -----------------------------------------------------------
28 |
29 | Salsa20 is a fast stream cipher written by Daniel Bernstein
30 | that basically uses a hash function and XOR making for fast
31 | encryption. (Decryption uses the same function.) Salsa20
32 | is simple and quick.
33 |
34 | Some Salsa20 parameter values...
35 | design strength 128 bits
36 | key length 128 or 256 bits, exactly
37 | IV, aka nonce 64 bits, always
38 | chunk size must be in multiples of 64 bytes
39 |
40 | Salsa20 has two reduced versions, 8 and 12 rounds each.
41 |
42 | One benchmark (10 MB):
43 | 1.5GHz PPC G4 102/97/89 MB/sec for 8/12/20 rounds
44 | AMD Athlon 2500+ 77/67/53 MB/sec for 8/12/20 rounds
45 | (no I/O and before Python GC kicks in)
46 |
47 | Salsa20 is a Phase 3 finalist in the EU eSTREAM competition
48 | and appears to be one of the fastest ciphers. It is well
49 | documented so I will not attempt any injustice here. Please
50 | see "References" below.
51 |
52 | ...and Salsa20 is "free for any use".
53 |
54 |
55 | pySalsa20: a Python wrapper for Salsa20 (Comments by Larry Bugbee)
56 | ------------------------------------------------------------------
57 |
58 | pySalsa20.py is a simple ctypes Python wrapper. Salsa20 is
59 | as it's name implies, 20 rounds, but there are two reduced
60 | versions, 8 and 12 rounds each. Because the APIs are
61 | identical, pySalsa20 is capable of wrapping all three
62 | versions (number of rounds hardcoded), including a special
63 | version that allows you to set the number of rounds with a
64 | set_rounds() function. Compile the version of your choice
65 | as a shared library (not as a Python extension), name and
66 | install it as libsalsa20.so.
67 |
68 | Sample usage:
69 | from pySalsa20 import Salsa20
70 | s20 = Salsa20(key, IV)
71 | dataout = s20.encryptBytes(datain) # same for decrypt
72 |
73 | This is EXPERIMENTAL software and intended for educational
74 | purposes only. To make experimentation less cumbersome,
75 | pySalsa20 is also free for any use.
76 |
77 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
78 | ANY KIND. USE AT YOUR OWN RISK.
79 |
80 | Enjoy,
81 |
82 | Larry Bugbee
83 | bugbee@seanet.com
84 | April 2007
85 |
86 |
87 | References:
88 | -----------
89 | http://en.wikipedia.org/wiki/Salsa20
90 | http://en.wikipedia.org/wiki/Daniel_Bernstein
91 | http://cr.yp.to/djb.html
92 | http://www.ecrypt.eu.org/stream/salsa20p3.html
93 | http://www.ecrypt.eu.org/stream/p3ciphers/salsa20/salsa20_p3source.zip
94 |
95 |
96 | Prerequisites for pySalsa20:
97 | ----------------------------
98 | - Python 2.5 (haven't tested in 2.4)
99 |
100 |
101 | pureSalsa20: Salsa20 in pure Python 2.5 (comments by Steve Witham)
102 | ------------------------------------------------------------------
103 |
104 | pureSalsa20 is the stand-alone Python code in this file.
105 | It implements the underlying Salsa20 core algorithm
106 | and emulates pySalsa20's Salsa20 class API (minus a bug(*)).
107 |
108 | pureSalsa20 is MUCH slower than libsalsa20.so wrapped with pySalsa20--
109 | about 1/1000 the speed for Salsa20/20 and 1/500 the speed for Salsa20/8,
110 | when encrypting 64k-byte blocks on my computer.
111 |
112 | pureSalsa20 is for cases where portability is much more important than
113 | speed. I wrote it for use in a "structured" random number generator.
114 |
115 | There are comments about the reasons for this slowness in
116 | http://www.tiac.net/~sw/2010/02/PureSalsa20
117 |
118 | Sample usage:
119 | from pureSalsa20 import Salsa20
120 | s20 = Salsa20(key, IV)
121 | dataout = s20.encryptBytes(datain) # same for decrypt
122 |
123 | I took the test code from pySalsa20, added a bunch of tests including
124 | rough speed tests, and moved them into the file testSalsa20.py.
125 | To test both pySalsa20 and pureSalsa20, type
126 | python testSalsa20.py
127 |
128 | (*)The bug (?) in pySalsa20 is this. The rounds variable is global to the
129 | libsalsa20.so library and not switched when switching between instances
130 | of the Salsa20 class.
131 | s1 = Salsa20( key, IV, 20 )
132 | s2 = Salsa20( key, IV, 8 )
133 | In this example,
134 | with pySalsa20, both s1 and s2 will do 8 rounds of encryption.
135 | with pureSalsa20, s1 will do 20 rounds and s2 will do 8 rounds.
136 | Perhaps giving each instance its own nRounds variable, which
137 | is passed to the salsa20wordtobyte() function, is insecure. I'm not a
138 | cryptographer.
139 |
140 | pureSalsa20.py and testSalsa20.py are EXPERIMENTAL software and
141 | intended for educational purposes only. To make experimentation less
142 | cumbersome, pureSalsa20.py and testSalsa20.py are free for any use.
143 |
144 | Revisions:
145 | ----------
146 | p3.2 Fixed bug that initialized the output buffer with plaintext!
147 | Saner ramping of nreps in speed test.
148 | Minor changes and print statements.
149 | p3.1 Took timing variability out of add32() and rot32().
150 | Made the internals more like pySalsa20/libsalsa .
151 | Put the semicolons back in the main loop!
152 | In encryptBytes(), modify a byte array instead of appending.
153 | Fixed speed calculation bug.
154 | Used subclasses instead of patches in testSalsa20.py .
155 | Added 64k-byte messages to speed test to be fair to pySalsa20.
156 | p3 First version, intended to parallel pySalsa20 version 3.
157 |
158 | More references:
159 | ----------------
160 | http://www.seanet.com/~bugbee/crypto/salsa20/ [pySalsa20]
161 | http://cr.yp.to/snuffle.html [The original name of Salsa20]
162 | http://cr.yp.to/snuffle/salsafamily-20071225.pdf [ Salsa20 design]
163 | http://www.tiac.net/~sw/2010/02/PureSalsa20
164 |
165 | THIS PROGRAM IS PROVIDED WITHOUT WARRANTY OR GUARANTEE OF
166 | ANY KIND. USE AT YOUR OWN RISK.
167 |
168 | Cheers,
169 |
170 | Steve Witham sw at remove-this tiac dot net
171 | February, 2010
172 | """
173 | import sys
174 | assert(sys.version_info >= (2, 6))
175 |
176 | if sys.version_info >= (3,):
177 | integer_types = (int,)
178 | python3 = True
179 | else:
180 | integer_types = (int, long)
181 | python3 = False
182 |
183 | from struct import Struct
184 | little_u64 = Struct( "= 2**64"
238 | ctx = self.ctx
239 | ctx[ 8],ctx[ 9] = little2_i32.unpack( little_u64.pack( counter ) )
240 |
241 | def getCounter( self ):
242 | return little_u64.unpack( little2_i32.pack( *self.ctx[ 8:10 ] ) ) [0]
243 |
244 |
245 | def setRounds(self, rounds, testing=False ):
246 | assert testing or rounds in [8, 12, 20], 'rounds must be 8, 12, 20'
247 | self.rounds = rounds
248 |
249 |
250 | def encryptBytes(self, data):
251 | assert type(data) == bytes, 'data must be byte string'
252 | assert self._lastChunk64, 'previous chunk not multiple of 64 bytes'
253 | lendata = len(data)
254 | munged = bytearray(lendata)
255 | for i in range( 0, lendata, 64 ):
256 | h = salsa20_wordtobyte( self.ctx, self.rounds, checkRounds=False )
257 | self.setCounter( ( self.getCounter() + 1 ) % 2**64 )
258 | # Stopping at 2^70 bytes per nonce is user's responsibility.
259 | for j in range( min( 64, lendata - i ) ):
260 | if python3:
261 | munged[ i+j ] = data[ i+j ] ^ h[j]
262 | else:
263 | munged[ i+j ] = ord(data[ i+j ]) ^ ord(h[j])
264 |
265 | self._lastChunk64 = not lendata % 64
266 | return bytes(munged)
267 |
268 | decryptBytes = encryptBytes # encrypt and decrypt use same function
269 |
270 | #--------------------------------------------------------------------------
271 |
272 | def salsa20_wordtobyte( input, nRounds=20, checkRounds=True ):
273 | """ Do nRounds Salsa20 rounds on a copy of
274 | input: list or tuple of 16 ints treated as little-endian unsigneds.
275 | Returns a 64-byte string.
276 | """
277 |
278 | assert( type(input) in ( list, tuple ) and len(input) == 16 )
279 | assert( not(checkRounds) or ( nRounds in [ 8, 12, 20 ] ) )
280 |
281 | x = list( input )
282 |
283 | def XOR( a, b ): return a ^ b
284 | ROTATE = rot32
285 | PLUS = add32
286 |
287 | for i in range( nRounds // 2 ):
288 | # These ...XOR...ROTATE...PLUS... lines are from ecrypt-linux.c
289 | # unchanged except for indents and the blank line between rounds:
290 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 0],x[12]), 7));
291 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[ 4],x[ 0]), 9));
292 | x[12] = XOR(x[12],ROTATE(PLUS(x[ 8],x[ 4]),13));
293 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[12],x[ 8]),18));
294 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 5],x[ 1]), 7));
295 | x[13] = XOR(x[13],ROTATE(PLUS(x[ 9],x[ 5]), 9));
296 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[13],x[ 9]),13));
297 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 1],x[13]),18));
298 | x[14] = XOR(x[14],ROTATE(PLUS(x[10],x[ 6]), 7));
299 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[14],x[10]), 9));
300 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 2],x[14]),13));
301 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 6],x[ 2]),18));
302 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[15],x[11]), 7));
303 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 3],x[15]), 9));
304 | x[11] = XOR(x[11],ROTATE(PLUS(x[ 7],x[ 3]),13));
305 | x[15] = XOR(x[15],ROTATE(PLUS(x[11],x[ 7]),18));
306 |
307 | x[ 1] = XOR(x[ 1],ROTATE(PLUS(x[ 0],x[ 3]), 7));
308 | x[ 2] = XOR(x[ 2],ROTATE(PLUS(x[ 1],x[ 0]), 9));
309 | x[ 3] = XOR(x[ 3],ROTATE(PLUS(x[ 2],x[ 1]),13));
310 | x[ 0] = XOR(x[ 0],ROTATE(PLUS(x[ 3],x[ 2]),18));
311 | x[ 6] = XOR(x[ 6],ROTATE(PLUS(x[ 5],x[ 4]), 7));
312 | x[ 7] = XOR(x[ 7],ROTATE(PLUS(x[ 6],x[ 5]), 9));
313 | x[ 4] = XOR(x[ 4],ROTATE(PLUS(x[ 7],x[ 6]),13));
314 | x[ 5] = XOR(x[ 5],ROTATE(PLUS(x[ 4],x[ 7]),18));
315 | x[11] = XOR(x[11],ROTATE(PLUS(x[10],x[ 9]), 7));
316 | x[ 8] = XOR(x[ 8],ROTATE(PLUS(x[11],x[10]), 9));
317 | x[ 9] = XOR(x[ 9],ROTATE(PLUS(x[ 8],x[11]),13));
318 | x[10] = XOR(x[10],ROTATE(PLUS(x[ 9],x[ 8]),18));
319 | x[12] = XOR(x[12],ROTATE(PLUS(x[15],x[14]), 7));
320 | x[13] = XOR(x[13],ROTATE(PLUS(x[12],x[15]), 9));
321 | x[14] = XOR(x[14],ROTATE(PLUS(x[13],x[12]),13));
322 | x[15] = XOR(x[15],ROTATE(PLUS(x[14],x[13]),18));
323 |
324 | for i in range( len( input ) ):
325 | x[i] = PLUS( x[i], input[i] )
326 | return little16_i32.pack( *x )
327 |
328 | #--------------------------- 32-bit ops -------------------------------
329 |
330 | def trunc32( w ):
331 | """ Return the bottom 32 bits of w as a Python int.
332 | This creates longs temporarily, but returns an int. """
333 | w = int( ( w & 0x7fffFFFF ) | -( w & 0x80000000 ) )
334 | assert type(w) == int
335 | return w
336 |
337 |
338 | def add32( a, b ):
339 | """ Add two 32-bit words discarding carry above 32nd bit,
340 | and without creating a Python long.
341 | Timing shouldn't vary.
342 | """
343 | lo = ( a & 0xFFFF ) + ( b & 0xFFFF )
344 | hi = ( a >> 16 ) + ( b >> 16 ) + ( lo >> 16 )
345 | return ( -(hi & 0x8000) | ( hi & 0x7FFF ) ) << 16 | ( lo & 0xFFFF )
346 |
347 |
348 | def rot32( w, nLeft ):
349 | """ Rotate 32-bit word left by nLeft or right by -nLeft
350 | without creating a Python long.
351 | Timing depends on nLeft but not on w.
352 | """
353 | nLeft &= 31 # which makes nLeft >= 0
354 | if nLeft == 0:
355 | return w
356 |
357 | # Note: now 1 <= nLeft <= 31.
358 | # RRRsLLLLLL There are nLeft RRR's, (31-nLeft) LLLLLL's,
359 | # => sLLLLLLRRR and one s which becomes the sign bit.
360 | RRR = ( ( ( w >> 1 ) & 0x7fffFFFF ) >> ( 31 - nLeft ) )
361 | sLLLLLL = -( (1<<(31-nLeft)) & w ) | (0x7fffFFFF>>nLeft) & w
362 | return RRR | ( sLLLLLL << nLeft )
363 |
364 |
365 | # --------------------------------- end -----------------------------------
366 |
--------------------------------------------------------------------------------
/readmdict.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | # readmdict.py
4 | # Octopus MDict Dictionary File (.mdx) and Resource File (.mdd) Analyser
5 | #
6 | # Copyright (C) 2012, 2013, 2015 Xiaoqiang Wang
7 | #
8 | # This program is a free software; you can redistribute it and/or modify
9 | # it under the terms of the GNU General Public License as published by
10 | # the Free Software Foundation, version 3 of the License.
11 | #
12 | # You can get a copy of GNU General Public License along this program
13 | # But you can always get it from http://www.gnu.org/licenses/gpl.txt
14 | #
15 | # This program is distributed in the hope that it will be useful,
16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 | # GNU General Public License for more details.
19 |
20 | from struct import pack, unpack
21 | from io import BytesIO
22 | import re
23 |
24 | from ripemd128 import ripemd128
25 | from pureSalsa20 import Salsa20
26 |
27 | # zlib compression is used for engine version >=2.0
28 | import zlib
29 | # LZO compression is used for engine version < 2.0
30 | try:
31 | import lzo
32 | except ImportError:
33 | lzo = None
34 | print("LZO compression support is not available")
35 |
36 |
37 | def _unescape_entities(text):
38 | """
39 | unescape offending tags < > " &
40 | """
41 | text = text.replace(b'<', b'<')
42 | text = text.replace(b'>', b'>')
43 | text = text.replace(b'"', b'"')
44 | text = text.replace(b'&', b'&')
45 | return text
46 |
47 |
48 | def _fast_decrypt(data, key):
49 | b = bytearray(data)
50 | key = bytearray(key)
51 | previous = 0x36
52 | for i in range(len(b)):
53 | t = (b[i] >> 4 | b[i] << 4) & 0xff
54 | t = t ^ previous ^ (i & 0xff) ^ key[i % len(key)]
55 | previous = b[i]
56 | b[i] = t
57 | return bytes(b)
58 |
59 |
60 | def _mdx_decrypt(comp_block):
61 | key = ripemd128(comp_block[4:8] + pack(b'
116 | """
117 | taglist = re.findall(b'(\w+)="(.*?)"', header, re.DOTALL)
118 | tagdict = {}
119 | for key, value in taglist:
120 | tagdict[key] = _unescape_entities(value)
121 | return tagdict
122 |
123 | def _decode_key_block_info(self, key_block_info_compressed):
124 | if self._version >= 2:
125 | # zlib compression
126 | assert(key_block_info_compressed[:4] == b'\x02\x00\x00\x00')
127 | # decrypt if needed
128 | if self._encrypt & 0x02:
129 | key_block_info_compressed = _mdx_decrypt(key_block_info_compressed)
130 | # decompress
131 | key_block_info = zlib.decompress(key_block_info_compressed[8:])
132 | # adler checksum
133 | adler32 = unpack('>I', key_block_info_compressed[4:8])[0]
134 | assert(adler32 == zlib.adler32(key_block_info) & 0xffffffff)
135 | else:
136 | # no compression
137 | key_block_info = key_block_info_compressed
138 | # decode
139 | key_block_info_list = []
140 | num_entries = 0
141 | i = 0
142 | if self._version >= 2:
143 | byte_format = '>H'
144 | byte_width = 2
145 | text_term = 1
146 | else:
147 | byte_format = '>B'
148 | byte_width = 1
149 | text_term = 0
150 |
151 | while i < len(key_block_info):
152 | # number of entries in current key block
153 | num_entries += unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
154 | i += self._number_width
155 | # text head size
156 | text_head_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
157 | i += byte_width
158 | # text head
159 | if self._encoding != 'UTF-16':
160 | i += text_head_size + text_term
161 | else:
162 | i += (text_head_size + text_term) * 2
163 | # text tail size
164 | text_tail_size = unpack(byte_format, key_block_info[i:i+byte_width])[0]
165 | i += byte_width
166 | # text tail
167 | if self._encoding != 'UTF-16':
168 | i += text_tail_size + text_term
169 | else:
170 | i += (text_tail_size + text_term) * 2
171 | # key block compressed size
172 | key_block_compressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
173 | i += self._number_width
174 | # key block decompressed size
175 | key_block_decompressed_size = unpack(self._number_format, key_block_info[i:i+self._number_width])[0]
176 | i += self._number_width
177 | key_block_info_list += [(key_block_compressed_size, key_block_decompressed_size)]
178 |
179 | #assert(num_entries == self._num_entries)
180 |
181 | return key_block_info_list
182 |
183 | def _decode_key_block(self, key_block_compressed, key_block_info_list):
184 | key_list = []
185 | i = 0
186 | for compressed_size, decompressed_size in key_block_info_list:
187 | start = i
188 | end = i + compressed_size
189 | # 4 bytes : compression type
190 | key_block_type = key_block_compressed[start:start+4]
191 | # 4 bytes : adler checksum of decompressed key block
192 | adler32 = unpack('>I', key_block_compressed[start+4:start+8])[0]
193 | if key_block_type == b'\x00\x00\x00\x00':
194 | key_block = key_block_compressed[start+8:end]
195 | elif key_block_type == b'\x01\x00\x00\x00':
196 | if lzo is None:
197 | print("LZO compression is not supported")
198 | break
199 | # decompress key block
200 | header = b'\xf0' + pack('>I', decompressed_size)
201 | key_block = lzo.decompress(header + key_block_compressed[start+8:end])
202 | elif key_block_type == b'\x02\x00\x00\x00':
203 | # decompress key block
204 | key_block = zlib.decompress(key_block_compressed[start+8:end])
205 | # extract one single key block into a key list
206 | key_list += self._split_key_block(key_block)
207 | # notice that adler32 returns signed value
208 | assert(adler32 == zlib.adler32(key_block) & 0xffffffff)
209 |
210 | i += compressed_size
211 | return key_list
212 |
213 | def _split_key_block(self, key_block):
214 | key_list = []
215 | key_start_index = 0
216 | while key_start_index < len(key_block):
217 | # the corresponding record's offset in record block
218 | key_id = unpack(self._number_format, key_block[key_start_index:key_start_index+self._number_width])[0]
219 | # key text ends with '\x00'
220 | if self._encoding == 'UTF-16':
221 | delimiter = b'\x00\x00'
222 | width = 2
223 | else:
224 | delimiter = b'\x00'
225 | width = 1
226 | i = key_start_index + self._number_width
227 | while i < len(key_block):
228 | if key_block[i:i+width] == delimiter:
229 | key_end_index = i
230 | break
231 | i += width
232 | key_text = key_block[key_start_index+self._number_width:key_end_index]\
233 | .decode(self._encoding, errors='ignore').encode('utf-8').strip()
234 | key_start_index = key_end_index + width
235 | key_list += [(key_id, key_text)]
236 | return key_list
237 |
238 | def _read_header(self):
239 | import sys
240 | f = open(self._fname, 'rb')
241 | # number of bytes of header text
242 | header_bytes_size = unpack('>I', f.read(4))[0]
243 | header_bytes = f.read(header_bytes_size)
244 | # 4 bytes: adler32 checksum of header, in little endian
245 | adler32 = unpack('= 0x03000000:
257 | encoding = encoding.decode('utf-8')
258 | # GB18030 > GBK > GB2312
259 | if encoding in ['GBK', 'GB2312']:
260 | encoding = 'GB18030'
261 | self._encoding = encoding
262 | # encryption flag
263 | # 0x00 - no encryption
264 | # 0x01 - encrypt record block
265 | # 0x02 - encrypt key info block
266 | if header_tag[b'Encrypted'] == b'No':
267 | self._encrypt = 0
268 | elif header_tag[b'Encrypted'] == b'Yes':
269 | self._encrypt = 1
270 | else:
271 | self._encrypt = int(header_tag[b'Encrypted'])
272 |
273 | # stylesheet attribute if present takes form of:
274 | # style_number # 1-255
275 | # style_begin # or ''
276 | # style_end # or ''
277 | # store stylesheet in dict in the form of
278 | # {'number' : ('style_begin', 'style_end')}
279 | self._stylesheet = {}
280 | if header_tag.get('StyleSheet'):
281 | lines = header_tag['StyleSheet'].splitlines()
282 | for i in range(0, len(lines), 3):
283 | self._stylesheet[lines[i]] = (lines[i+1], lines[i+2])
284 |
285 | # before version 2.0, number is 4 bytes integer
286 | # version 2.0 and above uses 8 bytes
287 | self._version = float(header_tag[b'GeneratedByEngineVersion'])
288 | if self._version < 2.0:
289 | self._number_width = 4
290 | self._number_format = '>I'
291 | else:
292 | self._number_width = 8
293 | self._number_format = '>Q'
294 |
295 | return header_tag
296 |
297 | def _read_keys(self):
298 | f = open(self._fname, 'rb')
299 | f.seek(self._key_block_offset)
300 |
301 | # the following numbers could be encrypted
302 | if self._version >= 2.0:
303 | num_bytes = 8 * 5
304 | else:
305 | num_bytes = 4 * 4
306 | block = f.read(num_bytes)
307 |
308 | if self._encrypt & 1:
309 | if self._passcode is None:
310 | raise RuntimeError('user identification is needed to read encrypted file')
311 | regcode, userid = self._passcode
312 | if self.header['RegisterBy'] == 'EMail':
313 | encrypted_key = _decrypt_regcode_by_email(regcode.decode('hex'), userid)
314 | else:
315 | encrypted_key = _decrypt_regcode_by_deviceid(regcode.decode('hex'), userid)
316 | block = _salsa_decrypt(block, encrypted_key)
317 |
318 | # decode this block
319 | sf = BytesIO(block)
320 | # number of key blocks
321 | num_key_blocks = self._read_number(sf)
322 | # number of entries
323 | self._num_entries = self._read_number(sf)
324 | # number of bytes of key block info after decompression
325 | if self._version >= 2.0:
326 | key_block_info_decomp_size = self._read_number(sf)
327 | # number of bytes of key block info
328 | key_block_info_size = self._read_number(sf)
329 | # number of bytes of key block
330 | key_block_size = self._read_number(sf)
331 |
332 | # 4 bytes: adler checksum of previous 5 numbers
333 | if self._version >= 2.0:
334 | adler32 = unpack('>I', f.read(4))[0]
335 | assert adler32 == (zlib.adler32(block) & 0xffffffff)
336 |
337 | # read key block info, which indicates key block's compressed and decompressed size
338 | key_block_info = f.read(key_block_info_size)
339 | key_block_info_list = self._decode_key_block_info(key_block_info)
340 | assert(num_key_blocks == len(key_block_info_list))
341 |
342 | # read key block
343 | key_block_compressed = f.read(key_block_size)
344 | # extract key block
345 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
346 |
347 | self._record_block_offset = f.tell()
348 | f.close()
349 |
350 | return key_list
351 |
352 | def _read_keys_brutal(self):
353 | f = open(self._fname, 'rb')
354 | f.seek(self._key_block_offset)
355 |
356 | # the following numbers could be encrypted, disregard them!
357 | if self._version >= 2.0:
358 | num_bytes = 8 * 5 + 4
359 | key_block_type = b'\x02\x00\x00\x00'
360 | else:
361 | num_bytes = 4 * 4
362 | key_block_type = b'\x01\x00\x00\x00'
363 | block = f.read(num_bytes)
364 |
365 | # key block info
366 | # 4 bytes '\x02\x00\x00\x00'
367 | # 4 bytes adler32 checksum
368 | # unknown number of bytes follows until '\x02\x00\x00\x00' which marks the beginning of key block
369 | key_block_info = f.read(8)
370 | if self._version >= 2.0:
371 | assert key_block_info[:4] == b'\x02\x00\x00\x00'
372 | while True:
373 | fpos = f.tell()
374 | t = f.read(1024)
375 | index = t.find(key_block_type)
376 | if index != -1:
377 | key_block_info += t[:index]
378 | f.seek(fpos + index)
379 | break
380 | else:
381 | key_block_info += t
382 |
383 | key_block_info_list = self._decode_key_block_info(key_block_info)
384 | key_block_size = sum(list(zip(*key_block_info_list))[0])
385 |
386 | # read key block
387 | key_block_compressed = f.read(key_block_size)
388 | # extract key block
389 | key_list = self._decode_key_block(key_block_compressed, key_block_info_list)
390 |
391 | self._record_block_offset = f.tell()
392 | f.close()
393 |
394 | self._num_entries = len(key_list)
395 | return key_list
396 |
397 |
398 | class MDD(MDict):
399 | """
400 | MDict resource file format (*.MDD) reader.
401 | >>> mdd = MDD('example.mdd')
402 | >>> len(mdd)
403 | 208
404 | >>> for filename,content in mdd.items():
405 | ... print filename, content[:10]
406 | """
407 | def __init__(self, fname, passcode=None):
408 | MDict.__init__(self, fname, encoding='UTF-16', passcode=passcode)
409 |
410 | def items(self):
411 | """Return a generator which in turn produce tuples in the form of (filename, content)
412 | """
413 | return self._decode_record_block()
414 |
415 | def _decode_record_block(self):
416 | f = open(self._fname, 'rb')
417 | f.seek(self._record_block_offset)
418 |
419 | num_record_blocks = self._read_number(f)
420 | num_entries = self._read_number(f)
421 | assert(num_entries == self._num_entries)
422 | record_block_info_size = self._read_number(f)
423 | record_block_size = self._read_number(f)
424 |
425 | # record block info section
426 | record_block_info_list = []
427 | size_counter = 0
428 | for i in range(num_record_blocks):
429 | compressed_size = self._read_number(f)
430 | decompressed_size = self._read_number(f)
431 | record_block_info_list += [(compressed_size, decompressed_size)]
432 | size_counter += self._number_width * 2
433 | assert(size_counter == record_block_info_size)
434 |
435 | # actual record block
436 | offset = 0
437 | i = 0
438 | size_counter = 0
439 | for compressed_size, decompressed_size in record_block_info_list:
440 | record_block_compressed = f.read(compressed_size)
441 | # 4 bytes: compression type
442 | record_block_type = record_block_compressed[:4]
443 | # 4 bytes: adler32 checksum of decompressed record block
444 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
445 | if record_block_type == '\x00\x00\x00\x00':
446 | record_block = record_block_compressed[8:]
447 | elif record_block_type == '\x01\x00\x00\x00':
448 | if lzo is None:
449 | print("LZO compression is not supported")
450 | break
451 | # decompress
452 | header = '\xf0' + pack('>I', decompressed_size)
453 | record_block = lzo.decompress(header + record_block_compressed[8:])
454 | elif record_block_type == b'\x02\x00\x00\x00':
455 | # decompress
456 | record_block = zlib.decompress(record_block_compressed[8:])
457 |
458 | # notice that adler32 return signed value
459 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
460 |
461 | assert(len(record_block) == decompressed_size)
462 | # split record block according to the offset info from key block
463 | while i < len(self._key_list):
464 | record_start, key_text = self._key_list[i]
465 | # reach the end of current record block
466 | if record_start - offset >= len(record_block):
467 | break
468 | # record end index
469 | if i < len(self._key_list)-1:
470 | record_end = self._key_list[i+1][0]
471 | else:
472 | record_end = len(record_block) + offset
473 | i += 1
474 | data = record_block[record_start-offset:record_end-offset]
475 | yield key_text, data
476 | offset += len(record_block)
477 | size_counter += compressed_size
478 | assert(size_counter == record_block_size)
479 |
480 | f.close()
481 |
482 |
483 | class MDX(MDict):
484 | """
485 | MDict dictionary file format (*.MDD) reader.
486 | >>> mdx = MDX('example.mdx')
487 | >>> len(mdx)
488 | 42481
489 | >>> for key,value in mdx.items():
490 | ... print key, value[:10]
491 | """
492 | def __init__(self, fname, encoding='', substyle=False, passcode=None):
493 | MDict.__init__(self, fname, encoding, passcode)
494 | self._substyle = substyle
495 |
496 | def items(self):
497 | """Return a generator which in turn produce tuples in the form of (key, value)
498 | """
499 | return self._decode_record_block()
500 |
501 | def _substitute_stylesheet(self, txt):
502 | # substitute stylesheet definition
503 | txt_list = re.split('`\d+`', txt)
504 | txt_tag = re.findall('`\d+`', txt)
505 | txt_styled = txt_list[0]
506 | for j, p in enumerate(txt_list[1:]):
507 | style = self._stylesheet[txt_tag[j][1:-1]]
508 | if p and p[-1] == '\n':
509 | txt_styled = txt_styled + style[0] + p.rstrip() + style[1] + '\r\n'
510 | else:
511 | txt_styled = txt_styled + style[0] + p + style[1]
512 | return txt_styled
513 |
514 | def _decode_record_block(self):
515 | f = open(self._fname, 'rb')
516 | f.seek(self._record_block_offset)
517 |
518 | num_record_blocks = self._read_number(f)
519 | num_entries = self._read_number(f)
520 | assert(num_entries == self._num_entries)
521 | record_block_info_size = self._read_number(f)
522 | record_block_size = self._read_number(f)
523 |
524 | # record block info section
525 | record_block_info_list = []
526 | size_counter = 0
527 | for i in range(num_record_blocks):
528 | compressed_size = self._read_number(f)
529 | decompressed_size = self._read_number(f)
530 | record_block_info_list += [(compressed_size, decompressed_size)]
531 | size_counter += self._number_width * 2
532 | assert(size_counter == record_block_info_size)
533 |
534 | # actual record block data
535 | offset = 0
536 | i = 0
537 | size_counter = 0
538 | for compressed_size, decompressed_size in record_block_info_list:
539 | record_block_compressed = f.read(compressed_size)
540 | # 4 bytes indicates block compression type
541 | record_block_type = record_block_compressed[:4]
542 | # 4 bytes adler checksum of uncompressed content
543 | adler32 = unpack('>I', record_block_compressed[4:8])[0]
544 | # no compression
545 | if record_block_type == b'\x00\x00\x00\x00':
546 | record_block = record_block_compressed[8:]
547 | # lzo compression
548 | elif record_block_type == b'\x01\x00\x00\x00':
549 | if lzo is None:
550 | print("LZO compression is not supported")
551 | break
552 | # decompress
553 | header = b'\xf0' + pack('>I', decompressed_size)
554 | record_block = lzo.decompress(header + record_block_compressed[8:])
555 | # zlib compression
556 | elif record_block_type == b'\x02\x00\x00\x00':
557 | # decompress
558 | record_block = zlib.decompress(record_block_compressed[8:])
559 |
560 | # notice that adler32 return signed value
561 | assert(adler32 == zlib.adler32(record_block) & 0xffffffff)
562 |
563 | assert(len(record_block) == decompressed_size)
564 | # split record block according to the offset info from key block
565 | while i < len(self._key_list):
566 | record_start, key_text = self._key_list[i]
567 | # reach the end of current record block
568 | if record_start - offset >= len(record_block):
569 | break
570 | # record end index
571 | if i < len(self._key_list)-1:
572 | record_end = self._key_list[i+1][0]
573 | else:
574 | record_end = len(record_block) + offset
575 | i += 1
576 | record = record_block[record_start-offset:record_end-offset]
577 | # convert to utf-8
578 | record = record.decode(self._encoding, errors='ignore').strip(u'\x00').encode('utf-8')
579 | # substitute styles
580 | if self._substyle and self._stylesheet:
581 | record = self._substitute_stylesheet(record)
582 |
583 | yield key_text, record
584 | offset += len(record_block)
585 | size_counter += compressed_size
586 | assert(size_counter == record_block_size)
587 |
588 | f.close()
589 |
590 |
591 | if __name__ == '__main__':
592 | import sys
593 | import os
594 | import os.path
595 | import argparse
596 |
597 | # 2x3 compatible
598 | if sys.hexversion >= 0x03000000:
599 | unicode = str
600 |
601 | def passcode(s):
602 | try:
603 | regcode, userid = s.split(',')
604 | except:
605 | raise argparse.ArgumentTypeError("Passcode must be regcode,userid")
606 | try:
607 | regcode.decode('hex')
608 | except:
609 | raise argparse.ArgumentTypeError("regcode must be a 32 bytes hexadecimal string")
610 | return regcode, userid
611 |
612 | parser = argparse.ArgumentParser()
613 | parser.add_argument('-x', '--extract', action="store_true",
614 | help='extract mdx to source format and extract files from mdd')
615 | parser.add_argument('-s', '--substyle', action="store_true",
616 | help='substitute style definition if present')
617 | parser.add_argument('-d', '--datafolder', default="data",
618 | help='folder to extract data files from mdd')
619 | parser.add_argument('-e', '--encoding', default="",
620 | help='folder to extract data files from mdd')
621 | parser.add_argument('-p', '--passcode', default=None, type=passcode,
622 | help='register_code,email_or_deviceid')
623 | parser.add_argument("filename", nargs='?', help="mdx file name")
624 | args = parser.parse_args()
625 |
626 | # use GUI to select file, default to extract
627 | if not args.filename:
628 | import Tkinter
629 | import tkFileDialog
630 | root = Tkinter.Tk()
631 | root.withdraw()
632 | args.filename = tkFileDialog.askopenfilename(parent=root)
633 | args.extract = True
634 |
635 | if not os.path.exists(args.filename):
636 | print("Please specify a valid MDX/MDD file")
637 |
638 | base, ext = os.path.splitext(args.filename)
639 |
640 | # read mdx file
641 | if ext.lower() == os.path.extsep + 'mdx':
642 | mdx = MDX(args.filename, args.encoding, args.substyle, args.passcode)
643 | if type(args.filename) is unicode:
644 | bfname = args.filename.encode('utf-8')
645 | else:
646 | bfname = args.filename
647 | print('======== %s ========' % bfname)
648 | print(' Number of Entries : %d' % len(mdx))
649 | for key, value in mdx.header.items():
650 | print(' %s : %s' % (key, value))
651 | else:
652 | mdx = None
653 |
654 | # find companion mdd file
655 | mdd_filename = ''.join([base, os.path.extsep, 'mdd'])
656 | if os.path.exists(mdd_filename):
657 | mdd = MDD(mdd_filename, args.passcode)
658 | if type(mdd_filename) is unicode:
659 | bfname = mdd_filename.encode('utf-8')
660 | else:
661 | bfname = mdd_filename
662 | print('======== %s ========' % bfname)
663 | print(' Number of Entries : %d' % len(mdd))
664 | for key, value in mdd.header.items():
665 | print(' %s : %s' % (key, value))
666 | else:
667 | mdd = None
668 |
669 | if args.extract:
670 | # write out glos
671 | if mdx:
672 | output_fname = ''.join([base, os.path.extsep, 'txt'])
673 | tf = open(output_fname, 'wb')
674 | for key, value in mdx.items():
675 | tf.write(key)
676 | tf.write(b'\r\n')
677 | tf.write(value)
678 | tf.write(b'\r\n')
679 | tf.write(b'>\r\n')
680 | tf.close()
681 | # write out style
682 | if mdx.header.get('StyleSheet'):
683 | style_fname = ''.join([base, '_style', os.path.extsep, 'txt'])
684 | sf = open(style_fname, 'wb')
685 | sf.write(b'\r\n'.join(mdx.header['StyleSheet'].splitlines()))
686 | sf.close()
687 | # write out optional data files
688 | if mdd:
689 | datafolder = os.path.join(os.path.dirname(args.filename), args.datafolder)
690 | if not os.path.exists(datafolder):
691 | os.makedirs(datafolder)
692 | for key, value in mdd.items():
693 | dfname = ''.join([datafolder, key.replace('\\', os.path.sep).decode('utf-8')])
694 | if not os.path.exists(os.path.dirname(dfname)):
695 | os.makedirs(os.path.dirname(dfname))
696 | df = open(dfname, 'wb')
697 | df.write(value)
698 | df.close()
699 |
--------------------------------------------------------------------------------
/ripemd128.py:
--------------------------------------------------------------------------------
1 | """
2 | Copyright by https://github.com/zhansliu/writemdict
3 |
4 | ripemd128.py - A simple ripemd128 library in pure Python.
5 |
6 | Supports both Python 2 (versions >= 2.6) and Python 3.
7 |
8 | Usage:
9 | from ripemd128 import ripemd128
10 | digest = ripemd128(b"The quick brown fox jumps over the lazy dog")
11 | assert(digest == b"\x3f\xa9\xb5\x7f\x05\x3c\x05\x3f\xbe\x27\x35\xb2\x38\x0d\xb5\x96")
12 |
13 | """
14 |
15 |
16 |
17 | import struct
18 |
19 |
20 | # follows this description: http://homes.esat.kuleuven.be/~bosselae/ripemd/rmd128.txt
21 |
22 | def f(j, x, y, z):
23 | assert(0 <= j and j < 64)
24 | if j < 16:
25 | return x ^ y ^ z
26 | elif j < 32:
27 | return (x & y) | (z & ~x)
28 | elif j < 48:
29 | return (x | (0xffffffff & ~y)) ^ z
30 | else:
31 | return (x & z) | (y & ~z)
32 |
33 | def K(j):
34 | assert(0 <= j and j < 64)
35 | if j < 16:
36 | return 0x00000000
37 | elif j < 32:
38 | return 0x5a827999
39 | elif j < 48:
40 | return 0x6ed9eba1
41 | else:
42 | return 0x8f1bbcdc
43 |
44 | def Kp(j):
45 | assert(0 <= j and j < 64)
46 | if j < 16:
47 | return 0x50a28be6
48 | elif j < 32:
49 | return 0x5c4dd124
50 | elif j < 48:
51 | return 0x6d703ef3
52 | else:
53 | return 0x00000000
54 |
55 | def padandsplit(message):
56 | """
57 | returns a two-dimensional array X[i][j] of 32-bit integers, where j ranges
58 | from 0 to 16.
59 | First pads the message to length in bytes is congruent to 56 (mod 64),
60 | by first adding a byte 0x80, and then padding with 0x00 bytes until the
61 | message length is congruent to 56 (mod 64). Then adds the little-endian
62 | 64-bit representation of the original length. Finally, splits the result
63 | up into 64-byte blocks, which are further parsed as 32-bit integers.
64 | """
65 | origlen = len(message)
66 | padlength = 64 - ((origlen - 56) % 64) #minimum padding is 1!
67 | message += b"\x80"
68 | message += b"\x00" * (padlength - 1)
69 | message += struct.pack("> (32-s)) & 0xffffffff
86 |
87 | r = [ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14,15,
88 | 7, 4,13, 1,10, 6,15, 3,12, 0, 9, 5, 2,14,11, 8,
89 | 3,10,14, 4, 9,15, 8, 1, 2, 7, 0, 6,13,11, 5,12,
90 | 1, 9,11,10, 0, 8,12, 4,13, 3, 7,15,14, 5, 6, 2]
91 | rp = [ 5,14, 7, 0, 9, 2,11, 4,13, 6,15, 8, 1,10, 3,12,
92 | 6,11, 3, 7, 0,13, 5,10,14,15, 8,12, 4, 9, 1, 2,
93 | 15, 5, 1, 3, 7,14, 6, 9,11, 8,12, 2,10, 0, 4,13,
94 | 8, 6, 4, 1, 3,11,15, 0, 5,12, 2,13, 9, 7,10,14]
95 | s = [11,14,15,12, 5, 8, 7, 9,11,13,14,15, 6, 7, 9, 8,
96 | 7, 6, 8,13,11, 9, 7,15, 7,12,15, 9,11, 7,13,12,
97 | 11,13, 6, 7,14, 9,13,15,14, 8,13, 6, 5,12, 7, 5,
98 | 11,12,14,15,14,15, 9, 8, 9,14, 5, 6, 8, 6, 5,12]
99 | sp = [ 8, 9, 9,11,13,15,15, 5, 7, 7, 8,11,14,14,12, 6,
100 | 9,13,15, 7,12, 8, 9,11, 7, 7,12, 7, 6,15,13,11,
101 | 9, 7,15,11, 8, 6, 6,14,12,13, 5,14,13,13, 7, 5,
102 | 15, 5, 8,11,14,14, 6,14, 6, 9,12, 9,12, 5,15, 8]
103 |
104 |
105 | def ripemd128(message):
106 | h0 = 0x67452301
107 | h1 = 0xefcdab89
108 | h2 = 0x98badcfe
109 | h3 = 0x10325476
110 | X = padandsplit(message)
111 | for i in range(len(X)):
112 | (A,B,C,D) = (h0,h1,h2,h3)
113 | (Ap,Bp,Cp,Dp) = (h0,h1,h2,h3)
114 | for j in range(64):
115 | T = rol(s[j], add(A, f(j,B,C,D), X[i][r[j]], K(j)))
116 | (A,D,C,B) = (D,C,B,T)
117 | T = rol(sp[j], add(Ap, f(63-j,Bp,Cp,Dp), X[i][rp[j]], Kp(j)))
118 | (Ap,Dp,Cp,Bp)=(Dp,Cp,Bp,T)
119 | T = add(h1,C,Dp)
120 | h1 = add(h2,D,Ap)
121 | h2 = add(h3,A,Bp)
122 | h3 = add(h0,B,Cp)
123 | h0 = T
124 |
125 |
126 | return struct.pack(" 1:
270 | self.log_match_result(_log_message_l, search_scope_re_str, 'm')
271 | else:
272 | self.log_match_result(_log_message_l, search_scope_re_str, 'u')
273 | if has_both_arg:
274 | search_start_index_l[target_index] = first_match_obj.end(1)
275 | search_end_index_l[target_index] = first_match_obj.start(3)
276 | elif has_after_arg:
277 | search_start_index_l[target_index] = first_match_obj.end()
278 | else: # has_before_arg
279 | search_end_index_l[target_index] = first_match_obj.start()
280 | if try_before_arg_flag:
281 | # no -a and -b pattern, try the -b pattern
282 | _log_message_l.append('Restricted to the before pattern')
283 | n_match, first_match_obj = search_target_str(args_d['before'], target_str,
284 | scope_re_flag)
285 | if n_match == 0:
286 | self.log_match_result(args_d['before'], 'n')
287 | try_after_arg_flag = True
288 | else:
289 | if n_match > 1:
290 | self.log_match_result(args_d['before'], 'm')
291 | else:
292 | self.log_match_result(args_d['before'], 'u')
293 | search_end_index_l[target_index] = first_match_obj.start()
294 | if try_after_arg_flag:
295 | # no -a and -b pattern, no -b pattern, last try -a pattern
296 | _log_message_l.append('Last try the after pattern')
297 | n_match, first_match_obj = search_target_str(args_d['after'], target_str,
298 | scope_re_flag)
299 | if n_match == 0:
300 | self.log_match_result(_log_message_l, args_d['after'], 'n')
301 | _log_message_l.append('All tries failed! The search scope remains as default')
302 | else:
303 | if n_match > 1:
304 | self.log_match_result(_log_message_l, args_d['after'], 'm')
305 | else:
306 | self.log_match_result(_log_message_l, args_d['after'], 'u')
307 | search_start_index_l[target_index] = first_match_obj.end()
308 | _log_message_l.append('-- Finished. File: ' + file_path_l[target_index])
309 | else:
310 | _log_message_l.append('No argument is provided. The search scope remains as default')
311 | _par_d['search_start_index_l'], _par_d['search_end_index_l'] = \
312 | search_start_index_l, search_end_index_l
313 |
314 | @staticmethod
315 | def modify_target_str(_log_message_l, _par_d):
316 | _log_message_l.append('# Begin to modify the target str')
317 | file_path_l = _par_d['file_path_l']
318 | n_target_str = _par_d['n_target_str']
319 | target_str_l = _par_d['target_str_l']
320 | cell = _par_d['cell']
321 | args_d = _par_d['args_d']
322 | search_start_index_l = _par_d['search_start_index_l']
323 | search_end_index_l = _par_d['search_end_index_l']
324 | indent = args_d['indent']
325 | indented_cell_l = []
326 | # pre process the cell: skip lines that are blank
327 | # before the first non blank line or after the last non blank line
328 | cell = cell.strip()
329 | cell_line_l = [] if cell == '' else cell.split('\n')
330 | n_cell_line = len(cell_line_l)
331 | modified_target_str_l = [target_str_l[i] for i in range(n_target_str)]
332 | update_d = {'cell': cell, 'cell_line_l': cell_line_l,
333 | 'n_cell_line': n_cell_line,
334 | 'modified_target_str_l': modified_target_str_l}
335 | _par_d.update(update_d)
336 | if n_cell_line == 0:
337 | # nothing to write
338 | _log_message_l.append('!! Empty cell. Nothing to write.')
339 | return
340 | # indent cell for writing
341 | for cell_line in cell_line_l:
342 | indented_cell_l.append(' '*indent + cell_line)
343 | indented_cell = '\n'.join(indented_cell_l)
344 | # log writing mode
345 | append_message_d = {'o': '!! Writing mode is overwrite.',
346 | 'i': '!! Writing mode is insert.',
347 | 'a': '!! Writing mode is append.',
348 | 'di': '!! Writing mode is different and insert.',
349 | 'da': '!! Writing mode is different and append.'}
350 | _log_message_l.append(append_message_d[args_d['mode']])
351 | # begin to build modified str
352 | for target_index, target_str in enumerate(target_str_l):
353 | file_path = file_path_l[target_index]
354 | _log_message_l.append('++ Deal with file ' + file_path)
355 | start_index = search_start_index_l[target_index]
356 | end_index = search_end_index_l[target_index]
357 | if target_str == '':
358 | _log_message_l.append('Target file is empty.')
359 | modified_target_str_l[target_index] = indented_cell
360 | else:
361 | left_segment_end = None
362 | right_segment_start = None
363 | if args_d['mode'] == 'o':
364 | left_segment_end = start_index
365 | right_segment_start = end_index
366 | elif args_d['mode'] in ['i', 'di']:
367 | left_segment_end = start_index
368 | right_segment_start = start_index
369 | else:
370 | # args_d['mode'] in ['a', 'da']:
371 | left_segment_end = end_index
372 | right_segment_start = end_index
373 | if args_d['mode'] in ['da', 'di']:
374 | # try to match the cell as whole
375 | _log_message_l.append('Try to match the cell as whole.')
376 | cell_re = construct_indent_line_re(cell)
377 | n_match, _ = search_target_str(cell_re, target_str, re.M,
378 | start_index, end_index)
379 | if n_match > 0:
380 | _log_message_l.append('Whole cell matched. No need to update.')
381 | left_segment_end = None
382 | right_segment_start = None
383 | if not (left_segment_end is None):
384 | modified_str = target_str[:left_segment_end]
385 | if modified_str != '':
386 | if modified_str[-1] != '\n':
387 | modified_str += '\n'
388 | modified_str += indented_cell
389 | if modified_str[-1] != '\n':
390 | modified_str += '\n'
391 | modified_str += target_str[right_segment_start:]
392 | modified_target_str_l[target_index] = modified_str
393 | _log_message_l.append('Target str is modified')
394 | _log_message_l.append('-- Finished. File: ' + file_path)
395 |
396 | @staticmethod
397 | def log_match_result(_log_l, pattern_str, key_word='u'):
398 | _log_l.append(SyncToFile.match_result_prefix_d[key_word])
399 | formatted_pattern_str = '-wrap2 ' + pattern_str
400 | _log_l.append(formatted_pattern_str)
401 |
402 |
403 | # In order to actually use these magics, you must register them with a
404 | # running IPython. This code must be placed in a file that is loaded once
405 | # IPython is up and running:
406 | ip = get_ipython()
407 | # You can register the class itself without instantiating it. IPython will
408 | # call the default constructor on it.
409 | ip.register_magics(SyncToFile)
--------------------------------------------------------------------------------
/wagnerfischerpp.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | #
3 | # Copyright (c) 2013-2014 Kyle Gorman
4 | #
5 | # Permission is hereby granted, free of charge, to any person obtaining a
6 | # copy of this software and associated documentation files (the
7 | # "Software"), to deal in the Software without restriction, including
8 | # without limitation the rights to use, copy, modify, merge, publish,
9 | # distribute, sublicense, and/or sell copies of the Software, and to
10 | # permit persons to whom the Software is furnished to do so, subject to
11 | # the following conditions:
12 | #
13 | # The above copyright notice and this permission notice shall be included
14 | # in all copies or substantial portions of the Software.
15 | #
16 | # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
17 | # OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18 | # MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19 | # IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20 | # CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21 | # TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22 | # SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 | #
24 | # wagnerfischerpp.py: efficient computation of Levenshtein distance and
25 | # all optimal alignments with arbitrary edit costs. The algorithm for
26 | # computing the dynamic programming table used has been discovered many
27 | # times, but most notably by Wagner & Fischer:
28 | #
29 | # R.A. Wagner & M.J. Fischer. 1974. The string-to-string correction
30 | # problem. Journal of the ACM, 21(1): 168-173.
31 | #
32 | # Wagner & Fischer also describe an algorithm ("Algorithm Y") to find the
33 | # alignment path (i.e., list of edit operations involved in the optimal
34 | # alignment), but it it is specified such that in fact it only generates
35 | # one such path, whereas many such paths may exist, particularly when
36 | # multiple edit operations have the same cost. For example, when all edit
37 | # operations have the same cost, there are two equal-cost alignments of
38 | # "TGAC" and "GCAC":
39 | #
40 | # TGAC TGxAC
41 | # ss== d=i==
42 | # GCAC xGCAC
43 | #
44 | # However, all such paths can be generated efficiently, as follows. First,
45 | # the dynamic programming table "cells" are defined as tuples of (partial
46 | # cost, set of all operations reaching this cell with minimal cost). As a
47 | # result, the completed table can be thought of as an unweighted, directed
48 | # graph (or FSA). The bottom right cell (the one containing the Levenshtein
49 | # distance) is the start state and the origin as end state. The set of arcs
50 | # are the set of operations in each cell as arcs. (Many of the cells of the
51 | # table, those which are not visited by any optimal alignment, are under
52 | # the graph interpretation unconnected vertices, and can be ignored. Every
53 | # path between the bottom right cell and the origin cell is an optimal
54 | # alignment. These paths can be efficiently enumerated using breadth-first
55 | # traversal. The trick here is that elements in deque must not only contain
56 | # indices but also partial paths. Averaging over all such paths, we can
57 | # come up with an estimate of the number of insertions, deletions, and
58 | # substitutions involved as well; in the example above, we say S = 1 and
59 | # D, I = 0.5.
60 |
61 | from __future__ import division
62 |
63 | from pprint import PrettyPrinter
64 | from collections import deque, namedtuple, Counter
65 |
66 | # default costs
67 |
68 | INSERTION = 1
69 | DELETION = 1
70 | SUBSTITUTION = 1
71 |
72 | Trace = namedtuple("Trace", ["cost", "ops"])
73 |
74 |
75 | class WagnerFischer(object):
76 |
77 | """
78 | An object representing a (set of) Levenshtein alignments between two
79 | iterable objects (they need not be strings). The cost of the optimal
80 | alignment is scored in `self.cost`, and all Levenshtein alignments can
81 | be generated using self.alignments()`.
82 |
83 | Basic tests:
84 |
85 | >>> WagnerFischer("god", "gawd").cost
86 | 2
87 | >>> WagnerFischer("sitting", "kitten").cost
88 | 3
89 | >>> WagnerFischer("bana", "banananana").cost
90 | 6
91 | >>> WagnerFischer("bana", "bana").cost
92 | 0
93 | >>> WagnerFischer("banana", "angioplastical").cost
94 | 11
95 | >>> WagnerFischer("angioplastical", "banana").cost
96 | 11
97 | >>> WagnerFischer("Saturday", "Sunday").cost
98 | 3
99 |
100 | IDS tests:
101 |
102 | >>> WagnerFischer("doytauvab", "doyvautab").IDS() == {"S": 2.0}
103 | True
104 | >>> WagnerFischer("kitten", "sitting").IDS() == {"I": 1.0, "S": 2.0}
105 | True
106 | """
107 |
108 | # initialize pretty printer (shared across all class instances)
109 | pprint = PrettyPrinter(width=75)
110 |
111 | def __init__(self, A, B, insertion=INSERTION, deletion=DELETION,
112 | substitution=SUBSTITUTION):
113 | # score operation costs in a dictionary, for programmatic access
114 | self.costs = {"I": insertion, "D": deletion, "S": substitution}
115 | # initialize table
116 | self.asz = len(A)
117 | self.bsz = len(B)
118 | self._table = [[None for _ in xrange(self.bsz + 1)] for
119 | _ in xrange(self.asz + 1)]
120 | # from now on, all indexing done using self.__getitem__
121 | ## fill in edges
122 | self[0][0] = Trace(0, {"O"}) # start cell
123 | for i in xrange(1, self.asz + 1):
124 | self[i][0] = Trace(i * self.costs["D"], {"D"})
125 | for j in xrange(1, self.bsz + 1):
126 | self[0][j] = Trace(j * self.costs["I"], {"I"})
127 | ## fill in rest
128 | for i in xrange(len(A)):
129 | for j in xrange(len(B)):
130 | # clean it up in case there are more than one
131 | # check for match first, always cheapest option
132 | if A[i] == B[j]:
133 | self[i + 1][j + 1] = Trace(self[i][j].cost, {"M"})
134 | # check for other types
135 | else:
136 | costI = self[i + 1][j].cost + self.costs["I"]
137 | costD = self[i][j + 1].cost + self.costs["D"]
138 | costS = self[i][j].cost + self.costs["S"]
139 | # determine min of three
140 | min_val = min(costI, costD, costS)
141 | # write that much in
142 | trace = Trace(min_val, set())
143 | # add _all_ operations matching minimum value
144 | if costI == min_val:
145 | trace.ops.add("I")
146 | if costD == min_val:
147 | trace.ops.add("D")
148 | if costS == min_val:
149 | trace.ops.add("S")
150 | # write to table
151 | self[i + 1][j + 1] = trace
152 | # store optimum cost as a property
153 | self.cost = self[-1][-1].cost
154 |
155 | def __repr__(self):
156 | return self.pprint.pformat(self._table)
157 |
158 | def __iter__(self):
159 | for row in self._table:
160 | yield row
161 |
162 | def __getitem__(self, i):
163 | """
164 | Returns the i-th row of the table, which is a list and so
165 | can be indexed. Therefore, e.g., self[2][3] == self._table[2][3]
166 | """
167 | return self._table[i]
168 |
169 | # stuff for generating alignments
170 |
171 | def _stepback(self, i, j, trace, path_back):
172 | """
173 | Given a cell location (i, j) and a Trace object trace, generate
174 | all traces they point back to in the table
175 | """
176 | for op in trace.ops:
177 | if op == "M":
178 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["M"]
179 | elif op == "I":
180 | yield i, j - 1, self[i][j - 1], path_back + ["I"]
181 | elif op == "D":
182 | yield i - 1, j, self[i - 1][j], path_back + ["D"]
183 | elif op == "S":
184 | yield i - 1, j - 1, self[i - 1][j - 1], path_back + ["S"]
185 | elif op == "O":
186 | return # origin cell, we"re done iterating
187 | else:
188 | raise ValueError("Unknown op '{}'".format(op))
189 |
190 | def alignments(self, bfirst=False):
191 | """
192 | Generate all alignments with optimal cost by traversing the
193 | an implicit graph on the dynamic programming table. By default,
194 | depth-first traversal is used, since users seem to get tired
195 | waiting for their first results.
196 | """
197 | # each cell of the queue is a tuple of (i, j, trace, path_back)
198 | # where i, j is the current index, trace is the trace object at
199 | # this cell
200 | if bfirst:
201 | return self._bfirst_alignments()
202 | else:
203 | return self._dfirst_alignments()
204 |
205 | def _dfirst_alignments(self):
206 | """
207 | Generate alignments via depth-first traversal.
208 | """
209 | stack = list(self._stepback(self.asz, self.bsz, self[-1][-1], []))
210 | while stack:
211 | (i, j, trace, path_back) = stack.pop()
212 | if trace.ops == {"O"}:
213 | path_back.reverse()
214 | yield path_back
215 | continue
216 | stack.extend(self._stepback(i, j, trace, path_back))
217 |
218 | def _bfirst_alignments(self):
219 | """
220 | Generate alignments via breadth-first traversal.
221 | """
222 | queue = deque(self._stepback(self.asz, self.bsz, self[-1][-1], []))
223 | while queue:
224 | (i, j, trace, path_back) = queue.popleft()
225 | if trace.ops == {"O"}:
226 | path_back.reverse()
227 | yield path_back
228 | continue
229 | queue.extend(self._stepback(i, j, trace, path_back))
230 |
231 | def IDS(self):
232 | """
233 | Estimate insertions, deletions, and substitution _count_ (not
234 | costs). Non-integer values arise when there are multiple possible
235 | alignments with the same cost.
236 | """
237 | npaths = 0
238 | opcounts = Counter()
239 | for alignment in self.alignments():
240 | # count edit types for this path, ignoring "M" (which is free)
241 | opcounts += Counter(op for op in alignment if op != "M")
242 | npaths += 1
243 | # average over all paths
244 | return Counter({o: c / npaths for (o, c) in opcounts.iteritems()})
245 |
246 |
247 | if __name__ == "__main__":
248 | import doctest
249 | doctest.testmod()
--------------------------------------------------------------------------------