├── input └── pron.dsl.dz ├── README.md ├── pretreat_word.py ├── pretreat_mnemo.py ├── convert_mnemo.py ├── convert_word.py ├── generate_import.py └── my_helpers.py /input/pron.dsl.dz: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tansongchen/GRE3000/HEAD/input/pron.dsl.dz -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 本仓库存放了《GRE 核心词汇考法精析》和《GRE 核心词汇助记与精练》两本电子书的 Anki 整合版卡组,以及将电子书转换为 Anki 卡组的 Python 源码,适合于希望快速通过 GRE 考试的同学们使用。版权归《GRE 核心词汇考法精析》系列书籍作者陈琦及其团队所有,本卡组仅供学习交流,不可用于任何商业目的。 2 | 3 | 要使用本卡组,你有两种选择: 4 | 5 | # 简单粗暴版 6 | 7 | 从[微云分享链接](https://share.weiyun.com/5iSh6jY)下载整个卡组,导入 Anki 即可。该卡组包含完整的考法精析、助记和单词发音。 8 | 9 | # 极客折腾版 10 | 11 | 如果你懂得一点 Python,你可以下载本仓库,定制属于自己的卡组。但是,出于上传压力,本仓库不包含从发音词典中获得的音频文件。如果你希望通过运行本仓库中的代码来获得自己的 Anki 卡组,那么你需要首先下载约 500 M 的[音频压缩文件](https://share.weiyun.com/5otF24v),解压后将其中的所有音频文件移动到 `input/pron` 中。解压后文件数量约为 14 万个,因此需要较长时间(虽然我们只会用到其中的 2900 个左右)。 12 | 13 | 本仓库的文件结构为: 14 | 15 | - `pretreat_word.py`:预处理《GRE 核心词汇考法精析》电子书(主要是修正一些错误); 16 | - 输入文件:`input/word_raw.txt` 17 | - 输出文件:`input/word.txt` 18 | - `pretreat_mnemo.py`:预处理《GRE 核心词汇助记与精练》电子书(同上); 19 | - 输入文件:`input/mnemo_raw.txt` 20 | - 输出文件:`input/mnemo.txt` 21 | - `convert_word.py`:将《GRE 核心词汇考法精析》解析为单词及其释意并保存为 `json` 文件; 22 | - 输入文件:`input/word.txt` 23 | - 输出文件:`input/word_json.txt` 24 | - `convert_mnemo.py`:将《GRE 核心词汇助记与精练》解析为单词及其助记并保存为 `json` 文件; 25 | - 输入文件:`input/mnemo.txt` 26 | - 输出文件:`input/mnemo_json.txt` 27 | - `generate_import.py`:将两份 `json` 文件中的键值对转化为 Anki 导入文本文件格式,并从 `input/pron` 中添加音频。 28 | - 输入文件:`input/word_json.txt`,`input/mnemo_json.txt`,`input/pron.dsl.dz`,`input/pron/*` 29 | - 输出文件:`output/anki_import.txt`,`output/audio/*` -------------------------------------------------------------------------------- /pretreat_word.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | # 把原文件中的问题解决 3 | 4 | f = open('input/word_raw.txt', encoding = 'utf-8', mode = 'r') 5 | l = f.readlines()[:75263] 6 | f.close() 7 | 8 | def a(line, char, space=0): 9 | global l 10 | l[line] = l[line][:char] + ' ‖ ' + l[line][char+space:] 11 | 12 | # 修正:无分隔符或分隔符错误 13 | a(2180,114,1) 14 | a(7619,80) 15 | a(9387,21,1) 16 | a(53326,81) 17 | a(67087,76) 18 | a(68349,69,1) 19 | a(70541,126,1) 20 | a(71683,78) 21 | 22 | # 修正:领域错误 23 | l[3017] = '**近**\n' 24 | l[37948] = '**近**\n' 25 | l[45042] = '**例**\n' 26 | l[48901] = '**近**\n' 27 | l[56230] = '**近**\n' 28 | l[56234] = '**反**\n' 29 | l[57000] = '**例**\n' 30 | l[68839] = '**派**\n' 31 | 32 | # 修正:无音标 33 | l[3139] = '\[ˈænɪmət\] ' + l[3139] 34 | l[10379] = '\[ˈkɑːmplɪmənt\]\n' 35 | l[10531] = '\[kəmˈpaʊnd\] ' + l[10531] 36 | l[12041] = '\[kənˈtrækt\] ' + l[12041] 37 | 38 | # 其他 39 | l[1154] = '*v*.**不正当或不合理使用、过分过量使用:**to put to a **wrong or improper use** or to **use excessively**\n' 40 | l[14070] = 'undaunted *adj*.无畏的,大胆的\n' 41 | l[14872] = l[14872].strip('\n') + ', ' + l[14876] 42 | l[14874] = '\n' 43 | l[14876] = '\n' 44 | l[16593] = l[16593][:57] 45 | l[16613] = 'disarm her anger 平息她的怒气\n' 46 | l[20486] = l[20486][:44] + ',' + l[20486][45:] 47 | l[28648] = 'humbuggery 欺骗\n' 48 | l[68563] = l[68563][:77] + l[68563][78:81] + ' ‖ ' + l[68563][81:] 49 | l[73646] = '[ˈprodʒekt]\n' 50 | 51 | f = open('input/word.txt', encoding = 'utf-8', mode = 'w') 52 | f.write(''.join(l)) 53 | f.close() 54 | -------------------------------------------------------------------------------- /pretreat_mnemo.py: -------------------------------------------------------------------------------- 1 | # 把原文件中的问题解决 2 | 3 | f = open('input/mnemo_raw.txt', encoding = 'utf-8', mode = 'r') 4 | l1 = f.readlines() 5 | f.close() 6 | 7 | l1 = l1[:946] + [l1[946].strip('\n') + ' ' + l1[948],] + l1[949:] 8 | l1 = l1[:1016] + [l1[1016].strip('\n') + ' ' + l1[1018],] + l1[1019:] 9 | l1 = l1[:1248] + [l1[1248].strip('\n') + ' ' + l1[1250],] + l1[1251:] 10 | 11 | 12 | l1[2176] = '**7.vad, vag, ced**\n' 13 | l1[3056] = '**3.verg, volv, shift**\n' 14 | l1[3234] = '【小结】' + l1[3234][2:] 15 | l1 = l1[:3256] + ['**4.scru**\n', '\n'] + l1[3256:] 16 | l1 = l1[:3394] + l1[3412:] 17 | l1[3720] = '【根】头\[cip\]在前面\[pre-\],(1)一个人匆忙地向前跑的状态(2)头在\[身体\]前面,就容易“fall down\[落下\]”→a.匆忙的;非常陡峭的\n' 18 | l1[3724] = '【根】头\[cip\]在前面\[pre-\], 头在\[身体\]前面,就容易“fall down\[落下\]”→n.悬崖峭壁\n' 19 | 20 | l1[4260] = '【小结】' + l1[4260][2:] 21 | l1[4940] = '**2.brav**\n' 22 | l1[5248] = '**3.tum**\n' 23 | l1[5278] = '【小结】' + l1[5278][2:] 24 | l1[5448] = '**2.jo**\n' 25 | l1[5454] = '【参】' + l1[5454] 26 | l1[5458] = '【参】' + l1[5458] 27 | l1[5462] = '【参】' + l1[5462] 28 | l1[5466] = '【参】' + l1[5466] 29 | l1 = l1[:5468] + l1[5470:] 30 | l1 = l1[:5478] + [l1[5478].strip('\n') + ' ' + l1[5480],] + l1[5481:] 31 | l1 = l1[:5748] + [l1[5748].strip('\n') + ' ' + l1[5758],] + l1[5749:5758] + l1[5760:] 32 | l1 = l1[:6680] + [l1[6680].strip('\n') + ' ' + l1[6682],] + l1[6683:] 33 | 34 | l1 = l1[:9318] 35 | 36 | # 拆分词根 37 | 38 | def div2(start, end, r1, r2, l1): 39 | l1[start] = '**' + r1 + '**\n' 40 | l1[start+2] = l1[start+2][3:] 41 | l1[start+4] = l1[start+4][3:] 42 | l1 = l1[:start+4] + l1[start+6:end] + \ 43 | ['**' + r2 + '**\n', '\n'] + l1[start+4:start+6] + l1[end:] 44 | return l1 45 | 46 | l1 = div2(1128, 1144, '2.vow', '8.claim', l1) 47 | l1 = div2(1176, 1196, '3.mand', '7.ord', l1) 48 | l1 = div2(2048, 2066, '3.amb', '8.err', l1) 49 | l1 = div2(4284, 4302, '3.brev', '4.long', l1) 50 | 51 | def div3(start, mid, end, r1, r2, r3, l1): 52 | l1[start] = '**' + r1 + '**\n' 53 | l1[start+2] = l1[start+2][3:] 54 | l1[start+4] = l1[start+4][3:] 55 | l1[start+6] = l1[start+6][3:] 56 | l1 = l1[:start+4] + l1[start+8:mid] + \ 57 | ['**' + r2 + '**\n', '\n'] + l1[start+4:start+6] + l1[mid:end] + \ 58 | ['**' + r3 + '**\n', '\n'] + l1[start+6:start+8] + l1[end:] 59 | return l1 60 | 61 | l1[3362] = '**2.duc \(t)**\n' 62 | l1[3392] = '**3.fac \(t), fect, feit**\n' 63 | l1[3318] = '**1.ag, act**\n' 64 | 65 | l1 = l1[:1092] + l1[1094:] 66 | l1 = div3(1084, 1104, 1114, '1.son', '1.ton', '1.phon', l1) 67 | l1 = div3(3064, 3080, 3092, '3.verg', '3.volv', '3.shift', l1) 68 | l1 = div3(2526, 2542, 2552, '1.cis', '1.tom', '1.sect', l1) 69 | l1 = l1[:5766] + ['(1)词根surg表示rise\[升起\], surge\[n.&v.汹涌\]指“升起的波涛”,可以与surf\[冲浪\]一起进行联想(冲浪时伴随着升起的波涛)。 \n', 70 | '\n', 71 | '(2)词根cit表示“(向上)引, 唤起\[arouse\]”, cite\[引用\]就来自该词根。excite\[v.激起,使兴奋\]指“引\[cit\]出\[ex-\](一个人的兴致)”。\n' 72 | ] + l1[5767:] 73 | l1 = div2(5764, 5778, '5.surg', '5.cit', l1) 74 | l1 = div3(2184, 2202, 2210, '7.vad, vas', '7.vag', '7.ced, cess', l1) 75 | l1[5472] = '很多由“jo”开始的词根与“快乐\[joy\](地开玩笑\[joke\])”有关。\n' 76 | l1[3278] = '可以将scru按照读音联想成“四顾”,表示“(顾虑地)看”。\n' 77 | l1 = div2(4236, 4266, '1.lev', '1.loft', l1) 78 | l1 = div2(1918, 1928, '4.post', '4.pond', l1) 79 | l1 = l1[:1614] + [l1[1614].strip('\n') + l1[1616][3:]] + l1[1617:] 80 | l1 = div2(1612, 1664, '2.tend, tens, tent', '2.tenu', l1) 81 | f = open('input/mnemo.txt', encoding = 'utf-8', mode = 'w') 82 | f.write(''.join(l1)) 83 | f.close() 84 | -------------------------------------------------------------------------------- /convert_mnemo.py: -------------------------------------------------------------------------------- 1 | import re 2 | import json 3 | import codecs 4 | import functools 5 | import os.path 6 | from random import random 7 | from random import randint 8 | from pprint import pprint 9 | from copy import deepcopy 10 | from my_helpers import * 11 | 12 | # 0 正则表达式预备 13 | re_escape_char = re.compile(r'\\(?=[\[\]()*+])') # 把转义符 \ 去掉 14 | re_list_start = re.compile(r'### \*\*List \d+', re.M) # 每一个 List 的开头形如:### **List 1 15 | re_root_start = re.compile(r'^\*\*\d+\.(.*)\*\*$\n|^Unit\d+$', re.M) # 每一个 root 的开头形如: **1.xxx** 16 | re_word_start = re.compile(r'^\*\*([a-zéï-]+)(.*?)(\[.*\])\*\*$', re.M|re.I) # 每一个单词 17 | 18 | # 1 读取文件并初步处理 19 | mnemo_str = codecs_open_r_utf8('input/mnemo.txt') 20 | mnemo_str = re_escape_char.sub('', mnemo_str) # 去掉 \ 21 | mnemo_str = collapse_blank_line(mnemo_str) # 合并不必要的空行 22 | # 把文件拆成 List 23 | mnemo_lists_l = extract_content_between(mnemo_str, re_list_start) 24 | 25 | # 2 把 List 拆成 Root 26 | def ListStr_to_RootDict(lists_l): 27 | root_d_l_l = [] 28 | for list_index, list_str in enumerate(lists_l[:39]): 29 | root_d_l = [] 30 | list_str = list_str.split('小结&复习')[0] # 一个 List 以小结复习结尾,这个我们用不到 31 | root_str_l = extract_content_between(list_str, re_root_start) 32 | for root_index, root_str in enumerate(root_str_l): 33 | root = re_root_start.search(root_str).group(1) 34 | # 只有在 List 1 到 34 且词根不是“其他xx”时才提取出词根,其余无效 35 | root = root.strip() if (list_index <= 33 and root[:2] != '其他') else '' 36 | root_str = root_str.split('【小结】')[0] 37 | root_str = re_root_start.sub('', root_str) 38 | root_d = {'pos': (list_index+1, root_index+1), 39 | 'root': root, 40 | 'root_str': root_str} # 对每个 root 创建一个 dict 41 | root_d_l.append(root_d) 42 | root_d_l_l.append(root_d_l) 43 | return root_d_l_l 44 | 45 | # 3 把 Root 拆成 Word 46 | def RootDict_to_WordStr(root_d_l_l): 47 | path = [('all','',True),('all','',True),('key','root_str',False)] 48 | # 利用 path 进行复杂遍历的函数在 Helpers 中定义了 49 | for list_index, root_index, root_str in iter_through_general(root_d_l_l, path): 50 | root_d = root_d_l_l[list_index][root_index] 51 | word_str_l = extract_content_between(root_str, re_word_start, True) 52 | root_exp = word_str_l.pop(0).strip() 53 | root_d['root_exp'] = root_exp.split('\n') 54 | root_d['word_str_l'] = word_str_l 55 | root_d.pop('root_str') 56 | return root_d_l_l 57 | 58 | # 4 把 Word 拆成三个部分 59 | def WordStr_to_WordDict_mono(word_str): # 先定义单个的 word_str 怎么处理 60 | word_lines_l = [x.strip() for x in word_str.split('\n') if x.strip() != ''] 61 | first_line_match = re_word_start.match(word_lines_l.pop(0)) 62 | word = first_line_match.group(1) 63 | phon = first_line_match.group(3) 64 | word_d = {'word': word, 65 | 'phon': phon if phon else '', 66 | 'content': word_lines_l} 67 | return word_d 68 | 69 | def WordStr_to_WordDict(word_d_l_l): 70 | word_d = {} 71 | path = [('all','',True),('all','',True), ('key','word_str_l',False),('all','',True)] 72 | for list_index, root_index, word_index, word_str in iter_through_general(word_d_l_l, path): 73 | one_word_d = WordStr_to_WordDict_mono(word_str) 74 | word = one_word_d['word'] 75 | for _key in ['pos', 'root', 'root_exp']: # 每个单词的 Dict 要继承自 root 的位置、词根和释义 76 | one_word_d[_key] = word_d_l_l[list_index][root_index][_key] 77 | one_word_d['cognates'] = '' 78 | word_d[word] = one_word_d 79 | return word_d 80 | 81 | # 5 加同根词 82 | def Add_Cognates(word_d, word_d_l_l): 83 | path = [('all','',False),('all','',False)] 84 | for root_d, in iter_through_general(word_d_l_l, path): 85 | root = root_d['root'] 86 | root_exp = root_d['root_exp'] 87 | if root != '' or root_exp != '': # 找到同一个词根下的所有词 88 | cognates_l = [re_word_start.match(x).group(1) for x in root_d['word_str_l']] 89 | for word in cognates_l: 90 | word_d[word]['cognates'] = ', '.join(cognates_l) 91 | return word_d 92 | 93 | mnemo_root_l = ListStr_to_RootDict(mnemo_lists_l) 94 | mnemo_word_s = RootDict_to_WordStr(mnemo_root_l) 95 | mnemo_word_d = WordStr_to_WordDict(mnemo_word_s) 96 | mnemo_word_d = Add_Cognates(mnemo_word_d, mnemo_word_s) 97 | with codecs.open('input/mnemo_json.txt', 'w', encoding='utf-8') as f: 98 | json.dump(mnemo_word_d, f) 99 | -------------------------------------------------------------------------------- /convert_word.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | from my_helpers import * 12 | 13 | # 0 正则表达式预备 14 | re_list_start = re.compile(r'^# \*\*List \d+', re.M) 15 | re_strip_start = re.compile(r'# \*\*Word List 1(.|\n)*$') 16 | re_unit_start = re.compile(r'^## \*\*Unit \d+', re.M) 17 | re_word_start = re.compile(r'^\*\*(?P[a-z \-éï]+)\*\*\n(?P\[.+\])?', re.U|re.M) 18 | re_usage_start = re.compile(r'^\*\*【考(?:法|点)\d?】(.*)$', re.M|re.U) 19 | re_phon = re.compile(r'\[.*\]', re.U) 20 | re_pspeech = re.compile(r'\*([a-z\/\.]+)\*.') 21 | re_escape_char = re.compile(r'\\(?=[\!\[\]()*+])') 22 | re_fields_all = re.compile(r'^(\*\*[例近反派]\*\*)', re.M) 23 | re_fields_der = re.compile(r'^(\*\*派\*\*)', re.M) 24 | re_delimit = re.compile(r'\n|‖') 25 | match_adj = re.compile(r'adj') 26 | match_adv = re.compile(r'adv') 27 | 28 | # 1 文件读取和初步处理 29 | word_str = codecs_open_r_utf8('input/word.txt') 30 | word_str = re_escape_char.sub('', word_str) # 先把转义用的\去掉 31 | word_str = word_str.translate(dict.fromkeys((ord(c) for c in u"\xa0"))) 32 | word_str = collapse_blank_line(word_str) # 合并不必要的空行 33 | # 2 把整个文件切成一个一个 List 34 | word_lists_l = extract_content_between(word_str, re_list_start) 35 | word_lists_l[30] = re_strip_start.sub('', word_lists_l[30]) 36 | # 3 把每个 List 切成一个一个 Unit 37 | word_units_l_l = list(map(functools.partial(extract_content_between, match_re=re_unit_start), word_lists_l)) 38 | 39 | def UnitStr_to_WordDict_mono(unit_str, list_index, unit_index): 40 | returned_words_d_d = {} 41 | word_str_l = extract_content_between(unit_str, re_word_start) 42 | for word_str in word_str_l: 43 | first_line_match = re_word_start.match(word_str) 44 | try: 45 | word = first_line_match.group('word') 46 | except AttributeError: 47 | print(word_str) 48 | phon = first_line_match.group('phon') 49 | one_word_d = {'word_str': re_word_start.sub('', word_str), 50 | 'phon': strF2H(phon) if phon else '', 51 | 'pos': (list_index, unit_index), 52 | 'audio': ''} # Audio 备用 53 | returned_words_d_d[word] = one_word_d 54 | return returned_words_d_d 55 | 56 | def UnitStr_to_WordDict(base_unit_data_l_l): 57 | _new3000_base_d = {} 58 | for list_index, unit_data_l in enumerate(base_unit_data_l_l): 59 | for unit_index, unit_data in enumerate(unit_data_l): 60 | _new3000_base_d.update(UnitStr_to_WordDict_mono(unit_data, list_index+1, unit_index+1)) 61 | return _new3000_base_d 62 | 63 | def WordStr_to_UsageList_mono(word_block_str, word, unit_index): 64 | 65 | usages_str_l = extract_content_between(word_block_str, re_usage_start) 66 | usages_d_l = [] 67 | for one_usage_str in usages_str_l: 68 | fields_l = extract_content_between(one_usage_str[9:], re_fields_all, True) 69 | fields_l[-1] = ('\n'.join(fields_l[-1].split('\n')[:2]) + '\n') if unit_index == 10 else fields_l[-1] 70 | one_usage_str = ''.join(fields_l) 71 | origin_and_der = extract_content_between(one_usage_str, re_fields_der, True) 72 | # 1. Origin 73 | origin_field_list = extract_content_between(origin_and_der[0], re_fields_all, True) 74 | usage_d = FieldList_to_UsageDict(origin_field_list) 75 | # 2. Derivatives 76 | for one_der in origin_and_der[1:]: 77 | der_field_list = extract_content_between(one_der[6:], re_fields_all, True) 78 | usage_d['der'].append(FieldList_to_UsageDict(der_field_list)) 79 | usages_d_l.append(usage_d) 80 | return usages_d_l 81 | 82 | def FieldList_to_UsageDict(field_list): 83 | fields_c2e = {'例': 'examples', '近': 'syns', '反': 'ants'} 84 | usage_d = {'basic': [], 85 | 'examples': [], 86 | 'syns': [], 87 | 'ants': [], 88 | 'der': []} 89 | usage_d['basic'] = [BasicStr_to_BasicDict(basic_str.strip()) for basic_str in re_delimit.split(field_list[0].strip())] 90 | for i in field_list[1:]: 91 | usage_d[fields_c2e[i[2]]] = [x.strip() for x in re_delimit.split(i[6:].strip())] 92 | return usage_d 93 | 94 | def BasicStr_to_BasicDict(basic_str): 95 | basic = {'exp': '', 96 | 'pspeech': '', 97 | 'phon': '', 98 | 'audio': '', 99 | 'res': basic_str} # Audio 备用 100 | # 1.1 Phonetics 101 | phon_result = re_phon.search(basic_str) 102 | if phon_result: 103 | basic['phon'] = phon_result.group() 104 | basic_str = re_phon.sub('', basic_str, 1).strip() 105 | # 1.2 Part of Speech 106 | pspeech_result = re_pspeech.search(basic_str) 107 | if pspeech_result: 108 | pspeech = pspeech_result.group(1) 109 | pspeech = match_adj.sub('a', pspeech) 110 | pspeech = match_adv.sub('ad', pspeech) 111 | basic['pspeech'] = pspeech 112 | basic_str = re_pspeech.sub('', basic_str, 1) 113 | # 1.3 Explanation 就是剩下的! 114 | basic['exp'] = basic_str 115 | return basic 116 | 117 | def WordStr_to_UsageList(words_d): 118 | for word in words_d: 119 | unit_index = word_d[word]['pos'][1] 120 | words_d[word]['usages'] = WordStr_to_UsageList_mono(words_d[word]['word_str'], word, unit_index) 121 | words_d[word].pop('word_str') 122 | return words_d 123 | 124 | # 4 把每个 Unit 切成一个一个 Word 125 | word_d = UnitStr_to_WordDict(word_units_l_l) 126 | # 5 把每个 Word 切成一个一个 Usage,也就是考法。 127 | word_d = WordStr_to_UsageList(word_d) 128 | with codecs.open('input/word_json.txt', 'w', encoding='utf-8') as f: 129 | json.dump(word_d, f) 130 | -------------------------------------------------------------------------------- /generate_import.py: -------------------------------------------------------------------------------- 1 | # coding=utf-8 2 | import json 3 | import codecs 4 | import os 5 | from my_helpers import * 6 | import gzip 7 | import shutil 8 | from pydub import AudioSegment 9 | import glob 10 | 11 | 12 | # 0 正则表达式预备 13 | re_us_pron = re.compile('\[s\](us.*?)\[/s\]') 14 | re_pspeech = re.compile('^[ \t]*?\[m1\]\[b\].*?\[/b\] \[i\] ([a-z, ]+).*?\[/i\]', re.M) 15 | re_word = re.compile(r'^([a-z-]+)[ \t]*$(.*?)(?=^[^ \t])', re.M|re.S|re.I) 16 | 17 | # 1 文件读取和初步处理 18 | output = 'output' 19 | source = 'input/pron' 20 | word_d = is_file_and_json_load('input/word_json.txt') 21 | mnemo_d = is_file_and_json_load('input/mnemo_json.txt') 22 | multi_pron = [word for word in word_d 23 | if any(basic['phon'] for usage in word_d[word]['usages'] 24 | for basic in usage['basic'])] 25 | # 词典数据文件 26 | dsl_pron_d = {} 27 | dsl_str = gzip.open('input/pron.dsl.dz', mode='r').read().decode('utf-16') 28 | for one_match_obj in re_word.finditer(dsl_str): 29 | word = one_match_obj.group(1) 30 | word_block = one_match_obj.group(2) 31 | dsl_pron_d[word] = {'word_block': word_block, 32 | 'pspeech_l': re_pspeech.findall(word_block), 33 | 'phon_l': re_us_pron.findall(word_block)} 34 | 35 | # 2 加入发音指针 36 | def add_audio_pointer(word): 37 | one_word_d = word_d[word] 38 | word = word.replace('é', 'e').replace('ï', 'i').split('/')[0] 39 | source_file_name = None 40 | if word not in dsl_pron_d: 41 | return 42 | dsl_word_d = dsl_pron_d[word] 43 | if word not in multi_pron: 44 | source_file_name = source + '/' + dsl_word_d['phon_l'][0] 45 | output_file_name = output + '/' + word + '.wav' 46 | one_word_d['audio'] = '[sound:' + word + '.mp3]' 47 | if not os.path.isfile(output_file_name) and os.path.isfile(source_file_name): 48 | shutil.copy(source_file_name, output_file_name) 49 | else: 50 | for usage_d in one_word_d['usages']: 51 | for basic_d in usage_d['basic']: 52 | book_pspeech = basic_d['pspeech'] 53 | if book_pspeech in ['vt', 'vi']: 54 | book_pspeech = 'v' 55 | for index, dsl_pspeech in enumerate(dsl_word_d['pspeech_l']): 56 | for dsl_sub_pspeech in dsl_pspeech.split(','): 57 | if dsl_sub_pspeech.strip().startswith(book_pspeech): 58 | source_file_name = source + '/' + dsl_word_d['phon_l'][index] 59 | break 60 | if source_file_name: 61 | output_file_name = output + '/' + word + '_' + book_pspeech + '.wav' 62 | basic_d['audio'] = '[sound:' + word + '_' + book_pspeech + '.mp3]' 63 | if not os.path.isfile(output_file_name) and os.path.isfile(source_file_name): 64 | shutil.copy(source_file_name, output_file_name) 65 | 66 | for word in word_d: 67 | add_audio_pointer(word) 68 | 69 | # 3. 将 wav 转换为 mp3,因为只有 mp3 能在手机上发音 70 | def convert_to_mp3(): 71 | owd = os.getcwd() 72 | os.chdir(output) 73 | for audio in glob.glob('*.wav'): 74 | mp3_filename = os.path.splitext(os.path.basename(audio))[0] + '.mp3' 75 | if not os.path.isfile(mp3_filename): 76 | AudioSegment.from_file(audio).export(mp3_filename, format='mp3') 77 | os.chdir(owd) 78 | 79 | convert_to_mp3() 80 | 81 | # 4. 把word和mnemo的字典转化为可以导入的note 82 | def Dict_to_Note(): 83 | output_list = [] 84 | join2 = '
  '.join 85 | join3 = '
   '.join 86 | join4 = '
    '.join 87 | for word in word_d: 88 | # 1. 单词部分 89 | one_word_d = word_d[word] 90 | pos_L, pos_U = one_word_d['pos'] 91 | pos = 'L' + ('0' + str(pos_L))[-2:] + ' U' + ('0' + str(pos_U))[-2:] 92 | pronon = one_word_d['phon'] + one_word_d['audio'] + ('
' if one_word_d['phon'] else '') 93 | 94 | # 2. 助记部分 95 | etym = '' 96 | if word in mnemo_d: 97 | mnemo = mnemo_d[word] 98 | compos = '
__构词__ %s
' % join3(mnemo['content']) if mnemo['content'] else '' 99 | root = '__词根__ __%s__ %s
' % (mnemo['root'], join3(mnemo['root_exp'])) if mnemo['root'] else '' 100 | cognates = '__同根__ %s
' % mnemo_d[word]['cognates'] if mnemo['root'] else '' 101 | etym = compos + root + cognates 102 | 103 | # 3. 考法部分 104 | exam = '' 105 | for usage_index, usage in enumerate(one_word_d['usages']): 106 | origin = '
【__考法 %d__】
' % (usage_index+1) 107 | der = '' 108 | for one_basic in usage['basic']: 109 | one_pron_str = '%s %s
' % (one_basic['phon'], one_basic['audio']) if one_basic['phon'] else '' 110 | one_exp_str = '%s. %s
' % (one_basic['pspeech'], one_basic['exp']) 111 | origin += (one_pron_str + one_exp_str) 112 | origin += '__例__ %s
' % join2(usage['examples']) if usage['examples'] else '' 113 | origin += '__近__ %s
' % join2(usage['syns']) if usage['syns'] else '' 114 | origin += '__反__ %s
' % join2(usage['ants']) if usage['ants'] else '' 115 | for der_d in usage['der']: 116 | der += '__派__ %s. %s
' % der_d['basic'][0]['pspeech'], der_d['basic'][0]['exp'] if der_d['basic'] else '' 117 | der += '  __例__ %s
' % join4(der_d['examples']) if der_d['examples'] else '' 118 | der += '  __近__ %s
' % join4(der_d['syns']) if der_d['syns'] else '' 119 | der += '  __反__ %s
' % join4(der_d['ants']) if der_d['ants'] else '' 120 | exam += (origin + der) 121 | 122 | one_line = [word, pos, pronon, '', etym, exam] 123 | one_line = [custom_html_element(x).replace('\n', '~') for x in one_line] 124 | output_list.append(one_line) 125 | return output_list 126 | 127 | import_data = sorted(Dict_to_Note(), key=lambda x: x[1]+x[0]) 128 | with codecs.open('output/anki_import.txt', 'w', encoding='utf-8') as f: 129 | for one_line in import_data: 130 | one_string = '\t'.join(one_line) + '\n' 131 | f.write(one_string) 132 | -------------------------------------------------------------------------------- /my_helpers.py: -------------------------------------------------------------------------------- 1 | # coding:utf-8 2 | import re 3 | import json 4 | import codecs 5 | import functools 6 | import os.path 7 | from random import random 8 | from random import randint 9 | from pprint import pprint 10 | from copy import deepcopy 11 | def strF2H(ustring): 12 | ''' 13 | convert full width character to half width 14 | input: a unicode object 15 | return: a unicode object 16 | ''' 17 | h_ustring = "" 18 | assert isinstance(ustring, str) 19 | for uchar in ustring: 20 | inside_code = ord(uchar) 21 | if inside_code == 12288: 22 | # white space 23 | inside_code = 32 24 | elif 65281 <= inside_code <= 65374: 25 | # other characters 26 | inside_code -= 65248 27 | 28 | h_ustring += chr(inside_code) 29 | return h_ustring 30 | def extract_content_between(obj_str, match_re, return_str_before_first_match=False): 31 | ''' 32 | extract content between the start of two equal pattern found in a str, 33 | also extract the content after the last match 34 | input: obj_str, the string to extract content from, must be a unicode object 35 | match_re, the pattern to be matched 36 | return: a list of str 37 | return_str_before_first_match: whether to return the str before the first match of the given patter 38 | ''' 39 | assert isinstance(obj_str, str) 40 | retype = type(re.compile(r'a str')) 41 | assert isinstance(match_re, retype) 42 | 43 | match_results_iter = match_re.finditer(obj_str) 44 | returned_str_l = [] 45 | start_index = None 46 | end_index = None 47 | first_start_index = None 48 | for match_result in match_results_iter: 49 | if first_start_index is None: 50 | first_start_index = match_result.start() 51 | if not (start_index is None): 52 | end_index = match_result.start() 53 | returned_str_l.append(obj_str[start_index:end_index]) 54 | start_index = match_result.start() 55 | returned_str_l.append(obj_str[start_index:]) 56 | if return_str_before_first_match: 57 | returned_str_l = [obj_str[:first_start_index]] + returned_str_l 58 | if len(returned_str_l) == 2 and returned_str_l[0] == returned_str_l[1]: 59 | returned_str_l = returned_str_l[:1] 60 | return returned_str_l 61 | def iter_through_general(obj_iter, path, yield_flags=True, final_yield_object=None): 62 | ''' 63 | iter through an object following the given path 64 | yield_flags: control whether to yield the flags indicating the path at the global level 65 | final_yield_object: internal parameter, don't modify 66 | obj_iter: an iterable variable 67 | path: a sequence, each element has the following structure 68 | (how_to_iter, what_to_iter, yield_flag) 69 | how_to_iter: a str, accept the following values 70 | 'all' or 'all_values': iter through key-value pair for dict, and all elements for other type 71 | if yield_flag is True, attach key or index to the final yield object 72 | 'all_keys', only iter through the keys of a dict 73 | obj_iter must be a dict 74 | 'key', iter through the value of a given key 75 | what_to_iter must be a str representing a key in obj_iter 76 | if yield_flag is True, attach key to the final yield object 77 | ignored when obj_iter is not dict 78 | 'keys', iter through the values of a given set of keys 79 | what_to_iter must be a tuple with elements reprenting keys in obj_iter 80 | if yield_flag is True, attach key to the final yield object 81 | ignored when obj_iter is not dict 82 | 'index', iter through a given element 83 | what_to_iter must be an int within bound 84 | if yield_flag is True, attach index to the final yield object 85 | ignored when obj_iter is dict 86 | 'indexes', iter through the elements with given indexes 87 | what_to_iter must be an list of int within bound 88 | if yield_flag is True, attach key to the final yield object 89 | ignored when obj_iter is dict 90 | what_to_iter: content decided by how_to_iter 91 | ignored for the following values of how_to_iter 92 | all, all_values, all_keys 93 | yield_flag: True or False 94 | True: depending on how_to_iter, attch different flags to the final result 95 | False: no flag wil be yield 96 | ignored for the following values of how_to_iter 97 | all_keys 98 | ''' 99 | is_dict = isinstance(obj_iter, dict) 100 | if final_yield_object is None: 101 | final_yield_object = [] 102 | if len(path) == 0: 103 | if yield_flags: 104 | final_yield_object.append(obj_iter) 105 | yield final_yield_object 106 | else: 107 | yield obj_iter 108 | else: 109 | how_to_iter, what_to_iter, yield_flag = path.pop(0) 110 | assert isinstance(how_to_iter, str) 111 | if how_to_iter in ['all', 'all_values', 'keys', 'indexes']: 112 | if how_to_iter in ['keys', 'indexes']: 113 | assert hasattr(what_to_iter, '__iter__') 114 | for item in what_to_iter: 115 | if is_dict: 116 | assert how_to_iter == 'keys' 117 | assert isinstance(item, str) 118 | assert item in obj_iter 119 | else: 120 | assert how_to_iter == 'indexes' 121 | assert isinstance(item, int) 122 | assert item < len(obj_iter) 123 | temp_iterator = ((item, obj_iter[item]) for item in what_to_iter) 124 | else: 125 | temp_iterator = iter(obj_iter.items()) if is_dict else enumerate(obj_iter) 126 | for flag, sub_obj_iter in temp_iterator: 127 | final_yield_object_copy = deepcopy(final_yield_object) 128 | if yield_flag: 129 | final_yield_object_copy.append(flag) 130 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object_copy): 131 | yield value 132 | elif how_to_iter == 'all_keys': 133 | assert is_dict 134 | for key in obj_iter.keys(): 135 | if yield_flags: 136 | final_yield_object.append(key) 137 | yield final_yield_object 138 | else: 139 | yield key 140 | elif how_to_iter in ['key', 'index']: 141 | if is_dict: 142 | assert how_to_iter == 'key' 143 | assert isinstance(what_to_iter, str) 144 | assert what_to_iter in obj_iter 145 | else: 146 | assert how_to_iter == 'index' 147 | assert isinstance(what_to_iter, int) 148 | assert what_to_iter < len(obj_iter) 149 | sub_obj_iter = obj_iter[what_to_iter] 150 | if yield_flag: 151 | final_yield_object.append(what_to_iter) 152 | for value in iter_through_general(sub_obj_iter, deepcopy(path), yield_flags, final_yield_object): 153 | yield value 154 | else: 155 | raise ValueError('Invalid path') 156 | strip_white_space = lambda _str: _str.replace(' ', '') 157 | new_line_join = lambda str_l: '\n'.join(str_l) 158 | def codecs_open_r_utf8(file_path): 159 | with codecs.open(file_path, 'r', 'utf-8') as f: 160 | returned_str = f.read() 161 | return returned_str 162 | # merge blank lines 163 | def collapse_blank_line(base_str): 164 | match_double_line_feed_re = re.compile(r'\n\n|\n \n') 165 | while match_double_line_feed_re.search(base_str): 166 | base_str = match_double_line_feed_re.sub(r'\n', base_str) 167 | return base_str 168 | def custom_html_element(_str): 169 | """ 170 | convert the markdown notations in a string to html tags 171 | currently, only two kinds of markdown notation exist in all the strings 172 | ** and * 173 | """ 174 | formatted_str = _str 175 | # format triple asterisk 176 | match_double_asterisk_re = re.compile('__(.*?)__') 177 | # replace ***...*** with ... 178 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 179 | # format double asterisk 180 | match_double_asterisk_re = re.compile('\*\*(.*?)\*\*') 181 | # replace **...** with ... 182 | formatted_str = match_double_asterisk_re.sub(r'\1', formatted_str) 183 | # format single asterisk 184 | # replace *...* with ... 185 | match_single_asterisk_re = re.compile('\*(.*?)\*') 186 | formatted_str = match_single_asterisk_re.sub(r'\1', formatted_str) 187 | return formatted_str 188 | def is_file_and_json_load(file_name_str): 189 | if os.path.isfile(file_name_str): 190 | with codecs.open(file_name_str, 'r', encoding='utf-8') as f: 191 | json_d = json.load(f) 192 | return json_d 193 | --------------------------------------------------------------------------------