├── 排序规则表.txt ├── doc_Gen.pptx ├── result.docx ├── all_queued.xls ├── screen_queued.xls ├── default_database.xls ├── original ├── 2.文化上海云建设稳步推进.docx ├── 10.上海举办秋季阅读马拉松赛.docx ├── 4.让老百姓享受高质量文化生活.docx ├── 5.文化上海云:百姓眼里的文化淘宝.docx ├── 8.互联网+公共文化服务上海联动.docx ├── 1.大数据让上海公共文化服务更精准更便民.docx ├── 6.上海安亭:吸引社会力量参与文化建设.docx ├── 7.区域联动力推互联网+公共文化服务 .docx ├── 3.2015年上海市民文化节家庭故事大赛落幕.docx └── 9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx ├── refined ├── 3.上海举办秋季阅读马拉松赛.docx ├── 1.让老百姓享受高质量文化生活.docx ├── 2.互联网公共文化服务上海联动.docx ├── 5.区域联动力推互联网公共文化服务.docx ├── 6.大数据让上海公共文化服务更精准更便民.docx └── 4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx ├── screened ├── 1.让老百姓享受高质量文化生活.docx ├── 2.互联网公共文化服务上海联动.docx ├── 3.上海举办秋季阅读马拉松赛.docx ├── 5.区域联动力推互联网公共文化服务.docx ├── 6.大数据让上海公共文化服务更精准更便民.docx └── 4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx └── docGen_bDay.py /排序规则表.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/排序规则表.txt -------------------------------------------------------------------------------- /doc_Gen.pptx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/doc_Gen.pptx -------------------------------------------------------------------------------- /result.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/result.docx -------------------------------------------------------------------------------- /all_queued.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/all_queued.xls -------------------------------------------------------------------------------- /screen_queued.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screen_queued.xls -------------------------------------------------------------------------------- /default_database.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/default_database.xls -------------------------------------------------------------------------------- /original/2.文化上海云建设稳步推进.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/2.文化上海云建设稳步推进.docx -------------------------------------------------------------------------------- /refined/3.上海举办秋季阅读马拉松赛.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/3.上海举办秋季阅读马拉松赛.docx -------------------------------------------------------------------------------- /original/10.上海举办秋季阅读马拉松赛.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/10.上海举办秋季阅读马拉松赛.docx -------------------------------------------------------------------------------- /original/4.让老百姓享受高质量文化生活.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/4.让老百姓享受高质量文化生活.docx -------------------------------------------------------------------------------- /refined/1.让老百姓享受高质量文化生活.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/1.让老百姓享受高质量文化生活.docx -------------------------------------------------------------------------------- /refined/2.互联网公共文化服务上海联动.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/2.互联网公共文化服务上海联动.docx -------------------------------------------------------------------------------- /screened/1.让老百姓享受高质量文化生活.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/1.让老百姓享受高质量文化生活.docx -------------------------------------------------------------------------------- /screened/2.互联网公共文化服务上海联动.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/2.互联网公共文化服务上海联动.docx -------------------------------------------------------------------------------- /screened/3.上海举办秋季阅读马拉松赛.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/3.上海举办秋季阅读马拉松赛.docx -------------------------------------------------------------------------------- /original/5.文化上海云:百姓眼里的文化淘宝.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/5.文化上海云:百姓眼里的文化淘宝.docx -------------------------------------------------------------------------------- /original/8.互联网+公共文化服务上海联动.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/8.互联网+公共文化服务上海联动.docx -------------------------------------------------------------------------------- /refined/5.区域联动力推互联网公共文化服务.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/5.区域联动力推互联网公共文化服务.docx -------------------------------------------------------------------------------- /screened/5.区域联动力推互联网公共文化服务.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/5.区域联动力推互联网公共文化服务.docx -------------------------------------------------------------------------------- /original/1.大数据让上海公共文化服务更精准更便民.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/1.大数据让上海公共文化服务更精准更便民.docx -------------------------------------------------------------------------------- /original/6.上海安亭:吸引社会力量参与文化建设.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/6.上海安亭:吸引社会力量参与文化建设.docx -------------------------------------------------------------------------------- /original/7.区域联动力推互联网+公共文化服务 .docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/7.区域联动力推互联网+公共文化服务 .docx -------------------------------------------------------------------------------- /refined/6.大数据让上海公共文化服务更精准更便民.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/6.大数据让上海公共文化服务更精准更便民.docx -------------------------------------------------------------------------------- /screened/6.大数据让上海公共文化服务更精准更便民.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/6.大数据让上海公共文化服务更精准更便民.docx -------------------------------------------------------------------------------- /original/3.2015年上海市民文化节家庭故事大赛落幕.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/3.2015年上海市民文化节家庭故事大赛落幕.docx -------------------------------------------------------------------------------- /refined/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx -------------------------------------------------------------------------------- /original/9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx -------------------------------------------------------------------------------- /screened/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx -------------------------------------------------------------------------------- /docGen_bDay.py: -------------------------------------------------------------------------------- 1 | # !usr/bin/env python 2 | 3 | ''' 4 | 2018-6-22 5 | B Day Special revision 6 | show me the MAGIC! 7 | 增加功能:根据docx文件夹,生成文章汇总表 8 | 9 | 2018-6-19 10 | rev 1.0 11 | 网络媒体编辑、汇总 12 | 筛选XXX(2016)年网络媒体文章,按指定顺序排序,生成一个word 13 | ''' 14 | 15 | import os 16 | import xlrd 17 | import xlwt 18 | import docx 19 | import time 20 | import datetime 21 | import shutil 22 | import string 23 | import zipfile 24 | import re 25 | import warnings 26 | import turtle 27 | 28 | punc = string.punctuation + string.whitespace 29 | digit = string.digits 30 | refine_count = 0 31 | 32 | #根据docx文件夹,生成文章汇总表 33 | #docx文件位于original文件夹中 34 | #当前文件夹中,需有:排序规则表.txt 35 | def genDatabase(src_path = 'original', filename = 'default_database.xls'): 36 | #生成文章list 37 | origin_list = [] 38 | date_pat = re.compile('\d{4}\D\d{1,2}\D\d{1,2}') # YYYYxMMxDD 39 | for doc_name in os.listdir(src_path): 40 | doc = docx.Document(os.path.join(src_path, doc_name)) 41 | for para in doc.paragraphs[:: -1]: #搜索署名行 42 | if para.text.strip(): 43 | sch = re.search(date_pat, para.text) 44 | if sch: #如匹配到日期 45 | #日期 46 | datestr = para.text[sch.start() : sch.end()] 47 | split = [] 48 | for char in datestr: 49 | if char not in digit: 50 | split.append(char) 51 | y, m, d = time.strptime(datestr, 52 | '%Y' + '%s' % split[0] + '%m' + '%s' % split[1] + '%d')[0:3] 53 | date = datetime.datetime(y, m, d) 54 | #媒体 55 | sp = para.text.split(datestr) 56 | media = sp[0].strip() if sp[0].strip() else sp[1].strip() 57 | if not media: #如媒体字段为空 58 | media = input('\n[%s]\nNO media name found. Please input a media name: ' 59 | % doc_name).strip() 60 | while not media: 61 | media = input('Media is missing. Input again: ').strip() 62 | warnings.warn('User generated [media:%s],\ 63 | may NOT match [%s]' % (media, doc_name)) 64 | else: #如未匹配到日期 65 | if input('[%s]\nSigniture NOT found in the ending line. Discard this doc? Y/ N' 66 | % doc_name).strip()[0].lower() == 'y': 67 | break 68 | else: 69 | date = datetime.datetime(time.strptime( 70 | input('Please input date (YYYY-MM-DD): ').strip(), '%Y-%m-%d')[0:3]) 71 | media = input('Please input media: ').strip() 72 | warnings.warn('User generated [date:%s] and [media:%s],\ 73 | may NOT match [%s]' % (date, media, doc_name)) 74 | origin_list.append([media, date, doc_name]) 75 | break 76 | #按文章编号排序 77 | for i in range(len(origin_list))[:0:-1]: #倒序搜索的重点为索引1,索引0不可再搜索: 78 | for j in range(len(origin_list))[i - 1::-1]: #因为内循环变成-1起始(相当于从end完整搜索) 79 | num_i = eval(origin_list[i][2].split('.')[0]) #每次内循环,num_i必须重算 80 | num_j = eval(origin_list[j][2].split('.')[0]) 81 | if not isinstance(num_i, int): 82 | origin_list.append(origin_list.pop(i)) 83 | break 84 | elif not isinstance(num_j, int): 85 | origin_list.append(origin_list.pop(j)) 86 | continue 87 | if num_i < num_j: 88 | origin_list[i], origin_list[j] = origin_list[j], origin_list[i] 89 | #去除文章标题的序号和后缀名 90 | for item in origin_list: 91 | item[2] = item[2].split('.')[1] 92 | 93 | #生成排序规则list 94 | seq_list = [] 95 | seq_path = u'排序规则表.txt' 96 | while not os.path.exists(seq_path): 97 | input('Please copy [%s] to the root dir. When finished press Enter...') 98 | with open(u'排序规则表.txt') as f: 99 | for line in f: 100 | seq_list.append(line.strip()) 101 | 102 | #生成文章汇总表 103 | genXls(origin_list, filename, seq_list) 104 | 105 | 106 | #读取文章汇总表,并按媒体、日期排序 107 | #媒体升序排列 108 | #日期默认升序,可选降序 109 | #文章汇总表: 110 | #为.xlsx文件,或默认标题default_database.xls 111 | #sheet 1,第2列媒体,第3列日期,第4列标题,从第2行开始 112 | #sheet 2,第1列媒体排序规则,从第2行开始 113 | def getDatabase(filename = r'网络媒体汇总.xlsx', date_reverse = False): 114 | if not os.path.exists(filename): 115 | for name in os.listdir(): 116 | if name.endswith('.xlsx') or name == 'default_database.xls': 117 | filename = name 118 | break 119 | else: 120 | print('NO database found. Now generating...') 121 | filename = 'default_database.xls' 122 | genDatabase('original', filename) 123 | wb = xlrd.open_workbook(filename) 124 | 125 | #读取媒体文章汇总表 126 | w_sheet = wb.sheet_by_index(0) 127 | w_list = [] 128 | for i in range(1, w_sheet.nrows): 129 | row = w_sheet.row_values(i, 1, 4) 130 | for i in range(len(row)): 131 | if isinstance(row[i], str): 132 | row[i] = row[i].strip().strip('《').strip('》') 133 | if isinstance(row[1], str): #处理日期列 134 | row[1] = row[1].split('(')[0].split('(')[0] 135 | split = [] 136 | for char in row[1]: 137 | if char not in digit: 138 | split.append(char) 139 | if len(split) == 2: # YYYYxMMxDD 140 | y, m, d = time.strptime(row[1], 141 | '%Y' + '%s' % split[0] + '%m' + '%s' % split[1] + '%d')[0:3] 142 | elif len(split) == 1: # YYYYxMM 143 | y, m, d = time.strptime(row[1], '%Y' + '%s' % split[0] + '%m')[0:3] 144 | else: 145 | raise Exception('date format error@ ', row) 146 | row[1] = datetime.datetime(y, m, d) 147 | else: 148 | row[1] = xlrd.xldate_as_datetime(row[1],0) 149 | w_list.append(row) 150 | 151 | #读取媒体名称排序规则表 152 | s_sheet = wb.sheet_by_index(1) 153 | s_list = [] 154 | for i in range(1, s_sheet.nrows): 155 | s_list.append(s_sheet.cell_value(i, 0).strip().strip('《').strip('》')) 156 | 157 | #按媒体名称排序 158 | for s_item in s_list: 159 | for i in range(len(w_list))[::-1]: #倒序搜索,避免前序操作改变后续索引位置 160 | if w_list[i][0] == s_item: 161 | w_list.append(w_list.pop(i)) 162 | for i in range(len(w_list))[::-1]: #将不包含在排序规则表中的媒体内容排到最后 163 | if w_list[i][0] not in s_list: 164 | w_list.append(w_list.pop(i)) 165 | 166 | #按日期排序 167 | for i in range(len(w_list))[:0:-1]: 168 | for j in range(len(w_list))[i - 1::-1]: 169 | if w_list[j][0] == w_list[i][0]: 170 | if date_reverse: #降序 171 | if w_list[j][1] < w_list[i][1]: 172 | w_list[i], w_list[j] = w_list[j], w_list[i] 173 | else: #升序 174 | if w_list[j][1] > w_list[i][1]: 175 | w_list[i], w_list[j] = w_list[j], w_list[i] 176 | 177 | print('[%s] loaded' % filename) 178 | return w_list 179 | 180 | 181 | #筛选XX(2016)年文章 182 | def scrDocx(article_list, year = 2016, from_path = 'original', to_path = 'screened'): 183 | screen_list = [] 184 | for item in article_list: 185 | if item[1].year == year: 186 | screen_list.append(item) 187 | genXls(screen_list, dst_path = 'screen_queued.xls') 188 | 189 | if os.path.exists(to_path): 190 | shutil.rmtree(to_path) 191 | os.mkdir(to_path) 192 | miss_count = 0 193 | #miss_list = [] 194 | for i in range(len(screen_list)): 195 | miss = True 196 | mod_screen_name = screen_list[i][2] #去除特殊字符,便于比较 197 | for j in range(len(mod_screen_name))[::-1]: #倒序搜索 198 | if mod_screen_name[j] in punc: 199 | mod_screen_name = mod_screen_name[:j] + mod_screen_name[j + 1:] + ' ' 200 | #末尾加空格,以确保索引正确 201 | mod_screen_name = mod_screen_name.strip() 202 | for dirpath, dirnames, filenames in os.walk(from_path): 203 | for filename in filenames: 204 | original_pathname = os.path.join(dirpath, filename) 205 | mod_filename = filename.rpartition('.')[0] #去除特殊字符,便于比较 206 | for j in range(len(mod_filename))[::-1]: #倒序搜索 207 | if mod_filename[j] in punc: 208 | mod_filename = mod_filename[:j] + mod_filename[j + 1:] + ' ' 209 | mod_filename = mod_filename.strip() 210 | if mod_screen_name in mod_filename: 211 | new_name = str(i + 1) + '.' + mod_screen_name + \ 212 | os.path.splitext(original_pathname)[1] 213 | new_pathname = os.path.join(to_path, new_name) 214 | silence = shutil.copy(original_pathname, new_pathname) 215 | miss = False 216 | if miss: #通过miss来提取未找到的文件清单 217 | miss_count += 1 218 | #miss_list.append(screen_list[i]) 219 | print('not found %2d: %d\t%s' % (miss_count, i + 1, screen_list[i])) 220 | 221 | print('\n %d docs copied to folder [%s]' % (len(screen_list) - miss_count, to_path)) 222 | 223 | 224 | #生成xls文件 225 | #my_list写入Sheet1 226 | #s_list(排序规则,如有)写入Sheet2 227 | def genXls(my_list, dst_path = 'result.xls', s_list = None): 228 | wb = xlwt.Workbook() 229 | ws = wb.add_sheet('Sheet1') 230 | myFormat = xlwt.XFStyle() 231 | myFormat.num_format_str = 'yyyy/mm/dd' 232 | 233 | title = ['序号', '媒体', '日期', '标题'] 234 | for i in range(len(title)): 235 | ws.write(0, i, title[i]) 236 | 237 | rows = len(my_list) 238 | cols = len(my_list[0]) 239 | for r in range(rows): 240 | ws.write(r + 1, 0, r + 1) 241 | for j in range(cols): 242 | ws.write(r + 1, j + 1, my_list[r][j], myFormat) 243 | 244 | if s_list: 245 | ws1 = wb.add_sheet('Sheet2') 246 | ws1.write(0, 0, '排序规则') 247 | for i in range(len(s_list)): 248 | ws1.write(i + 1, 0, s_list[i]) 249 | 250 | wb.save(dst_path) 251 | print('[%s] generated' % dst_path) 252 | 253 | 254 | #生成refined_doc文件 255 | def refDocx(src_dir = 'screened', refine_folder = 'refined'): 256 | global refine_count 257 | if os.path.exists(refine_folder): 258 | shutil.rmtree(refine_folder) 259 | os.mkdir(refine_folder) 260 | 261 | #生成refined文件夹(单篇文档,格式修正) 262 | temp_zip = 'temp.zip' 263 | temp_dir = 'temp' 264 | for doc_name in os.listdir(src_dir): 265 | old_doc = docx.Document(os.path.join(src_dir, doc_name)) 266 | new_doc = docx.Document() 267 | setdocStyle(new_doc.styles['Normal']) #doc整体格式设置 268 | 269 | #写入refined_doc 270 | #每张图片作为单独段落 271 | fig_n = 0 #图片排序 272 | fig_unloaded = True #记录图片是否已经载入 273 | for para in old_doc.paragraphs: 274 | if para.text.strip(): 275 | new_para = new_doc.add_paragraph() 276 | for run in para.runs: 277 | text = run.text.strip() 278 | if text: #copy非空文本到新文档 279 | refined_text = refText(text, doc_name) #文本内容修订 280 | new_run = new_para.add_run(refined_text) 281 | elif run.element.drawing_lst: #插入图片 282 | #doc.inline_shapes无法识别非嵌入式图片,故直接从段落文字块判定有无图 283 | if fig_unloaded: #如本doc还未载入图片,此处载入 284 | silence = shutil.copy(os.path.join(src_dir, doc_name), temp_zip) 285 | f = zipfile.ZipFile(temp_zip) 286 | for file in f.namelist(): 287 | if file.startswith('word/media/image'): 288 | silence = f.extract(file, temp_dir) 289 | f.close() 290 | img_dir = os.path.split(silence)[0] 291 | img_list = os.listdir(img_dir) 292 | fig_unloaded = False 293 | for i in range(len(run.element.drawing_lst)): 294 | fig_para = new_doc.add_paragraph() # 图片单独作为一段 295 | fig_para.paragraph_format.first_line_indent = 0 #图片段落无缩进 296 | fig_para.alignment = 1 #图片段落居中 297 | fig_run = fig_para.add_run() 298 | #if old_doc.inline_shapes: #如为内置图形(自动换行-嵌入式) 299 | # new_inlineshape = fig_run.add_picture( 300 | # os.path.join(img_dir, img_list[fig_n]), 301 | # old_doc.inline_shapes[fig_n].width, 302 | # old_doc.inline_shapes[fig_n].height) 303 | #else: #如为其他图形 304 | new_inlineshape = fig_run.add_picture( 305 | os.path.join(img_dir, img_list[fig_n]), 306 | width = docx.shared.Cm(13)) #默认图形宽度13cm 307 | fig_n += 1 308 | if os.path.exists(temp_zip): 309 | os.remove(temp_zip) 310 | if os.path.exists(temp_dir): 311 | shutil.rmtree(temp_dir) 312 | 313 | #refined doc格式调整 314 | new_doc.paragraphs[0].paragraph_format.first_line_indent = 0 #标题行无缩进 315 | new_doc.paragraphs[0].alignment = 1 #标题行居中 316 | new_doc.paragraphs[0].runs[0].font.bold = True #标题行加粗 317 | new_doc.paragraphs[1].insert_paragraph_before() #标题行后空行 318 | for i in range(len(new_doc.paragraphs))[::-1]: 319 | if new_doc.paragraphs[i].text.strip(): 320 | new_doc.paragraphs[i].alignment = 2 #署名行右对齐(跳过末尾图片段落) 321 | break 322 | new_doc.add_paragraph() #文末空2行 323 | new_doc.add_paragraph() #文末空2行 324 | new_doc.save(os.path.join(refine_folder, doc_name)) 325 | print('\nTotally %d refined.\n%d docs generated in [%s].' \ 326 | % (refine_count, len(os.listdir(refine_folder)), refine_folder)) 327 | 328 | 329 | #生成汇总doc文件 330 | def genDocx(src_dir = 'refined', dst_path = 'result.docx'): 331 | dst_doc = docx.Document() 332 | setdocStyle(dst_doc.styles['Normal']) #doc整体格式设置 333 | temp_zip = 'temp.zip' 334 | temp_dir = 'temp' 335 | for doc_name in os.listdir(src_dir): 336 | refined_doc = docx.Document(os.path.join(src_dir, doc_name)) 337 | #doc如有图片,提取图片 338 | if refined_doc.inline_shapes: 339 | fig_n = 0 #图片排序 340 | silence = shutil.copy(os.path.join(src_dir, doc_name), temp_zip) 341 | f = zipfile.ZipFile(temp_zip) 342 | for file in f.namelist(): 343 | if file.startswith('word/media/image'): 344 | silence = f.extract(file, temp_dir) 345 | f.close() 346 | img_dir = os.path.split(silence)[0] 347 | img_list = os.listdir(img_dir) 348 | #标题段落 349 | title_para = dst_doc.add_paragraph(refined_doc.paragraphs[0].text) 350 | title_para.paragraph_format.first_line_indent = 0 #标题行无缩进 351 | title_para.alignment = 1 #标题行居中 352 | title_para.runs[0].font.bold = True #标题行加粗 353 | #其他段落 354 | for para in refined_doc.paragraphs[1 :]: 355 | other_para = dst_doc.add_paragraph() 356 | if para.runs: 357 | for run in para.runs: 358 | new_run = other_para.add_run(run.text) 359 | if run.element.drawing_lst: #如是图片,插入图片段 360 | for i in range(len(run.element.drawing_lst)): 361 | other_para.paragraph_format.first_line_indent = 0 #图片段落无缩进 362 | other_para.alignment = 1 #图片段落居中 363 | new_inlineshape = new_run.add_picture( 364 | os.path.join(img_dir, img_list[fig_n]), 365 | refined_doc.inline_shapes[fig_n].width, 366 | refined_doc.inline_shapes[fig_n].height) 367 | fig_n += 1 368 | #署名段 369 | for i in range(len(dst_doc.paragraphs))[::-1]: 370 | if dst_doc.paragraphs[i].text.strip(): 371 | dst_doc.paragraphs[i].alignment = 2 #署名行右对齐(跳过末尾图片、空行) 372 | break 373 | 374 | if os.path.exists(temp_zip): 375 | os.remove(temp_zip) 376 | if os.path.exists(temp_dir): 377 | shutil.rmtree(temp_dir) 378 | 379 | dst_doc.save(dst_path) 380 | print('\nSummarized doc [%s] generated.' % dst_path) 381 | 382 | 383 | # 文本内容完善 384 | # 修正英文引号 385 | # 纠正前引号、回引号使用 386 | # 纠正英文逗号,句号 387 | def refText(text, pathname, output = False): 388 | global refine_count 389 | temp_text = text 390 | # 引号修正 391 | quote_revised = False 392 | quote_en = ["'", '"'] 393 | quote_cn_s = ['‘', '“'] #前引号 394 | quote_cn_e = ['’', '”'] #回引号 395 | for j in range(len(quote_en)): 396 | quote_s = True #单引号、双引号分别前引号搜索起点 397 | for i in range(len(temp_text)): 398 | if temp_text[i] == quote_en[j]: 399 | quote_revised = True 400 | refine_count += 1 401 | if quote_s: 402 | temp_text = temp_text[:i] + quote_cn_s[j] + temp_text[i + 1 :] 403 | quote_s = False 404 | else: 405 | temp_text = temp_text[:i] + quote_cn_e[j] + temp_text[i + 1 :] 406 | quote_s = True 407 | if quote_revised and output: 408 | print('\n[%s]\nOriginal:\t%s' % (pathname, text)) 409 | print('quote revised:\t%s' % temp_text) 410 | 411 | #逗号、句号修正 412 | comma_period_revised = False 413 | cp_en = [',', '.'] 414 | cp_cn = [',', '。'] 415 | for j in range(len(cp_en)): 416 | for i in range(len(temp_text)): 417 | idx_s = [] 418 | idx_e = [] 419 | new = [] 420 | if temp_text[i] == cp_en[j]: 421 | if i < (len(temp_text) - 1) and (temp_text[i - 1] in digit) \ 422 | and (temp_text[i + 1] in digit): 423 | pass # 剔除小数点, 千分位 424 | elif temp_text[i - 1] in digit: 425 | pass # 剔除序号项 426 | elif temp_text[i - 1] == '.': 427 | pass # 配合下节使用,跳过正确的多点线(除第一个点外) 428 | elif i < (len(temp_text) - 1) and temp_text[i + 1] == '.': # 检查多点线 429 | end = 0 430 | for k in range(i + 1, len(temp_text)): 431 | if temp_text[k] != '.': 432 | end = k - 1 433 | break 434 | print('\n%s (%s) %s' 435 | % (temp_text[:i], temp_text[i : end + 1], temp_text[end + 1:])) 436 | if input('is (%s) right? Y/N ' % temp_text[i : end + 1]).lower() == 'y': 437 | pass 438 | else: 439 | new.insert(0, input('Please input the right content to replace (%s): ' 440 | % temp_text[i : end + 1])) #倒序,避免索引错误 441 | idx_s.insert(0, i) #倒序,避免索引错误 442 | idx_e.insert(0, end + 1) #倒序,避免索引错误 443 | refine_count += 1 444 | else: 445 | refine_count += 1 446 | comma_period_revised = True 447 | temp_text = temp_text[:i] + cp_cn[j] + temp_text[i + 1 :] 448 | if idx_s: 449 | for i in range(len(idx_s)): 450 | temp_text = temp_text[: idx_s[i]] + new[i] + temp_text[idx_e[i]:] 451 | if comma_period_revised and output: 452 | if not quote_revised: 453 | print('\n[%s]\nOriginal:\t%s' % (pathname, text)) 454 | print('comma / period revised:\t%s' % temp_text) 455 | 456 | return temp_text 457 | 458 | 459 | #doc总体格式设置 460 | def setdocStyle(doc_style_obj, 461 | font_name = 'Hans', #字体名称 462 | font_size = docx.shared.Pt(10.5), #字体大小 463 | first_line_indent_n = 2, #首行缩进2字 464 | space_before = 0, #段前间距0 465 | space_after = 0, #段后间距0 466 | line_spacing = 1.0): #行距单倍 467 | doc_style_obj.font.name = 'Times New Roman' 468 | doc_style_obj.font.size = docx.shared.Pt(10.5) 469 | doc_style_obj.paragraph_format.first_line_indent = \ 470 | doc_style_obj.font.size * first_line_indent_n 471 | doc_style_obj.paragraph_format.space_before = space_before 472 | doc_style_obj.paragraph_format.space_after = space_after 473 | doc_style_obj.paragraph_format.line_spacing = line_spacing 474 | 475 | #生日快乐 476 | def love(): 477 | def func(x, y): 478 | main() 479 | turtle.title('喵喵子程序') 480 | lv=turtle.Turtle() 481 | lv.hideturtle() 482 | lv.getscreen().bgcolor('light blue') 483 | lv.color('yellow','red') 484 | lv.pensize(1) 485 | lv.speed(1) 486 | lv.up() 487 | lv.goto(0,-150) 488 | #开始画爱心 489 | lv.down() 490 | lv.begin_fill() 491 | lv.goto(0, -150) 492 | lv.goto(-175.12, -8.59) 493 | lv.left(140) 494 | pos = [] 495 | for i in range(19): 496 | lv.right(10) 497 | lv.forward(20) 498 | pos.append((-lv.pos()[0], lv.pos()[1])) 499 | for item in pos[::-1]: 500 | lv.goto(item) 501 | lv.goto(175.12, -8.59) 502 | lv.goto(0, -150) 503 | lv.left(50) 504 | lv.end_fill() 505 | #写字 506 | lv.up() 507 | lv.goto(0, 80) 508 | lv.down() 509 | lv.write("喵喵",font=(u"方正舒体",36,"normal"),align="center") 510 | lv.up() 511 | lv.goto(0, 0) 512 | lv.down() 513 | lv.write("生日快乐!",font=(u"方正舒体",48,"normal"),align="center") 514 | lv.up() 515 | lv.goto(100, -210) 516 | lv.down() 517 | lv.write("点我见证奇迹",font=(u"华文琥珀",26,"bold"),align="right") 518 | lv.up() 519 | lv.goto(160, -190) 520 | lv.resizemode('user') 521 | lv.shapesize(4, 4, 10) 522 | lv.color('red', 'red') 523 | lv.onclick(func) 524 | lv.showturtle() 525 | 526 | 527 | def main(): 528 | w_list = getDatabase(filename = r'网络媒体汇总.xlsx', date_reverse = True) 529 | genXls(w_list, dst_path = 'all_queued.xls') 530 | scrDocx(w_list, year = 2016, from_path = 'original', to_path = 'screened') 531 | refDocx(src_dir = 'screened', refine_folder = 'refined') 532 | genDocx(src_dir = 'refined', dst_path = 'result.docx') 533 | 534 | if __name__ == '__main__': 535 | if datetime.date.today() == datetime.date(2018, 6, 22): 536 | love() 537 | else: 538 | main() 539 | --------------------------------------------------------------------------------