├── 排序规则表.txt
├── doc_Gen.pptx
├── result.docx
├── all_queued.xls
├── screen_queued.xls
├── default_database.xls
├── original
    ├── 2.文化上海云建设稳步推进.docx
    ├── 10.上海举办秋季阅读马拉松赛.docx
    ├── 4.让老百姓享受高质量文化生活.docx
    ├── 5.文化上海云：百姓眼里的文化淘宝.docx
    ├── 8.互联网+公共文化服务上海联动.docx
    ├── 1.大数据让上海公共文化服务更精准更便民.docx
    ├── 6.上海安亭：吸引社会力量参与文化建设.docx
    ├── 7.区域联动力推互联网+公共文化服务 .docx
    ├── 3.2015年上海市民文化节家庭故事大赛落幕.docx
    └── 9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx
├── refined
    ├── 3.上海举办秋季阅读马拉松赛.docx
    ├── 1.让老百姓享受高质量文化生活.docx
    ├── 2.互联网公共文化服务上海联动.docx
    ├── 5.区域联动力推互联网公共文化服务.docx
    ├── 6.大数据让上海公共文化服务更精准更便民.docx
    └── 4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx
├── screened
    ├── 1.让老百姓享受高质量文化生活.docx
    ├── 2.互联网公共文化服务上海联动.docx
    ├── 3.上海举办秋季阅读马拉松赛.docx
    ├── 5.区域联动力推互联网公共文化服务.docx
    ├── 6.大数据让上海公共文化服务更精准更便民.docx
    └── 4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx
└── docGen_bDay.py


/排序规则表.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/排序规则表.txt


--------------------------------------------------------------------------------
/doc_Gen.pptx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/doc_Gen.pptx


--------------------------------------------------------------------------------
/result.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/result.docx


--------------------------------------------------------------------------------
/all_queued.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/all_queued.xls


--------------------------------------------------------------------------------
/screen_queued.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screen_queued.xls


--------------------------------------------------------------------------------
/default_database.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/default_database.xls


--------------------------------------------------------------------------------
/original/2.文化上海云建设稳步推进.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/2.文化上海云建设稳步推进.docx


--------------------------------------------------------------------------------
/refined/3.上海举办秋季阅读马拉松赛.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/3.上海举办秋季阅读马拉松赛.docx


--------------------------------------------------------------------------------
/original/10.上海举办秋季阅读马拉松赛.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/10.上海举办秋季阅读马拉松赛.docx


--------------------------------------------------------------------------------
/original/4.让老百姓享受高质量文化生活.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/4.让老百姓享受高质量文化生活.docx


--------------------------------------------------------------------------------
/refined/1.让老百姓享受高质量文化生活.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/1.让老百姓享受高质量文化生活.docx


--------------------------------------------------------------------------------
/refined/2.互联网公共文化服务上海联动.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/2.互联网公共文化服务上海联动.docx


--------------------------------------------------------------------------------
/screened/1.让老百姓享受高质量文化生活.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/1.让老百姓享受高质量文化生活.docx


--------------------------------------------------------------------------------
/screened/2.互联网公共文化服务上海联动.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/2.互联网公共文化服务上海联动.docx


--------------------------------------------------------------------------------
/screened/3.上海举办秋季阅读马拉松赛.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/3.上海举办秋季阅读马拉松赛.docx


--------------------------------------------------------------------------------
/original/5.文化上海云：百姓眼里的文化淘宝.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/5.文化上海云：百姓眼里的文化淘宝.docx


--------------------------------------------------------------------------------
/original/8.互联网+公共文化服务上海联动.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/8.互联网+公共文化服务上海联动.docx


--------------------------------------------------------------------------------
/refined/5.区域联动力推互联网公共文化服务.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/5.区域联动力推互联网公共文化服务.docx


--------------------------------------------------------------------------------
/screened/5.区域联动力推互联网公共文化服务.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/5.区域联动力推互联网公共文化服务.docx


--------------------------------------------------------------------------------
/original/1.大数据让上海公共文化服务更精准更便民.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/1.大数据让上海公共文化服务更精准更便民.docx


--------------------------------------------------------------------------------
/original/6.上海安亭：吸引社会力量参与文化建设.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/6.上海安亭：吸引社会力量参与文化建设.docx


--------------------------------------------------------------------------------
/original/7.区域联动力推互联网+公共文化服务 .docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/7.区域联动力推互联网+公共文化服务 .docx


--------------------------------------------------------------------------------
/refined/6.大数据让上海公共文化服务更精准更便民.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/6.大数据让上海公共文化服务更精准更便民.docx


--------------------------------------------------------------------------------
/screened/6.大数据让上海公共文化服务更精准更便民.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/6.大数据让上海公共文化服务更精准更便民.docx


--------------------------------------------------------------------------------
/original/3.2015年上海市民文化节家庭故事大赛落幕.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/3.2015年上海市民文化节家庭故事大赛落幕.docx


--------------------------------------------------------------------------------
/refined/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/refined/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx


--------------------------------------------------------------------------------
/original/9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/original/9.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx


--------------------------------------------------------------------------------
/screened/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/zz-icoding/doc_batch_process/HEAD/screened/4.车轮上的图书馆——上海市嘉定区安亭镇文化志愿者为百姓配送个性图书.docx


--------------------------------------------------------------------------------
/docGen_bDay.py:
--------------------------------------------------------------------------------
  1 | # !usr/bin/env python
  2 | 
  3 | '''
  4 | 2018-6-22
  5 | B Day Special revision
  6 | show me the MAGIC!
  7 | 增加功能：根据docx文件夹，生成文章汇总表
  8 | 
  9 | 2018-6-19
 10 | rev 1.0
 11 | 网络媒体编辑、汇总
 12 | 筛选XXX(2016)年网络媒体文章，按指定顺序排序，生成一个word
 13 | '''
 14 | 
 15 | import os
 16 | import xlrd
 17 | import xlwt
 18 | import docx
 19 | import time
 20 | import datetime
 21 | import shutil
 22 | import string
 23 | import zipfile
 24 | import re
 25 | import warnings
 26 | import turtle
 27 | 
 28 | punc = string.punctuation + string.whitespace
 29 | digit = string.digits
 30 | refine_count = 0
 31 | 
 32 | #根据docx文件夹，生成文章汇总表
 33 | #docx文件位于original文件夹中
 34 | #当前文件夹中，需有：排序规则表.txt
 35 | def genDatabase(src_path = 'original', filename = 'default_database.xls'):
 36 |     #生成文章list
 37 |     origin_list = []
 38 |     date_pat = re.compile('\d{4}\D\d{1,2}\D\d{1,2}') # YYYYxMMxDD
 39 |     for doc_name in os.listdir(src_path):
 40 |         doc = docx.Document(os.path.join(src_path, doc_name))
 41 |         for para in doc.paragraphs[:: -1]: #搜索署名行
 42 |             if para.text.strip():
 43 |                 sch = re.search(date_pat, para.text)
 44 |                 if sch: #如匹配到日期
 45 |                     #日期
 46 |                     datestr = para.text[sch.start() : sch.end()]
 47 |                     split = []
 48 |                     for char in datestr:
 49 |                         if char not in digit:
 50 |                             split.append(char)
 51 |                     y, m, d = time.strptime(datestr,
 52 |                                             '%Y' + '%s' % split[0] + '%m' + '%s' % split[1] + '%d')[0:3]
 53 |                     date = datetime.datetime(y, m, d)
 54 |                     #媒体
 55 |                     sp = para.text.split(datestr)
 56 |                     media = sp[0].strip() if sp[0].strip() else sp[1].strip()
 57 |                     if not media: #如媒体字段为空
 58 |                         media = input('\n[%s]\nNO media name found. Please input a media name: '
 59 |                                       % doc_name).strip()
 60 |                         while not media:
 61 |                            media = input('Media is missing. Input again: ').strip()
 62 |                         warnings.warn('User generated [media:%s],\
 63 |                                       may NOT match [%s]' % (media, doc_name))
 64 |                 else: #如未匹配到日期
 65 |                     if input('[%s]\nSigniture NOT found in the ending line. Discard this doc? Y/ N'
 66 |                              % doc_name).strip()[0].lower() == 'y':
 67 |                         break
 68 |                     else:
 69 |                         date = datetime.datetime(time.strptime(
 70 |                             input('Please input date (YYYY-MM-DD): ').strip(), '%Y-%m-%d')[0:3])
 71 |                         media = input('Please input media: ').strip()
 72 |                         warnings.warn('User generated [date:%s] and [media:%s],\
 73 |                                       may NOT match [%s]' % (date, media, doc_name))
 74 |                 origin_list.append([media, date, doc_name])
 75 |                 break
 76 |     #按文章编号排序
 77 |     for i in range(len(origin_list))[:0:-1]: #倒序搜索的重点为索引1，索引0不可再搜索：
 78 |         for j in range(len(origin_list))[i - 1::-1]: #因为内循环变成-1起始(相当于从end完整搜索)
 79 |             num_i = eval(origin_list[i][2].split('.')[0]) #每次内循环，num_i必须重算
 80 |             num_j = eval(origin_list[j][2].split('.')[0])
 81 |             if not isinstance(num_i, int):
 82 |                 origin_list.append(origin_list.pop(i))
 83 |                 break
 84 |             elif not isinstance(num_j, int):
 85 |                 origin_list.append(origin_list.pop(j))
 86 |                 continue
 87 |             if num_i < num_j:
 88 |                 origin_list[i], origin_list[j] = origin_list[j], origin_list[i]
 89 |     #去除文章标题的序号和后缀名
 90 |     for item in origin_list: 
 91 |         item[2] = item[2].split('.')[1]
 92 |             
 93 |     #生成排序规则list
 94 |     seq_list = []
 95 |     seq_path = u'排序规则表.txt'
 96 |     while not os.path.exists(seq_path):
 97 |         input('Please copy [%s] to the root dir. When finished press Enter...')
 98 |     with open(u'排序规则表.txt') as f:
 99 |         for line in f:
100 |             seq_list.append(line.strip())
101 | 
102 |     #生成文章汇总表
103 |     genXls(origin_list, filename, seq_list)
104 | 
105 | 
106 | #读取文章汇总表，并按媒体、日期排序
107 | #媒体升序排列
108 | #日期默认升序，可选降序
109 | #文章汇总表：
110 | #为.xlsx文件，或默认标题default_database.xls
111 | #sheet 1，第2列媒体，第3列日期，第4列标题，从第2行开始
112 | #sheet 2，第1列媒体排序规则，从第2行开始
113 | def getDatabase(filename = r'网络媒体汇总.xlsx', date_reverse = False):
114 |     if not os.path.exists(filename):
115 |         for name in os.listdir():
116 |             if name.endswith('.xlsx') or name == 'default_database.xls':
117 |                 filename = name
118 |                 break
119 |         else:
120 |             print('NO database found. Now generating...')
121 |             filename = 'default_database.xls'
122 |             genDatabase('original', filename)
123 |     wb = xlrd.open_workbook(filename)
124 | 
125 |     #读取媒体文章汇总表
126 |     w_sheet = wb.sheet_by_index(0)
127 |     w_list = []
128 |     for i in range(1, w_sheet.nrows):
129 |         row = w_sheet.row_values(i, 1, 4)
130 |         for i in range(len(row)):
131 |             if isinstance(row[i], str):
132 |                 row[i] = row[i].strip().strip('《').strip('》')
133 |         if isinstance(row[1], str): #处理日期列
134 |             row[1] = row[1].split('（')[0].split('(')[0]
135 |             split = []
136 |             for char in row[1]:
137 |                 if char not in digit:
138 |                     split.append(char)
139 |             if len(split) == 2: # YYYYxMMxDD
140 |                 y, m, d = time.strptime(row[1],
141 |                                         '%Y' + '%s' % split[0] + '%m' + '%s' % split[1] + '%d')[0:3]
142 |             elif len(split) == 1: # YYYYxMM
143 |                 y, m, d = time.strptime(row[1], '%Y' + '%s' % split[0] + '%m')[0:3]
144 |             else:
145 |                 raise Exception('date format error@ ', row)
146 |             row[1] = datetime.datetime(y, m, d)
147 |         else:
148 |             row[1] = xlrd.xldate_as_datetime(row[1],0)
149 |         w_list.append(row)
150 | 
151 |     #读取媒体名称排序规则表
152 |     s_sheet = wb.sheet_by_index(1)
153 |     s_list = []
154 |     for i in range(1, s_sheet.nrows):
155 |         s_list.append(s_sheet.cell_value(i, 0).strip().strip('《').strip('》'))
156 | 
157 |     #按媒体名称排序
158 |     for s_item in s_list:
159 |         for i in range(len(w_list))[::-1]: #倒序搜索，避免前序操作改变后续索引位置
160 |             if w_list[i][0] == s_item:
161 |                 w_list.append(w_list.pop(i))
162 |     for i in range(len(w_list))[::-1]: #将不包含在排序规则表中的媒体内容排到最后
163 |         if w_list[i][0] not in s_list:
164 |             w_list.append(w_list.pop(i))
165 | 
166 |     #按日期排序
167 |     for i in range(len(w_list))[:0:-1]:
168 |         for j in range(len(w_list))[i - 1::-1]:
169 |             if w_list[j][0] == w_list[i][0]:
170 |                 if date_reverse: #降序
171 |                     if w_list[j][1] < w_list[i][1]:
172 |                         w_list[i], w_list[j] = w_list[j], w_list[i]                    
173 |                 else: #升序
174 |                     if w_list[j][1] > w_list[i][1]:
175 |                         w_list[i], w_list[j] = w_list[j], w_list[i]
176 | 
177 |     print('[%s] loaded' % filename)
178 |     return w_list
179 | 
180 | 
181 | #筛选XX(2016)年文章
182 | def scrDocx(article_list, year = 2016, from_path = 'original', to_path = 'screened'):
183 |     screen_list = []
184 |     for item in article_list:
185 |         if item[1].year == year:
186 |             screen_list.append(item)
187 |     genXls(screen_list, dst_path = 'screen_queued.xls')
188 | 
189 |     if os.path.exists(to_path):
190 |         shutil.rmtree(to_path)
191 |     os.mkdir(to_path)
192 |     miss_count = 0
193 |     #miss_list = []
194 |     for i in range(len(screen_list)):
195 |         miss = True
196 |         mod_screen_name = screen_list[i][2] #去除特殊字符，便于比较
197 |         for j in range(len(mod_screen_name))[::-1]: #倒序搜索
198 |             if mod_screen_name[j] in punc:
199 |                 mod_screen_name = mod_screen_name[:j] + mod_screen_name[j + 1:] + ' '
200 |                 #末尾加空格，以确保索引正确
201 |         mod_screen_name = mod_screen_name.strip()
202 |         for dirpath, dirnames, filenames in os.walk(from_path):
203 |             for filename in filenames:
204 |                 original_pathname = os.path.join(dirpath, filename)
205 |                 mod_filename = filename.rpartition('.')[0] #去除特殊字符，便于比较
206 |                 for j in range(len(mod_filename))[::-1]: #倒序搜索
207 |                     if mod_filename[j] in punc:
208 |                         mod_filename = mod_filename[:j] + mod_filename[j + 1:] + ' '
209 |                 mod_filename = mod_filename.strip()
210 |                 if mod_screen_name in mod_filename:
211 |                     new_name = str(i + 1) + '.' + mod_screen_name + \
212 |                                os.path.splitext(original_pathname)[1]
213 |                     new_pathname = os.path.join(to_path, new_name)
214 |                     silence = shutil.copy(original_pathname, new_pathname)
215 |                     miss = False
216 |         if miss: #通过miss来提取未找到的文件清单
217 |             miss_count += 1
218 |             #miss_list.append(screen_list[i])
219 |             print('not found %2d: %d\t%s' % (miss_count, i + 1, screen_list[i]))
220 | 
221 |     print('\n %d docs copied to folder [%s]' % (len(screen_list) - miss_count, to_path))
222 | 
223 | 
224 | #生成xls文件
225 | #my_list写入Sheet1
226 | #s_list(排序规则，如有)写入Sheet2
227 | def genXls(my_list, dst_path = 'result.xls', s_list = None):
228 |     wb = xlwt.Workbook()
229 |     ws = wb.add_sheet('Sheet1')
230 |     myFormat = xlwt.XFStyle()
231 |     myFormat.num_format_str = 'yyyy/mm/dd'
232 |     
233 |     title = ['序号', '媒体', '日期', '标题']
234 |     for i in range(len(title)):
235 |         ws.write(0, i, title[i])
236 | 
237 |     rows = len(my_list)
238 |     cols = len(my_list[0])
239 |     for r in range(rows):
240 |         ws.write(r + 1, 0, r + 1)
241 |         for j in range(cols):
242 |             ws.write(r + 1, j + 1, my_list[r][j], myFormat)
243 | 
244 |     if s_list:
245 |         ws1 = wb.add_sheet('Sheet2')
246 |         ws1.write(0, 0, '排序规则')
247 |         for i in range(len(s_list)):
248 |             ws1.write(i + 1, 0, s_list[i])
249 |         
250 |     wb.save(dst_path)
251 |     print('[%s] generated' % dst_path)
252 | 
253 |     
254 | #生成refined_doc文件
255 | def refDocx(src_dir = 'screened', refine_folder = 'refined'):
256 |     global refine_count
257 |     if os.path.exists(refine_folder):
258 |         shutil.rmtree(refine_folder)
259 |     os.mkdir(refine_folder)
260 | 
261 |     #生成refined文件夹(单篇文档，格式修正)    
262 |     temp_zip = 'temp.zip'
263 |     temp_dir = 'temp'
264 |     for doc_name in os.listdir(src_dir):
265 |         old_doc = docx.Document(os.path.join(src_dir, doc_name))
266 |         new_doc = docx.Document()
267 |         setdocStyle(new_doc.styles['Normal']) #doc整体格式设置
268 | 
269 |         #写入refined_doc
270 |         #每张图片作为单独段落
271 |         fig_n = 0 #图片排序
272 |         fig_unloaded = True #记录图片是否已经载入
273 |         for para in old_doc.paragraphs:
274 |             if para.text.strip():
275 |                 new_para = new_doc.add_paragraph()
276 |             for run in para.runs:
277 |                 text = run.text.strip()
278 |                 if text: #copy非空文本到新文档
279 |                     refined_text = refText(text, doc_name) #文本内容修订
280 |                     new_run = new_para.add_run(refined_text)
281 |                 elif run.element.drawing_lst: #插入图片
282 |                     #doc.inline_shapes无法识别非嵌入式图片，故直接从段落文字块判定有无图
283 |                     if fig_unloaded: #如本doc还未载入图片，此处载入
284 |                         silence = shutil.copy(os.path.join(src_dir, doc_name), temp_zip)
285 |                         f = zipfile.ZipFile(temp_zip)
286 |                         for file in f.namelist():
287 |                             if file.startswith('word/media/image'):
288 |                                 silence = f.extract(file, temp_dir)
289 |                         f.close()
290 |                         img_dir = os.path.split(silence)[0]
291 |                         img_list = os.listdir(img_dir)
292 |                         fig_unloaded = False
293 |                     for i in range(len(run.element.drawing_lst)):
294 |                         fig_para = new_doc.add_paragraph() # 图片单独作为一段
295 |                         fig_para.paragraph_format.first_line_indent = 0 #图片段落无缩进
296 |                         fig_para.alignment = 1 #图片段落居中
297 |                         fig_run = fig_para.add_run()
298 |                         #if old_doc.inline_shapes: #如为内置图形（自动换行-嵌入式）
299 |                         #    new_inlineshape = fig_run.add_picture(
300 |                         #        os.path.join(img_dir, img_list[fig_n]),
301 |                         #        old_doc.inline_shapes[fig_n].width,
302 |                         #        old_doc.inline_shapes[fig_n].height)
303 |                         #else: #如为其他图形
304 |                         new_inlineshape = fig_run.add_picture(
305 |                             os.path.join(img_dir, img_list[fig_n]),
306 |                             width = docx.shared.Cm(13)) #默认图形宽度13cm
307 |                         fig_n += 1
308 |         if os.path.exists(temp_zip):
309 |             os.remove(temp_zip)
310 |         if os.path.exists(temp_dir):
311 |             shutil.rmtree(temp_dir)
312 | 
313 |         #refined doc格式调整
314 |         new_doc.paragraphs[0].paragraph_format.first_line_indent = 0 #标题行无缩进
315 |         new_doc.paragraphs[0].alignment = 1 #标题行居中
316 |         new_doc.paragraphs[0].runs[0].font.bold = True #标题行加粗
317 |         new_doc.paragraphs[1].insert_paragraph_before() #标题行后空行
318 |         for i in range(len(new_doc.paragraphs))[::-1]:
319 |             if new_doc.paragraphs[i].text.strip():
320 |                 new_doc.paragraphs[i].alignment = 2 #署名行右对齐(跳过末尾图片段落)
321 |                 break
322 |         new_doc.add_paragraph() #文末空2行
323 |         new_doc.add_paragraph() #文末空2行
324 |         new_doc.save(os.path.join(refine_folder, doc_name))
325 |     print('\nTotally %d refined.\n%d docs generated in [%s].' \
326 |           % (refine_count, len(os.listdir(refine_folder)), refine_folder))
327 | 
328 | 
329 | #生成汇总doc文件
330 | def genDocx(src_dir = 'refined', dst_path = 'result.docx'):
331 |     dst_doc = docx.Document()
332 |     setdocStyle(dst_doc.styles['Normal']) #doc整体格式设置
333 |     temp_zip = 'temp.zip'
334 |     temp_dir = 'temp'
335 |     for doc_name in os.listdir(src_dir):
336 |         refined_doc = docx.Document(os.path.join(src_dir, doc_name))
337 |         #doc如有图片，提取图片
338 |         if refined_doc.inline_shapes:
339 |             fig_n = 0 #图片排序
340 |             silence = shutil.copy(os.path.join(src_dir, doc_name), temp_zip)
341 |             f = zipfile.ZipFile(temp_zip)
342 |             for file in f.namelist():
343 |                 if file.startswith('word/media/image'):
344 |                     silence = f.extract(file, temp_dir)
345 |             f.close()
346 |             img_dir = os.path.split(silence)[0]
347 |             img_list = os.listdir(img_dir)
348 |         #标题段落
349 |         title_para = dst_doc.add_paragraph(refined_doc.paragraphs[0].text)
350 |         title_para.paragraph_format.first_line_indent = 0 #标题行无缩进
351 |         title_para.alignment = 1 #标题行居中
352 |         title_para.runs[0].font.bold = True #标题行加粗
353 |         #其他段落
354 |         for para in refined_doc.paragraphs[1 :]:
355 |             other_para = dst_doc.add_paragraph()
356 |             if para.runs:
357 |                 for run in para.runs:
358 |                     new_run = other_para.add_run(run.text)
359 |                     if run.element.drawing_lst: #如是图片，插入图片段
360 |                         for i in range(len(run.element.drawing_lst)):
361 |                             other_para.paragraph_format.first_line_indent = 0 #图片段落无缩进
362 |                             other_para.alignment = 1 #图片段落居中
363 |                             new_inlineshape = new_run.add_picture(
364 |                                 os.path.join(img_dir, img_list[fig_n]),
365 |                                 refined_doc.inline_shapes[fig_n].width,
366 |                                 refined_doc.inline_shapes[fig_n].height)
367 |                             fig_n += 1
368 |         #署名段
369 |         for i in range(len(dst_doc.paragraphs))[::-1]:
370 |             if dst_doc.paragraphs[i].text.strip():
371 |                 dst_doc.paragraphs[i].alignment = 2 #署名行右对齐(跳过末尾图片、空行)
372 |                 break
373 | 
374 |         if os.path.exists(temp_zip):
375 |             os.remove(temp_zip)
376 |         if os.path.exists(temp_dir):
377 |             shutil.rmtree(temp_dir)
378 | 
379 |     dst_doc.save(dst_path)
380 |     print('\nSummarized doc [%s] generated.' % dst_path)
381 | 
382 |    
383 | # 文本内容完善
384 | # 修正英文引号
385 | # 纠正前引号、回引号使用
386 | # 纠正英文逗号，句号
387 | def refText(text, pathname, output = False):
388 |     global refine_count
389 |     temp_text = text
390 |     # 引号修正
391 |     quote_revised = False
392 |     quote_en = ["'", '"']
393 |     quote_cn_s = ['‘', '“'] #前引号
394 |     quote_cn_e = ['’', '”'] #回引号
395 |     for j in range(len(quote_en)):
396 |         quote_s = True #单引号、双引号分别前引号搜索起点
397 |         for i in range(len(temp_text)):
398 |             if temp_text[i] == quote_en[j]:
399 |                 quote_revised = True
400 |                 refine_count += 1
401 |                 if quote_s:
402 |                     temp_text = temp_text[:i] + quote_cn_s[j] + temp_text[i + 1 :]
403 |                     quote_s = False
404 |                 else:
405 |                     temp_text = temp_text[:i] + quote_cn_e[j] + temp_text[i + 1 :]
406 |                     quote_s = True
407 |     if quote_revised and output:
408 |         print('\n[%s]\nOriginal:\t%s' % (pathname, text))
409 |         print('quote revised:\t%s' % temp_text)
410 | 
411 |     #逗号、句号修正
412 |     comma_period_revised = False
413 |     cp_en = [',', '.']
414 |     cp_cn = ['，', '。']
415 |     for j in range(len(cp_en)):
416 |         for i in range(len(temp_text)):
417 |             idx_s = []
418 |             idx_e = []
419 |             new = []
420 |             if temp_text[i] == cp_en[j]:
421 |                 if i < (len(temp_text) - 1) and (temp_text[i - 1] in digit) \
422 |                    and (temp_text[i + 1] in digit):
423 |                     pass # 剔除小数点, 千分位
424 |                 elif temp_text[i - 1] in digit:
425 |                     pass # 剔除序号项
426 |                 elif temp_text[i - 1] == '.': 
427 |                     pass # 配合下节使用，跳过正确的多点线（除第一个点外）
428 |                 elif i < (len(temp_text) - 1) and temp_text[i + 1] == '.': # 检查多点线
429 |                     end = 0
430 |                     for k in range(i + 1, len(temp_text)):
431 |                         if temp_text[k] != '.':
432 |                             end = k - 1
433 |                             break
434 |                     print('\n%s (%s) %s'
435 |                           % (temp_text[:i], temp_text[i : end + 1], temp_text[end + 1:]))
436 |                     if input('is (%s) right? Y/N ' % temp_text[i : end + 1]).lower() == 'y':
437 |                         pass
438 |                     else:
439 |                         new.insert(0, input('Please input the right content to replace (%s): '
440 |                                     % temp_text[i : end + 1])) #倒序，避免索引错误
441 |                         idx_s.insert(0, i) #倒序，避免索引错误
442 |                         idx_e.insert(0, end + 1) #倒序，避免索引错误
443 |                         refine_count += 1
444 |                 else:
445 |                     refine_count += 1
446 |                     comma_period_revised = True
447 |                     temp_text = temp_text[:i] + cp_cn[j] + temp_text[i + 1 :]
448 |     if idx_s:
449 |         for i in range(len(idx_s)):
450 |             temp_text = temp_text[: idx_s[i]] + new[i] + temp_text[idx_e[i]:]
451 |     if comma_period_revised and output:
452 |         if not quote_revised:
453 |             print('\n[%s]\nOriginal:\t%s' % (pathname, text))
454 |         print('comma / period revised:\t%s' % temp_text)
455 |     
456 |     return temp_text
457 | 
458 | 
459 | #doc总体格式设置
460 | def setdocStyle(doc_style_obj,
461 |              font_name = 'Hans', #字体名称
462 |              font_size = docx.shared.Pt(10.5), #字体大小
463 |              first_line_indent_n = 2, #首行缩进2字
464 |              space_before = 0, #段前间距0
465 |              space_after = 0, #段后间距0
466 |              line_spacing = 1.0): #行距单倍         
467 |     doc_style_obj.font.name = 'Times New Roman' 
468 |     doc_style_obj.font.size = docx.shared.Pt(10.5) 
469 |     doc_style_obj.paragraph_format.first_line_indent = \
470 |     doc_style_obj.font.size * first_line_indent_n
471 |     doc_style_obj.paragraph_format.space_before = space_before
472 |     doc_style_obj.paragraph_format.space_after = space_after
473 |     doc_style_obj.paragraph_format.line_spacing = line_spacing
474 | 
475 | #生日快乐
476 | def love():
477 |     def func(x, y):
478 |         main()
479 |     turtle.title('喵喵子程序')
480 |     lv=turtle.Turtle()
481 |     lv.hideturtle()
482 |     lv.getscreen().bgcolor('light blue')
483 |     lv.color('yellow','red')
484 |     lv.pensize(1)
485 |     lv.speed(1)
486 |     lv.up()
487 |     lv.goto(0,-150)
488 |     #开始画爱心
489 |     lv.down()
490 |     lv.begin_fill()
491 |     lv.goto(0, -150)
492 |     lv.goto(-175.12, -8.59)
493 |     lv.left(140)
494 |     pos = []
495 |     for i in range(19):
496 |         lv.right(10)
497 |         lv.forward(20)
498 |         pos.append((-lv.pos()[0], lv.pos()[1]))
499 |     for item in pos[::-1]:
500 |         lv.goto(item)
501 |     lv.goto(175.12, -8.59)
502 |     lv.goto(0, -150)
503 |     lv.left(50)
504 |     lv.end_fill()
505 |     #写字
506 |     lv.up()
507 |     lv.goto(0, 80)
508 |     lv.down()
509 |     lv.write("喵喵",font=(u"方正舒体",36,"normal"),align="center")
510 |     lv.up()
511 |     lv.goto(0, 0)
512 |     lv.down()
513 |     lv.write("生日快乐！",font=(u"方正舒体",48,"normal"),align="center")
514 |     lv.up()
515 |     lv.goto(100, -210)
516 |     lv.down()
517 |     lv.write("点我见证奇迹",font=(u"华文琥珀",26,"bold"),align="right")
518 |     lv.up()
519 |     lv.goto(160, -190)
520 |     lv.resizemode('user')
521 |     lv.shapesize(4, 4, 10)
522 |     lv.color('red', 'red')
523 |     lv.onclick(func)
524 |     lv.showturtle()
525 | 
526 | 
527 | def main():
528 |     w_list = getDatabase(filename = r'网络媒体汇总.xlsx', date_reverse = True)
529 |     genXls(w_list, dst_path = 'all_queued.xls')
530 |     scrDocx(w_list, year = 2016, from_path = 'original', to_path = 'screened')
531 |     refDocx(src_dir = 'screened', refine_folder = 'refined')
532 |     genDocx(src_dir = 'refined', dst_path = 'result.docx')
533 | 
534 | if __name__ == '__main__':
535 |     if datetime.date.today() == datetime.date(2018, 6, 22):
536 |         love()
537 |     else:
538 |         main()
539 | 


--------------------------------------------------------------------------------