├── .github └── workflows │ └── pylint.yml ├── Invoice2Excel.py ├── README.md ├── example ├── 012001900111_51142414.pdf ├── 012001900211-61876678(1).pdf ├── 03702170021127209635.pdf ├── 91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf ├── 91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf ├── 91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf ├── 91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf ├── 91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf ├── 91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf ├── 91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf ├── 91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf ├── 91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf ├── 91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf ├── 91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf ├── 91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf ├── didi.pdf ├── test-1.pdf ├── test-2.pdf ├── test │ ├── 91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf │ ├── 91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf │ ├── 91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf │ └── 91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf └── test2 │ ├── 91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf │ ├── 91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf │ ├── 91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf │ ├── 91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf │ ├── 91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf │ ├── 91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf │ ├── 91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf │ └── 91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf ├── requirements.txt └── result.xlsx /.github/workflows/pylint.yml: -------------------------------------------------------------------------------- 1 | name: Pylint 2 | 3 | on: [push] 4 | 5 | jobs: 6 | build: 7 | runs-on: ubuntu-latest 8 | strategy: 9 | matrix: 10 | python-version: ["3.8", "3.9", "3.10"] 11 | steps: 12 | - uses: actions/checkout@v3 13 | - name: Set up Python ${{ matrix.python-version }} 14 | uses: actions/setup-python@v3 15 | with: 16 | python-version: ${{ matrix.python-version }} 17 | - name: Install dependencies 18 | run: | 19 | python -m pip install --upgrade pip 20 | pip install pylint 21 | - name: Analysing the code with pylint 22 | run: | 23 | pylint $(git ls-files '*.py') 24 | -------------------------------------------------------------------------------- /Invoice2Excel.py: -------------------------------------------------------------------------------- 1 | # -*- coding:utf-8 -*- 2 | 3 | """ 4 | parse PDF invoice and extract data to Excel 5 | """ 6 | 7 | import pdfplumber as pb 8 | import os 9 | import pandas as pd 10 | import re 11 | import sys 12 | import getopt 13 | 14 | __author__ = 'yooongchun' 15 | __email__ = 'yooongchun@foxmail.com' 16 | 17 | 18 | class Extractor(object): 19 | def __init__(self, path): 20 | self.file = path 21 | 22 | @staticmethod 23 | def load_files(directory): 24 | """load files""" 25 | paths = [] 26 | for file in os.walk(directory): 27 | for f in file[2]: 28 | path = os.path.join(file[0], f) 29 | if os.path.isfile(path) and os.path.splitext(path)[1] == '.pdf': 30 | paths.append(path) 31 | return paths 32 | 33 | def _load_data(self): 34 | if self.file and os.path.splitext(self.file)[1] == '.pdf': 35 | pdf = pb.open(self.file) 36 | page = pdf.pages[0] 37 | words = page.extract_words(x_tolerance=5) 38 | lines = page.lines 39 | # convert coordination 40 | for index, word in enumerate(words): 41 | words[index]['y0'] = word['top'] 42 | words[index]['y1'] = word['bottom'] 43 | for index, line in enumerate(lines): 44 | lines[index]['x1'] = line['x0']+line['width'] 45 | lines[index]['y0'] = line['top'] 46 | lines[index]['y1'] = line['bottom'] 47 | return {'words': words, 'lines': lines} 48 | else: 49 | print("file %s can't be opened." % self.file) 50 | return None 51 | 52 | @staticmethod 53 | def _fill_line(lines): 54 | hlines = [line for line in lines if line['width'] > 0] # 筛选横线 55 | hlines = sorted(hlines, key=lambda h: h['width'], reverse=True)[ 56 | :-2] # 剔除较短的两根 57 | vlines = [line for line in lines if line['height'] > 0] # 筛选竖线 58 | vlines = sorted(vlines, key=lambda v: v['y0']) # 按照坐标排列 59 | # 查找边框顶点 60 | hx0 = hlines[0]['x0'] # 左侧 61 | hx1 = hlines[0]['x1'] # 右侧 62 | vy0 = vlines[0]['y0'] # 顶部 63 | vy1 = vlines[-1]['y1'] # 底部 64 | 65 | thline = {'x0': hx0, 'y0': vy0, 'x1': hx1, 'y1': vy0} # 顶部横线 66 | bhline = {'x0': hx0, 'y0': vy1, 'x1': hx1, 'y1': vy1} # 底部横线 67 | lvline = {'x0': hx0, 'y0': vy0, 'x1': hx0, 'y1': vy1} # 左侧竖线 68 | rvline = {'x0': hx1, 'y0': vy0, 'x1': hx1, 'y1': vy1} # 右侧竖线 69 | 70 | hlines.insert(0, thline) 71 | hlines.append(bhline) 72 | vlines.insert(0, lvline) 73 | vlines.append(rvline) 74 | return {'hlines': hlines, 'vlines': vlines} 75 | 76 | @staticmethod 77 | def _is_point_in_rect(point, rect): 78 | """判断点是否在矩形内""" 79 | px, py = point 80 | p1, p2, p3, p4 = rect 81 | if p1[0] <= px <= p2[0] and p1[1] <= py <= p3[1]: 82 | return True 83 | else: 84 | return False 85 | 86 | @staticmethod 87 | def _find_cross_points(hlines, vlines): 88 | points = [] 89 | delta = 1 90 | for vline in vlines: 91 | vx0 = vline['x0'] 92 | vy0 = vline['y0'] 93 | vy1 = vline['y1'] 94 | for hline in hlines: 95 | hx0 = hline['x0'] 96 | hy0 = hline['y0'] 97 | hx1 = hline['x1'] 98 | if (hx0-delta) <= vx0 <= (hx1+delta) and (vy0-delta) <= hy0 <= (vy1+delta): 99 | points.append((int(vx0), int(hy0))) 100 | return points 101 | 102 | @staticmethod 103 | def _find_rects(cross_points): 104 | # 构造矩阵 105 | X = sorted(set([int(p[0]) for p in cross_points])) 106 | Y = sorted(set([int(p[1]) for p in cross_points])) 107 | df = pd.DataFrame(index=Y, columns=X) 108 | for p in cross_points: 109 | x, y = int(p[0]), int(p[1]) 110 | df.loc[y, x] = 1 111 | df = df.fillna(0) 112 | # 寻找矩形 113 | rects = [] 114 | COLS = len(df.columns)-1 115 | ROWS = len(df.index)-1 116 | for row in range(ROWS): 117 | for col in range(COLS): 118 | p0 = df.iat[row, col] # 主点:必能构造一个矩阵 119 | cnt = col+1 120 | while cnt <= COLS: 121 | p1 = df.iat[row, cnt] 122 | p2 = df.iat[row+1, col] 123 | p3 = df.iat[row+1, cnt] 124 | if p0 and p1 and p2 and p3: 125 | rects.append(((df.columns[col], df.index[row]), (df.columns[cnt], df.index[row]), ( 126 | df.columns[col], df.index[row+1]), (df.columns[cnt], df.index[row+1]))) 127 | break 128 | else: 129 | cnt += 1 130 | return rects 131 | 132 | def _put_words_into_rect(self, words, rects): 133 | # 将words按照坐标层级放入矩阵中 134 | groups = {} 135 | delta = 2 136 | for word in words: 137 | p = (int(word['x0']), int((word['y0']+word['y1'])/2)) 138 | flag = False 139 | for r in rects: 140 | if self._is_point_in_rect(p, r): 141 | flag = True 142 | groups[('IN', r[0][1], r)] = groups.get( 143 | ('IN', r[0][1], r), [])+[word] 144 | break 145 | if not flag: 146 | y_range = [ 147 | p[1]+x for x in range(delta)]+[p[1]-x for x in range(delta)] 148 | out_ys = [k[1] for k in list(groups.keys()) if k[0] == 'OUT'] 149 | flag = False 150 | for y in set(y_range): 151 | if y in out_ys: 152 | v = out_ys[out_ys.index(y)] 153 | groups[('OUT', v)].append(word) 154 | flag = True 155 | break 156 | if not flag: 157 | groups[('OUT', p[1])] = [word] 158 | return groups 159 | 160 | @staticmethod 161 | def _find_text_by_same_line(group, delta=1): 162 | words = {} 163 | group = sorted(group, key=lambda x: x['x0']) 164 | for w in group: 165 | bottom = int(w['bottom']) 166 | text = w['text'] 167 | k1 = [bottom-i for i in range(delta)] 168 | k2 = [bottom+i for i in range(delta)] 169 | k = set(k1+k2) 170 | flag = False 171 | for kk in k: 172 | if kk in words: 173 | words[kk] = words.get(kk, '')+text 174 | flag = True 175 | break 176 | if not flag: 177 | words[bottom] = words.get(bottom, '')+text 178 | return words 179 | 180 | def _split_words_into_diff_line(self, groups): 181 | groups2 = {} 182 | for k, g in groups.items(): 183 | words = self._find_text_by_same_line(g, 3) 184 | groups2[k] = words 185 | return groups2 186 | 187 | @staticmethod 188 | def _index_of_y(x, rects): 189 | for index, r in enumerate(rects): 190 | if x == r[2][0][0]: 191 | return index+1 if index+1 < len(rects) else None 192 | return None 193 | 194 | @staticmethod 195 | def _find_outer(words): 196 | df = pd.DataFrame() 197 | for pos, text in words.items(): 198 | if re.search(r'发票$', text): # 发票名称 199 | df.loc[0, '发票名称'] = text 200 | elif re.search(r'发票代码', text): # 发票代码 201 | num = ''.join(re.findall(r'[0-9]+', text)) 202 | df.loc[0, '发票代码'] = num 203 | elif re.search(r'发票号码', text): # 发票号码 204 | num = ''.join(re.findall(r'[0-9]+', text)) 205 | df.loc[0, '发票号码'] = num 206 | elif re.search(r'开票日期', text): # 开票日期 207 | date = ''.join(re.findall( 208 | r'[0-9]{4}年[0-9]{1,2}月[0-9]{1,2}日', text)) 209 | df.loc[0, '开票日期'] = date 210 | elif '机器编号' in text and '校验码' in text: # 校验码 211 | text1 = re.search(r'校验码:\d+', text)[0] 212 | num = ''.join(re.findall(r'[0-9]+', text1)) 213 | df.loc[0, '校验码'] = num 214 | text2 = re.search(r'机器编号:\d+', text)[0] 215 | num = ''.join(re.findall(r'[0-9]+', text2)) 216 | df.loc[0, '机器编号'] = num 217 | elif '机器编号' in text: 218 | num = ''.join(re.findall(r'[0-9]+', text)) 219 | df.loc[0, '机器编号'] = num 220 | elif '校验码' in text: 221 | num = ''.join(re.findall(r'[0-9]+', text)) 222 | df.loc[0, '校验码'] = num 223 | elif re.search(r'收款人', text): 224 | items = re.split(r'收款人:|复核:|开票人:|销售方:', text) 225 | items = [item for item in items if re.sub( 226 | r'\s+', '', item) != ''] 227 | df.loc[0, '收款人'] = items[0] if items and len(items) > 0 else '' 228 | df.loc[0, '复核'] = items[1] if items and len(items) > 1 else '' 229 | df.loc[0, '开票人'] = items[2] if items and len(items) > 2 else '' 230 | df.loc[0, '销售方'] = items[3] if items and len(items) > 3 else '' 231 | return df 232 | 233 | @staticmethod 234 | def _find_and_sort_rect_in_same_line(y, groups): 235 | same_rects_k = [k for k, v in groups.items() if k[1] == y] 236 | return sorted(same_rects_k, key=lambda x: x[2][0][0]) 237 | 238 | def _find_inner(self, k, words, groups, groups2, free_zone_flag=False): 239 | df = pd.DataFrame() 240 | sort_words = sorted(words.items(), key=lambda x: x[0]) 241 | text = [word for k, word in sort_words] 242 | context = ''.join(text) 243 | if '购买方' in context or '销售方' in context: 244 | y = k[1] 245 | x = k[2][0][0] 246 | same_rects_k = self._find_and_sort_rect_in_same_line(y, groups) 247 | target_index = self._index_of_y(x, same_rects_k) 248 | target_k = same_rects_k[target_index] 249 | group_context = groups2[target_k] 250 | prefix = '购买方' if '购买方' in context else '销售方' 251 | for pos, text in group_context.items(): 252 | if '名称' in text: 253 | name = re.sub(r'名称:', '', text) 254 | df.loc[0, prefix+'名称'] = name 255 | elif '纳税人识别号' in text: 256 | tax_man_id = re.sub(r'纳税人识别号:', '', text) 257 | df.loc[0, prefix+'纳税人识别号'] = tax_man_id 258 | elif '地址、电话' in text: 259 | addr = re.sub(r'地址、电话:', '', text) 260 | df.loc[0, prefix+'地址电话'] = addr 261 | elif '开户行及账号' in text: 262 | account = re.sub(r'开户行及账号:', '', text) 263 | df.loc[0, prefix+'开户行及账号'] = account 264 | elif '密码区' in context: 265 | y = k[1] 266 | x = k[2][0][0] 267 | same_rects_k = self._find_and_sort_rect_in_same_line(y, groups) 268 | target_index = self._index_of_y(x, same_rects_k) 269 | target_k = same_rects_k[target_index] 270 | words = groups2[target_k] 271 | context = [v for k, v in words.items()] 272 | context = ''.join(context) 273 | df.loc[0, '密码区'] = context 274 | elif '价税合计' in context: 275 | y = k[1] 276 | x = k[2][0][0] 277 | same_rects_k = self._find_and_sort_rect_in_same_line(y, groups) 278 | target_index = self._index_of_y(x, same_rects_k) 279 | target_k = same_rects_k[target_index] 280 | group_words = groups2[target_k] 281 | group_context = ''.join([w for k, w in group_words.items()]) 282 | items = re.split(r'[((]小写[))]', group_context) 283 | b = items[0] if items and len(items) > 0 else '' 284 | s = items[1] if items and len(items) > 1 else '' 285 | df.loc[0, '价税合计(大写)'] = b 286 | df.loc[0, '价税合计(小写)'] = s 287 | elif '备注' in context: 288 | y = k[1] 289 | x = k[2][0][0] 290 | same_rects_k = self._find_and_sort_rect_in_same_line(y, groups) 291 | target_index = self._index_of_y(x, same_rects_k) 292 | if target_index: 293 | target_k = same_rects_k[target_index] 294 | group_words = groups2[target_k] 295 | group_context = ''.join([w for k, w in group_words.items()]) 296 | df.loc[0, '备注'] = group_context 297 | else: 298 | df.loc[0, '备注'] = '' 299 | else: 300 | if free_zone_flag: 301 | return df, free_zone_flag 302 | y = k[1] 303 | x = k[2][0][0] 304 | same_rects_k = self._find_and_sort_rect_in_same_line(y, groups) 305 | if len(same_rects_k) == 8: 306 | free_zone_flag = True 307 | for kk in same_rects_k: 308 | words = groups2[kk] 309 | words = sorted(words.items(), key=lambda x: x[0]) if words and len( 310 | words) > 0 else None 311 | key = words[0][1] if words and len(words) > 0 else None 312 | val = [word[1] for word in words[1:] 313 | ] if key and words and len(words) > 1 else '' 314 | val = '\n'.join(val) if val else '' 315 | if key: 316 | df.loc[0, key] = val 317 | return df, free_zone_flag 318 | 319 | def extract(self): 320 | data = self._load_data() 321 | words = data['words'] 322 | lines = data['lines'] 323 | 324 | lines = self._fill_line(lines) 325 | hlines = lines['hlines'] 326 | vlines = lines['vlines'] 327 | 328 | cross_points = self._find_cross_points(hlines, vlines) 329 | rects = self._find_rects(cross_points) 330 | 331 | word_groups = self._put_words_into_rect(words, rects) 332 | word_groups2 = self._split_words_into_diff_line(word_groups) 333 | 334 | df = pd.DataFrame() 335 | free_zone_flag = False 336 | for k, words in word_groups2.items(): 337 | if k[0] == 'OUT': 338 | df_item = self._find_outer(words) 339 | else: 340 | df_item, free_zone_flag = self._find_inner( 341 | k, words, word_groups, word_groups2, free_zone_flag) 342 | df = pd.concat([df, df_item], axis=1) 343 | return df 344 | 345 | 346 | if __name__ == '__main__': 347 | IN_PATH = 'example' 348 | OUT_PATH = 'result.xlsx' 349 | # parse params 350 | opts, args = getopt.getopt(sys.argv[1:], 'p:ts:', ['test', 'path=', 'save=']) 351 | for opt, arg in opts: 352 | if opt in ['-p', '--path']: 353 | IN_PATH = arg 354 | elif opt in ['--test', '-t']: 355 | IN_PATH = 'example' 356 | elif opt in ['--save', '-s']: 357 | OUT_PATH = arg 358 | # run programme 359 | print(f'run {"test" if IN_PATH == "example" else "extracting"} mode, load data from directory {IN_PATH}.\n{"*"*50}') 360 | files_path = Extractor('').load_files(IN_PATH) 361 | num = len(files_path) 362 | print(f'total {num} file(s) to parse.\n{"*"*50}') 363 | data = pd.DataFrame() 364 | for index, file_path in enumerate(files_path): 365 | print(f'{index+1}/{num}({round((index+1)/num*100, 2)}%)\t{file_path}') 366 | extractor = Extractor(file_path) 367 | try: 368 | d = extractor.extract() 369 | data = pd.concat([data, d], axis=0, sort=False, ignore_index=True) 370 | except Exception as e: 371 | print('file error:', file_path, '\n', e) 372 | print(f'{"*"*50}\nfinish parsing, save data to {OUT_PATH}') 373 | data.to_excel('result.xlsx', sheet_name='data') 374 | print(f'{"*" * 50}\nALL DONE. THANK YOU FOR USING MY PROGRAMME. GOODBYE!\n{"*"*50}') 375 | 376 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Invoice2Excel 2 | 3 | 摘要:这篇文章介绍如何把发票内容提取出来保存到Excel中。 4 | 5 | ------ 6 | 7 | ### 程序功能 8 | 9 | 程序会把发票中的内容提取出来然后写入Excel中,一个示例的发票内容如下: 10 | 11 | ![发票示例](https://yooongchun-blog-v2.oss-cn-beijing.aliyuncs.com/202004/demo.PNG) 12 | 13 | 提取结果如下: 14 | 15 | ![提取结果](https://yooongchun-blog-v2.oss-cn-beijing.aliyuncs.com/202004/result.png) 16 | 17 | ### 程序使用 18 | 在线使用:[App|永春小站](https://www.yooongchun.com/app) 19 | 20 | ### 获取帮助 21 | 22 | 联系作者获取帮助: 23 | 24 | - 微信:yongchunzha 25 | - Email:yooongchun@foxmail.com 26 | - 博客:[永春小站](https://www.yooongchun.com) 27 | -------------------------------------------------------------------------------- /example/012001900111_51142414.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/012001900111_51142414.pdf -------------------------------------------------------------------------------- /example/012001900211-61876678(1).pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/012001900211-61876678(1).pdf -------------------------------------------------------------------------------- /example/03702170021127209635.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/03702170021127209635.pdf -------------------------------------------------------------------------------- /example/91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf -------------------------------------------------------------------------------- /example/91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf -------------------------------------------------------------------------------- /example/91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf -------------------------------------------------------------------------------- /example/91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf -------------------------------------------------------------------------------- /example/91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf -------------------------------------------------------------------------------- /example/91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf -------------------------------------------------------------------------------- /example/91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf -------------------------------------------------------------------------------- /example/91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf -------------------------------------------------------------------------------- /example/91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf -------------------------------------------------------------------------------- /example/91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf -------------------------------------------------------------------------------- /example/91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf -------------------------------------------------------------------------------- /example/91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf -------------------------------------------------------------------------------- /example/didi.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/didi.pdf -------------------------------------------------------------------------------- /example/test-1.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test-1.pdf -------------------------------------------------------------------------------- /example/test-2.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test-2.pdf -------------------------------------------------------------------------------- /example/test/91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test/91420000722084584J_8cd62e109ec84973b3b39d166e52298c.pdf -------------------------------------------------------------------------------- /example/test/91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test/91420000753416406R_a34c491011c0420dbe1f15165b06d11a.pdf -------------------------------------------------------------------------------- /example/test/91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test/91420100581839321E_cf873bbec0a24ae59a4d3ebd9e365572.pdf -------------------------------------------------------------------------------- /example/test/91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test/91420112792429656B_9434b07ac46d4bfa9fce7674fdc3a5b0.pdf -------------------------------------------------------------------------------- /example/test2/91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91420100MA4KPBL615_b076f3dbe31343379d6f9f37395f74fb.pdf -------------------------------------------------------------------------------- /example/test2/91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91420200562725281Y_2601734d3cd2442d9e8637df0f6734ea.pdf -------------------------------------------------------------------------------- /example/test2/91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91421200052621669E_da5e0e2e7a9e469b94864ffc9cc7dc1c.pdf -------------------------------------------------------------------------------- /example/test2/91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91421200553905686B_85532d34579b4875a66af568e24aaa7d.pdf -------------------------------------------------------------------------------- /example/test2/91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91430600796857412W_dfeafefde38e4cb1b616bb598487e6e8.pdf -------------------------------------------------------------------------------- /example/test2/91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91440106556656312W_401affc75c1e4c9dadaca6ad2ed3afac.pdf -------------------------------------------------------------------------------- /example/test2/91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91440114755593481G_e2aff9af2e9b4d03a7309bdcb8169d42.pdf -------------------------------------------------------------------------------- /example/test2/91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/example/test2/91440116751950621F_88fbe9ccd4ed471aa8bc292bdba328cd.pdf -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | pandas==1.0.3 2 | pdfplumber==0.5.18 -------------------------------------------------------------------------------- /result.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/yooongchun/Invoice2Excel/d9dac5916fe6c52580240a7208527ebf7181e077/result.xlsx --------------------------------------------------------------------------------