├── LICENSE ├── README.md ├── parse_payslips.py └── requirements.txt /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2021 Shine Jayakumar 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Extract Data From PDF In Python 2 | 3 | ![MIT License](https://img.shields.io/github/license/shine-jayakumar/Covid19-Exploratory-Analysis-With-SQL) 4 | 5 | In this project, we are going to batch-convert pdf files to text and extract data without using PyPDF2/4. 6 | 7 | We're going to achieve that by: 8 | - Using PDFtoText converter from XPdf to convert pdf files to text files 9 | - Using regular expressions to extract data 10 | - Performing data cleaning using pandas 11 | - Exporting to Excel file 12 | 13 | ## Why Not Use PyPDF2/4 14 | **Short Answer:** I got this error: 15 | ``` 16 | TypeError: object of type 'IndirectObject' has no len() 17 | ``` 18 | 19 | **Long Answer:** If PyPDF4 had worked I would never have had a chance to explore other ways. 20 | I looked on [StackOverflow](https://stackoverflow.com/users/6711954/shine-j) however couldn't find a solution for this error. 21 | Obviously, there had to be someone with the [same problem](https://stackoverflow.com/questions/66587056/typeerror-object-of-type-indirectobject-has-no-len) but there's no solution. 22 | 23 | I was not willing to manually copy and paste the information from 52 of my payslips. 24 | Isn't that what programs are used for? 25 | 26 | 27 | **Table of Contents** 28 | 29 | - [Packages](#Packages "Packages") 30 | - [Converting PDF To Text](#Converting-PDF-To-Text "Converting PDF To Text") 31 | - [Script Link](#Script-Link "Script Link") 32 | 33 | 34 | ## Packages 35 | 36 | - Pandas 37 | 38 | Check out the [requirements.txt](https://github.com/shine-jayakumar/Extract-Data-From-PDF-In-Python/blob/main/requirements.txt "requirements.txt") 39 | 40 | ## Converting PDF To Text 41 | Converting PDF to text using [Xpdf's pdftotext](http://www.xpdfreader.com/download.html "Xpdf's pdftotext") is really simple. 42 | 43 | Using this command-line tool we can batch-convert PDFs to text files. 44 | ``` 45 | pdftotext source.pdf dest.txt 46 | ``` 47 | ## Script Link 48 | **Script Link:** [parse_payslips.py](https://github.com/shine-jayakumar/Extract-Data-From-PDF-In-Python/blob/main/parse_payslips.py "parse_payslips.py") 49 | -------------------------------------------------------------------------------- /parse_payslips.py: -------------------------------------------------------------------------------- 1 | import os 2 | import subprocess 3 | import sys 4 | import re 5 | import pandas as pd 6 | from datetime import datetime 7 | import logging 8 | 9 | # Setting up logger 10 | logger = logging.getLogger(__name__) 11 | logger.setLevel(logging.DEBUG) 12 | 13 | formatter = logging.Formatter("%(asctime)s:%(name)s:%(lineno)d:%(levelname)s:%(message)s") 14 | file_handler = logging.FileHandler(f'parse_payslip_info_log_{datetime.now().strftime("%d_%m_%Y__%H_%M_%S")}.log') 15 | file_handler.setFormatter(formatter) 16 | 17 | stdout_handler = logging.StreamHandler(sys.stdout) 18 | stdout_handler.setFormatter(formatter) 19 | 20 | logger.addHandler(file_handler) 21 | 22 | if len(sys.argv) > 1: 23 | if "--verbose" in sys.argv or "-v" in sys.argv: 24 | logger.addHandler(stdout_handler) 25 | 26 | def convert_pdf_to_text(): 27 | ''' 28 | convert_pdf_to_text() 29 | 30 | Converts all pdf files in the current directort to text file 31 | 32 | Output directory: .\converted_pdfs 33 | 34 | ''' 35 | # directory to hold converted text files 36 | logger.info("Creating converted_pdfs directory") 37 | if os.system("mkdir converted_pdfs") != 0: 38 | logger.error("Failed to create converted_pdfs directory") 39 | sys.exit() 40 | 41 | # list of pdf files in dir/sub-dir and save them to a text file 42 | logger.info("Gathering list of full path to pdf files in the current directory/sub-directory") 43 | os.system("dir /s /b *.pdf > allpdf.txt") 44 | 45 | list_fnames = [] 46 | 47 | # put \n seperated file path in a list 48 | logger.info("Saving pdf file names to a list") 49 | try: 50 | with open('allpdf.txt', 'r') as fh: 51 | list_fnames = list(fh.read().split('\n')) 52 | except FileNotFoundError: 53 | logger.error("Unable to open file: addpdf.txt") 54 | sys.exit() 55 | 56 | err_count = 0 57 | 58 | # converting files one by one 59 | logger.info("Generating text files from pdf") 60 | for fname in list_fnames: 61 | if fname: 62 | target_text_fname = f"{get_fname_without_ext(fname)}.txt" 63 | target_text_path = os.path.join('.\converted_pdfs',target_text_fname) 64 | ret = subprocess.run(['bin64\pdftotext.exe', fname, target_text_path], capture_output=True) 65 | if ret.returncode != 0: 66 | logger.error(f"Error converting: {target_text_path}") 67 | err_count += 1 68 | # saving list of converted text files 69 | logger.info("Gathering list of text file names") 70 | os.system("dir converted_pdfs\*.txt /b > alltexts.txt") 71 | 72 | return err_count 73 | 74 | def get_fname_without_ext(fname): 75 | ''' 76 | get_fname_without_ext(fname) 77 | 78 | Returns the filename (without extension) from a filepath 79 | 80 | Ex: Return 'ebook' from d:\pdffiles\ebook.pdf 81 | 82 | ''' 83 | match = "" 84 | #pattern = re.compile(r'(Payslip_.+)(.pdf)') 85 | match = re.search(r'(Payslip_.+)(.pdf)',fname) 86 | if match: 87 | return match.group(1) 88 | else: 89 | return "" 90 | 91 | def get_list_of_converted_files(): 92 | 93 | ''' 94 | get_list_of_converted_files() 95 | 96 | Returns list of full path of converted text files 97 | 98 | ''' 99 | list_text_fnames = [] 100 | 101 | def append_path(fname): 102 | return os.path.join(".\converted_pdfs", fname) 103 | 104 | logger.info("Reading alltexts.txt, appending converted_pdfs directory name") 105 | try: 106 | with open("alltexts.txt", 'r') as fh: 107 | list_text_fnames = list(fh.read().split('\n')) 108 | except FileNotFoundError: 109 | logger.error("alltexts.txt not found") 110 | sys.exit() 111 | 112 | return list(map(append_path, list_text_fnames)) 113 | 114 | def format_number_str(s): 115 | ''' 116 | format_number_str(s) 117 | 118 | Converts number string to float 119 | 120 | Ex: 2,345.00 to 2345.00 121 | 122 | ''' 123 | if s != "": 124 | return float(s.replace(",", "").replace(" ", "")) 125 | else: 126 | return 0 127 | 128 | def month_no_to_name(mnum): 129 | ''' 130 | month_no_to_name(mnum) 131 | 132 | Returns 3 letter month name from month number 133 | 134 | Ex: 1 -> Jan, 2 -> Feb, 12 -> Dec 135 | 136 | ''' 137 | months = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'] 138 | return months[mnum-1] 139 | 140 | 141 | class Payslip: 142 | ''' 143 | Payslip class 144 | 145 | Contains methods to extract payslip details from a converted text file 146 | 147 | ''' 148 | 149 | def __init__(self): 150 | self.pay_period = "" 151 | self.pay_date = "" 152 | self.epf_no = "" 153 | self.uan_no = "" 154 | self.basic_salary = "" 155 | self.gross_salary = "" 156 | self.net_salary = "" 157 | self.gross_salary_ytd = "" 158 | self.pf_amount = "" 159 | self.pf_ytd = "" 160 | self.income_tax = "" 161 | self.income_tax_ytd = "" 162 | self.raw_payslip_text = "" 163 | 164 | def read_text(self, fname): 165 | try: 166 | with open(fname, 'r') as fh: 167 | self.raw_payslip_text = fh.read() 168 | self.pay_period = self.get_pay_period() 169 | self.pay_date = self.get_pay_date() 170 | self.epf_no = self.get_epf_number() 171 | self.uan_no = self.get_uan_number() 172 | self.basic_salary = self.get_basic_salary() 173 | self.gross_salary = self.get_gross_sal() 174 | self.net_salary = self.get_net_sal() 175 | self.gross_salary_ytd =self.get_gross_sal_ytd() 176 | self.pf_amount =self.get_pf() 177 | self.pf_ytd = self.get_pf_ytd() 178 | self.income_tax = self.get_income_tax() 179 | self.income_tax_ytd = self.get_income_tax_ytd() 180 | 181 | except FileNotFoundError: 182 | logger.error(f"File not found: {fname}") 183 | 184 | def get_pay_period(self): 185 | match = re.search(r'Pay\sPeriod\s:\s?([\d.]+[\s\-]+[\d.]+)', self.raw_payslip_text) 186 | if match: 187 | return match.group(1) 188 | else: 189 | return "" 190 | 191 | def get_pay_date(self): 192 | match = re.search(r'Pay\sDate\n\n:\s?([\d+.]+)', self.raw_payslip_text) 193 | if match: 194 | return match.group(1) 195 | else: 196 | return "" 197 | 198 | def get_epf_number(self): 199 | match = re.search(r'Emp\sPF\sNumber:\s?([\w\/]+)', self.raw_payslip_text) 200 | if match: 201 | return match.group(1) 202 | else: 203 | return "" 204 | 205 | def get_uan_number(self): 206 | match = re.search(r'UAN[\n]+:\s?(\d+)', self.raw_payslip_text) 207 | if match: 208 | return match.group(1) 209 | else: 210 | return "" 211 | 212 | def get_basic_salary(self): 213 | match = re.search(r'Basic\sSalary\n+([\d,.]+)', self.raw_payslip_text) 214 | if match: 215 | return match.group(1) 216 | else: 217 | return "" 218 | 219 | def get_gross_sal(self): 220 | match = re.search(r'Total\sGross\n+([\d,.]+)', self.raw_payslip_text) 221 | if match: 222 | return match.group(1) 223 | else: 224 | return "" 225 | 226 | def get_net_sal(self): 227 | match = re.search(r'NET\sPAY\n+([\d,.]+)', self.raw_payslip_text) 228 | if match: 229 | return match.group(1) 230 | else: 231 | return "" 232 | 233 | def get_gross_sal_ytd(self): 234 | match = re.search(r'YTD\sGROSS\n+([\d,.]+)', self.raw_payslip_text) 235 | if match: 236 | return match.group(1) 237 | else: 238 | return "" 239 | 240 | def get_pf(self): 241 | match = re.search(r'Provident\sFund\n+([\d.,]+)', self.raw_payslip_text) 242 | if match: 243 | return match.group(1) 244 | else: 245 | return "" 246 | 247 | def get_pf_ytd(self): 248 | match = re.search(r'YTD\sEmployee\sPF\n+([\d.,]+)', self.raw_payslip_text) 249 | if match: 250 | return match.group(1) 251 | else: 252 | return "" 253 | 254 | def get_income_tax(self): 255 | match = re.search(r'Income\sTax\n+([\d,.]+)', self.raw_payslip_text) 256 | if match: 257 | return match.group(1) 258 | else: 259 | return "" 260 | 261 | def get_income_tax_ytd(self): 262 | match = re.search(r'YTD\sTAX\n+([\d,.]+)', self.raw_payslip_text) 263 | if match: 264 | return match.group(1) 265 | else: 266 | return "" 267 | 268 | 269 | payslip_details = { 270 | "pay_period": [], 271 | "pay_date": [], 272 | "basic_salary": [], 273 | "gross_salary": [], 274 | "net_salary": [], 275 | "gross_salary_ytd": [], 276 | "pf_amount": [], 277 | "pf_ytd": [], 278 | "income_tax": [], 279 | "income_tax_ytd": [], 280 | "epf_no": [], 281 | "uan_no": [], 282 | } 283 | 284 | logger.info("Process started") 285 | logger.info("Converting pdf to text") 286 | convert_pdf_to_text() 287 | 288 | list_txt_fnames = get_list_of_converted_files() 289 | pay = Payslip() 290 | 291 | logger.info("Saving payslip details from each text file to dictionary") 292 | # Saving payslip details from each text file to dictionary 293 | for fname in list_txt_fnames: 294 | pay.read_text(fname) 295 | payslip_details["pay_period"].append(pay.pay_period) 296 | payslip_details["pay_date"].append(pay.pay_date) 297 | payslip_details["epf_no"].append(pay.epf_no) 298 | payslip_details["uan_no"].append(pay.uan_no) 299 | payslip_details["basic_salary"].append(pay.basic_salary) 300 | payslip_details["gross_salary"].append(pay.gross_salary) 301 | payslip_details["net_salary"].append(pay.net_salary) 302 | payslip_details["gross_salary_ytd"].append(pay.gross_salary_ytd) 303 | payslip_details["pf_amount"].append(pay.pf_amount) 304 | payslip_details["pf_ytd"].append(pay.pf_ytd) 305 | payslip_details["income_tax"].append(pay.income_tax) 306 | payslip_details["income_tax_ytd"].append(pay.income_tax_ytd) 307 | 308 | logger.info("Creating dataframe from dictionary") 309 | # creating dataframe from dictionary 310 | pay_df = pd.DataFrame.from_dict(payslip_details) 311 | 312 | logger.info("Formatting columns containing numeric data") 313 | # Formatting columns containing numeric data 314 | # converting object to float 315 | pay_df['basic_salary'] = pay_df['basic_salary'].apply(lambda x: format_number_str(x)) 316 | pay_df['net_salary'] = pay_df['net_salary'].apply(lambda x: format_number_str(x)) 317 | pay_df['gross_salary'] = pay_df['gross_salary'].apply(lambda x: format_number_str(x)) 318 | pay_df['gross_salary_ytd'] = pay_df['gross_salary_ytd'].apply(lambda x: format_number_str(x)) 319 | pay_df['pf_amount'] = pay_df['pf_amount'].apply(lambda x: format_number_str(x)) 320 | pay_df['pf_ytd'] = pay_df['pf_ytd'].apply(lambda x: format_number_str(x)) 321 | pay_df['income_tax'] = pay_df['income_tax'].apply(lambda x: format_number_str(x)) 322 | pay_df['income_tax_ytd'] = pay_df['income_tax_ytd'].apply(lambda x: format_number_str(x)) 323 | 324 | logger.info("Creating Series to hold year and month") 325 | # series to hold month and year 326 | years = pay_df['pay_date'].apply(lambda x: re.sub(r'\d+.\d+.(\d{4})', r'\1',x)) 327 | months = pay_df['pay_date'].apply(lambda x: month_no_to_name(int(re.sub(r'\d+.(\d+).\d{4}', r'\1',x)))) 328 | 329 | logger.info("Appending year and month column to the start") 330 | # appending year and month to the start 331 | pay_df.insert(0,'year',years) 332 | pay_df.insert(1,'months',months) 333 | 334 | logger.info("Exporting to Excel") 335 | # exporting to Excel 336 | pay_df.to_excel("payslips.xlsx", index=False) 337 | 338 | logger.info("Process completed successfully") 339 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/shine-jayakumar/Extract-Data-From-PDF-In-Python/6f35375fc4342bfe693f52cdc9db8b1f485fadf0/requirements.txt --------------------------------------------------------------------------------