├── .gitignore ├── Cargo.toml ├── LICENCE ├── README.md └── src └── main.rs /.gitignore: -------------------------------------------------------------------------------- 1 | target/ 2 | **/*.rs.bk 3 | Cargo.lock 4 | **/*.orig 5 | **/perf.data 6 | **/perf.data.old 7 | 8 | !.travis.yml 9 | 10 | /fonts 11 | output.csv 12 | .idea/ -------------------------------------------------------------------------------- /Cargo.toml: -------------------------------------------------------------------------------- 1 | [package] 2 | name = "hdfc-cc-parser-rs" 3 | version = "0.0.6" 4 | authors = ["Joe Paul "] 5 | repository = "https://github.com/joeirimpan/hdfc-cc-parser-rs" 6 | keywords = ["HDFC", "credit card", "bill"] 7 | license = "MIT" 8 | edition = "2021" 9 | 10 | # See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html 11 | 12 | [dependencies] 13 | anyhow = "1.0.68" 14 | chrono = "0.4.23" 15 | csv = "1.1.6" 16 | pdf = { git = "https://github.com/pdf-rs/pdf", features = [ "euclid" ], rev = "5cf56b7" } 17 | regex = "1.7.1" 18 | pdf_encoding = "0.4.0" 19 | euclid = "0.22.6" 20 | log = "*" 21 | clap = "4.1.8" 22 | 23 | [profile.release] 24 | strip = true 25 | opt-level = "z" 26 | lto = true 27 | -------------------------------------------------------------------------------- /LICENCE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2023 Joe Paul 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # HDFC CC bill parser 2 | 3 | This tool parse and extract information from HDFC Bank credit card statements in .csv format. The extracted information can be used for personal finance management or analytics purposes. 4 | 5 | ## Features 6 | 7 | * Extracts transaction details such as date, description, points, amount 8 | * Multiple pdfs can be parsed and collated into 1 CSV. 9 | 10 | ## Requirements 11 | 12 | * Rust 13 | * HDFC credit card statements 14 | 15 | ## Usage 16 | * Clone this repository: `git clone https://github.com/joeirimpan/hdfc-cc-parser-rs.git` 17 | * Navigate to the repository directory: cd hdfc-cc-parser-rs 18 | * Build the project: `cargo build --release` 19 | * Run the binary: `./target/release/hdfc-cc-parser-rs --name="NAME_AS_PER_STATEMENT" --dir --file --password --sortformat="optional format eg., %d-%m-%Y"` 20 | 21 | ## Why? 22 | 23 | A similar python implementation which uses tabula-py took 70s+ to generate a csv with 8 pdfs. With this implementation, it took only 0.02s to generate the same. 24 | 25 | ## Analytics 26 | 27 | Assuming `clickhouse-local` is installed 28 | 29 | * Get the points accumulated 30 | ```bash 31 | cat output.csv | clickhouse-local --structure "tx_date Datetime, tx String, points Int32, amount Float32" --query "SELECT SUM(points) FROM table" --input-format CSV 32 | ``` 33 | 34 | * Get the debits 35 | ```bash 36 | cat output.csv | clickhouse-local --structure "tx_date Datetime, tx String, points Int32, amount Float32" --query "SELECT SUM(amount) FROM table WHERE amount < 0" --input-format CSV 37 | ``` -------------------------------------------------------------------------------- /src/main.rs: -------------------------------------------------------------------------------- 1 | use anyhow::{Context, Error}; 2 | use chrono::{NaiveDate, NaiveDateTime, NaiveTime}; 3 | use clap::{arg, Command}; 4 | use pdf::content::*; 5 | use pdf::file::File as pdfFile; 6 | use regex::Regex; 7 | use std::io; 8 | use std::process::exit; 9 | use std::str::FromStr; 10 | use std::sync::mpsc::{self, Sender}; 11 | use std::thread; 12 | use std::{fs, vec}; 13 | 14 | // Transaction row representation. 15 | #[derive(Debug, Clone)] 16 | pub struct Transaction { 17 | pub date: NaiveDateTime, 18 | pub tx: String, 19 | pub points: i32, 20 | pub amount: f32, 21 | } 22 | 23 | // default values for new Transaction. 24 | impl Default for Transaction { 25 | fn default() -> Self { 26 | Transaction { 27 | date: NaiveDateTime::new( 28 | NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), 29 | NaiveTime::from_hms_opt(0, 0, 0).unwrap(), 30 | ), 31 | tx: "".to_owned(), 32 | points: 0, 33 | amount: 0.0, 34 | } 35 | } 36 | } 37 | 38 | // Parse the pdf and return a list of transactions. 39 | pub fn parse( 40 | path: String, 41 | name: String, 42 | _password: String, 43 | sender: &Sender>, 44 | ) -> Result<(), Error> { 45 | let file = pdfFile::>::open_password(path.clone(), _password.as_bytes()) 46 | .context(format!("failed to open file {}", path))?; 47 | 48 | // Iterate through pages 49 | for page in file.pages() { 50 | if let Ok(page) = page { 51 | if let Some(content) = &page.contents { 52 | if let Ok(ops) = content.operations(&file) { 53 | let mut transaction = Transaction::default(); 54 | 55 | let mut found_row = false; 56 | let mut column_ct = 0; 57 | let mut header_assigned = false; 58 | let mut header_column_ct = 0; 59 | let mut prev_value = ""; 60 | 61 | for op in ops.iter().skip_while(|op| match op { 62 | Op::TextDraw { ref text } => { 63 | let data = text.as_bytes(); 64 | if let Ok(s) = std::str::from_utf8(data) { 65 | return s.trim() != "Domestic Transactions" 66 | && s.trim() != "International Transactions"; 67 | } 68 | return true; 69 | } 70 | _ => return true, 71 | }) { 72 | match op { 73 | Op::TextDraw { ref text } => { 74 | let data = text.as_bytes(); 75 | if let Ok(s) = std::str::from_utf8(data) { 76 | // figure out the header column count from the table header. 77 | // This makes it easier to figure out the end of transaction lines. 78 | let d = s.trim(); 79 | 80 | if !header_assigned { 81 | // save this value to check in next iteration of Op::BeginText to count header columns. 82 | prev_value = d; 83 | 84 | // read till name. (that is the header columns) 85 | match d { 86 | x if x == name => { 87 | header_assigned = true; 88 | // +1 considering 'Cr' (credit/debit) 89 | header_column_ct += 1; 90 | continue; 91 | } 92 | "" | _ => continue, 93 | } 94 | } 95 | 96 | column_ct += 1; 97 | if d == "" { 98 | if !found_row { 99 | column_ct -= 1; 100 | } 101 | 102 | continue; 103 | } 104 | 105 | if column_ct == 1 { 106 | if let Ok(tx_date) = 107 | NaiveDateTime::parse_from_str(d, "%d/%m/%Y %H:%M:%S") 108 | { 109 | found_row = true; 110 | transaction.date = tx_date; 111 | continue; 112 | } 113 | if let Ok(tx_date) = 114 | NaiveDate::parse_from_str(d, "%d/%m/%Y") 115 | { 116 | found_row = true; 117 | transaction.date = NaiveDateTime::new( 118 | tx_date, 119 | NaiveTime::from_hms_opt(0, 0, 0).unwrap(), 120 | ); 121 | continue; 122 | } 123 | } 124 | 125 | if column_ct > 2 && d.contains(".") { 126 | if let Ok(amt) = d.replace(",", "").parse::() { 127 | transaction.amount = amt * -1.0; 128 | continue; 129 | } 130 | } 131 | 132 | // Must be description or debit/credit representation or reward points 133 | if let Ok(tx) = String::from_str(d) { 134 | // skip reward points 135 | if let Ok(p) = tx.replace("- ", "-").parse::() { 136 | transaction.points = p; 137 | continue; 138 | } 139 | 140 | // mark it as credit 141 | if column_ct > 3 && tx == "Cr" { 142 | transaction.amount *= -1.0; 143 | continue; 144 | } 145 | 146 | // assume transaction description to be next to date 147 | if column_ct == 2 { 148 | transaction.tx = tx; 149 | } 150 | } 151 | } 152 | } 153 | 154 | Op::BeginText => { 155 | if !header_assigned { 156 | match prev_value { 157 | "" => continue, 158 | "Domestic Transactions" | "International Transactions" => { 159 | continue 160 | } 161 | _ => header_column_ct += 1, 162 | } 163 | } 164 | } 165 | 166 | Op::EndText => { 167 | match column_ct { 168 | // ignore 0 column_ct 169 | 0 => continue, 170 | 171 | x if x == header_column_ct && found_row => { 172 | // write to stdout 173 | sender 174 | .send(vec![ 175 | transaction.date.to_string(), 176 | transaction.tx.clone(), 177 | transaction.points.to_string(), 178 | transaction.amount.to_string(), 179 | ]) 180 | .context("Failed to write row")?; 181 | 182 | // reset found flag 183 | found_row = false; 184 | transaction = Transaction::default(); 185 | column_ct = 0; 186 | } 187 | 188 | _ => continue, 189 | } 190 | } 191 | _ => {} 192 | } 193 | } 194 | } 195 | } 196 | } 197 | } 198 | 199 | Ok(()) 200 | } 201 | 202 | fn date_format_to_regex(date_format: &str) -> Regex { 203 | let regex_str = date_format 204 | .replace("%Y", r"\d{4}") 205 | .replace("%m", r"\d{2}") 206 | .replace("%d", r"\d{2}") 207 | .replace("%H", r"\d{2}") 208 | .replace("%M", r"\d{2}") 209 | .replace("%S", r"\d{2}") 210 | .replace("%z", r"[\+\-]\d{4}") 211 | .replace("%Z", r"[A-Z]{3}"); 212 | 213 | Regex::new(®ex_str).unwrap() 214 | } 215 | 216 | fn main() -> Result<(), Error> { 217 | let matches = Command::new("HDFC credit card statement parser") 218 | .arg( 219 | arg!(--dir ) 220 | .required_unless_present("file") 221 | .conflicts_with("file"), 222 | ) 223 | .arg( 224 | arg!(--file ) 225 | .required_unless_present("dir") 226 | .conflicts_with("dir"), 227 | ) 228 | .arg(arg!(--name ).required(true)) 229 | .arg(arg!(--password ).required(false)) 230 | .arg(arg!(--sortformat ).required(false)) 231 | .arg(arg!(--addheaders).required(false)) 232 | .get_matches(); 233 | 234 | let dir_path = matches.get_one::("dir"); 235 | let file_path = matches.get_one::("file"); 236 | let name = matches.get_one::("name"); 237 | let _password = matches.get_one::("password"); 238 | let add_headers = matches.get_flag("addheaders"); 239 | 240 | let mut pdf_files = Vec::new(); 241 | 242 | // path is directory? 243 | if let Some(dir_path) = dir_path { 244 | let entries = match fs::read_dir(dir_path) { 245 | Ok(file) => file, 246 | Err(err) => { 247 | eprintln!("Error opening statements directory: {}", err); 248 | exit(1); 249 | } 250 | }; 251 | 252 | // Filter pdf files, sort the statement files based on dates in the file names. 253 | pdf_files = entries 254 | .filter_map(Result::ok) 255 | .map(|entry| entry.path()) 256 | .filter(|path| { 257 | path.extension() 258 | .map_or(false, |ext| ext == "pdf" || ext == "PDF") 259 | }) 260 | .map(|path| path.to_string_lossy().to_string()) 261 | .collect(); 262 | 263 | // Sort only if there is a date format specified 264 | if let Some(sort_format) = matches.get_one::("sortformat") { 265 | pdf_files.sort_by(|a, b| { 266 | let re = date_format_to_regex(sort_format); 267 | let a_date = match re.find(a) { 268 | Some(date_str) => { 269 | NaiveDate::parse_from_str(date_str.as_str(), sort_format).unwrap() 270 | } 271 | None => NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), 272 | }; 273 | let b_date = match re.find(b) { 274 | Some(date_str) => { 275 | NaiveDate::parse_from_str(date_str.as_str(), sort_format).unwrap() 276 | } 277 | None => NaiveDate::from_ymd_opt(1970, 1, 1).unwrap(), 278 | }; 279 | a_date.cmp(&b_date) 280 | }) 281 | } 282 | } 283 | 284 | // path is file? 285 | if let Some(file_path) = file_path { 286 | match fs::metadata(file_path) { 287 | Ok(_) => pdf_files.push(file_path.to_string()), 288 | Err(err) => { 289 | eprintln!("Error opening statement file: {}", err); 290 | exit(1); 291 | } 292 | }; 293 | } 294 | 295 | let (tx, rx) = mpsc::channel(); 296 | 297 | let writer_thread = thread::spawn(move || -> Result<(), Error> { 298 | let mut wtr = csv::Writer::from_writer(io::stdout()); 299 | 300 | if add_headers { 301 | // writes the header rows to CSV if user passes --addheaders param 302 | wtr.write_record(&["Date", "Description", "Points", "Amount"]) 303 | .context("Failed to write headers")?; 304 | } 305 | 306 | for record in rx { 307 | wtr.write_record(&record).context("Failed to write row")?; 308 | } 309 | 310 | wtr.flush().context("Error flushing to stdout")?; 311 | Ok(()) 312 | }); 313 | 314 | let pass: String = match _password { 315 | Some(s) => s.clone(), 316 | None => "".to_string(), 317 | }; 318 | 319 | let n: String = match name { 320 | Some(s) => s.clone(), 321 | None => "".to_string(), 322 | }; 323 | 324 | for file in pdf_files { 325 | parse(file, n.clone(), pass.clone(), &tx).context("Failed to parse statement")?; 326 | } 327 | 328 | drop(tx); 329 | 330 | match writer_thread.join() { 331 | Ok(Ok(_)) => (), 332 | Ok(Err(e)) => return Err(e.into()), 333 | Err(e) => return Err(anyhow::anyhow!("Thread panicked: {:?}", e)), 334 | } 335 | 336 | Ok(()) 337 | } 338 | --------------------------------------------------------------------------------