#!/usr/bin/env python3 """ HSBC Bank Statement PDF Extractor Extracts transaction data from HSBC Kinetic Current Account statements Author: Claude (Anthropic) Date: December 2024 """ import subprocess import re import csv from datetime import datetime def extract_text_from_pdf(pdf_path): """Extract text from PDF using pdftotext command""" try: result = subprocess.run( ['pdftotext', pdf_path, '-'], capture_output=True, text=True, check=True ) return result.stdout except subprocess.CalledProcessError as e: print(f"Error extracting PDF: {e}") return None except FileNotFoundError: print("Error: pdftotext not found. Please install poppler-utils.") print(" Windows: Download from https://github.com/oschwartz10612/poppler-windows/releases/") print(" Mac: brew install poppler") print(" Linux: apt-get install poppler-utils") return None def parse_hsbc_statements(pdf_text): """ Parse HSBC bank statement text and extract transactions Returns: List of transaction dictionaries with keys: - date: Transaction date (DD-MMM-YY format) - payment_type: DD, CR, VIS, BP, DR, ))) - details: Transaction description - paid_out: Amount debited - paid_in: Amount credited - balance: Running balance """ transactions = [] # Split into lines lines = pdf_text.split('\n') # Pattern to match transaction lines # Example: "01 Oct 24 DD DVLA-FP20FYB 52.50 392.24" tx_pattern = re.compile( r'(\d{2}\s+\w{3}\s+\d{2})\s+' # Date r'(\w+|\)\)\))\s+' # Payment type r'(.+?)\s+' # Details r'([\d,]+\.?\d*)\s*' # Paid out OR Paid in r'([\d,]+\.?\d*)?\s*' # Balance (optional) r'([D])?' # Debit indicator (optional) ) current_balance = None for i, line in enumerate(lines): line = line.strip() # Skip empty lines if not line: continue # Look for date pattern at start of line if re.match(r'\d{2}\s+\w{3}\s+\d{2}', line): # Try to parse as transaction match = tx_pattern.search(line) if match: date_str = match.group(1).replace(' ', '-') payment_type = match.group(2) details = match.group(3).strip() amount1 = match.group(4).replace(',', '') amount2 = match.group(5).replace(',', '') if match.group(5) else '' is_debit = match.group(6) == 'D' # Determine which amount is paid in/out based on pattern # HSBC format: Date, Type, Details, Amount, Balance # Amount could be paid in or paid out # Balance is the running balance after transaction paid_out = '' paid_in = '' balance = '' # If we have both amounts, second is usually the balance if amount2: # First amount is the transaction amount # Need to determine if it's paid in or paid out # CR (Credit) = Paid In # DD, VIS, DR, ))) = Usually Paid Out if payment_type == 'CR' or payment_type == 'BP': paid_in = amount1 else: paid_out = amount1 balance = amount2 if is_debit: balance = f"-{balance}" else: # Only one amount - could be transaction or balance # Need context from previous/next lines if payment_type == 'CR': paid_in = amount1 else: paid_out = amount1 transaction = { 'date': date_str, 'payment_type': payment_type, 'details': details, 'paid_out': paid_out, 'paid_in': paid_in, 'balance': balance } transactions.append(transaction) if balance: current_balance = balance return transactions def verify_balances(transactions): """Verify running balances are mathematically correct""" running_balance = 0 errors = [] for i, tx in enumerate(transactions): if tx['balance']: try: stated_balance = float(tx['balance'].replace(',', '')) # Calculate what balance should be if i == 0: running_balance = stated_balance else: paid_in = float(tx['paid_in']) if tx['paid_in'] else 0 paid_out = float(tx['paid_out']) if tx['paid_out'] else 0 running_balance = running_balance + paid_in - paid_out # Check if it matches (allow 0.01 difference for rounding) if abs(running_balance - stated_balance) > 0.01: errors.append({ 'row': i + 1, 'date': tx['date'], 'calculated': running_balance, 'stated': stated_balance, 'difference': running_balance - stated_balance }) # Sync to stated balance running_balance = stated_balance except ValueError: pass return errors def save_to_csv(transactions, output_file): """Save transactions to CSV file""" with open(output_file, 'w', newline='', encoding='utf-8') as f: writer = csv.DictWriter(f, fieldnames=[ 'date', 'payment_type', 'details', 'paid_out', 'paid_in', 'balance' ]) writer.writeheader() writer.writerows(transactions) print(f"✅ Saved {len(transactions)} transactions to {output_file}") def main(): """Main extraction workflow""" # Configuration pdf_file = "bank_statement.pdf" # Change this to your PDF file path output_file = "transactions.csv" print("=" * 80) print("HSBC Bank Statement Extractor") print("=" * 80) print() # Step 1: Extract text from PDF print(f"📄 Reading PDF: {pdf_file}") pdf_text = extract_text_from_pdf(pdf_file) if not pdf_text: print("❌ Failed to extract text from PDF") return print(f"✅ Extracted {len(pdf_text)} characters from PDF") print() # Step 2: Parse transactions print("🔍 Parsing transactions...") transactions = parse_hsbc_statements(pdf_text) print(f"✅ Found {len(transactions)} transactions") print() # Step 3: Verify balances print("🔢 Verifying balances...") errors = verify_balances(transactions) if errors: print(f"⚠️ Found {len(errors)} balance discrepancies:") for err in errors[:5]: # Show first 5 errors print(f" Row {err['row']} ({err['date']}): " f"Calculated £{err['calculated']:.2f} vs " f"Stated £{err['stated']:.2f} " f"(Diff: £{err['difference']:.2f})") if len(errors) > 5: print(f" ... and {len(errors) - 5} more") else: print("✅ All balances verified correctly") print() # Step 4: Save to CSV print(f"💾 Saving to CSV: {output_file}") save_to_csv(transactions, output_file) print() # Summary statistics print("=" * 80) print("SUMMARY") print("=" * 80) if transactions: total_in = sum(float(tx['paid_in']) if tx['paid_in'] else 0 for tx in transactions) total_out = sum(float(tx['paid_out']) if tx['paid_out'] else 0 for tx in transactions) print(f"Total Transactions: {len(transactions)}") print(f"Total Paid In: £{total_in:,.2f}") print(f"Total Paid Out: £{total_out:,.2f}") print(f"Net Movement: £{total_in - total_out:,.2f}") if transactions[0]['balance']: print(f"Opening Balance: £{transactions[0]['balance']}") if transactions[-1]['balance']: print(f"Closing Balance: £{transactions[-1]['balance']}") print("=" * 80) print("✅ Extraction complete!") if __name__ == "__main__": main()