#!/usr/bin/env python3
"""
HSBC Bank Statement PDF Extractor
Extracts transaction data from HSBC Kinetic Current Account statements
Author: Claude (Anthropic)
Date: December 2024
"""
import subprocess
import re
import csv
from datetime import datetime
def extract_text_from_pdf(pdf_path):
"""Extract text from PDF using pdftotext command"""
try:
result = subprocess.run(
['pdftotext', pdf_path, '-'],
capture_output=True,
text=True,
check=True
)
return result.stdout
except subprocess.CalledProcessError as e:
print(f"Error extracting PDF: {e}")
return None
except FileNotFoundError:
print("Error: pdftotext not found. Please install poppler-utils.")
print(" Windows: Download from https://github.com/oschwartz10612/poppler-windows/releases/")
print(" Mac: brew install poppler")
print(" Linux: apt-get install poppler-utils")
return None
def parse_hsbc_statements(pdf_text):
"""
Parse HSBC bank statement text and extract transactions
Returns: List of transaction dictionaries with keys:
- date: Transaction date (DD-MMM-YY format)
- payment_type: DD, CR, VIS, BP, DR, )))
- details: Transaction description
- paid_out: Amount debited
- paid_in: Amount credited
- balance: Running balance
"""
transactions = []
# Split into lines
lines = pdf_text.split('\n')
# Pattern to match transaction lines
# Example: "01 Oct 24 DD DVLA-FP20FYB 52.50 392.24"
tx_pattern = re.compile(
r'(\d{2}\s+\w{3}\s+\d{2})\s+' # Date
r'(\w+|\)\)\))\s+' # Payment type
r'(.+?)\s+' # Details
r'([\d,]+\.?\d*)\s*' # Paid out OR Paid in
r'([\d,]+\.?\d*)?\s*' # Balance (optional)
r'([D])?' # Debit indicator (optional)
)
current_balance = None
for i, line in enumerate(lines):
line = line.strip()
# Skip empty lines
if not line:
continue
# Look for date pattern at start of line
if re.match(r'\d{2}\s+\w{3}\s+\d{2}', line):
# Try to parse as transaction
match = tx_pattern.search(line)
if match:
date_str = match.group(1).replace(' ', '-')
payment_type = match.group(2)
details = match.group(3).strip()
amount1 = match.group(4).replace(',', '')
amount2 = match.group(5).replace(',', '') if match.group(5) else ''
is_debit = match.group(6) == 'D'
# Determine which amount is paid in/out based on pattern
# HSBC format: Date, Type, Details, Amount, Balance
# Amount could be paid in or paid out
# Balance is the running balance after transaction
paid_out = ''
paid_in = ''
balance = ''
# If we have both amounts, second is usually the balance
if amount2:
# First amount is the transaction amount
# Need to determine if it's paid in or paid out
# CR (Credit) = Paid In
# DD, VIS, DR, ))) = Usually Paid Out
if payment_type == 'CR' or payment_type == 'BP':
paid_in = amount1
else:
paid_out = amount1
balance = amount2
if is_debit:
balance = f"-{balance}"
else:
# Only one amount - could be transaction or balance
# Need context from previous/next lines
if payment_type == 'CR':
paid_in = amount1
else:
paid_out = amount1
transaction = {
'date': date_str,
'payment_type': payment_type,
'details': details,
'paid_out': paid_out,
'paid_in': paid_in,
'balance': balance
}
transactions.append(transaction)
if balance:
current_balance = balance
return transactions
def verify_balances(transactions):
"""Verify running balances are mathematically correct"""
running_balance = 0
errors = []
for i, tx in enumerate(transactions):
if tx['balance']:
try:
stated_balance = float(tx['balance'].replace(',', ''))
# Calculate what balance should be
if i == 0:
running_balance = stated_balance
else:
paid_in = float(tx['paid_in']) if tx['paid_in'] else 0
paid_out = float(tx['paid_out']) if tx['paid_out'] else 0
running_balance = running_balance + paid_in - paid_out
# Check if it matches (allow 0.01 difference for rounding)
if abs(running_balance - stated_balance) > 0.01:
errors.append({
'row': i + 1,
'date': tx['date'],
'calculated': running_balance,
'stated': stated_balance,
'difference': running_balance - stated_balance
})
# Sync to stated balance
running_balance = stated_balance
except ValueError:
pass
return errors
def save_to_csv(transactions, output_file):
"""Save transactions to CSV file"""
with open(output_file, 'w', newline='', encoding='utf-8') as f:
writer = csv.DictWriter(f, fieldnames=[
'date', 'payment_type', 'details', 'paid_out', 'paid_in', 'balance'
])
writer.writeheader()
writer.writerows(transactions)
print(f"✅ Saved {len(transactions)} transactions to {output_file}")
def main():
"""Main extraction workflow"""
# Configuration
pdf_file = "bank_statement.pdf" # Change this to your PDF file path
output_file = "transactions.csv"
print("=" * 80)
print("HSBC Bank Statement Extractor")
print("=" * 80)
print()
# Step 1: Extract text from PDF
print(f"📄 Reading PDF: {pdf_file}")
pdf_text = extract_text_from_pdf(pdf_file)
if not pdf_text:
print("❌ Failed to extract text from PDF")
return
print(f"✅ Extracted {len(pdf_text)} characters from PDF")
print()
# Step 2: Parse transactions
print("🔍 Parsing transactions...")
transactions = parse_hsbc_statements(pdf_text)
print(f"✅ Found {len(transactions)} transactions")
print()
# Step 3: Verify balances
print("🔢 Verifying balances...")
errors = verify_balances(transactions)
if errors:
print(f"⚠️ Found {len(errors)} balance discrepancies:")
for err in errors[:5]: # Show first 5 errors
print(f" Row {err['row']} ({err['date']}): "
f"Calculated £{err['calculated']:.2f} vs "
f"Stated £{err['stated']:.2f} "
f"(Diff: £{err['difference']:.2f})")
if len(errors) > 5:
print(f" ... and {len(errors) - 5} more")
else:
print("✅ All balances verified correctly")
print()
# Step 4: Save to CSV
print(f"💾 Saving to CSV: {output_file}")
save_to_csv(transactions, output_file)
print()
# Summary statistics
print("=" * 80)
print("SUMMARY")
print("=" * 80)
if transactions:
total_in = sum(float(tx['paid_in']) if tx['paid_in'] else 0
for tx in transactions)
total_out = sum(float(tx['paid_out']) if tx['paid_out'] else 0
for tx in transactions)
print(f"Total Transactions: {len(transactions)}")
print(f"Total Paid In: £{total_in:,.2f}")
print(f"Total Paid Out: £{total_out:,.2f}")
print(f"Net Movement: £{total_in - total_out:,.2f}")
if transactions[0]['balance']:
print(f"Opening Balance: £{transactions[0]['balance']}")
if transactions[-1]['balance']:
print(f"Closing Balance: £{transactions[-1]['balance']}")
print("=" * 80)
print("✅ Extraction complete!")
if __name__ == "__main__":
main()