Page 1 of 1

PDF document to Text File

Posted: Thu May 30, 2024 3:58 am
by admin
PDF document to Text File

Code: Select all

import re
import os
import pdfplumber

def extract_data_from_pdf(pdf_path):
    with pdfplumber.open(pdf_path) as pdf:
        text = ''
        for page in pdf.pages:
            text += page.extract_text()
    return text

def modify_lines_with_amounts(text):
    lines = text.split('\n')
    pattern = r'\b\d{1,3}(?:,\d{3})*\.\d{2}\b'  # Pattern for decimal amounts
    
    modified_lines = []
    for line in lines:
        match = re.search(pattern, line)
        if match:
            start, end = match.span()
            modified_line = line[:start] + f"| {line[start:end]} |" + line[end:]
            modified_lines.append(modified_line)
    
    return modified_lines

# Provide the path to your PDF file
pdf_file_path = "C:\\icai\\Statement new.pdf"

extracted_text = extract_data_from_pdf(pdf_file_path)
if extracted_text:
    modified_lines = modify_lines_with_amounts(extracted_text)
    if modified_lines:
        output_file_path = os.path.join(os.path.dirname(pdf_file_path), "modified_lines.txt")
        with open(output_file_path, "w") as output_file:
            for line in modified_lines:
                output_file.write(line + '\n')
        print(f"Modified lines exported to {output_file_path}")
    else:
        print("No lines found with amounts in decimals.")
else:
    print("No text extracted from the PDF.")