PDF document to Text File
Posted: Thu May 30, 2024 3:58 am
PDF document to Text File
Code: Select all
import re
import os
import pdfplumber
def extract_data_from_pdf(pdf_path):
with pdfplumber.open(pdf_path) as pdf:
text = ''
for page in pdf.pages:
text += page.extract_text()
return text
def modify_lines_with_amounts(text):
lines = text.split('\n')
pattern = r'\b\d{1,3}(?:,\d{3})*\.\d{2}\b' # Pattern for decimal amounts
modified_lines = []
for line in lines:
match = re.search(pattern, line)
if match:
start, end = match.span()
modified_line = line[:start] + f"| {line[start:end]} |" + line[end:]
modified_lines.append(modified_line)
return modified_lines
# Provide the path to your PDF file
pdf_file_path = "C:\\icai\\Statement new.pdf"
extracted_text = extract_data_from_pdf(pdf_file_path)
if extracted_text:
modified_lines = modify_lines_with_amounts(extracted_text)
if modified_lines:
output_file_path = os.path.join(os.path.dirname(pdf_file_path), "modified_lines.txt")
with open(output_file_path, "w") as output_file:
for line in modified_lines:
output_file.write(line + '\n')
print(f"Modified lines exported to {output_file_path}")
else:
print("No lines found with amounts in decimals.")
else:
print("No text extracted from the PDF.")