Update pdf to csv

ppatel089 · web-flow · commit b038b6c889ec · 2024-10-31T13:02:52.000-04:00
diff --git a/pdf to csv b/pdf to csv
@@ -1,68 +1 @@
-import os
-import re
-import pandas as pd
 
-# Define the directory where text files are stored
-directory = '/mnt/data'  # Adjust this path if your text files are located in a different directory
-
-# Define regex patterns to parse each section
-transaction_start_pattern = re.compile(r'^52')
-routing_number_pattern = re.compile(r'^62')
-memo_pattern = re.compile(r'^705')
-transaction_end_pattern = re.compile(r'^82')
-
-# Initialize empty list to store parsed transactions
-transactions = []
-
-# Variables to hold the current Customer Name, Tax ID, Sec Code, and Description
-current_customer_name = ""
-current_tax_id = ""
-current_sec_code = ""
-current_description = ""
-
-# Function to parse a single transaction line starting with 62
-def parse_62_line(line):
-    transaction = {
-        "Customer Name": current_customer_name,
-        "Tax ID": current_tax_id,
-        "Sec Code": current_sec_code,
-        "Description": current_description,
-        "Routing Number": line[3:12].strip(),
-        "Account Number": line[12:16].strip(),
-        "Amount": line[16:26].strip(),
-        "Payee": line[26:].strip(),
-        "Memo": ""  # Memo will be added if a line starting with 705 is found
-    }
-    transactions.append(transaction)
-
-# Process all text files in the directory
-for filename in os.listdir(directory):
-    if filename.endswith('.txt'):
-        with open(os.path.join(directory, filename), 'r') as file:
-            lines = file.readlines()
-            for line in lines:
-                line = line.strip()
-
-                # Parse Customer Name, Tax ID, Sec Code, and Description for lines starting with 52
-                if transaction_start_pattern.match(line):
-                    current_customer_name = line[4:34].strip()  # Extract Customer Name from positions 4 to 34
-                    current_tax_id = line[34:45].strip()  # Extract Tax ID from positions 34 to 45
-                    current_sec_code = line[45:48].strip()  # Extract Sec Code from positions 45 to 48
-                    current_description = line[48:58].strip()  # Extract Description from positions 48 to 58
-
-                # Parse each transaction line starting with 62
-                elif routing_number_pattern.match(line):
-                    parse_62_line(line)
-
-                # Add Memo to the last transaction if line starts with 705
-                elif memo_pattern.match(line) and transactions:
-                    transactions[-1]["Memo"] = line[3:].strip()
-
-                # Identify end of transaction group
-                elif transaction_end_pattern.match(line):
-                    continue  # Move to the next group of transactions
-
-# Convert transactions list to DataFrame and save as CSV
-df = pd.DataFrame(transactions)
-df.to_csv('/mnt/data/parsed_transactions.csv', index=False)
-print("Parsed data saved to parsed_transactions.csv")