Skip to content

Commit b038b6c

Browse files
authored
Update pdf to csv
1 parent f4d900d commit b038b6c

File tree

1 file changed

+0
-67
lines changed

1 file changed

+0
-67
lines changed

pdf to csv

-67
Original file line numberDiff line numberDiff line change
@@ -1,68 +1 @@
1-
import os
2-
import re
3-
import pandas as pd
41

5-
# Define the directory where text files are stored
6-
directory = '/mnt/data' # Adjust this path if your text files are located in a different directory
7-
8-
# Define regex patterns to parse each section
9-
transaction_start_pattern = re.compile(r'^52')
10-
routing_number_pattern = re.compile(r'^62')
11-
memo_pattern = re.compile(r'^705')
12-
transaction_end_pattern = re.compile(r'^82')
13-
14-
# Initialize empty list to store parsed transactions
15-
transactions = []
16-
17-
# Variables to hold the current Customer Name, Tax ID, Sec Code, and Description
18-
current_customer_name = ""
19-
current_tax_id = ""
20-
current_sec_code = ""
21-
current_description = ""
22-
23-
# Function to parse a single transaction line starting with 62
24-
def parse_62_line(line):
25-
transaction = {
26-
"Customer Name": current_customer_name,
27-
"Tax ID": current_tax_id,
28-
"Sec Code": current_sec_code,
29-
"Description": current_description,
30-
"Routing Number": line[3:12].strip(),
31-
"Account Number": line[12:16].strip(),
32-
"Amount": line[16:26].strip(),
33-
"Payee": line[26:].strip(),
34-
"Memo": "" # Memo will be added if a line starting with 705 is found
35-
}
36-
transactions.append(transaction)
37-
38-
# Process all text files in the directory
39-
for filename in os.listdir(directory):
40-
if filename.endswith('.txt'):
41-
with open(os.path.join(directory, filename), 'r') as file:
42-
lines = file.readlines()
43-
for line in lines:
44-
line = line.strip()
45-
46-
# Parse Customer Name, Tax ID, Sec Code, and Description for lines starting with 52
47-
if transaction_start_pattern.match(line):
48-
current_customer_name = line[4:34].strip() # Extract Customer Name from positions 4 to 34
49-
current_tax_id = line[34:45].strip() # Extract Tax ID from positions 34 to 45
50-
current_sec_code = line[45:48].strip() # Extract Sec Code from positions 45 to 48
51-
current_description = line[48:58].strip() # Extract Description from positions 48 to 58
52-
53-
# Parse each transaction line starting with 62
54-
elif routing_number_pattern.match(line):
55-
parse_62_line(line)
56-
57-
# Add Memo to the last transaction if line starts with 705
58-
elif memo_pattern.match(line) and transactions:
59-
transactions[-1]["Memo"] = line[3:].strip()
60-
61-
# Identify end of transaction group
62-
elif transaction_end_pattern.match(line):
63-
continue # Move to the next group of transactions
64-
65-
# Convert transactions list to DataFrame and save as CSV
66-
df = pd.DataFrame(transactions)
67-
df.to_csv('/mnt/data/parsed_transactions.csv', index=False)
68-
print("Parsed data saved to parsed_transactions.csv")

0 commit comments

Comments
 (0)