Skip to content

Commit 5b2ea12

Browse files
committed
v0.7165 - parsing improvements
1 parent e09250b commit 5b2ea12

File tree

4 files changed

+110
-41
lines changed

4 files changed

+110
-41
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -240,6 +240,8 @@ If you run into any issues, consult the logs or reach out on the repository's [I
240240
---
241241

242242
# Changelog
243+
- v0.7165 - Parsing improvements
244+
- Improved text formatting & escaping in complex markdown vs. html cases
243245
- v0.7614 - Better stock market data fetching from Yahoo Finance
244246
- Changes made to `src/api_get_stock_prices_yfinance.py`
245247
- => More accurate ticker symbol searches, fallbacks, multi-day data etc.

config/config.ini

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -125,7 +125,8 @@ Enabled = True
125125

126126
# The preferred, more capable model to use by default (e.g., gpt-4o, gpt-4.5-preview).
127127
# This model will be used until its daily token limit (PremiumTokenLimit) is reached.
128-
PremiumModel = gpt-4o
128+
# PremiumModel = gpt-4o
129+
PremiumModel = gpt-4.1
129130

130131
# The cheaper model to switch to when the PremiumTokenLimit is reached (e.g., gpt-4o-mini).
131132
# This model has its own daily token limit (MiniTokenLimit).

src/main.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,7 +6,7 @@
66
# https://github.com/FlyingFathead/TelegramBot-OpenAI-API
77
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
88
# version of this program
9-
version_number = "0.7614"
9+
version_number = "0.7615"
1010

1111
# Add the project root directory to Python's path
1212
import sys

src/modules.py

Lines changed: 105 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -106,47 +106,113 @@ def preserve_html_and_escape_text(text):
106106
escaped_text += html.escape(text[last_end:])
107107
return escaped_text
108108

109-
# markdown to html parsing (v0.737.2)
109+
# v0.7615
110110
def markdown_to_html(text):
111-
try:
112-
# Handle the code blocks with optional language specification first
113-
def replace_codeblock(match):
114-
codeblock = match.group(2) # Get the actual code inside the block
115-
language = match.group(1) # Get the language identifier
116-
escaped_code = html.escape(codeblock.strip())
117-
if language:
118-
return f'<pre><code class="language-{language}">{escaped_code}</code></pre>'
119-
else:
120-
return f'<pre><code>{escaped_code}</code></pre>'
121-
122-
# Replace code blocks with <pre><code> tags
123-
text = re.sub(r'```(\w+)?\n([\s\S]*?)```', replace_codeblock, text)
124-
125-
# Now handle Markdown links and convert them to HTML
126-
def replace_markdown_link(match):
127-
link_text = match.group(1) # The text to display
128-
url = match.group(2) # The URL
129-
return f'<a href="{html.escape(url)}">{html.escape(link_text)}</a>'
130-
131-
# Replace Markdown links [text](url) with HTML <a> tags
132-
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
133-
134-
# Handle inline code and other markdown elements
135-
text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
136-
text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
137-
text = re.sub(r'_(.*?)_', r'<i>\1</i>', text)
138-
text = re.sub(r'`([^`]*)`', r'<code>\1</code>', text)
139-
text = re.sub(r'######\s*(.*)', r'➤ <b>\1</b>', text)
140-
text = re.sub(r'#####\s*(.*)', r'➤ <b>\1</b>', text)
141-
text = re.sub(r'####\s*(.*)', r'➤ <b>\1</b>', text)
142-
text = re.sub(r'###\s*(.*)', r'➤ <b>\1</b>', text)
143-
text = re.sub(r'##\s*(.*)', r'➤ <b>\1</b>', text)
144-
text = re.sub(r'#\s*(.*)', r'➤ <b>\1</b>', text)
145-
146-
return text
111+
"""
112+
Convert a simple subset of Markdown to HTML,
113+
ensuring that code blocks are extracted first so they
114+
don't get accidentally transformed by heading/bold/italic rules.
115+
"""
116+
# 1) Extract code blocks into placeholders
117+
code_blocks = []
118+
119+
def extract_codeblock(match):
120+
language = match.group(1) or "" # i.e. "python"
121+
code_body = match.group(2) # the code text
122+
code_blocks.append((language, code_body))
123+
placeholder_index = len(code_blocks) - 1
124+
# Return a placeholder token like [CODEBLOCK_0]
125+
return f"[CODEBLOCK_{placeholder_index}]"
126+
127+
# Regex: triple backticks with optional language
128+
# Use DOTALL ([\s\S]) so it can capture newlines
129+
text = re.sub(
130+
r'```(\w+)?\n([\s\S]*?)```',
131+
extract_codeblock,
132+
text
133+
)
134+
135+
# 2) Now do the normal Markdown parsing on whatever’s left (outside code blocks)
136+
137+
# Headings: only match at the start of lines (via ^) and multiline
138+
text = re.sub(r'^(######)\s+(.*)', r'➤ <b>\2</b>', text, flags=re.MULTILINE)
139+
text = re.sub(r'^(#####)\s+(.*)', r'➤ <b>\2</b>', text, flags=re.MULTILINE)
140+
text = re.sub(r'^(####)\s+(.*)', r'➤ <b>\2</b>', text, flags=re.MULTILINE)
141+
text = re.sub(r'^(###)\s+(.*)', r'➤ <b>\2</b>', text, flags=re.MULTILINE)
142+
text = re.sub(r'^(##)\s+(.*)', r'➤ <b>\2</b>', text, flags=re.MULTILINE)
143+
text = re.sub(r'^#\s+(.*)', r'➤ <b>\1</b>', text, flags=re.MULTILINE)
144+
145+
# Links of the form [text](url)
146+
def replace_markdown_link(m):
147+
link_text = m.group(1)
148+
url = m.group(2)
149+
# Escape any HTML entities in the URL or text
150+
return f'<a href="{html.escape(url)}">{html.escape(link_text)}</a>'
151+
text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
152+
153+
# Bold
154+
text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
155+
156+
# Italics: also handle both `*text*` and `_text_`
157+
text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
158+
text = re.sub(r'_(.*?)_', r'<i>\1</i>', text)
159+
160+
# Inline code with single backticks
161+
text = re.sub(r'`([^`]*)`', r'<code>\1</code>', text)
162+
163+
# 3) Re‐insert the code blocks
164+
for i, (language, code_body) in enumerate(code_blocks):
165+
escaped_code = html.escape(code_body.strip())
166+
if language:
167+
block_html = f'<pre><code class="language-{language}">{escaped_code}</code></pre>'
168+
else:
169+
block_html = f'<pre><code>{escaped_code}</code></pre>'
170+
# Replace [CODEBLOCK_i] with the final <pre><code> block
171+
text = text.replace(f"[CODEBLOCK_{i}]", block_html, 1)
147172

148-
except Exception as e:
149-
return str(e)
173+
return text
174+
175+
# # markdown to html parsing (v0.737.2)
176+
# def markdown_to_html(text):
177+
# try:
178+
# # Handle the code blocks with optional language specification first
179+
# def replace_codeblock(match):
180+
# codeblock = match.group(2) # Get the actual code inside the block
181+
# language = match.group(1) # Get the language identifier
182+
# escaped_code = html.escape(codeblock.strip())
183+
# if language:
184+
# return f'<pre><code class="language-{language}">{escaped_code}</code></pre>'
185+
# else:
186+
# return f'<pre><code>{escaped_code}</code></pre>'
187+
188+
# # Replace code blocks with <pre><code> tags
189+
# text = re.sub(r'```(\w+)?\n([\s\S]*?)```', replace_codeblock, text)
190+
191+
# # Now handle Markdown links and convert them to HTML
192+
# def replace_markdown_link(match):
193+
# link_text = match.group(1) # The text to display
194+
# url = match.group(2) # The URL
195+
# return f'<a href="{html.escape(url)}">{html.escape(link_text)}</a>'
196+
197+
# # Replace Markdown links [text](url) with HTML <a> tags
198+
# text = re.sub(r'\[([^\]]+)\]\(([^)]+)\)', replace_markdown_link, text)
199+
200+
# # Handle inline code and other markdown elements
201+
# text = re.sub(r'\*\*(.*?)\*\*', r'<b>\1</b>', text)
202+
# text = re.sub(r'\*(.*?)\*', r'<i>\1</i>', text)
203+
# text = re.sub(r'_(.*?)_', r'<i>\1</i>', text)
204+
# text = re.sub(r'`([^`]*)`', r'<code>\1</code>', text)
205+
# text = re.sub(r'######\s*(.*)', r'➤ <b>\1</b>', text)
206+
# text = re.sub(r'#####\s*(.*)', r'➤ <b>\1</b>', text)
207+
# text = re.sub(r'####\s*(.*)', r'➤ <b>\1</b>', text)
208+
# text = re.sub(r'###\s*(.*)', r'➤ <b>\1</b>', text)
209+
# text = re.sub(r'##\s*(.*)', r'➤ <b>\1</b>', text)
210+
# text = re.sub(r'#\s*(.*)', r'➤ <b>\1</b>', text)
211+
212+
# return text
213+
214+
# except Exception as e:
215+
# return str(e)
150216

151217
# Check and update the global rate limit.
152218
def check_global_rate_limit(max_requests_per_minute, global_request_count, rate_limit_reset_time):

0 commit comments

Comments
 (0)