Skip to content

Commit 20464e1

Browse files
committed
Allow extracting deeply nested calls in Python
Currently the Python extractor does not support deeply nested gettext calls (deeper than as a direct argument to the top-level gettext call). e.g. ```py _("Hello %s", _("Person")) _("Hello %s", random_function(", ".join([_("Person 1"), _("Person 2")]))) ``` The extraction code was refactored quite a bit to simplify the flow and support this use-case. Fixes #1125 (meanwhile also fixes #1123)
1 parent 98b9562 commit 20464e1

File tree

2 files changed

+145
-93
lines changed

2 files changed

+145
-93
lines changed

babel/messages/extract.py

Lines changed: 119 additions & 77 deletions
Original file line numberDiff line numberDiff line change
@@ -30,10 +30,11 @@
3030
Mapping,
3131
MutableSequence,
3232
)
33+
from dataclasses import dataclass
3334
from functools import lru_cache
3435
from os.path import relpath
3536
from textwrap import dedent
36-
from tokenize import COMMENT, NAME, NL, OP, STRING, generate_tokens
37+
from tokenize import COMMENT, NAME, OP, STRING, generate_tokens
3738
from typing import TYPE_CHECKING, Any
3839

3940
from babel.messages._compat import find_entrypoints
@@ -99,6 +100,15 @@ def tell(self) -> int: ...
99100
FSTRING_END = getattr(tokenize, "FSTRING_END", None)
100101

101102

103+
@dataclass
104+
class FunctionStackItem:
105+
function_lineno: int
106+
function_name: str
107+
message_lineno: int | None
108+
messages: list[str | None]
109+
translator_comments: list[tuple[int, str]]
110+
111+
102112
def _strip_comment_tags(comments: MutableSequence[str], tags: Iterable[str]):
103113
"""Helper function for `extract` that strips comment tags from strings
104114
in a list of comment lines. This functions operates in-place.
@@ -507,14 +517,6 @@ def extract_python(
507517
:param options: a dictionary of additional options (optional)
508518
:rtype: ``iterator``
509519
"""
510-
funcname = lineno = message_lineno = None
511-
call_stack = -1
512-
buf = []
513-
messages = []
514-
translator_comments = []
515-
in_def = in_translator_comments = False
516-
comment_tag = None
517-
518520
encoding = parse_encoding(fileobj) or options.get('encoding', 'UTF-8')
519521
future_flags = parse_future_flags(fileobj, encoding)
520522
next_line = lambda: fileobj.readline().decode(encoding)
@@ -525,108 +527,148 @@ def extract_python(
525527
# currently parsing one.
526528
current_fstring_start = None
527529

530+
# Keep the stack of all function calls and its related contextual variables,
531+
# so we can handle nested gettext calls.
532+
function_stack: list[FunctionStackItem] = []
533+
# Keep the last encountered function/variable name for when we encounter
534+
# an opening parenthesis
535+
last_name = None
536+
# Keep track of whether we're in a class or function definition
537+
in_def = False
538+
# Keep track of whether we're in a block of translator comments
539+
in_translator_comments = False
540+
# Keep track of the last encountered translator comments
541+
translator_comments = []
542+
# Keep track of the (split) strings encountered
543+
message_buffer = []
544+
528545
for tok, value, (lineno, _), _, _ in tokens:
529-
if call_stack == -1 and tok == NAME and value in ('def', 'class'):
546+
if tok == NAME and value in ('def', 'class'):
547+
# We're entering a class or function definition
530548
in_def = True
531-
elif tok == OP and value == '(':
532-
if in_def:
533-
# Avoid false positives for declarations such as:
534-
# def gettext(arg='message'):
535-
in_def = False
536-
continue
537-
if funcname:
538-
call_stack += 1
539-
elif in_def and tok == OP and value == ':':
540-
# End of a class definition without parens
549+
continue
550+
551+
elif in_def and tok == OP and value in ('(', ':'):
552+
# We're in a class or function definition and should not do anything
541553
in_def = False
542554
continue
543-
elif call_stack == -1 and tok == COMMENT:
555+
556+
elif tok == OP and value == '(' and last_name:
557+
# We're entering a function call
558+
cur_translator_comments = translator_comments
559+
if function_stack and function_stack[-1].function_lineno == lineno:
560+
# If our current function call is on the same line as the previous one,
561+
# copy their translator comments, since they also apply to us.
562+
cur_translator_comments = function_stack[-1].translator_comments
563+
564+
# We add all information needed later for the current function call
565+
function_stack.append(FunctionStackItem(
566+
function_lineno=lineno,
567+
function_name=last_name,
568+
message_lineno=None,
569+
messages=[],
570+
translator_comments=cur_translator_comments,
571+
))
572+
translator_comments = []
573+
message_buffer.clear()
574+
575+
elif tok == COMMENT:
544576
# Strip the comment token from the line
545577
value = value[1:].strip()
546-
if in_translator_comments and \
547-
translator_comments[-1][0] == lineno - 1:
578+
if in_translator_comments and translator_comments[-1][0] == lineno - 1:
548579
# We're already inside a translator comment, continue appending
549580
translator_comments.append((lineno, value))
550581
continue
551-
# If execution reaches this point, let's see if comment line
552-
# starts with one of the comment tags
582+
553583
for comment_tag in comment_tags:
554584
if value.startswith(comment_tag):
585+
# Comment starts with one of the comment tags,
586+
# so let's start capturing it
555587
in_translator_comments = True
556588
translator_comments.append((lineno, value))
557589
break
558-
elif funcname and call_stack == 0:
559-
nested = (tok == NAME and value in keywords)
560-
if (tok == OP and value == ')') or nested:
561-
if buf:
562-
messages.append(''.join(buf))
563-
del buf[:]
590+
591+
elif function_stack and function_stack[-1].function_name in keywords:
592+
# We're inside a translation function call
593+
if tok == OP and value == ')':
594+
# The call has ended, so we yield the translatable term(s)
595+
messages = function_stack[-1].messages
596+
lineno = (
597+
function_stack[-1].message_lineno
598+
or function_stack[-1].function_lineno
599+
)
600+
cur_translator_comments = function_stack[-1].translator_comments
601+
602+
if message_buffer:
603+
messages.append(''.join(message_buffer))
604+
message_buffer.clear()
564605
else:
565606
messages.append(None)
566607

567608
messages = tuple(messages) if len(messages) > 1 else messages[0]
568-
# Comments don't apply unless they immediately
569-
# precede the message
570-
if translator_comments and \
571-
translator_comments[-1][0] < message_lineno - 1:
572-
translator_comments = []
609+
if (
610+
cur_translator_comments
611+
and cur_translator_comments[-1][0] < lineno - 1
612+
):
613+
# The translator comments are not immediately preceding the current
614+
# term, so we skip them.
615+
cur_translator_comments = []
616+
617+
yield (
618+
lineno,
619+
function_stack[-1].function_name,
620+
messages,
621+
[comment[1] for comment in cur_translator_comments],
622+
)
623+
624+
function_stack.pop()
573625

574-
yield (message_lineno, funcname, messages,
575-
[comment[1] for comment in translator_comments])
576-
577-
funcname = lineno = message_lineno = None
578-
call_stack = -1
579-
messages = []
580-
translator_comments = []
581-
in_translator_comments = False
582-
if nested:
583-
funcname = value
584626
elif tok == STRING:
585-
val = _parse_python_string(value, encoding, future_flags)
586-
if val is not None:
587-
if not message_lineno:
588-
message_lineno = lineno
589-
buf.append(val)
627+
# We've encountered a string inside a translation function call
628+
string_value = _parse_python_string(value, encoding, future_flags)
629+
if not function_stack[-1].message_lineno:
630+
function_stack[-1].message_lineno = lineno
631+
if string_value is not None:
632+
message_buffer.append(string_value)
590633

591634
# Python 3.12+, see https://peps.python.org/pep-0701/#new-tokens
592635
elif tok == FSTRING_START:
593636
current_fstring_start = value
594-
if not message_lineno:
595-
message_lineno = lineno
596637
elif tok == FSTRING_MIDDLE:
597638
if current_fstring_start is not None:
598639
current_fstring_start += value
599640
elif tok == FSTRING_END:
600641
if current_fstring_start is not None:
601642
fstring = current_fstring_start + value
602-
val = _parse_python_string(fstring, encoding, future_flags)
603-
if val is not None:
604-
buf.append(val)
643+
string_value = _parse_python_string(fstring, encoding, future_flags)
644+
if string_value is not None:
645+
message_buffer.append(string_value)
605646

606647
elif tok == OP and value == ',':
607-
if buf:
608-
messages.append(''.join(buf))
609-
del buf[:]
648+
# End of a function call argument
649+
if message_buffer:
650+
function_stack[-1].messages.append(''.join(message_buffer))
651+
message_buffer.clear()
610652
else:
611-
messages.append(None)
612-
if translator_comments:
613-
# We have translator comments, and since we're on a
614-
# comma(,) user is allowed to break into a new line
615-
# Let's increase the last comment's lineno in order
616-
# for the comment to still be a valid one
617-
old_lineno, old_comment = translator_comments.pop()
618-
translator_comments.append((old_lineno + 1, old_comment))
619-
620-
elif tok != NL and not message_lineno:
621-
message_lineno = lineno
622-
elif call_stack > 0 and tok == OP and value == ')':
623-
call_stack -= 1
624-
elif funcname and call_stack == -1:
625-
funcname = None
626-
elif tok == NAME and value in keywords:
627-
funcname = value
653+
function_stack[-1].messages.append(None)
628654

629-
if current_fstring_start is not None and tok not in {FSTRING_START, FSTRING_MIDDLE}:
655+
elif function_stack and tok == OP and value == ')':
656+
function_stack.pop()
657+
658+
if in_translator_comments and translator_comments[-1][0] < lineno:
659+
# We have a newline in between the comments, so they don't belong
660+
# together anymore
661+
in_translator_comments = False
662+
663+
if tok == NAME:
664+
last_name = value
665+
if function_stack and not function_stack[-1].message_lineno:
666+
function_stack[-1].message_lineno = lineno
667+
668+
if (
669+
current_fstring_start is not None
670+
and tok not in {FSTRING_START, FSTRING_MIDDLE}
671+
):
630672
# In Python 3.12, tokens other than FSTRING_* mean the
631673
# f-string is dynamic, so we don't wan't to extract it.
632674
# And if it's FSTRING_END, we've already handled it above.

tests/messages/test_extract.py

Lines changed: 26 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -428,24 +428,34 @@ def test_nested_messages(self):
428428
# NOTE: Third
429429
_(u'Hello, {0} and {1}!', _(u'Heungsub'),
430430
_(u'Armin'))
431+
432+
# NOTE: Fourth
433+
_("Hello %(person)s and %(other_person)s", person=random_fn(_("Person 1")), other_person=random_obj["random_fn"](_("Person 2")))
434+
435+
# NOTE: Fifth
436+
_("Hello %(people)s",
437+
people=random_obj.random_fn(
438+
", ".join([_("Person 1"), _("Person 2")]) + ", and everyone else"
439+
)
440+
)
431441
""")
432442
messages = list(extract.extract_python(buf, ('_',), ['NOTE:'], {}))
433-
assert messages[0][2] == ('Hello, {name}!', None)
434-
assert messages[0][3] == ['NOTE: First']
435-
assert messages[1][2] == 'Foo Bar'
436-
assert messages[1][3] == []
437-
assert messages[2][2] == ('Hello, {name1} and {name2}!', None)
438-
assert messages[2][3] == ['NOTE: Second']
439-
assert messages[3][2] == 'Heungsub'
440-
assert messages[3][3] == []
441-
assert messages[4][2] == 'Armin'
442-
assert messages[4][3] == []
443-
assert messages[5][2] == ('Hello, {0} and {1}!', None)
444-
assert messages[5][3] == ['NOTE: Third']
445-
assert messages[6][2] == 'Heungsub'
446-
assert messages[6][3] == []
447-
assert messages[7][2] == 'Armin'
448-
assert messages[7][3] == []
443+
assert [(m[2], m[3]) for m in messages] == [
444+
('Foo Bar', ['NOTE: First']),
445+
(('Hello, {name}!', None), ['NOTE: First']),
446+
('Heungsub', ['NOTE: Second']),
447+
('Armin', []),
448+
(('Hello, {name1} and {name2}!', None, None), ['NOTE: Second']),
449+
('Heungsub', ['NOTE: Third']),
450+
('Armin', []),
451+
(('Hello, {0} and {1}!', None, None), ['NOTE: Third']),
452+
('Person 1', ['NOTE: Fourth']),
453+
('Person 2', ['NOTE: Fourth']),
454+
(('Hello %(person)s and %(other_person)s', None, None), ['NOTE: Fourth']),
455+
('Person 1', []),
456+
('Person 2', []),
457+
(('Hello %(people)s', None), ['NOTE: Fifth']),
458+
]
449459

450460

451461
class ExtractTestCase(unittest.TestCase):

0 commit comments

Comments
 (0)