Allow extracting nested calls in Javascript

dylankiss · dylankiss · commit 9131a83b05f5 · 2024-10-10T11:30:29.000+02:00
Currently the Javascript extractor does not support nested gettext calls
at all.

The extraction code was refactored a bit to resemble the Python code
as much as possible and support this use-case.
diff --git a/babel/messages/extract.py b/babel/messages/extract.py
@@ -704,54 +704,109 @@ def extract_javascript(
     :param lineno: line number offset (for parsing embedded fragments)
     """
     from babel.messages.jslexer import Token, tokenize, unquote_string
-    funcname = message_lineno = None
-    messages = []
-    last_argument = None
-    translator_comments = []
-    concatenate_next = False
+
     encoding = options.get('encoding', 'utf-8')
-    last_token = None
-    call_stack = -1
     dotted = any('.' in kw for kw in keywords)
+    last_token = None
+    # Keep the stack of all function calls and its related contextual variables,
+    # so we can handle nested gettext calls.
+    function_stack = []
+    # Keep track of whether we're in a class or function definition
+    in_def = False
+    # Keep track of whether we're in a block of translator comments
+    in_translator_comments = False
+    # Keep track of the last encountered translator comments
+    translator_comments = []
+    # Keep track of the (split) strings encountered
+    message_buffer = []
+
     for token in tokenize(
         fileobj.read().decode(encoding),
-        jsx=options.get("jsx", True),
-        template_string=options.get("template_string", True),
+        jsx=options.get('jsx', True),
+        template_string=options.get('template_string', True),
         dotted=dotted,
         lineno=lineno,
     ):
-        if (  # Turn keyword`foo` expressions into keyword("foo") calls:
-            funcname and  # have a keyword...
-            (last_token and last_token.type == 'name') and  # we've seen nothing after the keyword...
-            token.type == 'template_string'  # this is a template string
+        if token.type == 'name' and token.value in ('class', 'function'):
+            # We're entering a class or function definition
+            in_def = True
+
+        elif in_def and token.type == 'operator' and token.value in ('(', '{'):
+            # We're in a class or function definition and should not do anything
+            in_def = False
+            continue
+
+        elif (
+            last_token
+            and last_token.type == 'name'
+            and last_token.value in keywords
+            and token.type == 'template_string'
         ):
-            message_lineno = token.lineno
-            messages = [unquote_string(token.value)]
-            call_stack = 0
+            # Turn keyword`foo` expressions into keyword("foo") function calls
+            string_value = unquote_string(token.value)
+            cur_translator_comments = translator_comments
+            if function_stack and function_stack[-1]['function_line_no'] == last_token.lineno:
+                # If our current function call is on the same line as the previous one,
+                # copy their translator comments, since they also apply to us.
+                cur_translator_comments = function_stack[-1]['translator_comments']
+
+            # We add all information needed later for the current function call
+            function_stack.append({
+                'function_line_no': last_token.lineno,
+                'function_name': last_token.value,
+                'message_line_no': token.lineno,
+                'messages': [string_value],
+                'translator_comments': cur_translator_comments,
+            })
+            translator_comments = []
+
+            # We act as if we are closing the function call now
             token = Token('operator', ')', token.lineno)
 
-        if options.get('parse_template_string') and not funcname and token.type == 'template_string':
+        if (
+            options.get('parse_template_string')
+            and (not last_token or last_token.type != 'name' or last_token.value not in keywords)
+            and token.type == 'template_string'
+        ):
             yield from parse_template_string(token.value, keywords, comment_tags, options, token.lineno)
 
         elif token.type == 'operator' and token.value == '(':
-            if funcname:
-                message_lineno = token.lineno
-                call_stack += 1
+            if last_token.type == 'name':
+                # We're entering a function call
+                cur_translator_comments = translator_comments
+                if function_stack and function_stack[-1]['function_line_no'] == token.lineno:
+                    # If our current function call is on the same line as the previous one,
+                    # copy their translator comments, since they also apply to us.
+                    cur_translator_comments = function_stack[-1]['translator_comments']
+
+                # We add all information needed later for the current function call
+                function_stack.append({
+                    'function_line_no': token.lineno,
+                    'function_name': last_token.value,
+                    'message_line_no': None,
+                    'messages': [],
+                    'translator_comments': cur_translator_comments,
+                })
+                translator_comments = []
 
-        elif call_stack == -1 and token.type == 'linecomment':
+        elif token.type == 'linecomment':
+            # Strip the comment token from the line
             value = token.value[2:].strip()
-            if translator_comments and \
-               translator_comments[-1][0] == token.lineno - 1:
+            if in_translator_comments and translator_comments[-1][0] == token.lineno - 1:
+                # We're already inside a translator comment, continue appending
                 translator_comments.append((token.lineno, value))
                 continue
 
             for comment_tag in comment_tags:
                 if value.startswith(comment_tag):
-                    translator_comments.append((token.lineno, value.strip()))
+                    # Comment starts with one of the comment tags,
+                    # so let's start capturing it
+                    in_translator_comments = True
+                    translator_comments.append((token.lineno, value))
                     break
 
         elif token.type == 'multilinecomment':
-            # only one multi-line comment may precede a translation
+            # Only one multi-line comment may precede a translation
             translator_comments = []
             value = token.value[2:-2].strip()
             for comment_tag in comment_tags:
@@ -761,68 +816,67 @@ def extract_javascript(
                         lines[0] = lines[0].strip()
                         lines[1:] = dedent('\n'.join(lines[1:])).splitlines()
                         for offset, line in enumerate(lines):
-                            translator_comments.append((token.lineno + offset,
-                                                        line))
+                            translator_comments.append((token.lineno + offset, line))
                     break
 
-        elif funcname and call_stack == 0:
+        elif function_stack and function_stack[-1]['function_name'] in keywords:
+            # We're inside a translation function call
             if token.type == 'operator' and token.value == ')':
-                if last_argument is not None:
-                    messages.append(last_argument)
-                if len(messages) > 1:
-                    messages = tuple(messages)
-                elif messages:
-                    messages = messages[0]
+                # The call has ended, so we yield the translatable term(s)
+                messages = function_stack[-1]['messages']
+                line_no = (
+                    function_stack[-1]['message_line_no']
+                    or function_stack[-1]['function_line_no']
+                )
+                cur_translator_comments = function_stack[-1]['translator_comments']
+
+                if message_buffer:
+                    messages.append(''.join(message_buffer))
+                    message_buffer.clear()
                 else:
-                    messages = None
+                    messages.append(None)
 
-                # Comments don't apply unless they immediately precede the
-                # message
-                if translator_comments and \
-                   translator_comments[-1][0] < message_lineno - 1:
-                    translator_comments = []
+                messages = tuple(messages) if len(messages) > 1 else messages[0]
+                if (
+                    cur_translator_comments
+                    and cur_translator_comments[-1][0] < line_no - 1
+                ):
+                    # The translator comments are not immediately preceding the current
+                    # term, so we skip them.
+                    cur_translator_comments = []
 
-                if messages is not None:
-                    yield (message_lineno, funcname, messages,
-                           [comment[1] for comment in translator_comments])
+                yield (
+                    line_no,
+                    function_stack[-1]['function_name'],
+                    messages,
+                    [comment[1] for comment in cur_translator_comments],
+                )
 
-                funcname = message_lineno = last_argument = None
-                concatenate_next = False
-                translator_comments = []
-                messages = []
-                call_stack = -1
+                function_stack.pop()
 
             elif token.type in ('string', 'template_string'):
-                new_value = unquote_string(token.value)
-                if concatenate_next:
-                    last_argument = (last_argument or '') + new_value
-                    concatenate_next = False
+                # We've encountered a string inside a translation function call
+                string_value = unquote_string(token.value)
+                if not function_stack[-1]['message_line_no']:
+                    function_stack[-1]['message_line_no'] = token.lineno
+                if string_value is not None:
+                    message_buffer.append(string_value)
+
+            elif token.type == 'operator' and token.value == ',':
+                # End of a function call argument
+                if message_buffer:
+                    function_stack[-1]['messages'].append(''.join(message_buffer))
+                    message_buffer.clear()
                 else:
-                    last_argument = new_value
-
-            elif token.type == 'operator':
-                if token.value == ',':
-                    if last_argument is not None:
-                        messages.append(last_argument)
-                        last_argument = None
-                    else:
-                        messages.append(None)
-                    concatenate_next = False
-                elif token.value == '+':
-                    concatenate_next = True
-
-        elif call_stack > 0 and token.type == 'operator' \
-                and token.value == ')':
-            call_stack -= 1
-
-        elif funcname and call_stack == -1:
-            funcname = None
-
-        elif call_stack == -1 and token.type == 'name' and \
-            token.value in keywords and \
-            (last_token is None or last_token.type != 'name' or
-             last_token.value != 'function'):
-            funcname = token.value
+                    function_stack[-1]['messages'].append(None)
+
+        elif function_stack and token.type == 'operator' and token.value == ')':
+            function_stack.pop()
+
+        if in_translator_comments and translator_comments[-1][0] < token.lineno:
+            # We have a newline in between the comments, so they don't belong
+            # together anymore
+            in_translator_comments = False
 
         last_token = token
 
diff --git a/tests/messages/test_js_extract.py b/tests/messages/test_js_extract.py
@@ -191,3 +191,28 @@ def test_inside_nested_template_string():
     )
 
     assert messages == [(1, 'Greetings!', [], None), (1, 'This is a lovely evening.', [], None), (1, 'The day is really nice!', [], None)]
+
+def test_nested_gettext_calls():
+    buf = BytesIO(b"""\
+gettext("Hello %s", gettext("User"));
+gettext("Hello %(user)s", { user: gettext("User") });
+gettext("Hello %s", dummy.dummyFunction(gettext("User")));
+gettext(
+    "Hello %(user)s",
+    { user: dummy.dummyFunction(gettext("User")) },
+);
+""")
+    messages = list(
+        extract.extract('javascript', buf, {"gettext": None}, [], {}),
+    )
+
+    assert messages == [
+        (1, 'User', [], None),
+        (1, 'Hello %s', [], None),
+        (2, 'User', [], None),
+        (2, 'Hello %(user)s', [], None),
+        (3, 'User', [], None),
+        (3, 'Hello %s', [], None),
+        (6, 'User', [], None),
+        (5, 'Hello %(user)s', [], None),
+    ]