Skip to content

Commit fdcac82

Browse files
committed
Improved javascript regex regocnizing for extracting js messages
1 parent 4f8c7f6 commit fdcac82

File tree

3 files changed

+89
-1
lines changed

3 files changed

+89
-1
lines changed

CHANGES.rst

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,15 @@
11
Babel Changelog
22
===============
33

4+
Next version
5+
--------------
6+
7+
Bugfixes
8+
~~~~~~~~
9+
10+
* Regex for parsing JavaScript regexes improved. Before this, the lexer couldn't recognize certain regexes,
11+
breaking the parsing of JS files.
12+
413
Version 2.9.1
514
-------------
615

babel/messages/jslexer.py

Lines changed: 51 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,57 @@
2424
name_re = re.compile(r'[\w$_][\w\d$_]*', re.UNICODE)
2525
dotted_name_re = re.compile(r'[\w$_][\w\d$_.]*[\w\d$_.]', re.UNICODE)
2626
division_re = re.compile(r'/=?')
27-
regex_re = re.compile(r'/(?:[^/\\]*(?:\\.[^/\\]*)*)/[a-zA-Z]*', re.DOTALL)
27+
28+
regex_re = re.compile(
29+
r'''
30+
31+
# Opening slash of the regex
32+
/
33+
34+
(?:
35+
36+
# 1) Blackslashed character
37+
#
38+
# Match a backslash `\` and then it's following character, allowing
39+
# to blackslash the `/` for example.
40+
(?:\\.)?
41+
42+
|
43+
44+
# 2) Regex character class `[a-z]`
45+
#
46+
# Match regex character class, like `[a-z]`. Inside a character
47+
# class, a `/` character may appear, which does not close the
48+
# regex. Therefore we allow it here inside a character class.
49+
\[
50+
(?:
51+
[^\]]*
52+
|
53+
\\\]
54+
)*
55+
\]
56+
57+
|
58+
59+
# 3) Other characters
60+
#
61+
# Match anything except a closing slash `/`, a backslash `\`, or a
62+
# opening bracket `[`. Those last two will be handled by the other
63+
# matchers.
64+
[^/\\\[]*
65+
66+
)*
67+
68+
# Closing slash of the regex
69+
/
70+
71+
# regex flags
72+
[a-zA-Z]*
73+
74+
''',
75+
re.DOTALL + re.VERBOSE
76+
)
77+
2878
line_re = re.compile(r'(\r\n|\n|\r)')
2979
line_join_re = re.compile(r'\\' + line_re.pattern)
3080
uni_escape_re = re.compile(r'[a-fA-F0-9]{1,4}')

tests/messages/test_js_extract.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -151,3 +151,32 @@ def test_template_string_tag_usage():
151151
)
152152

153153
assert messages == [(1, 'Tag template, wow', [], None)]
154+
155+
156+
def test_regex_with_non_escaped_slash():
157+
"""
158+
Test if regexes with non-escaped slashes are parsed correctly.
159+
160+
A Javascript regex that is opened and closed with slashes, allows a
161+
non-escaped slash inside a character class, like: [/]. In the past, the
162+
babel JS lexer thought this closed the regex.
163+
164+
If a " followed the falsly closing /, then babel thought a javascript
165+
string was started, and would stretch it to the next quote. This caused the
166+
bug.
167+
168+
The regex in babel/messages/jslexer.py now covers this scenario, and this
169+
unit test makes sure it works.
170+
"""
171+
buf = BytesIO(b"""\
172+
msg1 = _('message 1')
173+
regex1 = /[/]"/
174+
msg2 = _('message 2')
175+
fake_closing_quote = '"'
176+
""")
177+
messages = \
178+
list(extract.extract('javascript', buf, extract.DEFAULT_KEYWORDS,
179+
[], {}))
180+
181+
assert messages == [(1, 'message 1', [], None),
182+
(3, 'message 2', [], None)]

0 commit comments

Comments
 (0)