Skip to content

Commit 87239c3

Browse files
authored
Fix parsing of documents that may contain XML before Doctype (#149)
* Fix parsing of documents that may contain XML before Doctype This is a fix for malformed documents that may start with an XML tag, or even a comment before the declaration of the doctype. * Change assertion to check comments len
1 parent a092e27 commit 87239c3

File tree

2 files changed

+51
-1
lines changed

2 files changed

+51
-1
lines changed

native/html5ever_nif/src/flat_dom.rs

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -526,6 +526,8 @@ pub fn flat_sink_to_rec_term<'a>(
526526
child_base: 0,
527527
child_n: 0,
528528
}];
529+
let mut comments_bf_doctype = 0usize;
530+
let mut read_doctype = false;
529531

530532
loop {
531533
let mut top = stack.pop().unwrap();
@@ -567,7 +569,9 @@ pub fn flat_sink_to_rec_term<'a>(
567569
system_id,
568570
} => {
569571
assert!(!stack.is_empty());
570-
assert!(child_stack.is_empty());
572+
assert!(child_stack.is_empty() || comments_bf_doctype == child_stack.len());
573+
574+
read_doctype = true;
571575

572576
term = (
573577
atoms::doctype(),
@@ -596,6 +600,10 @@ pub fn flat_sink_to_rec_term<'a>(
596600
term = StrTendrilWrapper(contents).encode(env);
597601
}
598602
NodeData::Comment { contents } => {
603+
if !read_doctype {
604+
comments_bf_doctype += 1
605+
};
606+
599607
term = (atoms::comment(), StrTendrilWrapper(contents)).encode(env);
600608
}
601609
_ => unimplemented!(""),

test/html5ever_test.exs

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -308,4 +308,46 @@ defmodule Html5everTest do
308308
]}
309309
]}
310310
end
311+
312+
test "parse html starting with a XML tag" do
313+
html = """
314+
<?xml version="1.0" encoding="UTF-8"?>
315+
<!-- also a comment is allowed -->
316+
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
317+
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
318+
<head><title>Hello</title></head>
319+
<body>
320+
<a id="anchor" href="https://example.com">link</a>
321+
</body>
322+
</html>
323+
"""
324+
325+
assert Html5ever.parse(html) ==
326+
{:ok,
327+
[
328+
{:comment, "?xml version=\"1.0\" encoding=\"UTF-8\"?"},
329+
{:comment, " also a comment is allowed "},
330+
{:doctype, "html", "-//W3C//DTD XHTML 1.0 Strict//EN",
331+
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"},
332+
{
333+
"html",
334+
[{"xmlns", "http://www.w3.org/1999/xhtml"}, {"xml:lang", "en"}, {"lang", "en"}],
335+
[
336+
{"head", [], [{"title", [], ["Hello"]}]},
337+
"\n",
338+
" ",
339+
{"body", [],
340+
[
341+
"\n",
342+
" ",
343+
{"a", [{"id", "anchor"}, {"href", "https://example.com"}], ["link"]},
344+
"\n",
345+
" ",
346+
"\n",
347+
"\n"
348+
]}
349+
]
350+
}
351+
]}
352+
end
311353
end

0 commit comments

Comments
 (0)