From cf0392e87b11dfd26f6aefc56ee1a9dfec43d809 Mon Sep 17 00:00:00 2001 From: Mike Dalessio Date: Sat, 14 Aug 2021 11:44:13 -0400 Subject: [PATCH] fix: skip relinking namespaces in HTML4/5 documents Also fix both implementations of relink_namespaces to not skip attributes when the node itself doesn't have a namespace. Finally, the private method XML::Node#add_child_node_and_reparent_attrs is no longer needed after fixing relink_namespaces, so this commit essentially reverts 9fd03d8. --- CHANGELOG.md | 5 +++ ext/java/nokogiri/XmlNode.java | 42 ++++++++++--------- ext/nokogiri/xml_node.c | 29 +++++++------ lib/nokogiri/html5/node.rb | 22 +--------- lib/nokogiri/xml/node.rb | 16 ++----- .../test_namespaces_in_created_doc.rb | 22 +++++----- 6 files changed, 59 insertions(+), 77 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index fdd157d862..1399f3f09a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,11 @@ Nokogiri follows [Semantic Versioning](https://semver.org/), please see the [REA ## next / unreleased +### Fixed + +* [CRuby] When reparenting HTML nodes, do not attempt to relink namespaces. Previously, an HTML attribute with a colon might be interpreted as a prefixed/namespaced atttribute (for example, "xml:lang"). [[#1790](https://github.com/sparklemotion/nokogiri/issues/1790)] + + ### Improved * [CRuby] `Node#line` is no longer capped at 65535. libxml v2.9.0 and later support a new parse option, exposed as `Nokogiri::XML::ParseOptions::PARSE_BIG_LINES` and set in `ParseOptions::DEFAULT_XML`, `::DEFAULT_XSLT`, `::DEFAULT_HTML`, and `::DEFAULT_SCHEMA`. (Note that JRuby never had this problem.) [[#1764](https://github.com/sparklemotion/nokogiri/issues/1764), [#1493](https://github.com/sparklemotion/nokogiri/issues/1493), [#1617](https://github.com/sparklemotion/nokogiri/issues/1617), [#1505](https://github.com/sparklemotion/nokogiri/issues/1505), [#1003](https://github.com/sparklemotion/nokogiri/issues/1003), [#533](https://github.com/sparklemotion/nokogiri/issues/533)] diff --git a/ext/java/nokogiri/XmlNode.java b/ext/java/nokogiri/XmlNode.java index d1ced80f74..80be49f605 100644 --- a/ext/java/nokogiri/XmlNode.java +++ b/ext/java/nokogiri/XmlNode.java @@ -421,24 +421,24 @@ public class XmlNode extends RubyObject String nsURI = e.lookupNamespaceURI(prefix); this.node = NokogiriHelpers.renameNode(e, nsURI, e.getNodeName()); - if (nsURI == null || nsURI.isEmpty()) { return; } - - String currentPrefix = e.getParentNode().lookupPrefix(nsURI); - String currentURI = e.getParentNode().lookupNamespaceURI(prefix); - boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI); - - // add xmlns attribute if this is a new root node or if the node's - // namespace isn't a default namespace in the new document - if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) { - // this is the root node, so we must set the namespaces attributes anyway - e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI); - } else if (prefix == null) { - // this is a default namespace but isn't the default where this node is being added - if (!isDefault) { e.setAttribute("xmlns", nsURI); } - } else if (!prefix.equals(currentPrefix) || nsURI.equals(currentURI)) { - // this is a prefixed namespace - // but doesn't have the same prefix or the prefix is set to a different URI - e.setAttribute("xmlns:" + prefix, nsURI); + if (nsURI != null && !nsURI.isEmpty()) { + String currentPrefix = e.getParentNode().lookupPrefix(nsURI); + String currentURI = e.getParentNode().lookupNamespaceURI(prefix); + boolean isDefault = e.getParentNode().isDefaultNamespace(nsURI); + + // add xmlns attribute if this is a new root node or if the node's + // namespace isn't a default namespace in the new document + if (e.getParentNode().getNodeType() == Node.DOCUMENT_NODE) { + // this is the root node, so we must set the namespaces attributes anyway + e.setAttribute(prefix == null ? "xmlns" : "xmlns:" + prefix, nsURI); + } else if (prefix == null) { + // this is a default namespace but isn't the default where this node is being added + if (!isDefault) { e.setAttribute("xmlns", nsURI); } + } else if (!prefix.equals(currentPrefix) || nsURI.equals(currentURI)) { + // this is a prefixed namespace + // but doesn't have the same prefix or the prefix is set to a different URI + e.setAttribute("xmlns:" + prefix, nsURI); + } } if (e.hasAttributes()) { @@ -473,8 +473,10 @@ public class XmlNode extends RubyObject } } - if (this.node.hasChildNodes()) { - relink_namespace(context, getChildren()); + if (nsURI != null && !nsURI.isEmpty()) { + if (this.node.hasChildNodes()) { + relink_namespace(context, getChildren()); + } } } diff --git a/ext/nokogiri/xml_node.c b/ext/nokogiri/xml_node.c index 545a39b106..4f65ac414e 100644 --- a/ext/nokogiri/xml_node.c +++ b/ext/nokogiri/xml_node.c @@ -115,18 +115,6 @@ relink_namespace(xmlNodePtr reparented) } } - /* Only walk all children if there actually is a namespace we need to */ - /* reparent. */ - if (NULL == reparented->ns) { return; } - - /* When a node gets reparented, walk it's children to make sure that */ - /* their namespaces are reparented as well. */ - child = reparented->children; - while (NULL != child) { - relink_namespace(child); - child = child->next; - } - if (reparented->type == XML_ELEMENT_NODE) { attr = reparented->properties; while (NULL != attr) { @@ -134,6 +122,18 @@ relink_namespace(xmlNodePtr reparented) attr = attr->next; } } + + /* Only walk all children if there actually is a namespace we need to */ + /* reparent. */ + if (reparented->ns) { + /* When a node gets reparented, walk it's children to make sure that */ + /* their namespaces are reparented as well. */ + child = reparented->children; + while (NULL != child) { + relink_namespace(child); + child = child->next; + } + } } /* :nodoc: */ @@ -351,7 +351,10 @@ reparent_node_with(VALUE pivot_obj, VALUE reparentee_obj, pivot_reparentee_func */ DATA_PTR(reparentee_obj) = reparented ; - relink_namespace(reparented); + /* HTML doesn't have namespaces */ + if (!rb_obj_is_kind_of(DOC_RUBY_OBJECT(reparented->doc), cNokogiriHtml4Document)) { + relink_namespace(reparented); + } reparented_obj = noko_xml_node_wrap(Qnil, reparented); diff --git a/lib/nokogiri/html5/node.rb b/lib/nokogiri/html5/node.rb index 92cf1b9470..5275c415cb 100644 --- a/lib/nokogiri/html5/node.rb +++ b/lib/nokogiri/html5/node.rb @@ -63,28 +63,8 @@ def fragment(tags) return super(tags) unless document.is_a?(HTML5::Document) DocumentFragment.new(document, tags, self) end - - private - - # HTML elements can have attributes that contain colons. - # Nokogiri::XML::Node#[]= treats names with colons as a prefixed QName - # and tries to create an attribute in a namespace. This is especially - # annoying with attribute names like xml:lang since libxml2 will - # actually create the xml namespace if it doesn't exist already. - def add_child_node_and_reparent_attrs(node) - return super(node) unless document.is_a?(HTML5::Document) - # I'm not sure what this method is supposed to do. Reparenting - # namespaces is handled by libxml2, including child namespaces which - # this method wouldn't handle. - # https://github.com/sparklemotion/nokogiri/issues/1790 - add_child_node(node) - # node.attribute_nodes.find_all { |a| a.namespace }.each do |attr| - # attr.remove - # ns = attr.namespace - # a["#{ns.prefix}:#{attr.name}"] = attr.value - # end - end end + # Monkey patch XML::Node.prepend(HTML5::Node) end diff --git a/lib/nokogiri/xml/node.rb b/lib/nokogiri/xml/node.rb index a06ba5fa56..709422cafd 100644 --- a/lib/nokogiri/xml/node.rb +++ b/lib/nokogiri/xml/node.rb @@ -128,9 +128,9 @@ def >(selector) def add_child(node_or_tags) node_or_tags = coerce(node_or_tags) if node_or_tags.is_a?(XML::NodeSet) - node_or_tags.each { |n| add_child_node_and_reparent_attrs n } + node_or_tags.each { |n| add_child_node n } else - add_child_node_and_reparent_attrs node_or_tags + add_child_node node_or_tags end node_or_tags end @@ -248,9 +248,9 @@ def children=(node_or_tags) node_or_tags = coerce(node_or_tags) children.unlink if node_or_tags.is_a?(XML::NodeSet) - node_or_tags.each { |n| add_child_node_and_reparent_attrs n } + node_or_tags.each { |n| add_child_node n } else - add_child_node_and_reparent_attrs node_or_tags + add_child_node node_or_tags end node_or_tags end @@ -1223,14 +1223,6 @@ def inspect_attributes # @private IMPLIED_XPATH_CONTEXTS = [".//".freeze].freeze - - def add_child_node_and_reparent_attrs(node) - add_child_node node - node.attribute_nodes.find_all { |a| a.name =~ /:/ }.each do |attr_node| - attr_node.remove - node[attr_node.name] = attr_node.value - end - end end end end diff --git a/test/namespaces/test_namespaces_in_created_doc.rb b/test/namespaces/test_namespaces_in_created_doc.rb index 133c2e2463..79d2e6370a 100644 --- a/test/namespaces/test_namespaces_in_created_doc.rb +++ b/test/namespaces/test_namespaces_in_created_doc.rb @@ -88,19 +88,19 @@ def test_created_namespaced_attribute_on_unparented_node_is_renamespaced_in_xml_ assert(child.attribute_nodes.first.namespace) end - # def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html4_doc - # doc1 = Nokogiri::HTML4("") - # doc2 = Nokogiri::HTML4("") + def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html4_doc + doc1 = Nokogiri::HTML4("") + doc2 = Nokogiri::HTML4("") - # child = doc1.create_element("div") - # child["tempname"] = "en" - # attr = child.attribute("tempname") - # attr.name = "xml:lang" - # assert_nil(child.attribute_nodes.first.namespace) + child = doc1.create_element("div") + child["tempname"] = "en" + attr = child.attribute("tempname") + attr.name = "xml:lang" + assert_nil(child.attribute_nodes.first.namespace) - # doc2.at_css("body").add_child(child) - # assert_nil(child.attribute_nodes.first.namespace) - # end + doc2.at_css("body").add_child(child) + assert_nil(child.attribute_nodes.first.namespace) + end def test_created_namespaced_attribute_on_unparented_node_is_not_renamespaced_in_html5_doc skip("HTML5 not supported on this platform yet") unless defined?(Nokogiri::HTML5)