changeset 1818:7e6342de71fb

core (xml_tools): unescape HTML sequences to avoid parsing error in ElementParser
author Goffi <goffi@goffi.org>
date Fri, 22 Jan 2016 20:24:17 +0100 (2016-01-22)
parents 7ef0f5f90862
children c271087d2020
files src/tools/xml_tools.py
diffstat 1 files changed, 21 insertions(+), 5 deletions(-) [+]
line wrap: on
line diff
--- a/src/tools/xml_tools.py	Fri Jan 22 20:24:17 2016 +0100
+++ b/src/tools/xml_tools.py	Fri Jan 22 20:24:17 2016 +0100
@@ -29,11 +29,15 @@
 from twisted.internet import defer
 from sat.core import exceptions
 from collections import OrderedDict
+import htmlentitydefs
+import re
 
 """This library help manage XML used in SàT (parameters, registration, etc)"""
 
 SAT_FORM_PREFIX = "SAT_FORM_"
 SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_"  # used to have unique elements names
+html_entity_re = re.compile(r'&([a-zA-Z]+?);')
+XML_ENTITIES = ('quot', 'amp', 'apos', 'lt', 'gt')
 
 
 # Helper functions
@@ -1320,11 +1324,20 @@
 # Misc other funtions
 
 class ElementParser(object):
-    """callable class to parse XML string into Element
+    """callable class to parse XML string into Element"""
+    # XXX: Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942
 
-    Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942
-    (c) Karl Anderson
-    """
+    def _escapeHTML(self, matchobj):
+        entity = matchobj.group(1)
+        if entity in XML_ENTITIES:
+            # we don't escape XML entities
+            return matchobj.group(0)
+        else:
+            try:
+                return unichr(htmlentitydefs.name2codepoint[entity])
+            except KeyError:
+                log.warning(u"removing unknown entity {}".format(entity))
+                return u''
 
     def __call__(self, raw_xml, force_spaces=False, namespace=None):
         """
@@ -1339,6 +1352,9 @@
         else:
             raw_xml = u"<div>{}</div>".format(raw_xml)
 
+        # avoid ParserError on HTML escaped chars
+        raw_xml = html_entity_re.sub(self._escapeHTML, raw_xml)
+
         self.result = None
 
         def onStart(elem):
@@ -1362,7 +1378,7 @@
         top_elt = self.result.firstChildElement()
         # we now can check if there was a unique element on the top
         # and remove our wrapping <div/> is this was the case
-        if len(top_elt.children) == 1:
+        if len(top_elt.children) == 1 and domish.IElement.providedBy(top_elt.children[0]):
             top_elt = top_elt.firstChildElement()
         return top_elt