# HG changeset patch # User Goffi # Date 1453490657 -3600 # Node ID 7e6342de71fb96fb64b70217730b4183c4703320 # Parent 7ef0f5f908624fb2e9cd27e704afd13c03f60461 core (xml_tools): unescape HTML sequences to avoid parsing error in ElementParser diff -r 7ef0f5f90862 -r 7e6342de71fb src/tools/xml_tools.py --- a/src/tools/xml_tools.py Fri Jan 22 20:24:17 2016 +0100 +++ b/src/tools/xml_tools.py Fri Jan 22 20:24:17 2016 +0100 @@ -29,11 +29,15 @@ from twisted.internet import defer from sat.core import exceptions from collections import OrderedDict +import htmlentitydefs +import re """This library help manage XML used in SàT (parameters, registration, etc)""" SAT_FORM_PREFIX = "SAT_FORM_" SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_" # used to have unique elements names +html_entity_re = re.compile(r'&([a-zA-Z]+?);') +XML_ENTITIES = ('quot', 'amp', 'apos', 'lt', 'gt') # Helper functions @@ -1320,11 +1324,20 @@ # Misc other funtions class ElementParser(object): - """callable class to parse XML string into Element + """callable class to parse XML string into Element""" + # XXX: Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942 - Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942 - (c) Karl Anderson - """ + def _escapeHTML(self, matchobj): + entity = matchobj.group(1) + if entity in XML_ENTITIES: + # we don't escape XML entities + return matchobj.group(0) + else: + try: + return unichr(htmlentitydefs.name2codepoint[entity]) + except KeyError: + log.warning(u"removing unknown entity {}".format(entity)) + return u'' def __call__(self, raw_xml, force_spaces=False, namespace=None): """ @@ -1339,6 +1352,9 @@ else: raw_xml = u"
{}
".format(raw_xml) + # avoid ParserError on HTML escaped chars + raw_xml = html_entity_re.sub(self._escapeHTML, raw_xml) + self.result = None def onStart(elem): @@ -1362,7 +1378,7 @@ top_elt = self.result.firstChildElement() # we now can check if there was a unique element on the top # and remove our wrapping
is this was the case - if len(top_elt.children) == 1: + if len(top_elt.children) == 1 and domish.IElement.providedBy(top_elt.children[0]): top_elt = top_elt.firstChildElement() return top_elt