Mercurial > libervia-backend
comparison src/tools/xml_tools.py @ 1818:7e6342de71fb
core (xml_tools): unescape HTML sequences to avoid parsing error in ElementParser
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 22 Jan 2016 20:24:17 +0100 |
parents | 7ef0f5f90862 |
children | c271087d2020 |
comparison
equal
deleted
inserted
replaced
1817:7ef0f5f90862 | 1818:7e6342de71fb |
---|---|
27 from twisted.words.xish import domish | 27 from twisted.words.xish import domish |
28 from twisted.words.protocols.jabber import jid | 28 from twisted.words.protocols.jabber import jid |
29 from twisted.internet import defer | 29 from twisted.internet import defer |
30 from sat.core import exceptions | 30 from sat.core import exceptions |
31 from collections import OrderedDict | 31 from collections import OrderedDict |
32 import htmlentitydefs | |
33 import re | |
32 | 34 |
33 """This library help manage XML used in SàT (parameters, registration, etc)""" | 35 """This library help manage XML used in SàT (parameters, registration, etc)""" |
34 | 36 |
35 SAT_FORM_PREFIX = "SAT_FORM_" | 37 SAT_FORM_PREFIX = "SAT_FORM_" |
36 SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_" # used to have unique elements names | 38 SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_" # used to have unique elements names |
39 html_entity_re = re.compile(r'&([a-zA-Z]+?);') | |
40 XML_ENTITIES = ('quot', 'amp', 'apos', 'lt', 'gt') | |
37 | 41 |
38 | 42 |
39 # Helper functions | 43 # Helper functions |
40 | 44 |
41 def _dataFormField2XMLUIData(field, read_only=False): | 45 def _dataFormField2XMLUIData(field, read_only=False): |
1318 return d | 1322 return d |
1319 | 1323 |
1320 # Misc other funtions | 1324 # Misc other funtions |
1321 | 1325 |
1322 class ElementParser(object): | 1326 class ElementParser(object): |
1323 """callable class to parse XML string into Element | 1327 """callable class to parse XML string into Element""" |
1324 | 1328 # XXX: Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942 |
1325 Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942 | 1329 |
1326 (c) Karl Anderson | 1330 def _escapeHTML(self, matchobj): |
1327 """ | 1331 entity = matchobj.group(1) |
1332 if entity in XML_ENTITIES: | |
1333 # we don't escape XML entities | |
1334 return matchobj.group(0) | |
1335 else: | |
1336 try: | |
1337 return unichr(htmlentitydefs.name2codepoint[entity]) | |
1338 except KeyError: | |
1339 log.warning(u"removing unknown entity {}".format(entity)) | |
1340 return u'' | |
1328 | 1341 |
1329 def __call__(self, raw_xml, force_spaces=False, namespace=None): | 1342 def __call__(self, raw_xml, force_spaces=False, namespace=None): |
1330 """ | 1343 """ |
1331 @param raw_xml(unicode): the raw XML | 1344 @param raw_xml(unicode): the raw XML |
1332 @param force_spaces (bool): if True, replace occurrences of '\n' and '\t' with ' '. | 1345 @param force_spaces (bool): if True, replace occurrences of '\n' and '\t' with ' '. |
1336 # there is not a unique one on the top | 1349 # there is not a unique one on the top |
1337 if namespace is not None: | 1350 if namespace is not None: |
1338 raw_xml = u"<div xmlns='{}'>{}</div>".format(namespace, raw_xml) | 1351 raw_xml = u"<div xmlns='{}'>{}</div>".format(namespace, raw_xml) |
1339 else: | 1352 else: |
1340 raw_xml = u"<div>{}</div>".format(raw_xml) | 1353 raw_xml = u"<div>{}</div>".format(raw_xml) |
1354 | |
1355 # avoid ParserError on HTML escaped chars | |
1356 raw_xml = html_entity_re.sub(self._escapeHTML, raw_xml) | |
1341 | 1357 |
1342 self.result = None | 1358 self.result = None |
1343 | 1359 |
1344 def onStart(elem): | 1360 def onStart(elem): |
1345 self.result = elem | 1361 self.result = elem |
1360 tmp.addRawXml(raw_xml) | 1376 tmp.addRawXml(raw_xml) |
1361 parser.parse(tmp.toXml().encode('utf-8')) | 1377 parser.parse(tmp.toXml().encode('utf-8')) |
1362 top_elt = self.result.firstChildElement() | 1378 top_elt = self.result.firstChildElement() |
1363 # we now can check if there was a unique element on the top | 1379 # we now can check if there was a unique element on the top |
1364 # and remove our wrapping <div/> is this was the case | 1380 # and remove our wrapping <div/> is this was the case |
1365 if len(top_elt.children) == 1: | 1381 if len(top_elt.children) == 1 and domish.IElement.providedBy(top_elt.children[0]): |
1366 top_elt = top_elt.firstChildElement() | 1382 top_elt = top_elt.firstChildElement() |
1367 return top_elt | 1383 return top_elt |
1368 | 1384 |
1369 | 1385 |
1370 # FIXME: this method is duplicated from frontends.tools.xmlui.getText | 1386 # FIXME: this method is duplicated from frontends.tools.xmlui.getText |