comparison src/tools/xml_tools.py @ 1818:7e6342de71fb

core (xml_tools): unescape HTML sequences to avoid parsing error in ElementParser
author Goffi <goffi@goffi.org>
date Fri, 22 Jan 2016 20:24:17 +0100
parents 7ef0f5f90862
children c271087d2020
comparison
equal deleted inserted replaced
1817:7ef0f5f90862 1818:7e6342de71fb
27 from twisted.words.xish import domish 27 from twisted.words.xish import domish
28 from twisted.words.protocols.jabber import jid 28 from twisted.words.protocols.jabber import jid
29 from twisted.internet import defer 29 from twisted.internet import defer
30 from sat.core import exceptions 30 from sat.core import exceptions
31 from collections import OrderedDict 31 from collections import OrderedDict
32 import htmlentitydefs
33 import re
32 34
33 """This library help manage XML used in SàT (parameters, registration, etc)""" 35 """This library help manage XML used in SàT (parameters, registration, etc)"""
34 36
35 SAT_FORM_PREFIX = "SAT_FORM_" 37 SAT_FORM_PREFIX = "SAT_FORM_"
36 SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_" # used to have unique elements names 38 SAT_PARAM_SEPARATOR = "_XMLUI_PARAM_" # used to have unique elements names
39 html_entity_re = re.compile(r'&([a-zA-Z]+?);')
40 XML_ENTITIES = ('quot', 'amp', 'apos', 'lt', 'gt')
37 41
38 42
39 # Helper functions 43 # Helper functions
40 44
41 def _dataFormField2XMLUIData(field, read_only=False): 45 def _dataFormField2XMLUIData(field, read_only=False):
1318 return d 1322 return d
1319 1323
1320 # Misc other funtions 1324 # Misc other funtions
1321 1325
1322 class ElementParser(object): 1326 class ElementParser(object):
1323 """callable class to parse XML string into Element 1327 """callable class to parse XML string into Element"""
1324 1328 # XXX: Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942
1325 Found at http://stackoverflow.com/questions/2093400/how-to-create-twisted-words-xish-domish-element-entirely-from-raw-xml/2095942#2095942 1329
1326 (c) Karl Anderson 1330 def _escapeHTML(self, matchobj):
1327 """ 1331 entity = matchobj.group(1)
1332 if entity in XML_ENTITIES:
1333 # we don't escape XML entities
1334 return matchobj.group(0)
1335 else:
1336 try:
1337 return unichr(htmlentitydefs.name2codepoint[entity])
1338 except KeyError:
1339 log.warning(u"removing unknown entity {}".format(entity))
1340 return u''
1328 1341
1329 def __call__(self, raw_xml, force_spaces=False, namespace=None): 1342 def __call__(self, raw_xml, force_spaces=False, namespace=None):
1330 """ 1343 """
1331 @param raw_xml(unicode): the raw XML 1344 @param raw_xml(unicode): the raw XML
1332 @param force_spaces (bool): if True, replace occurrences of '\n' and '\t' with ' '. 1345 @param force_spaces (bool): if True, replace occurrences of '\n' and '\t' with ' '.
1336 # there is not a unique one on the top 1349 # there is not a unique one on the top
1337 if namespace is not None: 1350 if namespace is not None:
1338 raw_xml = u"<div xmlns='{}'>{}</div>".format(namespace, raw_xml) 1351 raw_xml = u"<div xmlns='{}'>{}</div>".format(namespace, raw_xml)
1339 else: 1352 else:
1340 raw_xml = u"<div>{}</div>".format(raw_xml) 1353 raw_xml = u"<div>{}</div>".format(raw_xml)
1354
1355 # avoid ParserError on HTML escaped chars
1356 raw_xml = html_entity_re.sub(self._escapeHTML, raw_xml)
1341 1357
1342 self.result = None 1358 self.result = None
1343 1359
1344 def onStart(elem): 1360 def onStart(elem):
1345 self.result = elem 1361 self.result = elem
1360 tmp.addRawXml(raw_xml) 1376 tmp.addRawXml(raw_xml)
1361 parser.parse(tmp.toXml().encode('utf-8')) 1377 parser.parse(tmp.toXml().encode('utf-8'))
1362 top_elt = self.result.firstChildElement() 1378 top_elt = self.result.firstChildElement()
1363 # we now can check if there was a unique element on the top 1379 # we now can check if there was a unique element on the top
1364 # and remove our wrapping <div/> is this was the case 1380 # and remove our wrapping <div/> is this was the case
1365 if len(top_elt.children) == 1: 1381 if len(top_elt.children) == 1 and domish.IElement.providedBy(top_elt.children[0]):
1366 top_elt = top_elt.firstChildElement() 1382 top_elt = top_elt.firstChildElement()
1367 return top_elt 1383 return top_elt
1368 1384
1369 1385
1370 # FIXME: this method is duplicated from frontends.tools.xmlui.getText 1386 # FIXME: this method is duplicated from frontends.tools.xmlui.getText