libervia-backend: src/plugins/plugin_xep

comparison src/plugins/plugin_xep_0277.py @ 859:64ec04991d9d

plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom

author	souliane <souliane@mailoo.org>
date	Sun, 23 Feb 2014 13:54:41 +0100
parents	660b3f5b6c78
children	1a759096ccbd

comparison

equal deleted inserted replaced

-:660b3f5b6c78
+:64ec04991d9d
 from twisted.internet import defer
 from sat.core import exceptions
 from sat.tools.xml_tools import ElementParser
 from wokkel import pubsub
-from feed import atom
+from feed import atom, date
+from lxml import etree
 import uuid
 from time import time
 import urlparse
 from cgi import escape
 NS_MICROBLOG = 'urn:xmpp:microblog:0'
 NS_XHTML = 'http://www.w3.org/1999/xhtml'
+NS_PUBSUB = 'http://jabber.org/protocol/pubsub'
 PLUGIN_INFO = {
 "name": "Microblogging over XMPP Plugin",
 "import_name": "XEP-0277",
 "type": "XEP",
 service = jid.JID(parsed_url.path)
 queries = parsed_url.query.split(';')
 parsed_queries = dict()
 for query in queries:
 parsed_queries.update(urlparse.parse_qs(query))
-node = parsed_queries.get('node',[''])[0]
+node = parsed_queries.get('node', [''])[0]
 if not node:
 raise exceptions.DataError('Invalid comments link')
 return (service, node)
 @defer.inlineCallbacks
 def item2mbdata(self, item):
 """Convert an XML Item to microblog data used in bridge API
 @param item: domish.Element of microblog item
 @return: microblog data (dictionary)"""
+def xpath(elt, path):
+"""Return the XPATH result of an entry element or its descendance, works with both:
+- no namespace, that means it is inherited from the parent item node --> NS_PUBSUB
+- empty namespace
+XXX: check why the received entries have no namespace when they are retrieved
+from self.host.plugins["XEP-0060"].getItems and they have an empty namespace
+when they are received with an event.
+"""
+result = elt.xpath(path)
+if len(result) > 0:
+return result
+return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB})
+# convert a date string to float without dealing with the date format
+date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text))
+item_elt = etree.fromstring(item.toXml().encode('utf-8'))
 try:
-entry_elt = [child for child in item.elements() if child.name == "entry"][0]
+entry_elt = xpath(item_elt, 'entry')[0]
 except IndexError:
-warning(_('No entry element in microblog item'))
+raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', ''))
-raise exceptions.DataError('no entry found')
-_entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8'))
 microblog_data = {}
-for key in ['title', 'content']:
+for key in ['title', 'content']:  # process the textual elements
-for type_ in ['', 'xhtml']:
+for attr_elt in xpath(entry_elt, key):
-try:
+attr_content = self.__getLXMLInnerContent(attr_elt)
-attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key)
+if not attr_content.strip():
-except AttributeError:
+continue  # element with empty value
-continue
+content_type = attr_elt.get('type', 'text').lower()
-if not attr.text:
-continue
-try:
-content_type = attr.attrs['type'].lower()
-except KeyError:
-content_type = 'text'
 if content_type == 'xhtml':
-text = self.__decapsulateExtraNS(attr.text)
+text = self.__decapsulateExtraNS(attr_content)
 microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text)
 else:
-microblog_data[key] = attr.text
+microblog_data[key] = attr_content
 if key not in microblog_data and ('%s_xhtml' % key) in microblog_data:
 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key])
-if 'title' not in microblog_data:
-raise exceptions.DataError(_("Atom entry misses a title element"))
+try:  # check for mandatory elements
-if 'content' not in microblog_data:
+microblog_data['id'] = xpath(entry_elt, 'id')[0].text
+microblog_data['updated'] = date2float(entry_elt, 'updated')
+assert('title' in microblog_data)  # has been processed already
+except IndexError:
+error(_("Atom entry %s misses a required element") % item_elt.get('id', ''))
+raise exceptions.DataError
+if 'content' not in microblog_data:  # use the atom title data as the microblog body content
 microblog_data['content'] = microblog_data['title']
 del microblog_data['title']
 if 'title_xhtml' in microblog_data:
 microblog_data['content_xhtml'] = microblog_data['title_xhtml']
 del microblog_data['title_xhtml']
+# recommended and optional elements with a fallback value
 try:
-if len(_entry.authors):
+microblog_data['published'] = date2float(entry_elt, 'published')
-microblog_data['author'] = _entry.authors[0].name.text
+except IndexError:
-microblog_data['updated'] = str(int(_entry.updated.tf))
+microblog_data['published'] = microblog_data['updated']
+# other recommended and optional elements
+try:
+link_elt = xpath(entry_elt, "link")[0]
 try:
-microblog_data['published'] = str(int(_entry.published.tf))
+assert(link_elt.attrib['title'] == "comments")
-except (KeyError, TypeError):
+microblog_data['comments'] = link_elt.attrib['href']
-microblog_data['published'] = microblog_data['updated']
+service, node = self.parseCommentUrl(microblog_data["comments"])
-microblog_data['id'] = item['id']
+microblog_data['comments_service'] = service.full()
-for link in _entry.links:
+microblog_data['comments_node'] = node
-try:
+except (exceptions.DataError, RuntimeError, KeyError):
-if link.attrs["title"] == "comments":
+warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id'])
-microblog_data['comments'] = link.attrs["href"]
+except:
-service, node = self.parseCommentUrl(microblog_data["comments"])
+pass
-microblog_data['comments_service'] = service.full()
+try:
-microblog_data['comments_node'] = node
+microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text
-break
+except IndexError:
-except (KeyError, exceptions.DataError, RuntimeError):
+try:  # XXX: workaround for Jappix behaviour
-warning(_("Can't parse the link element of pubsub entry %s") % item['id'])
+microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text
-continue
+except IndexError:
-except (AttributeError, KeyError):
+warning(_("Can't find author element in pubsub entry %s") % microblog_data['id'])
-error(_('Error while parsing atom entry for microblogging event'))
-raise exceptions.DataError
+defer.returnValue(microblog_data)
-##XXX: workaround for Jappix behaviour
+def __getLXMLInnerContent(self, elt):
-if not 'author' in microblog_data:
+"""Return the inner content of a lxml.etree.Element. It is not
-from xe import NestElement
+trivial because the lxml tostring method would return the full
+content including elt's tag and attributes, and elt.getchildren()
+would skip a text value which is not within an element..."""
+return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt)))
+def __getDomishInnerContent(self, elt):
+"""Return the inner content of a domish.Element."""
+result = ''
+for child in elt.children:
 try:
-author = NestElement('author')
+result += child.toXml()  # child id a domish.Element
-author.import_xml(str(_entry))
+except AttributeError:
-microblog_data['author'] = author.nick.text
+result += child  # child is unicode
-except:
+return result
-error(_('Cannot find author'))
-##end workaround Jappix
-defer.returnValue(microblog_data)
 def __decapsulateExtraNS(self, text):
 """Check for XHTML namespace and decapsulate the content so the user
 who wants to modify an entry will see the text that he entered. Also
 this avoids successive encapsulation with a new <div>...</div> at
 each modification (encapsulation is done in self.data2entry)"""
 elt = ElementParser()(text)
 if elt.uri != NS_XHTML:
 raise exceptions.DataError(_('Content of type XHTML must declare its namespace!'))
-return elt.firstChildElement().toXml()
+return self.__getDomishInnerContent(elt)
 def microblogCB(self, itemsEvent, profile):
 d = defer.Deferred()
 def manageItem(microblog_data):

Mercurial > libervia-backend

comparison src/plugins/plugin_xep_0277.py @ 859:64ec04991d9d