diff src/plugins/plugin_xep_0277.py @ 859:64ec04991d9d

plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom
author souliane <souliane@mailoo.org>
date Sun, 23 Feb 2014 13:54:41 +0100
parents 660b3f5b6c78
children 1a759096ccbd
line wrap: on
line diff
--- a/src/plugins/plugin_xep_0277.py	Fri Feb 14 21:24:31 2014 +0100
+++ b/src/plugins/plugin_xep_0277.py	Sun Feb 23 13:54:41 2014 +0100
@@ -25,7 +25,8 @@
 from sat.tools.xml_tools import ElementParser
 
 from wokkel import pubsub
-from feed import atom
+from feed import atom, date
+from lxml import etree
 import uuid
 from time import time
 import urlparse
@@ -33,6 +34,7 @@
 
 NS_MICROBLOG = 'urn:xmpp:microblog:0'
 NS_XHTML = 'http://www.w3.org/1999/xhtml'
+NS_PUBSUB = 'http://jabber.org/protocol/pubsub'
 
 PLUGIN_INFO = {
     "name": "Microblogging over XMPP Plugin",
@@ -83,7 +85,7 @@
         parsed_queries = dict()
         for query in queries:
             parsed_queries.update(urlparse.parse_qs(query))
-        node = parsed_queries.get('node',[''])[0]
+        node = parsed_queries.get('node', [''])[0]
 
         if not node:
             raise exceptions.DataError('Invalid comments link')
@@ -106,79 +108,106 @@
         """Convert an XML Item to microblog data used in bridge API
         @param item: domish.Element of microblog item
         @return: microblog data (dictionary)"""
+
+        def xpath(elt, path):
+            """Return the XPATH result of an entry element or its descendance, works with both:
+            - no namespace, that means it is inherited from the parent item node --> NS_PUBSUB
+            - empty namespace
+            XXX: check why the received entries have no namespace when they are retrieved
+            from self.host.plugins["XEP-0060"].getItems and they have an empty namespace
+            when they are received with an event.
+            """
+            result = elt.xpath(path)
+            if len(result) > 0:
+                return result
+            return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB})
+
+        # convert a date string to float without dealing with the date format
+        date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text))
+
+        item_elt = etree.fromstring(item.toXml().encode('utf-8'))
         try:
-            entry_elt = [child for child in item.elements() if child.name == "entry"][0]
+            entry_elt = xpath(item_elt, 'entry')[0]
         except IndexError:
-            warning(_('No entry element in microblog item'))
-            raise exceptions.DataError('no entry found')
-        _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8'))
+            raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', ''))
+
         microblog_data = {}
 
-        for key in ['title', 'content']:
-            for type_ in ['', 'xhtml']:
-                try:
-                    attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key)
-                except AttributeError:
-                    continue
-                if not attr.text:
-                    continue
-                try:
-                    content_type = attr.attrs['type'].lower()
-                except KeyError:
-                    content_type = 'text'
+        for key in ['title', 'content']:  # process the textual elements
+            for attr_elt in xpath(entry_elt, key):
+                attr_content = self.__getLXMLInnerContent(attr_elt)
+                if not attr_content.strip():
+                    continue  # element with empty value
+                content_type = attr_elt.get('type', 'text').lower()
                 if content_type == 'xhtml':
-                    text = self.__decapsulateExtraNS(attr.text)
+                    text = self.__decapsulateExtraNS(attr_content)
                     microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text)
                 else:
-                    microblog_data[key] = attr.text
+                    microblog_data[key] = attr_content
             if key not in microblog_data and ('%s_xhtml' % key) in microblog_data:
                 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key])
-        if 'title' not in microblog_data:
-            raise exceptions.DataError(_("Atom entry misses a title element"))
-        if 'content' not in microblog_data:
+
+        try:  # check for mandatory elements
+            microblog_data['id'] = xpath(entry_elt, 'id')[0].text
+            microblog_data['updated'] = date2float(entry_elt, 'updated')
+            assert('title' in microblog_data)  # has been processed already
+        except IndexError:
+            error(_("Atom entry %s misses a required element") % item_elt.get('id', ''))
+            raise exceptions.DataError
+
+        if 'content' not in microblog_data:  # use the atom title data as the microblog body content
             microblog_data['content'] = microblog_data['title']
             del microblog_data['title']
             if 'title_xhtml' in microblog_data:
                 microblog_data['content_xhtml'] = microblog_data['title_xhtml']
                 del microblog_data['title_xhtml']
 
+        # recommended and optional elements with a fallback value
         try:
-            if len(_entry.authors):
-                microblog_data['author'] = _entry.authors[0].name.text
-            microblog_data['updated'] = str(int(_entry.updated.tf))
+            microblog_data['published'] = date2float(entry_elt, 'published')
+        except IndexError:
+            microblog_data['published'] = microblog_data['updated']
+
+        # other recommended and optional elements
+        try:
+            link_elt = xpath(entry_elt, "link")[0]
             try:
-                microblog_data['published'] = str(int(_entry.published.tf))
-            except (KeyError, TypeError):
-                microblog_data['published'] = microblog_data['updated']
-            microblog_data['id'] = item['id']
-            for link in _entry.links:
-                try:
-                    if link.attrs["title"] == "comments":
-                        microblog_data['comments'] = link.attrs["href"]
-                        service, node = self.parseCommentUrl(microblog_data["comments"])
-                        microblog_data['comments_service'] = service.full()
-                        microblog_data['comments_node'] = node
-                        break
-                except (KeyError, exceptions.DataError, RuntimeError):
-                    warning(_("Can't parse the link element of pubsub entry %s") % item['id'])
-                    continue
-        except (AttributeError, KeyError):
-            error(_('Error while parsing atom entry for microblogging event'))
-            raise exceptions.DataError
-
-        ##XXX: workaround for Jappix behaviour
-        if not 'author' in microblog_data:
-            from xe import NestElement
-            try:
-                author = NestElement('author')
-                author.import_xml(str(_entry))
-                microblog_data['author'] = author.nick.text
-            except:
-                error(_('Cannot find author'))
-        ##end workaround Jappix
+                assert(link_elt.attrib['title'] == "comments")
+                microblog_data['comments'] = link_elt.attrib['href']
+                service, node = self.parseCommentUrl(microblog_data["comments"])
+                microblog_data['comments_service'] = service.full()
+                microblog_data['comments_node'] = node
+            except (exceptions.DataError, RuntimeError, KeyError):
+                warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id'])
+        except:
+            pass
+        try:
+            microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text
+        except IndexError:
+            try:  # XXX: workaround for Jappix behaviour
+                microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text
+            except IndexError:
+                warning(_("Can't find author element in pubsub entry %s") % microblog_data['id'])
 
         defer.returnValue(microblog_data)
 
+    def __getLXMLInnerContent(self, elt):
+        """Return the inner content of a lxml.etree.Element. It is not
+        trivial because the lxml tostring method would return the full
+        content including elt's tag and attributes, and elt.getchildren()
+        would skip a text value which is not within an element..."""
+        return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt)))
+
+    def __getDomishInnerContent(self, elt):
+        """Return the inner content of a domish.Element."""
+        result = ''
+        for child in elt.children:
+            try:
+                result += child.toXml()  # child id a domish.Element
+            except AttributeError:
+                result += child  # child is unicode
+        return result
+
     def __decapsulateExtraNS(self, text):
         """Check for XHTML namespace and decapsulate the content so the user
         who wants to modify an entry will see the text that he entered. Also
@@ -187,7 +216,7 @@
         elt = ElementParser()(text)
         if elt.uri != NS_XHTML:
             raise exceptions.DataError(_('Content of type XHTML must declare its namespace!'))
-        return elt.firstChildElement().toXml()
+        return self.__getDomishInnerContent(elt)
 
     def microblogCB(self, itemsEvent, profile):
         d = defer.Deferred()