# HG changeset patch # User souliane # Date 1393160081 -3600 # Node ID 64ec04991d9da36c7aabf3018ecf00f4f2ddd9d5 # Parent 660b3f5b6c788b5052e91457302762e12df9a15e plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom diff -r 660b3f5b6c78 -r 64ec04991d9d src/plugins/plugin_xep_0277.py --- a/src/plugins/plugin_xep_0277.py Fri Feb 14 21:24:31 2014 +0100 +++ b/src/plugins/plugin_xep_0277.py Sun Feb 23 13:54:41 2014 +0100 @@ -25,7 +25,8 @@ from sat.tools.xml_tools import ElementParser from wokkel import pubsub -from feed import atom +from feed import atom, date +from lxml import etree import uuid from time import time import urlparse @@ -33,6 +34,7 @@ NS_MICROBLOG = 'urn:xmpp:microblog:0' NS_XHTML = 'http://www.w3.org/1999/xhtml' +NS_PUBSUB = 'http://jabber.org/protocol/pubsub' PLUGIN_INFO = { "name": "Microblogging over XMPP Plugin", @@ -83,7 +85,7 @@ parsed_queries = dict() for query in queries: parsed_queries.update(urlparse.parse_qs(query)) - node = parsed_queries.get('node',[''])[0] + node = parsed_queries.get('node', [''])[0] if not node: raise exceptions.DataError('Invalid comments link') @@ -106,79 +108,106 @@ """Convert an XML Item to microblog data used in bridge API @param item: domish.Element of microblog item @return: microblog data (dictionary)""" + + def xpath(elt, path): + """Return the XPATH result of an entry element or its descendance, works with both: + - no namespace, that means it is inherited from the parent item node --> NS_PUBSUB + - empty namespace + XXX: check why the received entries have no namespace when they are retrieved + from self.host.plugins["XEP-0060"].getItems and they have an empty namespace + when they are received with an event. + """ + result = elt.xpath(path) + if len(result) > 0: + return result + return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB}) + + # convert a date string to float without dealing with the date format + date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text)) + + item_elt = etree.fromstring(item.toXml().encode('utf-8')) try: - entry_elt = [child for child in item.elements() if child.name == "entry"][0] + entry_elt = xpath(item_elt, 'entry')[0] except IndexError: - warning(_('No entry element in microblog item')) - raise exceptions.DataError('no entry found') - _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8')) + raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', '')) + microblog_data = {} - for key in ['title', 'content']: - for type_ in ['', 'xhtml']: - try: - attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key) - except AttributeError: - continue - if not attr.text: - continue - try: - content_type = attr.attrs['type'].lower() - except KeyError: - content_type = 'text' + for key in ['title', 'content']: # process the textual elements + for attr_elt in xpath(entry_elt, key): + attr_content = self.__getLXMLInnerContent(attr_elt) + if not attr_content.strip(): + continue # element with empty value + content_type = attr_elt.get('type', 'text').lower() if content_type == 'xhtml': - text = self.__decapsulateExtraNS(attr.text) + text = self.__decapsulateExtraNS(attr_content) microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text) else: - microblog_data[key] = attr.text + microblog_data[key] = attr_content if key not in microblog_data and ('%s_xhtml' % key) in microblog_data: microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key]) - if 'title' not in microblog_data: - raise exceptions.DataError(_("Atom entry misses a title element")) - if 'content' not in microblog_data: + + try: # check for mandatory elements + microblog_data['id'] = xpath(entry_elt, 'id')[0].text + microblog_data['updated'] = date2float(entry_elt, 'updated') + assert('title' in microblog_data) # has been processed already + except IndexError: + error(_("Atom entry %s misses a required element") % item_elt.get('id', '')) + raise exceptions.DataError + + if 'content' not in microblog_data: # use the atom title data as the microblog body content microblog_data['content'] = microblog_data['title'] del microblog_data['title'] if 'title_xhtml' in microblog_data: microblog_data['content_xhtml'] = microblog_data['title_xhtml'] del microblog_data['title_xhtml'] + # recommended and optional elements with a fallback value try: - if len(_entry.authors): - microblog_data['author'] = _entry.authors[0].name.text - microblog_data['updated'] = str(int(_entry.updated.tf)) + microblog_data['published'] = date2float(entry_elt, 'published') + except IndexError: + microblog_data['published'] = microblog_data['updated'] + + # other recommended and optional elements + try: + link_elt = xpath(entry_elt, "link")[0] try: - microblog_data['published'] = str(int(_entry.published.tf)) - except (KeyError, TypeError): - microblog_data['published'] = microblog_data['updated'] - microblog_data['id'] = item['id'] - for link in _entry.links: - try: - if link.attrs["title"] == "comments": - microblog_data['comments'] = link.attrs["href"] - service, node = self.parseCommentUrl(microblog_data["comments"]) - microblog_data['comments_service'] = service.full() - microblog_data['comments_node'] = node - break - except (KeyError, exceptions.DataError, RuntimeError): - warning(_("Can't parse the link element of pubsub entry %s") % item['id']) - continue - except (AttributeError, KeyError): - error(_('Error while parsing atom entry for microblogging event')) - raise exceptions.DataError - - ##XXX: workaround for Jappix behaviour - if not 'author' in microblog_data: - from xe import NestElement - try: - author = NestElement('author') - author.import_xml(str(_entry)) - microblog_data['author'] = author.nick.text - except: - error(_('Cannot find author')) - ##end workaround Jappix + assert(link_elt.attrib['title'] == "comments") + microblog_data['comments'] = link_elt.attrib['href'] + service, node = self.parseCommentUrl(microblog_data["comments"]) + microblog_data['comments_service'] = service.full() + microblog_data['comments_node'] = node + except (exceptions.DataError, RuntimeError, KeyError): + warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id']) + except: + pass + try: + microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text + except IndexError: + try: # XXX: workaround for Jappix behaviour + microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text + except IndexError: + warning(_("Can't find author element in pubsub entry %s") % microblog_data['id']) defer.returnValue(microblog_data) + def __getLXMLInnerContent(self, elt): + """Return the inner content of a lxml.etree.Element. It is not + trivial because the lxml tostring method would return the full + content including elt's tag and attributes, and elt.getchildren() + would skip a text value which is not within an element...""" + return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt))) + + def __getDomishInnerContent(self, elt): + """Return the inner content of a domish.Element.""" + result = '' + for child in elt.children: + try: + result += child.toXml() # child id a domish.Element + except AttributeError: + result += child # child is unicode + return result + def __decapsulateExtraNS(self, text): """Check for XHTML namespace and decapsulate the content so the user who wants to modify an entry will see the text that he entered. Also @@ -187,7 +216,7 @@ elt = ElementParser()(text) if elt.uri != NS_XHTML: raise exceptions.DataError(_('Content of type XHTML must declare its namespace!')) - return elt.firstChildElement().toXml() + return self.__getDomishInnerContent(elt) def microblogCB(self, itemsEvent, profile): d = defer.Deferred() diff -r 660b3f5b6c78 -r 64ec04991d9d src/test/helpers.py --- a/src/test/helpers.py Fri Feb 14 21:24:31 2014 +0100 +++ b/src/test/helpers.py Sun Feb 23 13:54:41 2014 +0100 @@ -179,7 +179,7 @@ setattr(self, name, checkCall) - def addMethod(self, name, int_suffix, in_sign, out_sign, method, async=False): + def addMethod(self, name, int_suffix, in_sign, out_sign, method, async=False, doc=None): pass def addSignal(self, name, int_suffix, signature): diff -r 660b3f5b6c78 -r 64ec04991d9d src/test/test_plugin_xep_0277.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/test/test_plugin_xep_0277.py Sun Feb 23 13:54:41 2014 +0100 @@ -0,0 +1,102 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# SAT: a jabber client +# Copyright (C) 2009, 2010, 2011, 2012, 2013, 2014 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +""" Plugin XEP-0277 tests """ + +from sat.test import helpers +from sat.plugins import plugin_xep_0277 +from sat.plugins import plugin_misc_text_syntaxes +from sat.tools.xml_tools import ElementParser +import re + + +class XEP_0277Test(helpers.SatTestCase): + + PUBSUB_ENTRY_1 = """ + + + <span>titre</span> + c745a688-9b02-11e3-a1a3-c0143dd4fe51 + 2014-02-21T16:16:39+02:00 + 2014-02-21T16:16:38+02:00 + <p>contenu</p>texte sans balise<p>autre contenu</p> +

contenu

texte sans balise

autre contenu

+ + test1@souliane.org + +
+
+ """ % plugin_xep_0277.NS_PUBSUB + + PUBSUB_ENTRY_2 = """ + + + <div>titre</div> + <div xmlns="http://www.w3.org/1999/xhtml"><div style="background-image: url('xxx');">titre</div></div> + c745a688-9b02-11e3-a1a3-c0143dd4fe51 + 2014-02-21T16:16:39+02:00 + 2014-02-21T16:16:38+02:00 + <div><p>contenu</p>texte dans balise<p>autre contenu</p></div> +

contenu

texte dans balise

autre contenu

+ + test1@souliane.org + +
+
+ """ % plugin_xep_0277.NS_PUBSUB + + def setUp(self): + self.host = helpers.FakeSAT() + + class XEP_0163(object): + def __init__(self, host): + pass + + def addPEPEvent(self, *args): + pass + self.host.plugins["XEP-0163"] = XEP_0163(self.host) + self.host.plugins["TEXT-SYNTAXES"] = plugin_misc_text_syntaxes.TextSyntaxes(self.host) + self.plugin = plugin_xep_0277.XEP_0277(self.host) + + def test_item2mbdata_1(self): + expected = {'id': 'c745a688-9b02-11e3-a1a3-c0143dd4fe51', + 'title': 'titre', + 'updated': '1392992199.0', + 'published': '1392992198.0', + 'content': '

contenu

texte sans balise

autre contenu

', + 'content_xhtml': '

contenu

texte sans balise

autre contenu

', + 'author': 'test1@souliane.org' + } + d = self.plugin.item2mbdata(ElementParser()(self.PUBSUB_ENTRY_1)) + d.addCallback(self.assertEqual, expected) + return d + + def test_item2mbdata_2(self): + expected = {'id': 'c745a688-9b02-11e3-a1a3-c0143dd4fe51', + 'title': '
titre
', + 'title_xhtml': '
titre
', + 'updated': '1392992199.0', + 'published': '1392992198.0', + 'content': '

contenu

texte dans balise

autre contenu

', + 'content_xhtml': '

contenu

texte dans balise

autre contenu

', + 'author': 'test1@souliane.org' + } + d = self.plugin.item2mbdata(ElementParser()(self.PUBSUB_ENTRY_2)) + d.addCallback(self.assertEqual, expected) + return d