comparison src/plugins/plugin_xep_0277.py @ 859:64ec04991d9d

plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom
author souliane <souliane@mailoo.org>
date Sun, 23 Feb 2014 13:54:41 +0100
parents 660b3f5b6c78
children 1a759096ccbd
comparison
equal deleted inserted replaced
858:660b3f5b6c78 859:64ec04991d9d
23 from twisted.internet import defer 23 from twisted.internet import defer
24 from sat.core import exceptions 24 from sat.core import exceptions
25 from sat.tools.xml_tools import ElementParser 25 from sat.tools.xml_tools import ElementParser
26 26
27 from wokkel import pubsub 27 from wokkel import pubsub
28 from feed import atom 28 from feed import atom, date
29 from lxml import etree
29 import uuid 30 import uuid
30 from time import time 31 from time import time
31 import urlparse 32 import urlparse
32 from cgi import escape 33 from cgi import escape
33 34
34 NS_MICROBLOG = 'urn:xmpp:microblog:0' 35 NS_MICROBLOG = 'urn:xmpp:microblog:0'
35 NS_XHTML = 'http://www.w3.org/1999/xhtml' 36 NS_XHTML = 'http://www.w3.org/1999/xhtml'
37 NS_PUBSUB = 'http://jabber.org/protocol/pubsub'
36 38
37 PLUGIN_INFO = { 39 PLUGIN_INFO = {
38 "name": "Microblogging over XMPP Plugin", 40 "name": "Microblogging over XMPP Plugin",
39 "import_name": "XEP-0277", 41 "import_name": "XEP-0277",
40 "type": "XEP", 42 "type": "XEP",
81 service = jid.JID(parsed_url.path) 83 service = jid.JID(parsed_url.path)
82 queries = parsed_url.query.split(';') 84 queries = parsed_url.query.split(';')
83 parsed_queries = dict() 85 parsed_queries = dict()
84 for query in queries: 86 for query in queries:
85 parsed_queries.update(urlparse.parse_qs(query)) 87 parsed_queries.update(urlparse.parse_qs(query))
86 node = parsed_queries.get('node',[''])[0] 88 node = parsed_queries.get('node', [''])[0]
87 89
88 if not node: 90 if not node:
89 raise exceptions.DataError('Invalid comments link') 91 raise exceptions.DataError('Invalid comments link')
90 92
91 return (service, node) 93 return (service, node)
104 @defer.inlineCallbacks 106 @defer.inlineCallbacks
105 def item2mbdata(self, item): 107 def item2mbdata(self, item):
106 """Convert an XML Item to microblog data used in bridge API 108 """Convert an XML Item to microblog data used in bridge API
107 @param item: domish.Element of microblog item 109 @param item: domish.Element of microblog item
108 @return: microblog data (dictionary)""" 110 @return: microblog data (dictionary)"""
111
112 def xpath(elt, path):
113 """Return the XPATH result of an entry element or its descendance, works with both:
114 - no namespace, that means it is inherited from the parent item node --> NS_PUBSUB
115 - empty namespace
116 XXX: check why the received entries have no namespace when they are retrieved
117 from self.host.plugins["XEP-0060"].getItems and they have an empty namespace
118 when they are received with an event.
119 """
120 result = elt.xpath(path)
121 if len(result) > 0:
122 return result
123 return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB})
124
125 # convert a date string to float without dealing with the date format
126 date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text))
127
128 item_elt = etree.fromstring(item.toXml().encode('utf-8'))
109 try: 129 try:
110 entry_elt = [child for child in item.elements() if child.name == "entry"][0] 130 entry_elt = xpath(item_elt, 'entry')[0]
111 except IndexError: 131 except IndexError:
112 warning(_('No entry element in microblog item')) 132 raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', ''))
113 raise exceptions.DataError('no entry found') 133
114 _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8'))
115 microblog_data = {} 134 microblog_data = {}
116 135
117 for key in ['title', 'content']: 136 for key in ['title', 'content']: # process the textual elements
118 for type_ in ['', 'xhtml']: 137 for attr_elt in xpath(entry_elt, key):
119 try: 138 attr_content = self.__getLXMLInnerContent(attr_elt)
120 attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key) 139 if not attr_content.strip():
121 except AttributeError: 140 continue # element with empty value
122 continue 141 content_type = attr_elt.get('type', 'text').lower()
123 if not attr.text:
124 continue
125 try:
126 content_type = attr.attrs['type'].lower()
127 except KeyError:
128 content_type = 'text'
129 if content_type == 'xhtml': 142 if content_type == 'xhtml':
130 text = self.__decapsulateExtraNS(attr.text) 143 text = self.__decapsulateExtraNS(attr_content)
131 microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text) 144 microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text)
132 else: 145 else:
133 microblog_data[key] = attr.text 146 microblog_data[key] = attr_content
134 if key not in microblog_data and ('%s_xhtml' % key) in microblog_data: 147 if key not in microblog_data and ('%s_xhtml' % key) in microblog_data:
135 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key]) 148 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key])
136 if 'title' not in microblog_data: 149
137 raise exceptions.DataError(_("Atom entry misses a title element")) 150 try: # check for mandatory elements
138 if 'content' not in microblog_data: 151 microblog_data['id'] = xpath(entry_elt, 'id')[0].text
152 microblog_data['updated'] = date2float(entry_elt, 'updated')
153 assert('title' in microblog_data) # has been processed already
154 except IndexError:
155 error(_("Atom entry %s misses a required element") % item_elt.get('id', ''))
156 raise exceptions.DataError
157
158 if 'content' not in microblog_data: # use the atom title data as the microblog body content
139 microblog_data['content'] = microblog_data['title'] 159 microblog_data['content'] = microblog_data['title']
140 del microblog_data['title'] 160 del microblog_data['title']
141 if 'title_xhtml' in microblog_data: 161 if 'title_xhtml' in microblog_data:
142 microblog_data['content_xhtml'] = microblog_data['title_xhtml'] 162 microblog_data['content_xhtml'] = microblog_data['title_xhtml']
143 del microblog_data['title_xhtml'] 163 del microblog_data['title_xhtml']
144 164
165 # recommended and optional elements with a fallback value
145 try: 166 try:
146 if len(_entry.authors): 167 microblog_data['published'] = date2float(entry_elt, 'published')
147 microblog_data['author'] = _entry.authors[0].name.text 168 except IndexError:
148 microblog_data['updated'] = str(int(_entry.updated.tf)) 169 microblog_data['published'] = microblog_data['updated']
170
171 # other recommended and optional elements
172 try:
173 link_elt = xpath(entry_elt, "link")[0]
149 try: 174 try:
150 microblog_data['published'] = str(int(_entry.published.tf)) 175 assert(link_elt.attrib['title'] == "comments")
151 except (KeyError, TypeError): 176 microblog_data['comments'] = link_elt.attrib['href']
152 microblog_data['published'] = microblog_data['updated'] 177 service, node = self.parseCommentUrl(microblog_data["comments"])
153 microblog_data['id'] = item['id'] 178 microblog_data['comments_service'] = service.full()
154 for link in _entry.links: 179 microblog_data['comments_node'] = node
155 try: 180 except (exceptions.DataError, RuntimeError, KeyError):
156 if link.attrs["title"] == "comments": 181 warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id'])
157 microblog_data['comments'] = link.attrs["href"] 182 except:
158 service, node = self.parseCommentUrl(microblog_data["comments"]) 183 pass
159 microblog_data['comments_service'] = service.full() 184 try:
160 microblog_data['comments_node'] = node 185 microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text
161 break 186 except IndexError:
162 except (KeyError, exceptions.DataError, RuntimeError): 187 try: # XXX: workaround for Jappix behaviour
163 warning(_("Can't parse the link element of pubsub entry %s") % item['id']) 188 microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text
164 continue 189 except IndexError:
165 except (AttributeError, KeyError): 190 warning(_("Can't find author element in pubsub entry %s") % microblog_data['id'])
166 error(_('Error while parsing atom entry for microblogging event')) 191
167 raise exceptions.DataError 192 defer.returnValue(microblog_data)
168 193
169 ##XXX: workaround for Jappix behaviour 194 def __getLXMLInnerContent(self, elt):
170 if not 'author' in microblog_data: 195 """Return the inner content of a lxml.etree.Element. It is not
171 from xe import NestElement 196 trivial because the lxml tostring method would return the full
197 content including elt's tag and attributes, and elt.getchildren()
198 would skip a text value which is not within an element..."""
199 return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt)))
200
201 def __getDomishInnerContent(self, elt):
202 """Return the inner content of a domish.Element."""
203 result = ''
204 for child in elt.children:
172 try: 205 try:
173 author = NestElement('author') 206 result += child.toXml() # child id a domish.Element
174 author.import_xml(str(_entry)) 207 except AttributeError:
175 microblog_data['author'] = author.nick.text 208 result += child # child is unicode
176 except: 209 return result
177 error(_('Cannot find author'))
178 ##end workaround Jappix
179
180 defer.returnValue(microblog_data)
181 210
182 def __decapsulateExtraNS(self, text): 211 def __decapsulateExtraNS(self, text):
183 """Check for XHTML namespace and decapsulate the content so the user 212 """Check for XHTML namespace and decapsulate the content so the user
184 who wants to modify an entry will see the text that he entered. Also 213 who wants to modify an entry will see the text that he entered. Also
185 this avoids successive encapsulation with a new <div>...</div> at 214 this avoids successive encapsulation with a new <div>...</div> at
186 each modification (encapsulation is done in self.data2entry)""" 215 each modification (encapsulation is done in self.data2entry)"""
187 elt = ElementParser()(text) 216 elt = ElementParser()(text)
188 if elt.uri != NS_XHTML: 217 if elt.uri != NS_XHTML:
189 raise exceptions.DataError(_('Content of type XHTML must declare its namespace!')) 218 raise exceptions.DataError(_('Content of type XHTML must declare its namespace!'))
190 return elt.firstChildElement().toXml() 219 return self.__getDomishInnerContent(elt)
191 220
192 def microblogCB(self, itemsEvent, profile): 221 def microblogCB(self, itemsEvent, profile):
193 d = defer.Deferred() 222 d = defer.Deferred()
194 223
195 def manageItem(microblog_data): 224 def manageItem(microblog_data):