Mercurial > libervia-backend
comparison src/plugins/plugin_xep_0277.py @ 859:64ec04991d9d
plugin XEP-0277: fix pubsub entry parsing using lxml instead of feed.atom
author | souliane <souliane@mailoo.org> |
---|---|
date | Sun, 23 Feb 2014 13:54:41 +0100 |
parents | 660b3f5b6c78 |
children | 1a759096ccbd |
comparison
equal
deleted
inserted
replaced
858:660b3f5b6c78 | 859:64ec04991d9d |
---|---|
23 from twisted.internet import defer | 23 from twisted.internet import defer |
24 from sat.core import exceptions | 24 from sat.core import exceptions |
25 from sat.tools.xml_tools import ElementParser | 25 from sat.tools.xml_tools import ElementParser |
26 | 26 |
27 from wokkel import pubsub | 27 from wokkel import pubsub |
28 from feed import atom | 28 from feed import atom, date |
29 from lxml import etree | |
29 import uuid | 30 import uuid |
30 from time import time | 31 from time import time |
31 import urlparse | 32 import urlparse |
32 from cgi import escape | 33 from cgi import escape |
33 | 34 |
34 NS_MICROBLOG = 'urn:xmpp:microblog:0' | 35 NS_MICROBLOG = 'urn:xmpp:microblog:0' |
35 NS_XHTML = 'http://www.w3.org/1999/xhtml' | 36 NS_XHTML = 'http://www.w3.org/1999/xhtml' |
37 NS_PUBSUB = 'http://jabber.org/protocol/pubsub' | |
36 | 38 |
37 PLUGIN_INFO = { | 39 PLUGIN_INFO = { |
38 "name": "Microblogging over XMPP Plugin", | 40 "name": "Microblogging over XMPP Plugin", |
39 "import_name": "XEP-0277", | 41 "import_name": "XEP-0277", |
40 "type": "XEP", | 42 "type": "XEP", |
81 service = jid.JID(parsed_url.path) | 83 service = jid.JID(parsed_url.path) |
82 queries = parsed_url.query.split(';') | 84 queries = parsed_url.query.split(';') |
83 parsed_queries = dict() | 85 parsed_queries = dict() |
84 for query in queries: | 86 for query in queries: |
85 parsed_queries.update(urlparse.parse_qs(query)) | 87 parsed_queries.update(urlparse.parse_qs(query)) |
86 node = parsed_queries.get('node',[''])[0] | 88 node = parsed_queries.get('node', [''])[0] |
87 | 89 |
88 if not node: | 90 if not node: |
89 raise exceptions.DataError('Invalid comments link') | 91 raise exceptions.DataError('Invalid comments link') |
90 | 92 |
91 return (service, node) | 93 return (service, node) |
104 @defer.inlineCallbacks | 106 @defer.inlineCallbacks |
105 def item2mbdata(self, item): | 107 def item2mbdata(self, item): |
106 """Convert an XML Item to microblog data used in bridge API | 108 """Convert an XML Item to microblog data used in bridge API |
107 @param item: domish.Element of microblog item | 109 @param item: domish.Element of microblog item |
108 @return: microblog data (dictionary)""" | 110 @return: microblog data (dictionary)""" |
111 | |
112 def xpath(elt, path): | |
113 """Return the XPATH result of an entry element or its descendance, works with both: | |
114 - no namespace, that means it is inherited from the parent item node --> NS_PUBSUB | |
115 - empty namespace | |
116 XXX: check why the received entries have no namespace when they are retrieved | |
117 from self.host.plugins["XEP-0060"].getItems and they have an empty namespace | |
118 when they are received with an event. | |
119 """ | |
120 result = elt.xpath(path) | |
121 if len(result) > 0: | |
122 return result | |
123 return elt.xpath('/'.join(['ns:%s' % tag for tag in path.split('/')]), namespaces={'ns': NS_PUBSUB}) | |
124 | |
125 # convert a date string to float without dealing with the date format | |
126 date2float = lambda elt, path: unicode(date.rfc3339.tf_from_timestamp(xpath(elt, path)[0].text)) | |
127 | |
128 item_elt = etree.fromstring(item.toXml().encode('utf-8')) | |
109 try: | 129 try: |
110 entry_elt = [child for child in item.elements() if child.name == "entry"][0] | 130 entry_elt = xpath(item_elt, 'entry')[0] |
111 except IndexError: | 131 except IndexError: |
112 warning(_('No entry element in microblog item')) | 132 raise exceptions.DataError(_('No entry found in the pubsub item %s') % item_elt.get('id', '')) |
113 raise exceptions.DataError('no entry found') | 133 |
114 _entry = atom.Entry().import_xml(entry_elt.toXml().encode('utf-8')) | |
115 microblog_data = {} | 134 microblog_data = {} |
116 | 135 |
117 for key in ['title', 'content']: | 136 for key in ['title', 'content']: # process the textual elements |
118 for type_ in ['', 'xhtml']: | 137 for attr_elt in xpath(entry_elt, key): |
119 try: | 138 attr_content = self.__getLXMLInnerContent(attr_elt) |
120 attr = getattr(_entry, "%s_%s" % (key, type_) if type_ else key) | 139 if not attr_content.strip(): |
121 except AttributeError: | 140 continue # element with empty value |
122 continue | 141 content_type = attr_elt.get('type', 'text').lower() |
123 if not attr.text: | |
124 continue | |
125 try: | |
126 content_type = attr.attrs['type'].lower() | |
127 except KeyError: | |
128 content_type = 'text' | |
129 if content_type == 'xhtml': | 142 if content_type == 'xhtml': |
130 text = self.__decapsulateExtraNS(attr.text) | 143 text = self.__decapsulateExtraNS(attr_content) |
131 microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text) | 144 microblog_data['%s_xhtml' % key] = yield self.host.plugins["TEXT-SYNTAXES"].clean_xhtml(text) |
132 else: | 145 else: |
133 microblog_data[key] = attr.text | 146 microblog_data[key] = attr_content |
134 if key not in microblog_data and ('%s_xhtml' % key) in microblog_data: | 147 if key not in microblog_data and ('%s_xhtml' % key) in microblog_data: |
135 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key]) | 148 microblog_data[key] = yield self.__removeXHTMLMarkups(microblog_data['%s_xhtml' % key]) |
136 if 'title' not in microblog_data: | 149 |
137 raise exceptions.DataError(_("Atom entry misses a title element")) | 150 try: # check for mandatory elements |
138 if 'content' not in microblog_data: | 151 microblog_data['id'] = xpath(entry_elt, 'id')[0].text |
152 microblog_data['updated'] = date2float(entry_elt, 'updated') | |
153 assert('title' in microblog_data) # has been processed already | |
154 except IndexError: | |
155 error(_("Atom entry %s misses a required element") % item_elt.get('id', '')) | |
156 raise exceptions.DataError | |
157 | |
158 if 'content' not in microblog_data: # use the atom title data as the microblog body content | |
139 microblog_data['content'] = microblog_data['title'] | 159 microblog_data['content'] = microblog_data['title'] |
140 del microblog_data['title'] | 160 del microblog_data['title'] |
141 if 'title_xhtml' in microblog_data: | 161 if 'title_xhtml' in microblog_data: |
142 microblog_data['content_xhtml'] = microblog_data['title_xhtml'] | 162 microblog_data['content_xhtml'] = microblog_data['title_xhtml'] |
143 del microblog_data['title_xhtml'] | 163 del microblog_data['title_xhtml'] |
144 | 164 |
165 # recommended and optional elements with a fallback value | |
145 try: | 166 try: |
146 if len(_entry.authors): | 167 microblog_data['published'] = date2float(entry_elt, 'published') |
147 microblog_data['author'] = _entry.authors[0].name.text | 168 except IndexError: |
148 microblog_data['updated'] = str(int(_entry.updated.tf)) | 169 microblog_data['published'] = microblog_data['updated'] |
170 | |
171 # other recommended and optional elements | |
172 try: | |
173 link_elt = xpath(entry_elt, "link")[0] | |
149 try: | 174 try: |
150 microblog_data['published'] = str(int(_entry.published.tf)) | 175 assert(link_elt.attrib['title'] == "comments") |
151 except (KeyError, TypeError): | 176 microblog_data['comments'] = link_elt.attrib['href'] |
152 microblog_data['published'] = microblog_data['updated'] | 177 service, node = self.parseCommentUrl(microblog_data["comments"]) |
153 microblog_data['id'] = item['id'] | 178 microblog_data['comments_service'] = service.full() |
154 for link in _entry.links: | 179 microblog_data['comments_node'] = node |
155 try: | 180 except (exceptions.DataError, RuntimeError, KeyError): |
156 if link.attrs["title"] == "comments": | 181 warning(_("Can't parse the link element of pubsub entry %s") % microblog_data['id']) |
157 microblog_data['comments'] = link.attrs["href"] | 182 except: |
158 service, node = self.parseCommentUrl(microblog_data["comments"]) | 183 pass |
159 microblog_data['comments_service'] = service.full() | 184 try: |
160 microblog_data['comments_node'] = node | 185 microblog_data['author'] = xpath(entry_elt, 'author/name')[0].text |
161 break | 186 except IndexError: |
162 except (KeyError, exceptions.DataError, RuntimeError): | 187 try: # XXX: workaround for Jappix behaviour |
163 warning(_("Can't parse the link element of pubsub entry %s") % item['id']) | 188 microblog_data['author'] = xpath(entry_elt, 'author/nick')[0].text |
164 continue | 189 except IndexError: |
165 except (AttributeError, KeyError): | 190 warning(_("Can't find author element in pubsub entry %s") % microblog_data['id']) |
166 error(_('Error while parsing atom entry for microblogging event')) | 191 |
167 raise exceptions.DataError | 192 defer.returnValue(microblog_data) |
168 | 193 |
169 ##XXX: workaround for Jappix behaviour | 194 def __getLXMLInnerContent(self, elt): |
170 if not 'author' in microblog_data: | 195 """Return the inner content of a lxml.etree.Element. It is not |
171 from xe import NestElement | 196 trivial because the lxml tostring method would return the full |
197 content including elt's tag and attributes, and elt.getchildren() | |
198 would skip a text value which is not within an element...""" | |
199 return self.__getDomishInnerContent(ElementParser()(etree.tostring(elt))) | |
200 | |
201 def __getDomishInnerContent(self, elt): | |
202 """Return the inner content of a domish.Element.""" | |
203 result = '' | |
204 for child in elt.children: | |
172 try: | 205 try: |
173 author = NestElement('author') | 206 result += child.toXml() # child id a domish.Element |
174 author.import_xml(str(_entry)) | 207 except AttributeError: |
175 microblog_data['author'] = author.nick.text | 208 result += child # child is unicode |
176 except: | 209 return result |
177 error(_('Cannot find author')) | |
178 ##end workaround Jappix | |
179 | |
180 defer.returnValue(microblog_data) | |
181 | 210 |
182 def __decapsulateExtraNS(self, text): | 211 def __decapsulateExtraNS(self, text): |
183 """Check for XHTML namespace and decapsulate the content so the user | 212 """Check for XHTML namespace and decapsulate the content so the user |
184 who wants to modify an entry will see the text that he entered. Also | 213 who wants to modify an entry will see the text that he entered. Also |
185 this avoids successive encapsulation with a new <div>...</div> at | 214 this avoids successive encapsulation with a new <div>...</div> at |
186 each modification (encapsulation is done in self.data2entry)""" | 215 each modification (encapsulation is done in self.data2entry)""" |
187 elt = ElementParser()(text) | 216 elt = ElementParser()(text) |
188 if elt.uri != NS_XHTML: | 217 if elt.uri != NS_XHTML: |
189 raise exceptions.DataError(_('Content of type XHTML must declare its namespace!')) | 218 raise exceptions.DataError(_('Content of type XHTML must declare its namespace!')) |
190 return elt.firstChildElement().toXml() | 219 return self.__getDomishInnerContent(elt) |
191 | 220 |
192 def microblogCB(self, itemsEvent, profile): | 221 def microblogCB(self, itemsEvent, profile): |
193 d = defer.Deferred() | 222 d = defer.Deferred() |
194 | 223 |
195 def manageItem(microblog_data): | 224 def manageItem(microblog_data): |