comparison src/plugins/plugin_blog_import.py @ 1825:4e51f21c687f

plugin blog import: this plugin is the base handling blog importers: - blog importers are plugin which register to this one with the "register" method - blog import manage uploading the given blog data to a PubSub server - filters can be used. - current filters fixe the <img> src without host, and upload images to XMPP server
author Goffi <goffi@goffi.org>
date Fri, 22 Jan 2016 20:24:17 +0100
parents
children 68c0dc13d821
comparison
equal deleted inserted replaced
1824:a19161bb3ff7 1825:4e51f21c687f
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 from sat.core.i18n import _
21 from sat.core.constants import Const as C
22 from sat.core.log import getLogger
23 log = getLogger(__name__)
24 from twisted.internet import defer
25 from twisted.web import client as web_client
26 from twisted.words.xish import domish
27 from sat.core import exceptions
28 from sat.tools import xml_tools
29 import collections
30 import os
31 import os.path
32 import tempfile
33 import urlparse
34 import uuid
35
36
37 PLUGIN_INFO = {
38 "name": "blog import",
39 "import_name": "BLOG_IMPORT",
40 "type": C.PLUG_TYPE_BLOG,
41 "dependencies": ["XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
42 "main": "BlogImportPlugin",
43 "handler": "no",
44 "description": _(u"""Blog import management:
45 This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""")
46 }
47
48 OPT_HOST = 'host'
49 OPT_UPLOAD_IMAGES = 'upload_images'
50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
51 OPT_IGNORE_TLS = 'ignore_tls_errors'
52 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
53
54
55 BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc'))
56
57
58 class BlogImportPlugin(object):
59
60 def __init__(self, host):
61 log.info(_("plugin Blog Import initialization"))
62 self.host = host
63 self._importers = {}
64 self._u = host.plugins['UPLOAD']
65 self._m = host.plugins['XEP-0277']
66 self._s = self.host.plugins['TEXT-SYNTAXES']
67 host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='', method=self._blogImport, async=True)
68 host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters)
69 host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription)
70
71 def listImporters(self):
72 importers = self._importers.keys()
73 importers.sort()
74 return [(name, self._importers[name].short_desc) for name in self._importers]
75
76 def getDescription(self, name):
77 """Return import short and long descriptions
78
79 @param name(unicode): blog importer name
80 @return (tuple[unicode,unicode]): short and long description
81 """
82 try:
83 importer = self._importers[name]
84 except KeyError:
85 raise exceptions.NotFound(u"Blog importer not found [{}]".format(name))
86 else:
87 return importer.short_desc, importer.long_desc
88
89 def _blogImport(self, name, location, options, pubsub_service='', profile=C.PROF_KEY_DEFAULT):
90 client = self.host.getClient(profile)
91 for option in BOOL_OPTIONS:
92 try:
93 options[option] = C.bool(options[option])
94 except KeyError:
95 pass
96 return self.blogImport(client, name, location, options)
97
98 @defer.inlineCallbacks
99 def blogImport(self, client, name, location, options=None, pubsub_service=None):
100 """Import a blog
101
102 @param name(unicode): name of the blog importer
103 @param location(unicode): location of the blog data to import
104 can be an url, a file path, or anything which make sense
105 check importer description for more details
106 @param options(dict, None): extra options. Below are the generic options,
107 blog importer can have specific ones. All options have unicode values
108 generic options:
109 - OPT_HOST (unicode): original host
110 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
111 see OPT_UPLOAD_IGNORE_HOST.
112 Default: True
113 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
114 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
115 Default: False
116 @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported
117 None to use profile's server
118 """
119 if options is None:
120 options = {}
121 else:
122 for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True),
123 (OPT_IGNORE_TLS, False)):
124 # we want an filled options dict, with all empty or False values removed
125 try:
126 value =options[opt_name]
127 except KeyError:
128 if opt_default:
129 options[opt_name] = opt_default
130 else:
131 if not value:
132 del options[opt_name]
133 try:
134 importer = self._importers[name]
135 except KeyError:
136 raise exceptions.NotFound(u"Importer [{}] not found".format(name))
137 posts_data = yield importer.callback(client, location, options)
138 url_links = {}
139 yield self._recursiveImport(client, posts_data, options, url_links)
140
141 @defer.inlineCallbacks
142 def _recursiveImport(self, client, posts_data, options, url_links, service=None, node=None, depth=0):
143 """Do the upload recursively
144
145 @param posts_data(list): list of data as specified in [register]
146 @param options(dict): import options
147 @param url_links(dict): link between former posts and new items
148 @param service(jid.JID, None): PubSub service to use
149 @param node(unicode, None): PubSub node to use
150 @param depth(int): level of recursion
151 """
152 for data in posts_data:
153 # data checks/filters
154 mb_data = data['blog']
155 try:
156 item_id = mb_data['id']
157 except KeyError:
158 item_id = mb_data['id'] = unicode(uuid.uuid4())
159
160 try:
161 # we keep the link between old url and new blog item
162 # so the user can redirect its former blog urls
163 old_url = data['url']
164 except KeyError:
165 pass
166 else:
167 url_links[old_url] = (service, node, item_id)
168 log.info(u"url link from {old} to {service}/{node}/{id}".format(
169 old = old_url, service=service or u'server', node=node or u'', id=item_id))
170
171 depth or log.debug(u"Filtering data")
172 yield self.blogFilters(client, mb_data, options)
173
174 # comments data
175 if len(data['comments']) != 1:
176 raise NotImplementedError(u"can't manage multiple comment links")
177 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
178 if allow_comments:
179 comments_service, comments_node = self._m.getCommentService(client), self._m.getCommentNode(item_id)
180 mb_data['comments_service'] = comments_service
181 mb_data['comments_node'] = comments_node
182 else:
183 if data['comments'][0]:
184 raise exceptions.DataError(u"allow_comments set to False, but comments are there")
185
186 # post upload
187 depth or log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
188 yield self._m.send(mb_data, service, node, profile=client.profile)
189
190 # comments upload
191 depth or log.debug(u"uploading comments")
192 if allow_comments:
193 yield self._recursiveImport(client, data['comments'][0], options, url_links, service=comments_service, node=comments_node, depth=depth+1)
194
195 @defer.inlineCallbacks
196 def blogFilters(self, client, mb_data, options):
197 """Apply filters according to options
198
199 modify mb_data in place
200 @param posts_data(list[dict]): data as returned by importer callback
201 @param options(dict): dict as given in [blogImport]
202 """
203 # FIXME: blog filters don't work on text content
204 # TODO: text => XHTML conversion should handler links with <a/>
205 # filters can then be used by converting text to XHTML
206 if not options:
207 return
208
209 # we want only XHTML content
210 for prefix in ('content',): # a tuple is use, if title need to be added in the future
211 try:
212 rich = mb_data['{}_rich'.format(prefix)]
213 except KeyError:
214 pass
215 else:
216 if '{}_xhtml'.format(prefix) in mb_data:
217 raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix))
218 # we convert rich syntax to XHTML here, so we can handle filters easily
219 converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False)
220 mb_data['{}_xhtml'.format(prefix)] = converted
221 del mb_data['{}_rich'.format(prefix)]
222
223 try:
224 mb_data['txt']
225 except KeyError:
226 pass
227 else:
228 if '{}_xhtml'.format(prefix) in mb_data:
229 log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix))
230 del mb_data['{}_text'.format(prefix)]
231 else:
232 log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix))
233 return
234
235 # at this point, we have only XHTML version of content
236 try:
237 top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML)
238 except domish.ParserError:
239 # we clean the xml and try again our luck
240 cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml'])
241 top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
242 opt_host = options.get(OPT_HOST)
243 if opt_host:
244 # we normalise the domain
245 parsed_host = urlparse.urlsplit(opt_host)
246 opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', ''))
247
248 tmp_dir = tempfile.mkdtemp()
249 for img_elt in xml_tools.findAll(top_elt, ['img']):
250 yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir)
251 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
252
253 # we now replace the content with filtered one
254 mb_data['content_xhtml'] = top_elt.toXml()
255
256 @defer.inlineCallbacks
257 def imgFilters(self, client, img_elt, options, opt_host, tmp_dir):
258 """Filters handling images
259
260 url without host are fixed (if possible)
261 according to options, images are uploaded to XMPP server
262 @param img_elt(domish.Element): <img/> element to handle
263 @param options(dict): filters options
264 @param opt_host(unicode): normalised host given in options
265 @param tmp_dir(str): path to temp directory
266 """
267 try:
268 url = img_elt['src']
269 if url[0] == u'/':
270 if not opt_host:
271 log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}"
272 .format(url=url, xml=img_elt.toXml()))
273 return
274 else:
275 url = urlparse.urljoin(opt_host, url)
276 filename = url.rsplit('/',1)[-1].strip()
277 if not filename:
278 raise KeyError
279 except (KeyError, IndexError):
280 log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml()))
281 return
282
283 # we change the url for the normalized one
284 img_elt['src'] = url
285
286 if options.get(OPT_UPLOAD_IMAGES, False):
287 # upload is requested
288 try:
289 ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
290 except KeyError:
291 pass
292 else:
293 # host is the ignored one, we skip
294 parsed_url = urlparse.urlsplit(url)
295 if ignore_host in parsed_url.hostname:
296 log.info(u"Don't upload image at {url} because of {opt} option".format(
297 url=url, opt=OPT_UPLOAD_IGNORE_HOST))
298 return
299
300 # we download images and re-upload them via XMPP
301 tmp_file = os.path.join(tmp_dir, filename).encode('utf-8')
302 upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)}
303
304 try:
305 yield web_client.downloadPage(url.encode('utf-8'), tmp_file)
306 filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload
307 dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options)
308 download_url = yield download_d
309 except Exception as e:
310 log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e))
311 else:
312 img_elt['src'] = download_url
313
314 try:
315 os.unlink(tmp_file)
316 except OSError:
317 pass
318
319 def register(self, name, callback, short_desc='', long_desc=''):
320 """Register a blogImport method
321
322 @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase
323 @param callback(callable): method to call:
324 the signature must be (client, location, options) (cf. [blogImport])
325 the importer must return an iterable of dict which must have the following keys:
326 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
327 the importer MUST NOT create node or call XEP-0277 plugin itself
328 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
329 It is recommanded to use a unique id in the "id" key which is constant per blog item,
330 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
331
332 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
333 a list of list is used because XEP-0277 can handler several comments nodes,
334 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
335 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
336 If allow_comments is False and some comments are present, a exceptions.DataError will be raised
337 the import MAY optionally have the following keys:
338 'url' (unicode): former url of the post (only the path, without host part)
339 if present the association to the new path will be displayed to user, so it can make redirections if necessary
340
341 Optionally, the importer plugin can return a tuple with the just described iterator and a post_total
342 where "post_total" (int) indicate the total number of posts (without comments)
343 useful to display a progress indicator when the iterator is a generator
344 @param short_desc(unicode): one line description of the importer
345 @param long_desc(unicode): long description of the importer, its options, etc.
346 """
347 name = name.lower()
348 if name in self._importers:
349 raise exceptions.ConflictError(u"A blog importer with the name {} already exsit".format(name))
350 self._importers[name] = BlogImporter(callback, short_desc, long_desc)
351
352 def unregister(self, name):
353 del self._importers[name]