comparison src/plugins/plugin_blog_import.py @ 1831:68c0dc13d821

plugin blog import, XEP-0277: progress + redirect: - progression is now handled - url redirections are handled with PubSub URIs, and returned as metadata with progressFinished - tmp_dir is cleaned in a finally close
author Goffi <goffi@goffi.org>
date Sat, 23 Jan 2016 20:01:28 +0100
parents 4e51f21c687f
children cdecf553e051
comparison
equal deleted inserted replaced
1830:74676624ad5d 1831:68c0dc13d821
36 36
37 PLUGIN_INFO = { 37 PLUGIN_INFO = {
38 "name": "blog import", 38 "name": "blog import",
39 "import_name": "BLOG_IMPORT", 39 "import_name": "BLOG_IMPORT",
40 "type": C.PLUG_TYPE_BLOG, 40 "type": C.PLUG_TYPE_BLOG,
41 "dependencies": ["XEP-0277", "TEXT-SYNTAXES", "UPLOAD"], 41 "dependencies": ["XEP-0060", "XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
42 "main": "BlogImportPlugin", 42 "main": "BlogImportPlugin",
43 "handler": "no", 43 "handler": "no",
44 "description": _(u"""Blog import management: 44 "description": _(u"""Blog import management:
45 This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""") 45 This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""")
46 } 46 }
47 47
48 OPT_HOST = 'host' 48 OPT_HOST = 'host'
49 OPT_UPLOAD_IMAGES = 'upload_images' 49 OPT_UPLOAD_IMAGES = 'upload_images'
50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host' 50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
51 OPT_IGNORE_TLS = 'ignore_tls_errors' 51 OPT_IGNORE_TLS = 'ignore_tls_errors'
52 URL_REDIRECT_PREFIX = 'url_redirect_'
52 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS) 53 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
53 54
54 55
55 BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc')) 56 BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc'))
56 57
60 def __init__(self, host): 61 def __init__(self, host):
61 log.info(_("plugin Blog Import initialization")) 62 log.info(_("plugin Blog Import initialization"))
62 self.host = host 63 self.host = host
63 self._importers = {} 64 self._importers = {}
64 self._u = host.plugins['UPLOAD'] 65 self._u = host.plugins['UPLOAD']
66 self._p = host.plugins['XEP-0060']
65 self._m = host.plugins['XEP-0277'] 67 self._m = host.plugins['XEP-0277']
66 self._s = self.host.plugins['TEXT-SYNTAXES'] 68 self._s = self.host.plugins['TEXT-SYNTAXES']
67 host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='', method=self._blogImport, async=True) 69 host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='s', method=self._blogImport, async=True)
68 host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters) 70 host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters)
69 host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription) 71 host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription)
72
73 def getProgress(self, progress_id, profile):
74 client = self.host.getClient(profile)
75 return client._blogImport_progress[progress_id]
70 76
71 def listImporters(self): 77 def listImporters(self):
72 importers = self._importers.keys() 78 importers = self._importers.keys()
73 importers.sort() 79 importers.sort()
74 return [(name, self._importers[name].short_desc) for name in self._importers] 80 return [(name, self._importers[name].short_desc) for name in self._importers]
113 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host 119 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
114 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload. 120 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
115 Default: False 121 Default: False
116 @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported 122 @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported
117 None to use profile's server 123 None to use profile's server
124 @return (unicode): progress id
118 """ 125 """
119 if options is None: 126 if options is None:
120 options = {} 127 options = {}
121 else: 128 else:
122 for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True), 129 for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True),
132 del options[opt_name] 139 del options[opt_name]
133 try: 140 try:
134 importer = self._importers[name] 141 importer = self._importers[name]
135 except KeyError: 142 except KeyError:
136 raise exceptions.NotFound(u"Importer [{}] not found".format(name)) 143 raise exceptions.NotFound(u"Importer [{}] not found".format(name))
137 posts_data = yield importer.callback(client, location, options) 144 posts_data, posts_count = yield importer.callback(client, location, options)
138 url_links = {} 145 url_redirect = {}
139 yield self._recursiveImport(client, posts_data, options, url_links) 146 progress_id = unicode(uuid.uuid4())
147 try:
148 progress_data = client._blogImport_progress
149 except AttributeError:
150 progress_data = client._blogImport_progress = {}
151 progress_data[progress_id] = {u'position': '0'}
152 if posts_count is not None:
153 progress_data[progress_id]['size'] = unicode(posts_count)
154 metadata = {'name': u'{}: {}'.format(name, location),
155 'direction': 'out',
156 'type': 'BLOG_IMPORT'
157 }
158 self.host.registerProgressCb(progress_id, self.getProgress, metadata, profile=client.profile)
159 self.host.bridge.progressStarted(progress_id, metadata, client.profile)
160 self._recursiveImport(client, posts_data, progress_id, options, url_redirect)
161 defer.returnValue(progress_id)
140 162
141 @defer.inlineCallbacks 163 @defer.inlineCallbacks
142 def _recursiveImport(self, client, posts_data, options, url_links, service=None, node=None, depth=0): 164 def _recursiveImport(self, client, posts_data, progress_id, options, url_redirect, service=None, node=None, depth=0):
143 """Do the upload recursively 165 """Do the upload recursively
144 166
145 @param posts_data(list): list of data as specified in [register] 167 @param posts_data(list): list of data as specified in [register]
146 @param options(dict): import options 168 @param options(dict): import options
147 @param url_links(dict): link between former posts and new items 169 @param url_redirect(dict): link between former posts and new items
148 @param service(jid.JID, None): PubSub service to use 170 @param service(jid.JID, None): PubSub service to use
149 @param node(unicode, None): PubSub node to use 171 @param node(unicode, None): PubSub node to use
150 @param depth(int): level of recursion 172 @param depth(int): level of recursion
151 """ 173 """
152 for data in posts_data: 174 for idx, data in enumerate(posts_data):
153 # data checks/filters 175 # data checks/filters
154 mb_data = data['blog'] 176 mb_data = data['blog']
155 try: 177 try:
156 item_id = mb_data['id'] 178 item_id = mb_data['id']
157 except KeyError: 179 except KeyError:
158 item_id = mb_data['id'] = unicode(uuid.uuid4()) 180 item_id = mb_data['id'] = unicode(uuid.uuid4())
159 181
160 try: 182 try:
161 # we keep the link between old url and new blog item 183 # we keep the link between old url and new blog item
162 # so the user can redirect its former blog urls 184 # so the user can redirect its former blog urls
163 old_url = data['url'] 185 old_uri = data['url']
164 except KeyError: 186 except KeyError:
165 pass 187 pass
166 else: 188 else:
167 url_links[old_url] = (service, node, item_id) 189 new_uri = url_redirect[old_uri] = self._p.getNodeURI(
168 log.info(u"url link from {old} to {service}/{node}/{id}".format( 190 service if service is not None else client.jid.userhostJID(),
169 old = old_url, service=service or u'server', node=node or u'', id=item_id)) 191 node or self._m.namespace,
170 192 item_id)
171 depth or log.debug(u"Filtering data") 193 log.info(u"url link from {old} to {new}".format(
194 old=old_uri, new=new_uri))
195
172 yield self.blogFilters(client, mb_data, options) 196 yield self.blogFilters(client, mb_data, options)
173 197
174 # comments data 198 # comments data
175 if len(data['comments']) != 1: 199 if len(data['comments']) != 1:
176 raise NotImplementedError(u"can't manage multiple comment links") 200 raise NotImplementedError(u"can't manage multiple comment links")
177 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE)) 201 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
178 if allow_comments: 202 if allow_comments:
179 comments_service, comments_node = self._m.getCommentService(client), self._m.getCommentNode(item_id) 203 comments_service, comments_node = self._m.getCommentsService(client), self._m.getCommentsNode(item_id)
180 mb_data['comments_service'] = comments_service 204 mb_data['comments_service'] = comments_service
181 mb_data['comments_node'] = comments_node 205 mb_data['comments_node'] = comments_node
182 else: 206 else:
183 if data['comments'][0]: 207 if data['comments'][0]:
184 raise exceptions.DataError(u"allow_comments set to False, but comments are there") 208 raise exceptions.DataError(u"allow_comments set to False, but comments are there")
188 yield self._m.send(mb_data, service, node, profile=client.profile) 212 yield self._m.send(mb_data, service, node, profile=client.profile)
189 213
190 # comments upload 214 # comments upload
191 depth or log.debug(u"uploading comments") 215 depth or log.debug(u"uploading comments")
192 if allow_comments: 216 if allow_comments:
193 yield self._recursiveImport(client, data['comments'][0], options, url_links, service=comments_service, node=comments_node, depth=depth+1) 217 yield self._recursiveImport(client, data['comments'][0], progress_id, options, url_redirect, service=comments_service, node=comments_node, depth=depth+1)
218 if depth == 0:
219 client._blogImport_progress[progress_id]['position'] = unicode(idx+1)
220
221 if depth == 0:
222 self.host.bridge.progressFinished(progress_id,
223 {u'{}{}'.format(URL_REDIRECT_PREFIX, old): new for old, new in url_redirect.iteritems()},
224 client.profile)
225 self.host.removeProgressCb(progress_id, client.profile)
226 del client._blogImport_progress[progress_id]
194 227
195 @defer.inlineCallbacks 228 @defer.inlineCallbacks
196 def blogFilters(self, client, mb_data, options): 229 def blogFilters(self, client, mb_data, options):
197 """Apply filters according to options 230 """Apply filters according to options
198 231
244 # we normalise the domain 277 # we normalise the domain
245 parsed_host = urlparse.urlsplit(opt_host) 278 parsed_host = urlparse.urlsplit(opt_host)
246 opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', '')) 279 opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', ''))
247 280
248 tmp_dir = tempfile.mkdtemp() 281 tmp_dir = tempfile.mkdtemp()
249 for img_elt in xml_tools.findAll(top_elt, ['img']): 282 try:
250 yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir) 283 for img_elt in xml_tools.findAll(top_elt, ['img']):
251 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong 284 yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir)
285 finally:
286 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
252 287
253 # we now replace the content with filtered one 288 # we now replace the content with filtered one
254 mb_data['content_xhtml'] = top_elt.toXml() 289 mb_data['content_xhtml'] = top_elt.toXml()
255 290
256 @defer.inlineCallbacks 291 @defer.inlineCallbacks
320 """Register a blogImport method 355 """Register a blogImport method
321 356
322 @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase 357 @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase
323 @param callback(callable): method to call: 358 @param callback(callable): method to call:
324 the signature must be (client, location, options) (cf. [blogImport]) 359 the signature must be (client, location, options) (cf. [blogImport])
325 the importer must return an iterable of dict which must have the following keys: 360 the importer must return a tuple with (posts_data, posts_count)
361
362 posts_data is an iterable of dict which must have the following keys:
326 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en) 363 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
327 the importer MUST NOT create node or call XEP-0277 plugin itself 364 the importer MUST NOT create node or call XEP-0277 plugin itself
328 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments 365 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
329 It is recommanded to use a unique id in the "id" key which is constant per blog item, 366 It is recommanded to use a unique id in the "id" key which is constant per blog item,
330 so if the import fail, a new import will overwrite the failed items and avoid duplicates. 367 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
336 If allow_comments is False and some comments are present, a exceptions.DataError will be raised 373 If allow_comments is False and some comments are present, a exceptions.DataError will be raised
337 the import MAY optionally have the following keys: 374 the import MAY optionally have the following keys:
338 'url' (unicode): former url of the post (only the path, without host part) 375 'url' (unicode): former url of the post (only the path, without host part)
339 if present the association to the new path will be displayed to user, so it can make redirections if necessary 376 if present the association to the new path will be displayed to user, so it can make redirections if necessary
340 377
341 Optionally, the importer plugin can return a tuple with the just described iterator and a post_total 378 posts_count (int, None) indicate the total number of posts (without comments)
342 where "post_total" (int) indicate the total number of posts (without comments)
343 useful to display a progress indicator when the iterator is a generator 379 useful to display a progress indicator when the iterator is a generator
380 use None if you can't guess the total number of blog posts
344 @param short_desc(unicode): one line description of the importer 381 @param short_desc(unicode): one line description of the importer
345 @param long_desc(unicode): long description of the importer, its options, etc. 382 @param long_desc(unicode): long description of the importer, its options, etc.
346 """ 383 """
347 name = name.lower() 384 name = name.lower()
348 if name in self._importers: 385 if name in self._importers: