comparison src/plugins/plugin_blog_import.py @ 2369:cdaa58e14553

plugin import: generic data import plugin: this plugin handle common task for importers. Specialized importers (e.g. blog import) use it as a basic, and specific importers (e.g. Dotclear) register to the specialized one. Blog importer generic method have been moved to it.
author Goffi <goffi@goffi.org>
date Sun, 01 Oct 2017 12:21:23 +0200
parents 33c8c4973743
children 2c2b826b0bb3
comparison
equal deleted inserted replaced
2368:3865a772c360 2369:cdaa58e14553
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details. 15 # GNU Affero General Public License for more details.
16 16
17 # You should have received a copy of the GNU Affero General Public License 17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
19 20
20 from sat.core.i18n import _ 21 from sat.core.i18n import _
21 from sat.core.constants import Const as C 22 from sat.core.constants import Const as C
22 from sat.core.log import getLogger 23 from sat.core.log import getLogger
23 log = getLogger(__name__) 24 log = getLogger(__name__)
29 import collections 30 import collections
30 import os 31 import os
31 import os.path 32 import os.path
32 import tempfile 33 import tempfile
33 import urlparse 34 import urlparse
34 import uuid 35 import shortuuid
35 36
36 37
37 PLUGIN_INFO = { 38 PLUGIN_INFO = {
38 C.PI_NAME: "blog import", 39 C.PI_NAME: "blog import",
39 C.PI_IMPORT_NAME: "BLOG_IMPORT", 40 C.PI_IMPORT_NAME: "BLOG_IMPORT",
40 C.PI_TYPE: C.PLUG_TYPE_BLOG, 41 C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT),
41 C.PI_DEPENDENCIES: ["XEP-0060", "XEP-0277", "TEXT-SYNTAXES", "UPLOAD"], 42 C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
42 C.PI_MAIN: "BlogImportPlugin", 43 C.PI_MAIN: "BlogImportPlugin",
43 C.PI_HANDLER: "no", 44 C.PI_HANDLER: "no",
44 C.PI_DESCRIPTION: _(u"""Blog import management: 45 C.PI_DESCRIPTION: _(u"""Blog import management:
45 This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""") 46 This plugin manage the different blog importers which can register to it, and handle generic importing tasks.""")
46 } 47 }
47 48
48 OPT_HOST = 'host' 49 OPT_HOST = 'host'
49 OPT_UPLOAD_IMAGES = 'upload_images' 50 OPT_UPLOAD_IMAGES = 'upload_images'
50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host' 51 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
51 OPT_IGNORE_TLS = 'ignore_tls_errors' 52 OPT_IGNORE_TLS = 'ignore_tls_errors'
52 URL_REDIRECT_PREFIX = 'url_redirect_' 53 URL_REDIRECT_PREFIX = 'url_redirect_'
53 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
54 54
55 55
56 BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc')) 56 BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc'))
57 57
58 58
59 class BlogImportPlugin(object): 59 class BlogImportPlugin(object):
60 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
61 OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True,
62 OPT_IGNORE_TLS: False}
60 63
61 def __init__(self, host): 64 def __init__(self, host):
62 log.info(_("plugin Blog Import initialization")) 65 log.info(_("plugin Blog Import initialization"))
63 self.host = host 66 self.host = host
64 self._importers = {}
65 self._u = host.plugins['UPLOAD'] 67 self._u = host.plugins['UPLOAD']
66 self._p = host.plugins['XEP-0060'] 68 self._p = host.plugins['XEP-0060']
67 self._m = host.plugins['XEP-0277'] 69 self._m = host.plugins['XEP-0277']
68 self._s = self.host.plugins['TEXT-SYNTAXES'] 70 self._s = self.host.plugins['TEXT-SYNTAXES']
69 host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='s', method=self._blogImport, async=True) 71 host.plugins['IMPORT'].initialize(self, u'blog')
70 host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters) 72
71 host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription) 73 def importItem(self, client, item_import_data, options, return_data, service, node):
72 74 """importItem specialized for blog import
73 def getProgress(self, progress_id, profile): 75
74 client = self.host.getClient(profile) 76 @param items_import_data(iterable[dict]):
75 return client._blogImport_progress[progress_id] 77 * mandatory keys:
76 78 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
77 def listImporters(self): 79 the importer MUST NOT create node or call XEP-0277 plugin itself
78 importers = self._importers.keys() 80 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
79 importers.sort() 81 It is recommanded to use a unique id in the "id" key which is constant per blog item,
80 return [(name, self._importers[name].short_desc) for name in self._importers] 82 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
81 83
82 def getDescription(self, name): 84 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
83 """Return import short and long descriptions 85 a list of list is used because XEP-0277 can handler several comments nodes,
84 86 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
85 @param name(unicode): blog importer name 87 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
86 @return (tuple[unicode,unicode]): short and long description 88 If allow_comments is False and some comments are present, an exceptions.DataError will be raised
87 """ 89 * optional keys:
88 try: 90 'url' (unicode): former url of the post (only the path, without host part)
89 importer = self._importers[name] 91 if present the association to the new path will be displayed to user, so it can make redirections if necessary
90 except KeyError: 92 @param options(dict, None): Below are the generic options,
91 raise exceptions.NotFound(u"Blog importer not found [{}]".format(name))
92 else:
93 return importer.short_desc, importer.long_desc
94
95 def _blogImport(self, name, location, options, pubsub_service='', profile=C.PROF_KEY_DEFAULT):
96 client = self.host.getClient(profile)
97 options = {key: unicode(value) for key, value in options.iteritems()}
98 for option in BOOL_OPTIONS:
99 try:
100 options[option] = C.bool(options[option])
101 except KeyError:
102 pass
103 return self.blogImport(client, unicode(name), unicode(location), options)
104
105 @defer.inlineCallbacks
106 def blogImport(self, client, name, location, options=None, pubsub_service=None):
107 """Import a blog
108
109 @param name(unicode): name of the blog importer
110 @param location(unicode): location of the blog data to import
111 can be an url, a file path, or anything which make sense
112 check importer description for more details
113 @param options(dict, None): extra options. Below are the generic options,
114 blog importer can have specific ones. All options have unicode values 93 blog importer can have specific ones. All options have unicode values
115 generic options: 94 generic options:
116 - OPT_HOST (unicode): original host 95 - OPT_HOST (unicode): original host
117 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True 96 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
118 see OPT_UPLOAD_IGNORE_HOST. 97 see OPT_UPLOAD_IGNORE_HOST.
119 Default: True 98 Default: True
120 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host 99 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
121 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload. 100 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
122 Default: False 101 Default: False
123 @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported 102 @param return_data(dict): will contain link between former posts and new items
124 None to use profile's server 103
125 @return (unicode): progress id
126 """ 104 """
127 if options is None: 105 mb_data = item_import_data['blog']
128 options = {} 106 try:
107 item_id = mb_data['id']
108 except KeyError:
109 item_id = mb_data['id'] = unicode(shortuuid.uuid())
110
111 try:
112 # we keep the link between old url and new blog item
113 # so the user can redirect its former blog urls
114 old_uri = item_import_data['url']
115 except KeyError:
116 pass
129 else: 117 else:
130 for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True), 118 new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.getNodeURI(
131 (OPT_IGNORE_TLS, False)): 119 service if service is not None else client.jid.userhostJID(),
132 # we want an filled options dict, with all empty or False values removed 120 node or self._m.namespace,
133 try: 121 item_id)
134 value =options[opt_name] 122 log.info(u"url link from {old} to {new}".format(
135 except KeyError: 123 old=old_uri, new=new_uri))
136 if opt_default: 124
137 options[opt_name] = opt_default 125 return mb_data
138 else: 126
139 if not value: 127 def importSubItems(self, client, item_import_data, mb_data, options):
140 del options[opt_name] 128 # comments data
141 try: 129 if len(item_import_data['comments']) != 1:
142 importer = self._importers[name] 130 raise NotImplementedError(u"can't manage multiple comment links")
143 except KeyError: 131 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
144 raise exceptions.NotFound(u"Importer [{}] not found".format(name)) 132 if allow_comments:
145 posts_data, posts_count = yield importer.callback(client, location, options) 133 comments_service, comments_node = self._m.getCommentsService(client), self._m.getCommentsNode(mb_data['id'])
146 url_redirect = {} 134 mb_data['comments_service'] = comments_service.full()
147 progress_id = unicode(uuid.uuid4()) 135 mb_data['comments_node'] = comments_node
148 try: 136 recurse_kwargs = {
149 progress_data = client._blogImport_progress 137 'items_import_data':item_import_data['comments'][0],
150 except AttributeError: 138 'service':comments_service,
151 progress_data = client._blogImport_progress = {} 139 'node':comments_node}
152 progress_data[progress_id] = {u'position': '0'} 140 return recurse_kwargs
153 if posts_count is not None: 141 else:
154 progress_data[progress_id]['size'] = unicode(posts_count) 142 if item_import_data['comments'][0]:
155 metadata = {'name': u'{}: {}'.format(name, location), 143 raise exceptions.DataError(u"allow_comments set to False, but comments are there")
156 'direction': 'out', 144 return None
157 'type': 'BLOG_IMPORT' 145
158 } 146 def publishItem(self, client, mb_data, service, node):
159 self.host.registerProgressCb(progress_id, self.getProgress, metadata, profile=client.profile) 147 log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
160 self.host.bridge.progressStarted(progress_id, metadata, client.profile) 148 return self._m.send(client, mb_data, service, node)
161 self._recursiveImport(client, posts_data, progress_id, options, url_redirect)
162 defer.returnValue(progress_id)
163 149
164 @defer.inlineCallbacks 150 @defer.inlineCallbacks
165 def _recursiveImport(self, client, posts_data, progress_id, options, url_redirect, service=None, node=None, depth=0): 151 def itemFilters(self, client, mb_data, options):
166 """Do the upload recursively
167
168 @param posts_data(list): list of data as specified in [register]
169 @param options(dict): import options
170 @param url_redirect(dict): link between former posts and new items
171 @param service(jid.JID, None): PubSub service to use
172 @param node(unicode, None): PubSub node to use
173 @param depth(int): level of recursion
174 """
175 for idx, data in enumerate(posts_data):
176 # data checks/filters
177 mb_data = data['blog']
178 try:
179 item_id = mb_data['id']
180 except KeyError:
181 item_id = mb_data['id'] = unicode(uuid.uuid4())
182
183 try:
184 # we keep the link between old url and new blog item
185 # so the user can redirect its former blog urls
186 old_uri = data['url']
187 except KeyError:
188 pass
189 else:
190 new_uri = url_redirect[old_uri] = self._p.getNodeURI(
191 service if service is not None else client.jid.userhostJID(),
192 node or self._m.namespace,
193 item_id)
194 log.info(u"url link from {old} to {new}".format(
195 old=old_uri, new=new_uri))
196
197 yield self.blogFilters(client, mb_data, options)
198
199 # comments data
200 if len(data['comments']) != 1:
201 raise NotImplementedError(u"can't manage multiple comment links")
202 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
203 if allow_comments:
204 comments_service, comments_node = self._m.getCommentsService(client), self._m.getCommentsNode(item_id)
205 mb_data['comments_service'] = comments_service.full()
206 mb_data['comments_node'] = comments_node
207 else:
208 if data['comments'][0]:
209 raise exceptions.DataError(u"allow_comments set to False, but comments are there")
210
211 # post upload
212 depth or log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
213 yield self._m.send(mb_data, service, node, profile=client.profile)
214
215 # comments upload
216 depth or log.debug(u"uploading comments")
217 if allow_comments:
218 yield self._recursiveImport(client, data['comments'][0], progress_id, options, url_redirect, service=comments_service, node=comments_node, depth=depth+1)
219 if depth == 0:
220 client._blogImport_progress[progress_id]['position'] = unicode(idx+1)
221
222 if depth == 0:
223 self.host.bridge.progressFinished(progress_id,
224 {u'{}{}'.format(URL_REDIRECT_PREFIX, old): new for old, new in url_redirect.iteritems()},
225 client.profile)
226 self.host.removeProgressCb(progress_id, client.profile)
227 del client._blogImport_progress[progress_id]
228
229 @defer.inlineCallbacks
230 def blogFilters(self, client, mb_data, options):
231 """Apply filters according to options 152 """Apply filters according to options
232 153
233 modify mb_data in place 154 modify mb_data in place
234 @param posts_data(list[dict]): data as returned by importer callback 155 @param posts_data(list[dict]): data as returned by importer callback
235 @param options(dict): dict as given in [blogImport] 156 @param options(dict): dict as given in [blogImport]
350 271
351 try: 272 try:
352 os.unlink(tmp_file) 273 os.unlink(tmp_file)
353 except OSError: 274 except OSError:
354 pass 275 pass
355
356 def register(self, name, callback, short_desc='', long_desc=''):
357 """Register a blogImport method
358
359 @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase
360 @param callback(callable): method to call:
361 the signature must be (client, location, options) (cf. [blogImport])
362 the importer must return a tuple with (posts_data, posts_count)
363
364 posts_data is an iterable of dict which must have the following keys:
365 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
366 the importer MUST NOT create node or call XEP-0277 plugin itself
367 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
368 It is recommanded to use a unique id in the "id" key which is constant per blog item,
369 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
370
371 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
372 a list of list is used because XEP-0277 can handler several comments nodes,
373 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
374 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
375 If allow_comments is False and some comments are present, a exceptions.DataError will be raised
376 the import MAY optionally have the following keys:
377 'url' (unicode): former url of the post (only the path, without host part)
378 if present the association to the new path will be displayed to user, so it can make redirections if necessary
379
380 posts_count (int, None) indicate the total number of posts (without comments)
381 useful to display a progress indicator when the iterator is a generator
382 use None if you can't guess the total number of blog posts
383 @param short_desc(unicode): one line description of the importer
384 @param long_desc(unicode): long description of the importer, its options, etc.
385 """
386 name = name.lower()
387 if name in self._importers:
388 raise exceptions.ConflictError(u"A blog importer with the name {} already exsit".format(name))
389 self._importers[name] = BlogImporter(callback, short_desc, long_desc)
390
391 def unregister(self, name):
392 del self._importers[name]