Mercurial > libervia-backend
changeset 1825:4e51f21c687f
plugin blog import: this plugin is the base handling blog importers:
- blog importers are plugin which register to this one with the "register" method
- blog import manage uploading the given blog data to a PubSub server
- filters can be used.
- current filters fixe the <img> src without host, and upload images to XMPP server
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 22 Jan 2016 20:24:17 +0100 (2016-01-22) |
parents | a19161bb3ff7 |
children | d80ccf4bf201 |
files | src/core/constants.py src/plugins/plugin_blog_import.py |
diffstat | 2 files changed, 354 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- a/src/core/constants.py Fri Jan 22 20:24:17 2016 +0100 +++ b/src/core/constants.py Fri Jan 22 20:24:17 2016 +0100 @@ -145,6 +145,7 @@ PLUG_TYPE_EXP = "EXP" PLUG_TYPE_SEC = "SEC" PLUG_TYPE_SYNTAXE = "SYNTAXE" + PLUG_TYPE_BLOG = "BLOG" # names of widely used plugins
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/src/plugins/plugin_blog_import.py Fri Jan 22 20:24:17 2016 +0100 @@ -0,0 +1,353 @@ +#!/usr/bin/python +# -*- coding: utf-8 -*- + +# SàT plugin for import external blogs +# Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from sat.core.i18n import _ +from sat.core.constants import Const as C +from sat.core.log import getLogger +log = getLogger(__name__) +from twisted.internet import defer +from twisted.web import client as web_client +from twisted.words.xish import domish +from sat.core import exceptions +from sat.tools import xml_tools +import collections +import os +import os.path +import tempfile +import urlparse +import uuid + + +PLUGIN_INFO = { + "name": "blog import", + "import_name": "BLOG_IMPORT", + "type": C.PLUG_TYPE_BLOG, + "dependencies": ["XEP-0277", "TEXT-SYNTAXES", "UPLOAD"], + "main": "BlogImportPlugin", + "handler": "no", + "description": _(u"""Blog import management: +This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""") +} + +OPT_HOST = 'host' +OPT_UPLOAD_IMAGES = 'upload_images' +OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host' +OPT_IGNORE_TLS = 'ignore_tls_errors' +BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS) + + +BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc')) + + +class BlogImportPlugin(object): + + def __init__(self, host): + log.info(_("plugin Blog Import initialization")) + self.host = host + self._importers = {} + self._u = host.plugins['UPLOAD'] + self._m = host.plugins['XEP-0277'] + self._s = self.host.plugins['TEXT-SYNTAXES'] + host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='', method=self._blogImport, async=True) + host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters) + host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription) + + def listImporters(self): + importers = self._importers.keys() + importers.sort() + return [(name, self._importers[name].short_desc) for name in self._importers] + + def getDescription(self, name): + """Return import short and long descriptions + + @param name(unicode): blog importer name + @return (tuple[unicode,unicode]): short and long description + """ + try: + importer = self._importers[name] + except KeyError: + raise exceptions.NotFound(u"Blog importer not found [{}]".format(name)) + else: + return importer.short_desc, importer.long_desc + + def _blogImport(self, name, location, options, pubsub_service='', profile=C.PROF_KEY_DEFAULT): + client = self.host.getClient(profile) + for option in BOOL_OPTIONS: + try: + options[option] = C.bool(options[option]) + except KeyError: + pass + return self.blogImport(client, name, location, options) + + @defer.inlineCallbacks + def blogImport(self, client, name, location, options=None, pubsub_service=None): + """Import a blog + + @param name(unicode): name of the blog importer + @param location(unicode): location of the blog data to import + can be an url, a file path, or anything which make sense + check importer description for more details + @param options(dict, None): extra options. Below are the generic options, + blog importer can have specific ones. All options have unicode values + generic options: + - OPT_HOST (unicode): original host + - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True + see OPT_UPLOAD_IGNORE_HOST. + Default: True + - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host + - OPT_IGNORE_TLS (bool): ignore TLS error for image upload. + Default: False + @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported + None to use profile's server + """ + if options is None: + options = {} + else: + for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True), + (OPT_IGNORE_TLS, False)): + # we want an filled options dict, with all empty or False values removed + try: + value =options[opt_name] + except KeyError: + if opt_default: + options[opt_name] = opt_default + else: + if not value: + del options[opt_name] + try: + importer = self._importers[name] + except KeyError: + raise exceptions.NotFound(u"Importer [{}] not found".format(name)) + posts_data = yield importer.callback(client, location, options) + url_links = {} + yield self._recursiveImport(client, posts_data, options, url_links) + + @defer.inlineCallbacks + def _recursiveImport(self, client, posts_data, options, url_links, service=None, node=None, depth=0): + """Do the upload recursively + + @param posts_data(list): list of data as specified in [register] + @param options(dict): import options + @param url_links(dict): link between former posts and new items + @param service(jid.JID, None): PubSub service to use + @param node(unicode, None): PubSub node to use + @param depth(int): level of recursion + """ + for data in posts_data: + # data checks/filters + mb_data = data['blog'] + try: + item_id = mb_data['id'] + except KeyError: + item_id = mb_data['id'] = unicode(uuid.uuid4()) + + try: + # we keep the link between old url and new blog item + # so the user can redirect its former blog urls + old_url = data['url'] + except KeyError: + pass + else: + url_links[old_url] = (service, node, item_id) + log.info(u"url link from {old} to {service}/{node}/{id}".format( + old = old_url, service=service or u'server', node=node or u'', id=item_id)) + + depth or log.debug(u"Filtering data") + yield self.blogFilters(client, mb_data, options) + + # comments data + if len(data['comments']) != 1: + raise NotImplementedError(u"can't manage multiple comment links") + allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE)) + if allow_comments: + comments_service, comments_node = self._m.getCommentService(client), self._m.getCommentNode(item_id) + mb_data['comments_service'] = comments_service + mb_data['comments_node'] = comments_node + else: + if data['comments'][0]: + raise exceptions.DataError(u"allow_comments set to False, but comments are there") + + # post upload + depth or log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title',''))) + yield self._m.send(mb_data, service, node, profile=client.profile) + + # comments upload + depth or log.debug(u"uploading comments") + if allow_comments: + yield self._recursiveImport(client, data['comments'][0], options, url_links, service=comments_service, node=comments_node, depth=depth+1) + + @defer.inlineCallbacks + def blogFilters(self, client, mb_data, options): + """Apply filters according to options + + modify mb_data in place + @param posts_data(list[dict]): data as returned by importer callback + @param options(dict): dict as given in [blogImport] + """ + # FIXME: blog filters don't work on text content + # TODO: text => XHTML conversion should handler links with <a/> + # filters can then be used by converting text to XHTML + if not options: + return + + # we want only XHTML content + for prefix in ('content',): # a tuple is use, if title need to be added in the future + try: + rich = mb_data['{}_rich'.format(prefix)] + except KeyError: + pass + else: + if '{}_xhtml'.format(prefix) in mb_data: + raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix)) + # we convert rich syntax to XHTML here, so we can handle filters easily + converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False) + mb_data['{}_xhtml'.format(prefix)] = converted + del mb_data['{}_rich'.format(prefix)] + + try: + mb_data['txt'] + except KeyError: + pass + else: + if '{}_xhtml'.format(prefix) in mb_data: + log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix)) + del mb_data['{}_text'.format(prefix)] + else: + log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix)) + return + + # at this point, we have only XHTML version of content + try: + top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML) + except domish.ParserError: + # we clean the xml and try again our luck + cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml']) + top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML) + opt_host = options.get(OPT_HOST) + if opt_host: + # we normalise the domain + parsed_host = urlparse.urlsplit(opt_host) + opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', '')) + + tmp_dir = tempfile.mkdtemp() + for img_elt in xml_tools.findAll(top_elt, ['img']): + yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir) + os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong + + # we now replace the content with filtered one + mb_data['content_xhtml'] = top_elt.toXml() + + @defer.inlineCallbacks + def imgFilters(self, client, img_elt, options, opt_host, tmp_dir): + """Filters handling images + + url without host are fixed (if possible) + according to options, images are uploaded to XMPP server + @param img_elt(domish.Element): <img/> element to handle + @param options(dict): filters options + @param opt_host(unicode): normalised host given in options + @param tmp_dir(str): path to temp directory + """ + try: + url = img_elt['src'] + if url[0] == u'/': + if not opt_host: + log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}" + .format(url=url, xml=img_elt.toXml())) + return + else: + url = urlparse.urljoin(opt_host, url) + filename = url.rsplit('/',1)[-1].strip() + if not filename: + raise KeyError + except (KeyError, IndexError): + log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml())) + return + + # we change the url for the normalized one + img_elt['src'] = url + + if options.get(OPT_UPLOAD_IMAGES, False): + # upload is requested + try: + ignore_host = options[OPT_UPLOAD_IGNORE_HOST] + except KeyError: + pass + else: + # host is the ignored one, we skip + parsed_url = urlparse.urlsplit(url) + if ignore_host in parsed_url.hostname: + log.info(u"Don't upload image at {url} because of {opt} option".format( + url=url, opt=OPT_UPLOAD_IGNORE_HOST)) + return + + # we download images and re-upload them via XMPP + tmp_file = os.path.join(tmp_dir, filename).encode('utf-8') + upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)} + + try: + yield web_client.downloadPage(url.encode('utf-8'), tmp_file) + filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload + dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options) + download_url = yield download_d + except Exception as e: + log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e)) + else: + img_elt['src'] = download_url + + try: + os.unlink(tmp_file) + except OSError: + pass + + def register(self, name, callback, short_desc='', long_desc=''): + """Register a blogImport method + + @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase + @param callback(callable): method to call: + the signature must be (client, location, options) (cf. [blogImport]) + the importer must return an iterable of dict which must have the following keys: + 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en) + the importer MUST NOT create node or call XEP-0277 plugin itself + 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments + It is recommanded to use a unique id in the "id" key which is constant per blog item, + so if the import fail, a new import will overwrite the failed items and avoid duplicates. + + 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments') + a list of list is used because XEP-0277 can handler several comments nodes, + but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]]) + blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed. + If allow_comments is False and some comments are present, a exceptions.DataError will be raised + the import MAY optionally have the following keys: + 'url' (unicode): former url of the post (only the path, without host part) + if present the association to the new path will be displayed to user, so it can make redirections if necessary + + Optionally, the importer plugin can return a tuple with the just described iterator and a post_total + where "post_total" (int) indicate the total number of posts (without comments) + useful to display a progress indicator when the iterator is a generator + @param short_desc(unicode): one line description of the importer + @param long_desc(unicode): long description of the importer, its options, etc. + """ + name = name.lower() + if name in self._importers: + raise exceptions.ConflictError(u"A blog importer with the name {} already exsit".format(name)) + self._importers[name] = BlogImporter(callback, short_desc, long_desc) + + def unregister(self, name): + del self._importers[name]