changeset 1825:4e51f21c687f

plugin blog import: this plugin is the base handling blog importers: - blog importers are plugin which register to this one with the "register" method - blog import manage uploading the given blog data to a PubSub server - filters can be used. - current filters fixe the <img> src without host, and upload images to XMPP server
author Goffi <goffi@goffi.org>
date Fri, 22 Jan 2016 20:24:17 +0100 (2016-01-22)
parents a19161bb3ff7
children d80ccf4bf201
files src/core/constants.py src/plugins/plugin_blog_import.py
diffstat 2 files changed, 354 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- a/src/core/constants.py	Fri Jan 22 20:24:17 2016 +0100
+++ b/src/core/constants.py	Fri Jan 22 20:24:17 2016 +0100
@@ -145,6 +145,7 @@
     PLUG_TYPE_EXP = "EXP"
     PLUG_TYPE_SEC = "SEC"
     PLUG_TYPE_SYNTAXE = "SYNTAXE"
+    PLUG_TYPE_BLOG = "BLOG"
 
 
     # names of widely used plugins
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/plugin_blog_import.py	Fri Jan 22 20:24:17 2016 +0100
@@ -0,0 +1,353 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# SàT plugin for import external blogs
+# Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from sat.core.i18n import _
+from sat.core.constants import Const as C
+from sat.core.log import getLogger
+log = getLogger(__name__)
+from twisted.internet import defer
+from twisted.web import client as web_client
+from twisted.words.xish import domish
+from sat.core import exceptions
+from sat.tools import xml_tools
+import collections
+import os
+import os.path
+import tempfile
+import urlparse
+import uuid
+
+
+PLUGIN_INFO = {
+    "name": "blog import",
+    "import_name": "BLOG_IMPORT",
+    "type": C.PLUG_TYPE_BLOG,
+    "dependencies": ["XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
+    "main": "BlogImportPlugin",
+    "handler": "no",
+    "description": _(u"""Blog import management:
+This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""")
+}
+
+OPT_HOST = 'host'
+OPT_UPLOAD_IMAGES = 'upload_images'
+OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
+OPT_IGNORE_TLS = 'ignore_tls_errors'
+BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
+
+
+BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc'))
+
+
+class BlogImportPlugin(object):
+
+    def __init__(self, host):
+        log.info(_("plugin Blog Import initialization"))
+        self.host = host
+        self._importers = {}
+        self._u = host.plugins['UPLOAD']
+        self._m = host.plugins['XEP-0277']
+        self._s = self.host.plugins['TEXT-SYNTAXES']
+        host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='', method=self._blogImport, async=True)
+        host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters)
+        host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription)
+
+    def listImporters(self):
+        importers = self._importers.keys()
+        importers.sort()
+        return [(name, self._importers[name].short_desc) for name in self._importers]
+
+    def getDescription(self, name):
+        """Return import short and long descriptions
+
+        @param name(unicode): blog importer name
+        @return (tuple[unicode,unicode]): short and long description
+        """
+        try:
+            importer = self._importers[name]
+        except KeyError:
+            raise exceptions.NotFound(u"Blog importer not found [{}]".format(name))
+        else:
+            return importer.short_desc, importer.long_desc
+
+    def _blogImport(self, name, location, options, pubsub_service='', profile=C.PROF_KEY_DEFAULT):
+        client = self.host.getClient(profile)
+        for option in BOOL_OPTIONS:
+            try:
+                options[option] = C.bool(options[option])
+            except KeyError:
+                pass
+        return self.blogImport(client, name, location, options)
+
+    @defer.inlineCallbacks
+    def blogImport(self, client, name, location, options=None, pubsub_service=None):
+        """Import a blog
+
+        @param name(unicode): name of the blog importer
+        @param location(unicode): location of the blog data to import
+            can be an url, a file path, or anything which make sense
+            check importer description for more details
+        @param options(dict, None): extra options. Below are the generic options,
+            blog importer can have specific ones. All options have unicode values
+            generic options:
+                - OPT_HOST (unicode): original host
+                - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
+                    see OPT_UPLOAD_IGNORE_HOST.
+                    Default: True
+                - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
+                - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
+                    Default: False
+        @param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported
+            None to use profile's server
+        """
+        if options is None:
+            options = {}
+        else:
+            for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True),
+                                          (OPT_IGNORE_TLS, False)):
+                # we want an filled options dict, with all empty or False values removed
+                try:
+                    value =options[opt_name]
+                except KeyError:
+                    if opt_default:
+                        options[opt_name] = opt_default
+                else:
+                    if not value:
+                        del options[opt_name]
+        try:
+            importer = self._importers[name]
+        except KeyError:
+            raise exceptions.NotFound(u"Importer [{}] not found".format(name))
+        posts_data = yield importer.callback(client, location, options)
+        url_links = {}
+        yield self._recursiveImport(client, posts_data, options, url_links)
+
+    @defer.inlineCallbacks
+    def _recursiveImport(self, client, posts_data, options, url_links, service=None, node=None, depth=0):
+        """Do the upload recursively
+
+        @param posts_data(list): list of data as specified in [register]
+        @param options(dict): import options
+        @param url_links(dict): link between former posts and new items
+        @param service(jid.JID, None): PubSub service to use
+        @param node(unicode, None): PubSub node to use
+        @param depth(int): level of recursion
+        """
+        for data in posts_data:
+            # data checks/filters
+            mb_data = data['blog']
+            try:
+                item_id = mb_data['id']
+            except KeyError:
+                item_id = mb_data['id'] = unicode(uuid.uuid4())
+
+            try:
+                # we keep the link between old url and new blog item
+                # so the user can redirect its former blog urls
+                old_url = data['url']
+            except KeyError:
+                pass
+            else:
+                url_links[old_url] = (service, node, item_id)
+                log.info(u"url link from {old} to {service}/{node}/{id}".format(
+                    old = old_url, service=service or u'server', node=node or u'', id=item_id))
+
+            depth or log.debug(u"Filtering data")
+            yield self.blogFilters(client, mb_data, options)
+
+            # comments data
+            if len(data['comments']) != 1:
+                raise NotImplementedError(u"can't manage multiple comment links")
+            allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
+            if allow_comments:
+                comments_service, comments_node = self._m.getCommentService(client), self._m.getCommentNode(item_id)
+                mb_data['comments_service'] = comments_service
+                mb_data['comments_node'] = comments_node
+            else:
+                if data['comments'][0]:
+                    raise exceptions.DataError(u"allow_comments set to False, but comments are there")
+
+            # post upload
+            depth or log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
+            yield self._m.send(mb_data, service, node, profile=client.profile)
+
+            # comments upload
+            depth or log.debug(u"uploading comments")
+            if allow_comments:
+                yield self._recursiveImport(client, data['comments'][0], options, url_links, service=comments_service, node=comments_node, depth=depth+1)
+
+    @defer.inlineCallbacks
+    def blogFilters(self, client, mb_data, options):
+        """Apply filters according to options
+
+        modify mb_data in place
+        @param posts_data(list[dict]): data as returned by importer callback
+        @param options(dict): dict as given in [blogImport]
+        """
+        # FIXME: blog filters don't work on text content
+        # TODO: text => XHTML conversion should handler links with <a/>
+        #       filters can then be used by converting text to XHTML
+        if not options:
+            return
+
+        # we want only XHTML content
+        for prefix in ('content',): # a tuple is use, if title need to be added in the future
+            try:
+                rich = mb_data['{}_rich'.format(prefix)]
+            except KeyError:
+                pass
+            else:
+                if '{}_xhtml'.format(prefix) in mb_data:
+                    raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix))
+                # we convert rich syntax to XHTML here, so we can handle filters easily
+                converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False)
+                mb_data['{}_xhtml'.format(prefix)] = converted
+                del mb_data['{}_rich'.format(prefix)]
+
+            try:
+                mb_data['txt']
+            except KeyError:
+                pass
+            else:
+                if '{}_xhtml'.format(prefix) in mb_data:
+                    log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix))
+                    del mb_data['{}_text'.format(prefix)]
+                else:
+                    log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix))
+                    return
+
+        # at this point, we have only XHTML version of content
+        try:
+            top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML)
+        except domish.ParserError:
+            # we clean the xml and try again our luck
+            cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml'])
+            top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
+        opt_host = options.get(OPT_HOST)
+        if opt_host:
+            # we normalise the domain
+            parsed_host = urlparse.urlsplit(opt_host)
+            opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', ''))
+
+        tmp_dir = tempfile.mkdtemp()
+        for img_elt in xml_tools.findAll(top_elt, ['img']):
+            yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir)
+        os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
+
+        # we now replace the content with filtered one
+        mb_data['content_xhtml'] = top_elt.toXml()
+
+    @defer.inlineCallbacks
+    def imgFilters(self, client, img_elt, options, opt_host, tmp_dir):
+        """Filters handling images
+
+        url without host are fixed (if possible)
+        according to options, images are uploaded to XMPP server
+        @param img_elt(domish.Element): <img/> element to handle
+        @param options(dict): filters options
+        @param opt_host(unicode): normalised host given in options
+        @param tmp_dir(str): path to temp directory
+        """
+        try:
+            url = img_elt['src']
+            if url[0] == u'/':
+                if not opt_host:
+                    log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}"
+                        .format(url=url, xml=img_elt.toXml()))
+                    return
+                else:
+                    url = urlparse.urljoin(opt_host, url)
+            filename = url.rsplit('/',1)[-1].strip()
+            if not filename:
+                raise KeyError
+        except (KeyError, IndexError):
+            log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml()))
+            return
+
+        # we change the url for the normalized one
+        img_elt['src'] = url
+
+        if options.get(OPT_UPLOAD_IMAGES, False):
+            # upload is requested
+            try:
+                ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
+            except KeyError:
+                pass
+            else:
+                # host is the ignored one, we skip
+                parsed_url = urlparse.urlsplit(url)
+                if ignore_host in parsed_url.hostname:
+                    log.info(u"Don't upload image at {url} because of {opt} option".format(
+                        url=url, opt=OPT_UPLOAD_IGNORE_HOST))
+                    return
+
+            # we download images and re-upload them via XMPP
+            tmp_file = os.path.join(tmp_dir, filename).encode('utf-8')
+            upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)}
+
+            try:
+                yield web_client.downloadPage(url.encode('utf-8'), tmp_file)
+                filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload
+                dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options)
+                download_url = yield download_d
+            except Exception as e:
+                log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e))
+            else:
+                img_elt['src'] = download_url
+
+            try:
+                os.unlink(tmp_file)
+            except OSError:
+                pass
+
+    def register(self, name, callback, short_desc='', long_desc=''):
+        """Register a blogImport method
+
+        @param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase
+        @param callback(callable): method to call:
+            the signature must be (client, location, options) (cf. [blogImport])
+            the importer must return an iterable of dict which must have the following keys:
+                'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
+                    the importer MUST NOT create node or call XEP-0277 plugin itself
+                    'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
+                    It is recommanded to use a unique id in the "id" key which is constant per blog item,
+                    so if the import fail, a new import will overwrite the failed items and avoid duplicates.
+
+                'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
+                    a list of list is used because XEP-0277 can handler several comments nodes,
+                    but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
+                    blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
+                    If allow_comments is False and some comments are present, a exceptions.DataError will be raised
+            the import MAY optionally have the following keys:
+                'url' (unicode): former url of the post (only the path, without host part)
+                    if present the association to the new path will be displayed to user, so it can make redirections if necessary
+
+            Optionally, the importer plugin can return a tuple with the just described iterator and a post_total
+                where "post_total" (int) indicate the total number of posts (without comments)
+                useful to display a progress indicator when the iterator is a generator
+        @param short_desc(unicode): one line description of the importer
+        @param long_desc(unicode): long description of the importer, its options, etc.
+        """
+        name = name.lower()
+        if name in self._importers:
+            raise exceptions.ConflictError(u"A blog importer with the name {} already exsit".format(name))
+        self._importers[name] = BlogImporter(callback, short_desc, long_desc)
+
+    def unregister(self, name):
+        del self._importers[name]