libervia-backend: src/plugins/plugin_blog

comparison src/plugins/plugin_blog_import.py @ 1825:4e51f21c687f

plugin blog import: this plugin is the base handling blog importers: - blog importers are plugin which register to this one with the "register" method - blog import manage uploading the given blog data to a PubSub server - filters can be used. - current filters fixe the <img> src without host, and upload images to XMPP server

author	Goffi <goffi@goffi.org>
date	Fri, 22 Jan 2016 20:24:17 +0100
parents
children	68c0dc13d821

comparison

equal deleted inserted replaced

-:a19161bb3ff7
+:4e51f21c687f
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+# SàT plugin for import external blogs
+# Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+from sat.core.i18n import _
+from sat.core.constants import Const as C
+from sat.core.log import getLogger
+log = getLogger(__name__)
+from twisted.internet import defer
+from twisted.web import client as web_client
+from twisted.words.xish import domish
+from sat.core import exceptions
+from sat.tools import xml_tools
+import collections
+import os
+import os.path
+import tempfile
+import urlparse
+import uuid
+PLUGIN_INFO = {
+"name": "blog import",
+"import_name": "BLOG_IMPORT",
+"type": C.PLUG_TYPE_BLOG,
+"dependencies": ["XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
+"main": "BlogImportPlugin",
+"handler": "no",
+"description": _(u"""Blog import management:
+This plugin manage the different blog importers which can register to it, and handler generic importing tasks.""")
+}
+OPT_HOST = 'host'
+OPT_UPLOAD_IMAGES = 'upload_images'
+OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
+OPT_IGNORE_TLS = 'ignore_tls_errors'
+BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
+BlogImporter = collections.namedtuple('BlogImporter', ('callback', 'short_desc', 'long_desc'))
+class BlogImportPlugin(object):
+def __init__(self, host):
+log.info(_("plugin Blog Import initialization"))
+self.host = host
+self._importers = {}
+self._u = host.plugins['UPLOAD']
+self._m = host.plugins['XEP-0277']
+self._s = self.host.plugins['TEXT-SYNTAXES']
+host.bridge.addMethod("blogImport", ".plugin", in_sign='ssa{ss}ss', out_sign='', method=self._blogImport, async=True)
+host.bridge.addMethod("blogImportList", ".plugin", in_sign='', out_sign='a(ss)', method=self.listImporters)
+host.bridge.addMethod("blogImportDesc", ".plugin", in_sign='s', out_sign='(ss)', method=self.getDescription)
+def listImporters(self):
+importers = self._importers.keys()
+importers.sort()
+return [(name, self._importers[name].short_desc) for name in self._importers]
+def getDescription(self, name):
+"""Return import short and long descriptions
+@param name(unicode): blog importer name
+@return (tuple[unicode,unicode]): short and long description
+"""
+try:
+importer = self._importers[name]
+except KeyError:
+raise exceptions.NotFound(u"Blog importer not found [{}]".format(name))
+else:
+return importer.short_desc, importer.long_desc
+def _blogImport(self, name, location, options, pubsub_service='', profile=C.PROF_KEY_DEFAULT):
+client = self.host.getClient(profile)
+for option in BOOL_OPTIONS:
+try:
+options[option] = C.bool(options[option])
+except KeyError:
+pass
+return self.blogImport(client, name, location, options)
+@defer.inlineCallbacks
+def blogImport(self, client, name, location, options=None, pubsub_service=None):
+"""Import a blog
+@param name(unicode): name of the blog importer
+@param location(unicode): location of the blog data to import
+can be an url, a file path, or anything which make sense
+check importer description for more details
+@param options(dict, None): extra options. Below are the generic options,
+blog importer can have specific ones. All options have unicode values
+generic options:
+- OPT_HOST (unicode): original host
+- OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
+see OPT_UPLOAD_IGNORE_HOST.
+Default: True
+- OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
+- OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
+Default: False
+@param pubsub_service(jid.JID, None): jid of the PubSub service where blog must be imported
+None to use profile's server
+"""
+if options is None:
+options = {}
+else:
+for opt_name, opt_default in ((OPT_UPLOAD_IMAGES, True),
+(OPT_IGNORE_TLS, False)):
+# we want an filled options dict, with all empty or False values removed
+try:
+value =options[opt_name]
+except KeyError:
+if opt_default:
+options[opt_name] = opt_default
+else:
+if not value:
+del options[opt_name]
+try:
+importer = self._importers[name]
+except KeyError:
+raise exceptions.NotFound(u"Importer [{}] not found".format(name))
+posts_data = yield importer.callback(client, location, options)
+url_links = {}
+yield self._recursiveImport(client, posts_data, options, url_links)
+@defer.inlineCallbacks
+def _recursiveImport(self, client, posts_data, options, url_links, service=None, node=None, depth=0):
+"""Do the upload recursively
+@param posts_data(list): list of data as specified in [register]
+@param options(dict): import options
+@param url_links(dict): link between former posts and new items
+@param service(jid.JID, None): PubSub service to use
+@param node(unicode, None): PubSub node to use
+@param depth(int): level of recursion
+"""
+for data in posts_data:
+# data checks/filters
+mb_data = data['blog']
+try:
+item_id = mb_data['id']
+except KeyError:
+item_id = mb_data['id'] = unicode(uuid.uuid4())
+try:
+# we keep the link between old url and new blog item
+# so the user can redirect its former blog urls
+old_url = data['url']
+except KeyError:
+pass
+else:
+url_links[old_url] = (service, node, item_id)
+log.info(u"url link from {old} to {service}/{node}/{id}".format(
+old = old_url, service=service or u'server', node=node or u'', id=item_id))
+depth or log.debug(u"Filtering data")
+yield self.blogFilters(client, mb_data, options)
+# comments data
+if len(data['comments']) != 1:
+raise NotImplementedError(u"can't manage multiple comment links")
+allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
+if allow_comments:
+comments_service, comments_node = self._m.getCommentService(client), self._m.getCommentNode(item_id)
+mb_data['comments_service'] = comments_service
+mb_data['comments_node'] = comments_node
+else:
+if data['comments'][0]:
+raise exceptions.DataError(u"allow_comments set to False, but comments are there")
+# post upload
+depth or log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
+yield self._m.send(mb_data, service, node, profile=client.profile)
+# comments upload
+depth or log.debug(u"uploading comments")
+if allow_comments:
+yield self._recursiveImport(client, data['comments'][0], options, url_links, service=comments_service, node=comments_node, depth=depth+1)
+@defer.inlineCallbacks
+def blogFilters(self, client, mb_data, options):
+"""Apply filters according to options
+modify mb_data in place
+@param posts_data(list[dict]): data as returned by importer callback
+@param options(dict): dict as given in [blogImport]
+"""
+# FIXME: blog filters don't work on text content
+# TODO: text => XHTML conversion should handler links with <a/>
+#       filters can then be used by converting text to XHTML
+if not options:
+return
+# we want only XHTML content
+for prefix in ('content',): # a tuple is use, if title need to be added in the future
+try:
+rich = mb_data['{}_rich'.format(prefix)]
+except KeyError:
+pass
+else:
+if '{}_xhtml'.format(prefix) in mb_data:
+raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix))
+# we convert rich syntax to XHTML here, so we can handle filters easily
+converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False)
+mb_data['{}_xhtml'.format(prefix)] = converted
+del mb_data['{}_rich'.format(prefix)]
+try:
+mb_data['txt']
+except KeyError:
+pass
+else:
+if '{}_xhtml'.format(prefix) in mb_data:
+log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix))
+del mb_data['{}_text'.format(prefix)]
+else:
+log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix))
+return
+# at this point, we have only XHTML version of content
+try:
+top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML)
+except domish.ParserError:
+# we clean the xml and try again our luck
+cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml'])
+top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
+opt_host = options.get(OPT_HOST)
+if opt_host:
+# we normalise the domain
+parsed_host = urlparse.urlsplit(opt_host)
+opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', ''))
+tmp_dir = tempfile.mkdtemp()
+for img_elt in xml_tools.findAll(top_elt, ['img']):
+yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir)
+os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
+# we now replace the content with filtered one
+mb_data['content_xhtml'] = top_elt.toXml()
+@defer.inlineCallbacks
+def imgFilters(self, client, img_elt, options, opt_host, tmp_dir):
+"""Filters handling images
+url without host are fixed (if possible)
+according to options, images are uploaded to XMPP server
+@param img_elt(domish.Element): <img/> element to handle
+@param options(dict): filters options
+@param opt_host(unicode): normalised host given in options
+@param tmp_dir(str): path to temp directory
+"""
+try:
+url = img_elt['src']
+if url[0] == u'/':
+if not opt_host:
+log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}"
+.format(url=url, xml=img_elt.toXml()))
+return
+else:
+url = urlparse.urljoin(opt_host, url)
+filename = url.rsplit('/',1)[-1].strip()
+if not filename:
+raise KeyError
+except (KeyError, IndexError):
+log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml()))
+return
+# we change the url for the normalized one
+img_elt['src'] = url
+if options.get(OPT_UPLOAD_IMAGES, False):
+# upload is requested
+try:
+ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
+except KeyError:
+pass
+else:
+# host is the ignored one, we skip
+parsed_url = urlparse.urlsplit(url)
+if ignore_host in parsed_url.hostname:
+log.info(u"Don't upload image at {url} because of {opt} option".format(
+url=url, opt=OPT_UPLOAD_IGNORE_HOST))
+return
+# we download images and re-upload them via XMPP
+tmp_file = os.path.join(tmp_dir, filename).encode('utf-8')
+upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)}
+try:
+yield web_client.downloadPage(url.encode('utf-8'), tmp_file)
+filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload
+dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options)
+download_url = yield download_d
+except Exception as e:
+log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e))
+else:
+img_elt['src'] = download_url
+try:
+os.unlink(tmp_file)
+except OSError:
+pass
+def register(self, name, callback, short_desc='', long_desc=''):
+"""Register a blogImport method
+@param name(unicode): unique importer name, should indicate the blogging software it handler and always lowercase
+@param callback(callable): method to call:
+the signature must be (client, location, options) (cf. [blogImport])
+the importer must return an iterable of dict which must have the following keys:
+'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
+the importer MUST NOT create node or call XEP-0277 plugin itself
+'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
+It is recommanded to use a unique id in the "id" key which is constant per blog item,
+so if the import fail, a new import will overwrite the failed items and avoid duplicates.
+'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
+a list of list is used because XEP-0277 can handler several comments nodes,
+but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
+blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
+If allow_comments is False and some comments are present, a exceptions.DataError will be raised
+the import MAY optionally have the following keys:
+'url' (unicode): former url of the post (only the path, without host part)
+if present the association to the new path will be displayed to user, so it can make redirections if necessary
+Optionally, the importer plugin can return a tuple with the just described iterator and a post_total
+where "post_total" (int) indicate the total number of posts (without comments)
+useful to display a progress indicator when the iterator is a generator
+@param short_desc(unicode): one line description of the importer
+@param long_desc(unicode): long description of the importer, its options, etc.
+"""
+name = name.lower()
+if name in self._importers:
+raise exceptions.ConflictError(u"A blog importer with the name {} already exsit".format(name))
+self._importers[name] = BlogImporter(callback, short_desc, long_desc)
+def unregister(self, name):
+del self._importers[name]

Mercurial > libervia-backend

comparison src/plugins/plugin_blog_import.py @ 1825:4e51f21c687f