diff libervia/backend/plugins/plugin_blog_import.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`
author Goffi <goffi@goffi.org>
date Fri, 02 Jun 2023 11:49:51 +0200
parents sat/plugins/plugin_blog_import.py@524856bd7b19
children
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libervia/backend/plugins/plugin_blog_import.py	Fri Jun 02 11:49:51 2023 +0200
@@ -0,0 +1,323 @@
+#!/usr/bin/env python3
+
+
+# SàT plugin for import external blogs
+# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+
+from libervia.backend.core.i18n import _
+from libervia.backend.core.constants import Const as C
+from libervia.backend.core.log import getLogger
+
+log = getLogger(__name__)
+from twisted.internet import defer
+from twisted.web import client as web_client
+from twisted.words.xish import domish
+from libervia.backend.core import exceptions
+from libervia.backend.tools import xml_tools
+import os
+import os.path
+import tempfile
+import urllib.parse
+import shortuuid
+
+
+PLUGIN_INFO = {
+    C.PI_NAME: "blog import",
+    C.PI_IMPORT_NAME: "BLOG_IMPORT",
+    C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT),
+    C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT_SYNTAXES", "UPLOAD"],
+    C.PI_MAIN: "BlogImportPlugin",
+    C.PI_HANDLER: "no",
+    C.PI_DESCRIPTION: _(
+        """Blog import management:
+This plugin manage the different blog importers which can register to it, and handle generic importing tasks."""
+    ),
+}
+
+OPT_HOST = "host"
+OPT_UPLOAD_IMAGES = "upload_images"
+OPT_UPLOAD_IGNORE_HOST = "upload_ignore_host"
+OPT_IGNORE_TLS = "ignore_tls_errors"
+URL_REDIRECT_PREFIX = "url_redirect_"
+
+
+class BlogImportPlugin(object):
+    BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
+    JSON_OPTIONS = ()
+    OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True, OPT_IGNORE_TLS: False}
+
+    def __init__(self, host):
+        log.info(_("plugin Blog import initialization"))
+        self.host = host
+        self._u = host.plugins["UPLOAD"]
+        self._p = host.plugins["XEP-0060"]
+        self._m = host.plugins["XEP-0277"]
+        self._s = self.host.plugins["TEXT_SYNTAXES"]
+        host.plugins["IMPORT"].initialize(self, "blog")
+
+    def import_item(
+        self, client, item_import_data, session, options, return_data, service, node
+    ):
+        """import_item specialized for blog import
+
+        @param item_import_data(dict):
+            * mandatory keys:
+                'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
+                    the importer MUST NOT create node or call XEP-0277 plugin itself
+                    'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
+                    It is recommanded to use a unique id in the "id" key which is constant per blog item,
+                    so if the import fail, a new import will overwrite the failed items and avoid duplicates.
+
+                'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
+                    a list of list is used because XEP-0277 can handler several comments nodes,
+                    but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
+                    blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
+                    If allow_comments is False and some comments are present, an exceptions.DataError will be raised
+            * optional keys:
+                'url' (unicode): former url of the post (only the path, without host part)
+                    if present the association to the new path will be displayed to user, so it can make redirections if necessary
+        @param options(dict, None): Below are the generic options,
+            blog importer can have specific ones. All options have unicode values
+            generic options:
+                - OPT_HOST (unicode): original host
+                - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
+                    see OPT_UPLOAD_IGNORE_HOST.
+                    Default: True
+                - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
+                - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
+                    Default: False
+        @param return_data(dict): will contain link between former posts and new items
+
+        """
+        mb_data = item_import_data["blog"]
+        try:
+            item_id = mb_data["id"]
+        except KeyError:
+            item_id = mb_data["id"] = str(shortuuid.uuid())
+
+        try:
+            # we keep the link between old url and new blog item
+            # so the user can redirect its former blog urls
+            old_uri = item_import_data["url"]
+        except KeyError:
+            pass
+        else:
+            new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.get_node_uri(
+                service if service is not None else client.jid.userhostJID(),
+                node or self._m.namespace,
+                item_id,
+            )
+            log.info("url link from {old} to {new}".format(old=old_uri, new=new_uri))
+
+        return mb_data
+
+    @defer.inlineCallbacks
+    def import_sub_items(self, client, item_import_data, mb_data, session, options):
+        # comments data
+        if len(item_import_data["comments"]) != 1:
+            raise NotImplementedError("can't manage multiple comment links")
+        allow_comments = C.bool(mb_data.get("allow_comments", C.BOOL_FALSE))
+        if allow_comments:
+            comments_service = yield self._m.get_comments_service(client)
+            comments_node = self._m.get_comments_node(mb_data["id"])
+            mb_data["comments_service"] = comments_service.full()
+            mb_data["comments_node"] = comments_node
+            recurse_kwargs = {
+                "items_import_data": item_import_data["comments"][0],
+                "service": comments_service,
+                "node": comments_node,
+            }
+            defer.returnValue(recurse_kwargs)
+        else:
+            if item_import_data["comments"][0]:
+                raise exceptions.DataError(
+                    "allow_comments set to False, but comments are there"
+                )
+            defer.returnValue(None)
+
+    def publish_item(self, client, mb_data, service, node, session):
+        log.debug(
+            "uploading item [{id}]: {title}".format(
+                id=mb_data["id"], title=mb_data.get("title", "")
+            )
+        )
+        return self._m.send(client, mb_data, service, node)
+
+    @defer.inlineCallbacks
+    def item_filters(self, client, mb_data, session, options):
+        """Apply filters according to options
+
+        modify mb_data in place
+        @param posts_data(list[dict]): data as returned by importer callback
+        @param options(dict): dict as given in [blogImport]
+        """
+        # FIXME: blog filters don't work on text content
+        # TODO: text => XHTML conversion should handler links with <a/>
+        #       filters can then be used by converting text to XHTML
+        if not options:
+            return
+
+        # we want only XHTML content
+        for prefix in (
+            "content",
+        ):  # a tuple is use, if title need to be added in the future
+            try:
+                rich = mb_data["{}_rich".format(prefix)]
+            except KeyError:
+                pass
+            else:
+                if "{}_xhtml".format(prefix) in mb_data:
+                    raise exceptions.DataError(
+                        "importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(
+                            prefix=prefix
+                        )
+                    )
+                # we convert rich syntax to XHTML here, so we can handle filters easily
+                converted = yield self._s.convert(
+                    rich, self._s.get_current_syntax(client.profile), safe=False
+                )
+                mb_data["{}_xhtml".format(prefix)] = converted
+                del mb_data["{}_rich".format(prefix)]
+
+            try:
+                mb_data["txt"]
+            except KeyError:
+                pass
+            else:
+                if "{}_xhtml".format(prefix) in mb_data:
+                    log.warning(
+                        "{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(
+                            prefix=prefix
+                        )
+                    )
+                    del mb_data["{}_text".format(prefix)]
+                else:
+                    log.warning(
+                        "importer gave a text {prefix}, blog filters don't work on text {prefix}".format(
+                            prefix=prefix
+                        )
+                    )
+                    return
+
+        # at this point, we have only XHTML version of content
+        try:
+            top_elt = xml_tools.ElementParser()(
+                mb_data["content_xhtml"], namespace=C.NS_XHTML
+            )
+        except domish.ParserError:
+            # we clean the xml and try again our luck
+            cleaned = yield self._s.clean_xhtml(mb_data["content_xhtml"])
+            top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
+        opt_host = options.get(OPT_HOST)
+        if opt_host:
+            # we normalise the domain
+            parsed_host = urllib.parse.urlsplit(opt_host)
+            opt_host = urllib.parse.urlunsplit(
+                (
+                    parsed_host.scheme or "http",
+                    parsed_host.netloc or parsed_host.path,
+                    "",
+                    "",
+                    "",
+                )
+            )
+
+        tmp_dir = tempfile.mkdtemp()
+        try:
+            # TODO: would be nice to also update the hyperlinks to these images, e.g. when you have <a href="{url}"><img src="{url}"></a>
+            for img_elt in xml_tools.find_all(top_elt, names=["img"]):
+                yield self.img_filters(client, img_elt, options, opt_host, tmp_dir)
+        finally:
+            os.rmdir(tmp_dir)  # XXX: tmp_dir should be empty, or something went wrong
+
+        # we now replace the content with filtered one
+        mb_data["content_xhtml"] = top_elt.toXml()
+
+    @defer.inlineCallbacks
+    def img_filters(self, client, img_elt, options, opt_host, tmp_dir):
+        """Filters handling images
+
+        url without host are fixed (if possible)
+        according to options, images are uploaded to XMPP server
+        @param img_elt(domish.Element): <img/> element to handle
+        @param options(dict): filters options
+        @param opt_host(unicode): normalised host given in options
+        @param tmp_dir(str): path to temp directory
+        """
+        try:
+            url = img_elt["src"]
+            if url[0] == "/":
+                if not opt_host:
+                    log.warning(
+                        "host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}".format(
+                            url=url, xml=img_elt.toXml()
+                        )
+                    )
+                    return
+                else:
+                    url = urllib.parse.urljoin(opt_host, url)
+            filename = url.rsplit("/", 1)[-1].strip()
+            if not filename:
+                raise KeyError
+        except (KeyError, IndexError):
+            log.warning("ignoring invalid img element: {}".format(img_elt.toXml()))
+            return
+
+        # we change the url for the normalized one
+        img_elt["src"] = url
+
+        if options.get(OPT_UPLOAD_IMAGES, False):
+            # upload is requested
+            try:
+                ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
+            except KeyError:
+                pass
+            else:
+                # host is the ignored one, we skip
+                parsed_url = urllib.parse.urlsplit(url)
+                if ignore_host in parsed_url.hostname:
+                    log.info(
+                        "Don't upload image at {url} because of {opt} option".format(
+                            url=url, opt=OPT_UPLOAD_IGNORE_HOST
+                        )
+                    )
+                    return
+
+            # we download images and re-upload them via XMPP
+            tmp_file = os.path.join(tmp_dir, filename).encode("utf-8")
+            upload_options = {"ignore_tls_errors": options.get(OPT_IGNORE_TLS, False)}
+
+            try:
+                yield web_client.downloadPage(url.encode("utf-8"), tmp_file)
+                filename = filename.replace(
+                    "%", "_"
+                )  # FIXME: tmp workaround for a bug in prosody http upload
+                __, download_d = yield self._u.upload(
+                    client, tmp_file, filename, extra=upload_options
+                )
+                download_url = yield download_d
+            except Exception as e:
+                log.warning(
+                    "can't download image at {url}: {reason}".format(url=url, reason=e)
+                )
+            else:
+                img_elt["src"] = download_url
+
+            try:
+                os.unlink(tmp_file)
+            except OSError:
+                pass