view sat/plugins/plugin_blog_import_dokuwiki.py @ 3728:b15644cae50d

component AP gateway: JID/node ⟺ AP outbox conversion: - convert a combination of JID and optional pubsub node to AP actor handle (see `getJIDAndNode` for details) and vice versa - the gateway now provides a Pubsub service - retrieve pubsub node and convert it to AP collection, AP pagination is converted to RSM - do the opposite: convert AP collection to pubsub and handle RSM request. Due to ActivityStream collection pagination limitations, some RSM request produce inefficient requests, but caching should be used most of the time in the future and avoid the problem. - set specific name to HTTP Server - new `local_only` setting (`True` by default) to indicate if the gateway can request or not XMPP Pubsub nodes from other servers - disco info now specifies important features such as Pubsub RSM, and nodes metadata ticket 363
author Goffi <goffi@goffi.org>
date Tue, 25 Jan 2022 17:54:06 +0100
parents be6d91572633
children 524856bd7b19
line wrap: on
line source

#!/usr/bin/env python3


# SàT plugin to import dokuwiki blogs
# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
# Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

from sat.core.i18n import _, D_
from sat.core.constants import Const as C
from sat.core.log import getLogger

log = getLogger(__name__)
from sat.core import exceptions
from sat.tools import xml_tools
from twisted.internet import threads
from collections import OrderedDict
import calendar
import urllib.request, urllib.parse, urllib.error
import urllib.parse
import tempfile
import re
import time
import os.path

try:
    from dokuwiki import DokuWiki, DokuWikiError  # this is a new dependency
except ImportError:
    raise exceptions.MissingModule(
        'Missing module dokuwiki, please install it with "pip install dokuwiki"'
    )
try:
    from PIL import Image  # this is already needed by plugin XEP-0054
except:
    raise exceptions.MissingModule(
        "Missing module pillow, please download/install it from https://python-pillow.github.io"
    )

PLUGIN_INFO = {
    C.PI_NAME: "Dokuwiki import",
    C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI",
    C.PI_TYPE: C.PLUG_TYPE_BLOG,
    C.PI_DEPENDENCIES: ["BLOG_IMPORT"],
    C.PI_MAIN: "DokuwikiImport",
    C.PI_HANDLER: "no",
    C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine."""),
}

SHORT_DESC = D_("import posts from Dokuwiki blog engine")

LONG_DESC = D_(
    """This importer handle Dokuwiki blog engine.

To use it, you need an admin access to a running Dokuwiki website
(local or on the Internet). The importer retrieves the data using
the XMLRPC Dokuwiki API.

You can specify a namespace (that could be a namespace directory
or a single post) or leave it empty to use the root namespace "/"
and import all the posts.

You can specify a new media repository to modify the internal
media links and make them point to the URL of your choice, but
note that the upload is not done automatically: a temporary
directory will be created on your local drive and you will
need to upload it yourself to your repository via SSH or FTP.

Following options are recognized:

location: DokuWiki site URL
user: DokuWiki admin user
passwd: DokuWiki admin password
namespace: DokuWiki namespace to import (default: root namespace "/")
media_repo: URL to the new remote media repository (default: none)
limit: maximal number of posts to import (default: 100)

Example of usage (with jp frontend):

jp import dokuwiki -p dave --pwd xxxxxx --connect
    http://127.0.1.1 -o user souliane -o passwd qwertz
    -o namespace public:2015:10
    -o media_repo http://media.diekulturvermittlung.at

This retrieves the 100 last blog posts from http://127.0.1.1 that
are inside the namespace "public:2015:10" using the Dokuwiki user
"souliane", and it imports them to sat profile dave's microblog node.
Internal Dokuwiki media that were hosted on http://127.0.1.1 are now
pointing to http://media.diekulturvermittlung.at.
"""
)
DEFAULT_MEDIA_REPO = ""
DEFAULT_NAMESPACE = "/"
DEFAULT_LIMIT = 100  # you might get a DBUS timeout (no reply) if it lasts too long


class Importer(DokuWiki):
    def __init__(
        self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT
    ):
        """

        @param url (unicode): DokuWiki site URL
        @param user (unicode): DokuWiki admin user
        @param passwd (unicode): DokuWiki admin password
        @param media_repo (unicode): New remote media repository
        """
        DokuWiki.__init__(self, url, user, passwd)
        self.url = url
        self.media_repo = media_repo
        self.temp_dir = tempfile.mkdtemp() if self.media_repo else None
        self.limit = limit
        self.posts_data = OrderedDict()

    def getPostId(self, post):
        """Return a unique and constant post id

        @param post(dict): parsed post data
        @return (unicode): post unique item id
        """
        return str(post["id"])

    def getPostUpdated(self, post):
        """Return the update date.

        @param post(dict): parsed post data
        @return (unicode): update date
        """
        return str(post["mtime"])

    def getPostPublished(self, post):
        """Try to parse the date from the message ID, else use "mtime".

        The date can be extracted if the message ID looks like one of:
            - namespace:YYMMDD_short_title
            - namespace:YYYYMMDD_short_title
        @param post (dict):  parsed post data
        @return (unicode): publication date
        """
        id_, default = str(post["id"]), str(post["mtime"])
        try:
            date = id_.split(":")[-1].split("_")[0]
        except KeyError:
            return default
        try:
            time_struct = time.strptime(date, "%y%m%d")
        except ValueError:
            try:
                time_struct = time.strptime(date, "%Y%m%d")
            except ValueError:
                return default
        return str(calendar.timegm(time_struct))

    def processPost(self, post, profile_jid):
        """Process a single page.

        @param post (dict): parsed post data
        @param profile_jid
        """
        # get main information
        id_ = self.getPostId(post)
        updated = self.getPostUpdated(post)
        published = self.getPostPublished(post)

        # manage links
        backlinks = self.pages.backlinks(id_)
        for link in self.pages.links(id_):
            if link["type"] != "extern":
                assert link["type"] == "local"
                page = link["page"]
                backlinks.append(page[1:] if page.startswith(":") else page)

        self.pages.get(id_)
        content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid)

        # XXX: title is already in content_xhtml and difficult to remove, so leave it
        # title = content.split("\n")[0].strip(u"\ufeff= ")

        # build the extra data dictionary
        mb_data = {
            "id": id_,
            "published": published,
            "updated": updated,
            "author": profile_jid.user,
            # "content": content,  # when passed, it is displayed in Libervia instead of content_xhtml
            "content_xhtml": content_xhtml,
            # "title": title,
            "allow_comments": "true",
        }

        # find out if the message access is public or restricted
        namespace = id_.split(":")[0]
        if namespace and namespace.lower() not in ("public", "/"):
            mb_data["group"] = namespace  # roster group must exist

        self.posts_data[id_] = {"blog": mb_data, "comments": [[]]}

    def process(self, client, namespace=DEFAULT_NAMESPACE):
        """Process a namespace or a single page.

        @param namespace (unicode): DokuWiki namespace (or page) to import
        """
        profile_jid = client.jid
        log.info("Importing data from DokuWiki %s" % self.version)
        try:
            pages_list = self.pages.list(namespace)
        except DokuWikiError:
            log.warning(
                'Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.'
            )
            return

        if not pages_list:  # namespace is actually a page?
            names = namespace.split(":")
            real_namespace = ":".join(names[0:-1])
            pages_list = self.pages.list(real_namespace)
            pages_list = [page for page in pages_list if page["id"] == namespace]
            namespace = real_namespace

        count = 0
        for page in pages_list:
            self.processPost(page, profile_jid)
            count += 1
            if count >= self.limit:
                break

        return (iter(self.posts_data.values()), len(self.posts_data))

    def processContent(self, text, backlinks, profile_jid):
        """Do text substitutions and file copy.

        @param text (unicode): message content
        @param backlinks (list[unicode]): list of backlinks
        """
        text = text.strip("\ufeff")  # this is at the beginning of the file (BOM)

        for backlink in backlinks:
            src = '/doku.php?id=%s"' % backlink
            tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink)
            text = text.replace(src, tgt)

        subs = {}

        link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>"""
        for tag in re.finditer(link_pattern, text):
            type_, attr, link = tag.group(1), tag.group(2), tag.group(3)
            assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href")
            if re.match(r"^\w*://", link):  # absolute URL to link directly
                continue
            if self.media_repo:
                self.moveMedia(link, subs)
            elif link not in subs:
                subs[link] = urllib.parse.urljoin(self.url, link)

        for url, new_url in subs.items():
            text = text.replace(url, new_url)
        return text

    def moveMedia(self, link, subs):
        """Move a media from the DokuWiki host to the new repository.

        This also updates the hyperlinks to internal media files.
        @param link (unicode): media link
        @param subs (dict): substitutions data
        """
        url = urllib.parse.urljoin(self.url, link)
        user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link)
        thumb_width = None

        if user_media:  # media that has been added by the user
            params = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
            try:
                media = params["media"][0]
            except KeyError:
                log.warning("No media found in fetch URL: %s" % user_media.group(2))
                return
            if re.match(r"^\w*://", media):  # external URL to link directly
                subs[link] = media
                return
            try:  # create thumbnail
                thumb_width = params["w"][0]
            except KeyError:
                pass

            filename = media.replace(":", "/")
            # XXX: avoid "precondition failed" error (only keep the media parameter)
            url = urllib.parse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media)

        elif link.startswith("/lib/plugins/"):
            # other link added by a plugin or something else
            filename = link[13:]
        else:  # fake alert... there's no media (or we don't handle it yet)
            return

        filepath = os.path.join(self.temp_dir, filename)
        self.downloadMedia(url, filepath)

        if thumb_width:
            filename = os.path.join("thumbs", thumb_width, filename)
            thumbnail = os.path.join(self.temp_dir, filename)
            self.createThumbnail(filepath, thumbnail, thumb_width)

        new_url = os.path.join(self.media_repo, filename)
        subs[link] = new_url

    def downloadMedia(self, source, dest):
        """Copy media to localhost.

        @param source (unicode): source url
        @param dest (unicode): target path
        """
        dirname = os.path.dirname(dest)
        if not os.path.exists(dest):
            if not os.path.exists(dirname):
                os.makedirs(dirname)
            urllib.request.urlretrieve(source, dest)
            log.debug("DokuWiki media file copied to %s" % dest)

    def createThumbnail(self, source, dest, width):
        """Create a thumbnail.

        @param source (unicode): source file path
        @param dest (unicode): destination file path
        @param width (unicode): thumbnail's width
        """
        thumb_dir = os.path.dirname(dest)
        if not os.path.exists(thumb_dir):
            os.makedirs(thumb_dir)
        try:
            im = Image.open(source)
            im.thumbnail((width, int(width) * im.size[0] / im.size[1]))
            im.save(dest)
            log.debug("DokuWiki media thumbnail created: %s" % dest)
        except IOError:
            log.error("Cannot create DokuWiki media thumbnail %s" % dest)


class DokuwikiImport(object):
    def __init__(self, host):
        log.info(_("plugin Dokuwiki Import initialization"))
        self.host = host
        self._blog_import = host.plugins["BLOG_IMPORT"]
        self._blog_import.register("dokuwiki", self.DkImport, SHORT_DESC, LONG_DESC)

    def DkImport(self, client, location, options=None):
        """Import from DokuWiki to PubSub

        @param location (unicode): DokuWiki site URL
        @param options (dict, None): DokuWiki import parameters
            - user (unicode): DokuWiki admin user
            - passwd (unicode): DokuWiki admin password
            - namespace (unicode): DokuWiki namespace to import
            - media_repo (unicode): New remote media repository
        """
        options[self._blog_import.OPT_HOST] = location
        try:
            user = options["user"]
        except KeyError:
            raise exceptions.DataError('parameter "user" is required')
        try:
            passwd = options["passwd"]
        except KeyError:
            raise exceptions.DataError('parameter "passwd" is required')

        opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None)
        try:
            media_repo = options["media_repo"]
            if opt_upload_images:
                options[
                    self._blog_import.OPT_UPLOAD_IMAGES
                ] = False  # force using --no-images-upload
            info_msg = _(
                "DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}"
            )
        except KeyError:
            media_repo = DEFAULT_MEDIA_REPO
            if opt_upload_images:
                info_msg = _(
                    "DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though."
                )
            else:
                info_msg = _(
                    "DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible."
                )

        try:
            namespace = options["namespace"]
        except KeyError:
            namespace = DEFAULT_NAMESPACE
        try:
            limit = options["limit"]
        except KeyError:
            limit = DEFAULT_LIMIT

        dk_importer = Importer(location, user, passwd, media_repo, limit)
        info_msg = info_msg.format(
            temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location
        )
        self.host.actionNew(
            {"xmlui": xml_tools.note(info_msg).toXml()}, profile=client.profile
        )
        d = threads.deferToThread(dk_importer.process, client, namespace)
        return d