diff src/plugins/plugin_blog_import_dokuwiki.py @ 1841:7717975b3ec3

plugin blog_import_dokuwiki: first draft
author souliane <souliane@mailoo.org>
date Thu, 04 Feb 2016 12:29:43 +0100
parents
children 9fd517248dc8
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/src/plugins/plugin_blog_import_dokuwiki.py	Thu Feb 04 12:29:43 2016 +0100
@@ -0,0 +1,369 @@
+#!/usr/bin/python
+# -*- coding: utf-8 -*-
+
+# SàT plugin to import external blogs
+# Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
+# Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from sat.core.i18n import _, D_
+from sat.core.constants import Const as C
+from sat.core.log import getLogger
+log = getLogger(__name__)
+from sat.core import exceptions
+from sat.tools import xml_tools
+from twisted.internet import threads
+from collections import OrderedDict
+import calendar
+import urllib
+import urlparse
+import tempfile
+import re
+import time
+import os.path
+try:
+    from dokuwiki import DokuWiki, DokuWikiError  # this is a new dependency
+except ImportError:
+    raise exceptions.MissingModule(u'Missing module dokuwiki, please install it with "pip install dokuwiki"')
+try:
+    from PIL import Image  # this is already needed by plugin XEP-0054
+except:
+    raise exceptions.MissingModule(u"Missing module pillow, please download/install it from https://python-pillow.github.io")
+try:
+    from plugin_blog_import import OPT_HOST, OPT_UPLOAD_IMAGES
+except:
+    raise exceptions.MissingModule(u'Missing module plugin_blog_import, please install the Salut à Toi "BLOG_IMPORT" plugin')
+
+PLUGIN_INFO = {
+    "name": "Dokuwiki import",
+    "import_name": "IMPORT_DOKUWIKI",
+    "type": C.PLUG_TYPE_BLOG,
+    "dependencies": ["BLOG_IMPORT"],
+    "main": "DokuwikiImport",
+    "handler": "no",
+    "description": _("""Blog importer for Dokuwiki blog engine.""")
+}
+
+SHORT_DESC = D_(u"import posts from Dokuwiki blog engine")
+
+LONG_DESC = D_(u"""This importer handle Dokuwiki blog engine.
+
+To use it, you need an admin access to a running Dokuwiki website
+(local or on the Internet). The importer retrieves the data using
+the XMLRPC Dokuwiki API.
+
+You can specify a namespace (that could be a namespace directory
+or a single post) or leave it empty to use the root namespace "/"
+and import all the posts.
+
+You can specify a new media repository to modify the internal
+media links and make them point to the URL of your choice, but
+note that the upload is not done automatically: a temporary
+directory will be created on your local drive and you will
+need to upload it yourself to your repository via SSH or FTP.
+
+Following options are recognized:
+
+location: DokuWiki site URL
+user: DokuWiki admin user
+passwd: DokuWiki admin password
+namespace: DokuWiki namespace to import (default: root namespace "/")
+media_repo: URL to the new remote media repository (default: none)
+limit: maximal number of posts to import (default: 100)
+
+Example of usage:
+
+jp import dokuwiki -p dave --pwd xxxxxx --connect
+    http://127.0.1.1 -o user souliane -o passwd qwertz
+    -o namespace public:2015:10
+    -o media_repo http://media.diekulturvermittlung.at
+
+This retrieves the 100 last blog posts from http://127.0.1.1 that
+are inside the namespace "public:2015:10" using the Dokuwiki user
+"souliane", and it imports them to sat profile dave's microblog node.
+Internal Dokuwiki media that were hosted on http://127.0.1.1 are now
+pointing to http://media.diekulturvermittlung.at.
+""")
+DEFAULT_MEDIA_REPO = ""
+DEFAULT_NAMESPACE = "/"
+DEFAULT_LIMIT = 100  # you might get a DBUS timeout (no reply) if it lasts too long
+
+
+class Importer(DokuWiki):
+
+    def __init__(self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT):
+        """
+
+        @param url (unicode): DokuWiki site URL
+        @param user (unicode): DokuWiki admin user
+        @param passwd (unicode): DokuWiki admin password
+        @param media_repo (unicode): New remote media repository
+        """
+        DokuWiki.__init__(self, url, user, passwd)
+        self.url = url
+        self.media_repo = media_repo
+        self.temp_dir = None
+        if self.media_repo:
+            self.temp_dir = tempfile.mkdtemp()
+            self.info_msg = _("DokuWiki media files will be downloaded to %s - to finish the import you will need to upload them to %s" % (self.temp_dir, self.media_repo))
+        else:
+            self.info_msg = _("DokuWiki media files will stay on %s - some of them may be protected by DokuWiki ACL and will not be accessible from XMPP." % url)
+        self.limit = limit
+        self.posts_data = OrderedDict()
+
+    def process(self, client, namespace=DEFAULT_NAMESPACE):
+        """Process a namespace or a single page.
+
+        @param namespace (unicode): DokuWiki namespace (or page) to import
+        """
+        profile_jid = client.jid
+        log.info("Importing data from DokuWiki %s" % self.version)
+        try:
+            pages_list = self.pages.list(namespace)
+        except DokuWikiError:
+            log.warning('Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.')
+            return
+
+        if not pages_list:  # namespace is actually a page?
+            names = namespace.split(":")
+            real_namespace = ":".join(names[0:-1])
+            pages_list = self.pages.list(real_namespace)
+            pages_list = [page for page in pages_list if page["id"] == namespace]
+            namespace = real_namespace
+
+        count = 0
+        for page in pages_list:
+
+            # get main information
+            id_, updated, published = unicode(page['id']), unicode(page['mtime']), self.getOriginalDate(page)
+
+            # manage links
+            backlinks = self.pages.backlinks(id_)
+            for link in self.pages.links(id_):
+                if link["type"] != "extern":
+                    assert link["type"] == "local"
+                    page = link["page"]
+                    backlinks.append(page[1:] if page.startswith(":") else page)
+
+            content = self.pages.get(id_)
+            content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid)
+
+            # XXX: title is already in content_xhtml and difficult to remove, so leave it
+            # title = content.split("\n")[0].strip(u"\ufeff= ")
+
+            # build the extra data dictionary
+            mb_data = {"id": id_,
+                    "published": published,
+                    "updated": updated,
+                    "author": profile_jid.user,
+                    # "content": content,  # when passed, it is displayed in Libervia instead of content_xhtml
+                    "content_xhtml": content_xhtml,
+                    # "title": title,
+                    "allow_comments": "true",
+                    }
+
+            # find out if the message access is public or restricted
+            namespace = id_.split(":")[0]
+            if namespace and namespace.lower() not in ("public", "/"):
+                mb_data["group"] = namespace  # roster group must exist
+
+            self.posts_data[id_] = {'blog': mb_data, 'comments':[[]]}
+
+            count += 1
+            if count >= self.limit :
+                break
+
+        return (self.posts_data.itervalues(), len(self.posts_data))
+
+    def getOriginalDate(self, page):
+        """Try to parse the date from the message ID, else use "mtime".
+
+        The date can be extracted if the message ID looks like one of:
+            - namespace:YYMMDD_short_title
+            - namespace:YYYYMMDD_short_title
+        @param page (dict): message page
+        @return unicode
+        """
+        id_, default = unicode(page["id"]), unicode(page["mtime"])
+        try:
+            date = id_.split(":")[-1].split("_")[0]
+        except KeyError:
+            return default
+        try:
+            time_struct = time.strptime(date, "%y%m%d")
+        except ValueError:
+            try:
+                time_struct = time.strptime(date, "%Y%m%d")
+            except ValueError:
+                return default
+        return unicode(calendar.timegm(time_struct))
+
+
+    def processContent(self, text, backlinks, profile_jid):
+        """Do text substitutions and file copy.
+
+        @param text (unicode): message content
+        @param backlinks (list[unicode]): list of backlinks
+        """
+        text = text.strip(u"\ufeff")  # this is at the beginning of the file (BOM)
+
+        for backlink in backlinks:
+            src = '/doku.php?id=%s"' % backlink
+            tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink)
+            text = text.replace(src, tgt)
+
+        subs = []
+
+        link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>"""
+        for tag in re.finditer(link_pattern, text):
+            type_, attr, link = tag.group(1), tag.group(2), tag.group(3)
+            assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href")
+            if re.match(r"^\w*://", link):  # absolute URL to link directly
+                continue
+            if self.media_repo:
+                self.moveMedia(link, subs)
+            else:
+                subs.append((link, urlparse.urljoin(self.url, link)))
+
+        for url, new_url in subs:
+            text = text.replace(url, new_url)
+        return text
+
+    def moveMedia(self, link, subs):
+        """Move a media from the DokuWiki host to the new repository.
+
+        @param link (unicode): media link
+        @param subs (set(couple)): substitutions list
+        """
+        url = urlparse.urljoin(self.url, link)
+        user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link)
+        thumb_width = None
+
+        if user_media:  # media that has been added by the user
+            params = urlparse.parse_qs(urlparse.urlparse(url).query)
+            try:
+                media = params["media"][0]
+            except KeyError:
+                log.warning("No media found in fetch URL: %s" % user_media.group(2))
+                return
+            if re.match(r"^\w*://", media):  # external URL to link directly
+                subs.append((link, media))
+                return
+            try:  # create thumbnail
+                thumb_width = params["w"][0]
+            except KeyError:
+                pass
+
+            filename = media.replace(":", "/")
+            # XXX: avoid "precondition failed" error (only keep the media parameter)
+            url = urlparse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media)
+
+        elif link.startswith("/lib/plugins/"):
+            # other link added by a plugin or something else
+            filename = link[13:]
+        else:  # fake alert... there's no media (or we don't handle it yet)
+            return
+
+        filepath = os.path.join(self.temp_dir, filename)
+        self.downloadMedia(url, filepath)
+
+        if thumb_width:
+            filename = os.path.join("thumbs", thumb_width, filename)
+            thumbnail = os.path.join(self.temp_dir, filename)
+            self.createThumbnail(filepath, thumbnail, thumb_width)
+
+        new_url = os.path.join(self.media_repo, filename)
+        subs.append((link, new_url))
+
+    def downloadMedia(self, source, dest):
+        """Copy media to localhost.
+
+        @param source (unicode): source url
+        @param dest (unicode): target path
+        """
+        dirname = os.path.dirname(dest)
+        if not os.path.exists(dest):
+            if not os.path.exists(dirname):
+                os.makedirs(dirname)
+            urllib.urlretrieve(source, dest)
+            log.debug("DokuWiki media file copied to %s" % dest)
+
+    def createThumbnail(self, source, dest, width):
+        """Create a thumbnail.
+
+        @param source (unicode): source file path
+        @param dest (unicode): destination file path
+        @param width (unicode): thumbnail's width
+        """
+        thumb_dir = os.path.dirname(dest)
+        if not os.path.exists(thumb_dir):
+            os.makedirs(thumb_dir)
+        try:
+            im = Image.open(source)
+            im.thumbnail((width, int(width) * im.size[0] / im.size[1]))
+            im.save(dest)
+            log.debug("DokuWiki media thumbnail created: %s" % dest)
+        except IOError:
+            log.error("Cannot create DokuWiki media thumbnail %s" % dest)
+
+
+
+class DokuwikiImport(object):
+
+    def __init__(self, host):
+        log.info(_("plugin Dokuwiki Import initialization"))
+        self.host = host
+        host.plugins['BLOG_IMPORT'].register('dokuwiki', self.DkImport, SHORT_DESC, LONG_DESC)
+
+    def DkImport(self, client, location, options=None):
+        """Import from DokuWiki to PubSub
+
+        @param location (unicode): DokuWiki site URL
+        @param options (dict, None): DokuWiki import parameters
+            - user (unicode): DokuWiki admin user
+            - passwd (unicode): DokuWiki admin password
+            - namespace (unicode): DokuWiki namespace to import
+            - media_repo (unicode): New remote media repository
+        """
+        options[OPT_HOST] = location
+        try:
+            user = options["user"]
+        except KeyError:
+            raise exceptions.DataError('parameter "user" is required')
+        try:
+            passwd = options["passwd"]
+        except KeyError:
+            raise exceptions.DataError('parameter "passwd" is required')
+        try:
+            media_repo = options["media_repo"]
+        except KeyError:
+            media_repo = DEFAULT_MEDIA_REPO
+        if options.get(OPT_UPLOAD_IMAGES, None):
+            # TODO: when media_repo is not defined or empty, we should not force this option
+            options[OPT_UPLOAD_IMAGES] = False
+            msg = _(u"Option --no-images-upload will be used by force.")
+            self.host.actionNew({'xmlui': xml_tools.note(msg).toXml()}, profile=client.profile)
+        try:
+            namespace = options["namespace"]
+        except KeyError:
+            namespace = DEFAULT_NAMESPACE
+        try:
+            limit = options["limit"]
+        except KeyError:
+            limit = DEFAULT_LIMIT
+        dk_importer = Importer(location, user, passwd, media_repo, limit)
+        self.host.actionNew({'xmlui': xml_tools.note(dk_importer.info_msg).toXml()}, profile=client.profile)
+        d = threads.deferToThread(dk_importer.process, client, namespace)
+        return d