Mercurial > libervia-backend
diff libervia/backend/plugins/plugin_blog_import_dokuwiki.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 02 Jun 2023 11:49:51 +0200 |
parents | sat/plugins/plugin_blog_import_dokuwiki.py@524856bd7b19 |
children | 47401850dec6 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libervia/backend/plugins/plugin_blog_import_dokuwiki.py Fri Jun 02 11:49:51 2023 +0200 @@ -0,0 +1,414 @@ +#!/usr/bin/env python3 + + +# SàT plugin to import dokuwiki blogs +# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) +# Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from libervia.backend.core.i18n import _, D_ +from libervia.backend.core.constants import Const as C +from libervia.backend.core.log import getLogger + +log = getLogger(__name__) +from libervia.backend.core import exceptions +from libervia.backend.tools import xml_tools +from twisted.internet import threads +from collections import OrderedDict +import calendar +import urllib.request, urllib.parse, urllib.error +import urllib.parse +import tempfile +import re +import time +import os.path + +try: + from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency +except ImportError: + raise exceptions.MissingModule( + 'Missing module dokuwiki, please install it with "pip install dokuwiki"' + ) +try: + from PIL import Image # this is already needed by plugin XEP-0054 +except: + raise exceptions.MissingModule( + "Missing module pillow, please download/install it from https://python-pillow.github.io" + ) + +PLUGIN_INFO = { + C.PI_NAME: "Dokuwiki import", + C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI", + C.PI_TYPE: C.PLUG_TYPE_BLOG, + C.PI_DEPENDENCIES: ["BLOG_IMPORT"], + C.PI_MAIN: "DokuwikiImport", + C.PI_HANDLER: "no", + C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine."""), +} + +SHORT_DESC = D_("import posts from Dokuwiki blog engine") + +LONG_DESC = D_( + """This importer handle Dokuwiki blog engine. + +To use it, you need an admin access to a running Dokuwiki website +(local or on the Internet). The importer retrieves the data using +the XMLRPC Dokuwiki API. + +You can specify a namespace (that could be a namespace directory +or a single post) or leave it empty to use the root namespace "/" +and import all the posts. + +You can specify a new media repository to modify the internal +media links and make them point to the URL of your choice, but +note that the upload is not done automatically: a temporary +directory will be created on your local drive and you will +need to upload it yourself to your repository via SSH or FTP. + +Following options are recognized: + +location: DokuWiki site URL +user: DokuWiki admin user +passwd: DokuWiki admin password +namespace: DokuWiki namespace to import (default: root namespace "/") +media_repo: URL to the new remote media repository (default: none) +limit: maximal number of posts to import (default: 100) + +Example of usage (with jp frontend): + +jp import dokuwiki -p dave --pwd xxxxxx --connect + http://127.0.1.1 -o user souliane -o passwd qwertz + -o namespace public:2015:10 + -o media_repo http://media.diekulturvermittlung.at + +This retrieves the 100 last blog posts from http://127.0.1.1 that +are inside the namespace "public:2015:10" using the Dokuwiki user +"souliane", and it imports them to sat profile dave's microblog node. +Internal Dokuwiki media that were hosted on http://127.0.1.1 are now +pointing to http://media.diekulturvermittlung.at. +""" +) +DEFAULT_MEDIA_REPO = "" +DEFAULT_NAMESPACE = "/" +DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long + + +class Importer(DokuWiki): + def __init__( + self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT + ): + """ + + @param url (unicode): DokuWiki site URL + @param user (unicode): DokuWiki admin user + @param passwd (unicode): DokuWiki admin password + @param media_repo (unicode): New remote media repository + """ + DokuWiki.__init__(self, url, user, passwd) + self.url = url + self.media_repo = media_repo + self.temp_dir = tempfile.mkdtemp() if self.media_repo else None + self.limit = limit + self.posts_data = OrderedDict() + + def get_post_id(self, post): + """Return a unique and constant post id + + @param post(dict): parsed post data + @return (unicode): post unique item id + """ + return str(post["id"]) + + def get_post_updated(self, post): + """Return the update date. + + @param post(dict): parsed post data + @return (unicode): update date + """ + return str(post["mtime"]) + + def get_post_published(self, post): + """Try to parse the date from the message ID, else use "mtime". + + The date can be extracted if the message ID looks like one of: + - namespace:YYMMDD_short_title + - namespace:YYYYMMDD_short_title + @param post (dict): parsed post data + @return (unicode): publication date + """ + id_, default = str(post["id"]), str(post["mtime"]) + try: + date = id_.split(":")[-1].split("_")[0] + except KeyError: + return default + try: + time_struct = time.strptime(date, "%y%m%d") + except ValueError: + try: + time_struct = time.strptime(date, "%Y%m%d") + except ValueError: + return default + return str(calendar.timegm(time_struct)) + + def process_post(self, post, profile_jid): + """Process a single page. + + @param post (dict): parsed post data + @param profile_jid + """ + # get main information + id_ = self.get_post_id(post) + updated = self.get_post_updated(post) + published = self.get_post_published(post) + + # manage links + backlinks = self.pages.backlinks(id_) + for link in self.pages.links(id_): + if link["type"] != "extern": + assert link["type"] == "local" + page = link["page"] + backlinks.append(page[1:] if page.startswith(":") else page) + + self.pages.get(id_) + content_xhtml = self.process_content(self.pages.html(id_), backlinks, profile_jid) + + # XXX: title is already in content_xhtml and difficult to remove, so leave it + # title = content.split("\n")[0].strip(u"\ufeff= ") + + # build the extra data dictionary + mb_data = { + "id": id_, + "published": published, + "updated": updated, + "author": profile_jid.user, + # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml + "content_xhtml": content_xhtml, + # "title": title, + "allow_comments": "true", + } + + # find out if the message access is public or restricted + namespace = id_.split(":")[0] + if namespace and namespace.lower() not in ("public", "/"): + mb_data["group"] = namespace # roster group must exist + + self.posts_data[id_] = {"blog": mb_data, "comments": [[]]} + + def process(self, client, namespace=DEFAULT_NAMESPACE): + """Process a namespace or a single page. + + @param namespace (unicode): DokuWiki namespace (or page) to import + """ + profile_jid = client.jid + log.info("Importing data from DokuWiki %s" % self.version) + try: + pages_list = self.pages.list(namespace) + except DokuWikiError: + log.warning( + 'Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.' + ) + return + + if not pages_list: # namespace is actually a page? + names = namespace.split(":") + real_namespace = ":".join(names[0:-1]) + pages_list = self.pages.list(real_namespace) + pages_list = [page for page in pages_list if page["id"] == namespace] + namespace = real_namespace + + count = 0 + for page in pages_list: + self.process_post(page, profile_jid) + count += 1 + if count >= self.limit: + break + + return (iter(self.posts_data.values()), len(self.posts_data)) + + def process_content(self, text, backlinks, profile_jid): + """Do text substitutions and file copy. + + @param text (unicode): message content + @param backlinks (list[unicode]): list of backlinks + """ + text = text.strip("\ufeff") # this is at the beginning of the file (BOM) + + for backlink in backlinks: + src = '/doku.php?id=%s"' % backlink + tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink) + text = text.replace(src, tgt) + + subs = {} + + link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>""" + for tag in re.finditer(link_pattern, text): + type_, attr, link = tag.group(1), tag.group(2), tag.group(3) + assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href") + if re.match(r"^\w*://", link): # absolute URL to link directly + continue + if self.media_repo: + self.move_media(link, subs) + elif link not in subs: + subs[link] = urllib.parse.urljoin(self.url, link) + + for url, new_url in subs.items(): + text = text.replace(url, new_url) + return text + + def move_media(self, link, subs): + """Move a media from the DokuWiki host to the new repository. + + This also updates the hyperlinks to internal media files. + @param link (unicode): media link + @param subs (dict): substitutions data + """ + url = urllib.parse.urljoin(self.url, link) + user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link) + thumb_width = None + + if user_media: # media that has been added by the user + params = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) + try: + media = params["media"][0] + except KeyError: + log.warning("No media found in fetch URL: %s" % user_media.group(2)) + return + if re.match(r"^\w*://", media): # external URL to link directly + subs[link] = media + return + try: # create thumbnail + thumb_width = params["w"][0] + except KeyError: + pass + + filename = media.replace(":", "/") + # XXX: avoid "precondition failed" error (only keep the media parameter) + url = urllib.parse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media) + + elif link.startswith("/lib/plugins/"): + # other link added by a plugin or something else + filename = link[13:] + else: # fake alert... there's no media (or we don't handle it yet) + return + + filepath = os.path.join(self.temp_dir, filename) + self.download_media(url, filepath) + + if thumb_width: + filename = os.path.join("thumbs", thumb_width, filename) + thumbnail = os.path.join(self.temp_dir, filename) + self.create_thumbnail(filepath, thumbnail, thumb_width) + + new_url = os.path.join(self.media_repo, filename) + subs[link] = new_url + + def download_media(self, source, dest): + """Copy media to localhost. + + @param source (unicode): source url + @param dest (unicode): target path + """ + dirname = os.path.dirname(dest) + if not os.path.exists(dest): + if not os.path.exists(dirname): + os.makedirs(dirname) + urllib.request.urlretrieve(source, dest) + log.debug("DokuWiki media file copied to %s" % dest) + + def create_thumbnail(self, source, dest, width): + """Create a thumbnail. + + @param source (unicode): source file path + @param dest (unicode): destination file path + @param width (unicode): thumbnail's width + """ + thumb_dir = os.path.dirname(dest) + if not os.path.exists(thumb_dir): + os.makedirs(thumb_dir) + try: + im = Image.open(source) + im.thumbnail((width, int(width) * im.size[0] / im.size[1])) + im.save(dest) + log.debug("DokuWiki media thumbnail created: %s" % dest) + except IOError: + log.error("Cannot create DokuWiki media thumbnail %s" % dest) + + +class DokuwikiImport(object): + def __init__(self, host): + log.info(_("plugin Dokuwiki import initialization")) + self.host = host + self._blog_import = host.plugins["BLOG_IMPORT"] + self._blog_import.register("dokuwiki", self.dk_import, SHORT_DESC, LONG_DESC) + + def dk_import(self, client, location, options=None): + """import from DokuWiki to PubSub + + @param location (unicode): DokuWiki site URL + @param options (dict, None): DokuWiki import parameters + - user (unicode): DokuWiki admin user + - passwd (unicode): DokuWiki admin password + - namespace (unicode): DokuWiki namespace to import + - media_repo (unicode): New remote media repository + """ + options[self._blog_import.OPT_HOST] = location + try: + user = options["user"] + except KeyError: + raise exceptions.DataError('parameter "user" is required') + try: + passwd = options["passwd"] + except KeyError: + raise exceptions.DataError('parameter "passwd" is required') + + opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None) + try: + media_repo = options["media_repo"] + if opt_upload_images: + options[ + self._blog_import.OPT_UPLOAD_IMAGES + ] = False # force using --no-images-upload + info_msg = _( + "DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}" + ) + except KeyError: + media_repo = DEFAULT_MEDIA_REPO + if opt_upload_images: + info_msg = _( + "DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though." + ) + else: + info_msg = _( + "DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible." + ) + + try: + namespace = options["namespace"] + except KeyError: + namespace = DEFAULT_NAMESPACE + try: + limit = options["limit"] + except KeyError: + limit = DEFAULT_LIMIT + + dk_importer = Importer(location, user, passwd, media_repo, limit) + info_msg = info_msg.format( + temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location + ) + self.host.action_new( + {"xmlui": xml_tools.note(info_msg).toXml()}, profile=client.profile + ) + d = threads.deferToThread(dk_importer.process, client, namespace) + return d