comparison libervia/backend/plugins/plugin_blog_import_dokuwiki.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`
author Goffi <goffi@goffi.org>
date Fri, 02 Jun 2023 11:49:51 +0200
parents sat/plugins/plugin_blog_import_dokuwiki.py@524856bd7b19
children 47401850dec6
comparison
equal deleted inserted replaced
4070:d10748475025 4071:4b842c1fb686
1 #!/usr/bin/env python3
2
3
4 # SàT plugin to import dokuwiki blogs
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org)
7
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
17
18 # You should have received a copy of the GNU Affero General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 from libervia.backend.core.i18n import _, D_
22 from libervia.backend.core.constants import Const as C
23 from libervia.backend.core.log import getLogger
24
25 log = getLogger(__name__)
26 from libervia.backend.core import exceptions
27 from libervia.backend.tools import xml_tools
28 from twisted.internet import threads
29 from collections import OrderedDict
30 import calendar
31 import urllib.request, urllib.parse, urllib.error
32 import urllib.parse
33 import tempfile
34 import re
35 import time
36 import os.path
37
38 try:
39 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency
40 except ImportError:
41 raise exceptions.MissingModule(
42 'Missing module dokuwiki, please install it with "pip install dokuwiki"'
43 )
44 try:
45 from PIL import Image # this is already needed by plugin XEP-0054
46 except:
47 raise exceptions.MissingModule(
48 "Missing module pillow, please download/install it from https://python-pillow.github.io"
49 )
50
51 PLUGIN_INFO = {
52 C.PI_NAME: "Dokuwiki import",
53 C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI",
54 C.PI_TYPE: C.PLUG_TYPE_BLOG,
55 C.PI_DEPENDENCIES: ["BLOG_IMPORT"],
56 C.PI_MAIN: "DokuwikiImport",
57 C.PI_HANDLER: "no",
58 C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine."""),
59 }
60
61 SHORT_DESC = D_("import posts from Dokuwiki blog engine")
62
63 LONG_DESC = D_(
64 """This importer handle Dokuwiki blog engine.
65
66 To use it, you need an admin access to a running Dokuwiki website
67 (local or on the Internet). The importer retrieves the data using
68 the XMLRPC Dokuwiki API.
69
70 You can specify a namespace (that could be a namespace directory
71 or a single post) or leave it empty to use the root namespace "/"
72 and import all the posts.
73
74 You can specify a new media repository to modify the internal
75 media links and make them point to the URL of your choice, but
76 note that the upload is not done automatically: a temporary
77 directory will be created on your local drive and you will
78 need to upload it yourself to your repository via SSH or FTP.
79
80 Following options are recognized:
81
82 location: DokuWiki site URL
83 user: DokuWiki admin user
84 passwd: DokuWiki admin password
85 namespace: DokuWiki namespace to import (default: root namespace "/")
86 media_repo: URL to the new remote media repository (default: none)
87 limit: maximal number of posts to import (default: 100)
88
89 Example of usage (with jp frontend):
90
91 jp import dokuwiki -p dave --pwd xxxxxx --connect
92 http://127.0.1.1 -o user souliane -o passwd qwertz
93 -o namespace public:2015:10
94 -o media_repo http://media.diekulturvermittlung.at
95
96 This retrieves the 100 last blog posts from http://127.0.1.1 that
97 are inside the namespace "public:2015:10" using the Dokuwiki user
98 "souliane", and it imports them to sat profile dave's microblog node.
99 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now
100 pointing to http://media.diekulturvermittlung.at.
101 """
102 )
103 DEFAULT_MEDIA_REPO = ""
104 DEFAULT_NAMESPACE = "/"
105 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long
106
107
108 class Importer(DokuWiki):
109 def __init__(
110 self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT
111 ):
112 """
113
114 @param url (unicode): DokuWiki site URL
115 @param user (unicode): DokuWiki admin user
116 @param passwd (unicode): DokuWiki admin password
117 @param media_repo (unicode): New remote media repository
118 """
119 DokuWiki.__init__(self, url, user, passwd)
120 self.url = url
121 self.media_repo = media_repo
122 self.temp_dir = tempfile.mkdtemp() if self.media_repo else None
123 self.limit = limit
124 self.posts_data = OrderedDict()
125
126 def get_post_id(self, post):
127 """Return a unique and constant post id
128
129 @param post(dict): parsed post data
130 @return (unicode): post unique item id
131 """
132 return str(post["id"])
133
134 def get_post_updated(self, post):
135 """Return the update date.
136
137 @param post(dict): parsed post data
138 @return (unicode): update date
139 """
140 return str(post["mtime"])
141
142 def get_post_published(self, post):
143 """Try to parse the date from the message ID, else use "mtime".
144
145 The date can be extracted if the message ID looks like one of:
146 - namespace:YYMMDD_short_title
147 - namespace:YYYYMMDD_short_title
148 @param post (dict): parsed post data
149 @return (unicode): publication date
150 """
151 id_, default = str(post["id"]), str(post["mtime"])
152 try:
153 date = id_.split(":")[-1].split("_")[0]
154 except KeyError:
155 return default
156 try:
157 time_struct = time.strptime(date, "%y%m%d")
158 except ValueError:
159 try:
160 time_struct = time.strptime(date, "%Y%m%d")
161 except ValueError:
162 return default
163 return str(calendar.timegm(time_struct))
164
165 def process_post(self, post, profile_jid):
166 """Process a single page.
167
168 @param post (dict): parsed post data
169 @param profile_jid
170 """
171 # get main information
172 id_ = self.get_post_id(post)
173 updated = self.get_post_updated(post)
174 published = self.get_post_published(post)
175
176 # manage links
177 backlinks = self.pages.backlinks(id_)
178 for link in self.pages.links(id_):
179 if link["type"] != "extern":
180 assert link["type"] == "local"
181 page = link["page"]
182 backlinks.append(page[1:] if page.startswith(":") else page)
183
184 self.pages.get(id_)
185 content_xhtml = self.process_content(self.pages.html(id_), backlinks, profile_jid)
186
187 # XXX: title is already in content_xhtml and difficult to remove, so leave it
188 # title = content.split("\n")[0].strip(u"\ufeff= ")
189
190 # build the extra data dictionary
191 mb_data = {
192 "id": id_,
193 "published": published,
194 "updated": updated,
195 "author": profile_jid.user,
196 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml
197 "content_xhtml": content_xhtml,
198 # "title": title,
199 "allow_comments": "true",
200 }
201
202 # find out if the message access is public or restricted
203 namespace = id_.split(":")[0]
204 if namespace and namespace.lower() not in ("public", "/"):
205 mb_data["group"] = namespace # roster group must exist
206
207 self.posts_data[id_] = {"blog": mb_data, "comments": [[]]}
208
209 def process(self, client, namespace=DEFAULT_NAMESPACE):
210 """Process a namespace or a single page.
211
212 @param namespace (unicode): DokuWiki namespace (or page) to import
213 """
214 profile_jid = client.jid
215 log.info("Importing data from DokuWiki %s" % self.version)
216 try:
217 pages_list = self.pages.list(namespace)
218 except DokuWikiError:
219 log.warning(
220 'Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.'
221 )
222 return
223
224 if not pages_list: # namespace is actually a page?
225 names = namespace.split(":")
226 real_namespace = ":".join(names[0:-1])
227 pages_list = self.pages.list(real_namespace)
228 pages_list = [page for page in pages_list if page["id"] == namespace]
229 namespace = real_namespace
230
231 count = 0
232 for page in pages_list:
233 self.process_post(page, profile_jid)
234 count += 1
235 if count >= self.limit:
236 break
237
238 return (iter(self.posts_data.values()), len(self.posts_data))
239
240 def process_content(self, text, backlinks, profile_jid):
241 """Do text substitutions and file copy.
242
243 @param text (unicode): message content
244 @param backlinks (list[unicode]): list of backlinks
245 """
246 text = text.strip("\ufeff") # this is at the beginning of the file (BOM)
247
248 for backlink in backlinks:
249 src = '/doku.php?id=%s"' % backlink
250 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink)
251 text = text.replace(src, tgt)
252
253 subs = {}
254
255 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>"""
256 for tag in re.finditer(link_pattern, text):
257 type_, attr, link = tag.group(1), tag.group(2), tag.group(3)
258 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href")
259 if re.match(r"^\w*://", link): # absolute URL to link directly
260 continue
261 if self.media_repo:
262 self.move_media(link, subs)
263 elif link not in subs:
264 subs[link] = urllib.parse.urljoin(self.url, link)
265
266 for url, new_url in subs.items():
267 text = text.replace(url, new_url)
268 return text
269
270 def move_media(self, link, subs):
271 """Move a media from the DokuWiki host to the new repository.
272
273 This also updates the hyperlinks to internal media files.
274 @param link (unicode): media link
275 @param subs (dict): substitutions data
276 """
277 url = urllib.parse.urljoin(self.url, link)
278 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link)
279 thumb_width = None
280
281 if user_media: # media that has been added by the user
282 params = urllib.parse.parse_qs(urllib.parse.urlparse(url).query)
283 try:
284 media = params["media"][0]
285 except KeyError:
286 log.warning("No media found in fetch URL: %s" % user_media.group(2))
287 return
288 if re.match(r"^\w*://", media): # external URL to link directly
289 subs[link] = media
290 return
291 try: # create thumbnail
292 thumb_width = params["w"][0]
293 except KeyError:
294 pass
295
296 filename = media.replace(":", "/")
297 # XXX: avoid "precondition failed" error (only keep the media parameter)
298 url = urllib.parse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media)
299
300 elif link.startswith("/lib/plugins/"):
301 # other link added by a plugin or something else
302 filename = link[13:]
303 else: # fake alert... there's no media (or we don't handle it yet)
304 return
305
306 filepath = os.path.join(self.temp_dir, filename)
307 self.download_media(url, filepath)
308
309 if thumb_width:
310 filename = os.path.join("thumbs", thumb_width, filename)
311 thumbnail = os.path.join(self.temp_dir, filename)
312 self.create_thumbnail(filepath, thumbnail, thumb_width)
313
314 new_url = os.path.join(self.media_repo, filename)
315 subs[link] = new_url
316
317 def download_media(self, source, dest):
318 """Copy media to localhost.
319
320 @param source (unicode): source url
321 @param dest (unicode): target path
322 """
323 dirname = os.path.dirname(dest)
324 if not os.path.exists(dest):
325 if not os.path.exists(dirname):
326 os.makedirs(dirname)
327 urllib.request.urlretrieve(source, dest)
328 log.debug("DokuWiki media file copied to %s" % dest)
329
330 def create_thumbnail(self, source, dest, width):
331 """Create a thumbnail.
332
333 @param source (unicode): source file path
334 @param dest (unicode): destination file path
335 @param width (unicode): thumbnail's width
336 """
337 thumb_dir = os.path.dirname(dest)
338 if not os.path.exists(thumb_dir):
339 os.makedirs(thumb_dir)
340 try:
341 im = Image.open(source)
342 im.thumbnail((width, int(width) * im.size[0] / im.size[1]))
343 im.save(dest)
344 log.debug("DokuWiki media thumbnail created: %s" % dest)
345 except IOError:
346 log.error("Cannot create DokuWiki media thumbnail %s" % dest)
347
348
349 class DokuwikiImport(object):
350 def __init__(self, host):
351 log.info(_("plugin Dokuwiki import initialization"))
352 self.host = host
353 self._blog_import = host.plugins["BLOG_IMPORT"]
354 self._blog_import.register("dokuwiki", self.dk_import, SHORT_DESC, LONG_DESC)
355
356 def dk_import(self, client, location, options=None):
357 """import from DokuWiki to PubSub
358
359 @param location (unicode): DokuWiki site URL
360 @param options (dict, None): DokuWiki import parameters
361 - user (unicode): DokuWiki admin user
362 - passwd (unicode): DokuWiki admin password
363 - namespace (unicode): DokuWiki namespace to import
364 - media_repo (unicode): New remote media repository
365 """
366 options[self._blog_import.OPT_HOST] = location
367 try:
368 user = options["user"]
369 except KeyError:
370 raise exceptions.DataError('parameter "user" is required')
371 try:
372 passwd = options["passwd"]
373 except KeyError:
374 raise exceptions.DataError('parameter "passwd" is required')
375
376 opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None)
377 try:
378 media_repo = options["media_repo"]
379 if opt_upload_images:
380 options[
381 self._blog_import.OPT_UPLOAD_IMAGES
382 ] = False # force using --no-images-upload
383 info_msg = _(
384 "DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}"
385 )
386 except KeyError:
387 media_repo = DEFAULT_MEDIA_REPO
388 if opt_upload_images:
389 info_msg = _(
390 "DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though."
391 )
392 else:
393 info_msg = _(
394 "DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible."
395 )
396
397 try:
398 namespace = options["namespace"]
399 except KeyError:
400 namespace = DEFAULT_NAMESPACE
401 try:
402 limit = options["limit"]
403 except KeyError:
404 limit = DEFAULT_LIMIT
405
406 dk_importer = Importer(location, user, passwd, media_repo, limit)
407 info_msg = info_msg.format(
408 temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location
409 )
410 self.host.action_new(
411 {"xmlui": xml_tools.note(info_msg).toXml()}, profile=client.profile
412 )
413 d = threads.deferToThread(dk_importer.process, client, namespace)
414 return d