Mercurial > libervia-backend
comparison libervia/backend/plugins/plugin_blog_import_dokuwiki.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 02 Jun 2023 11:49:51 +0200 (19 months ago) |
parents | sat/plugins/plugin_blog_import_dokuwiki.py@524856bd7b19 |
children | 47401850dec6 |
comparison
equal
deleted
inserted
replaced
4070:d10748475025 | 4071:4b842c1fb686 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # SàT plugin to import dokuwiki blogs | |
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) | |
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org) | |
7 | |
8 # This program is free software: you can redistribute it and/or modify | |
9 # it under the terms of the GNU Affero General Public License as published by | |
10 # the Free Software Foundation, either version 3 of the License, or | |
11 # (at your option) any later version. | |
12 | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU Affero General Public License for more details. | |
17 | |
18 # You should have received a copy of the GNU Affero General Public License | |
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 from libervia.backend.core.i18n import _, D_ | |
22 from libervia.backend.core.constants import Const as C | |
23 from libervia.backend.core.log import getLogger | |
24 | |
25 log = getLogger(__name__) | |
26 from libervia.backend.core import exceptions | |
27 from libervia.backend.tools import xml_tools | |
28 from twisted.internet import threads | |
29 from collections import OrderedDict | |
30 import calendar | |
31 import urllib.request, urllib.parse, urllib.error | |
32 import urllib.parse | |
33 import tempfile | |
34 import re | |
35 import time | |
36 import os.path | |
37 | |
38 try: | |
39 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency | |
40 except ImportError: | |
41 raise exceptions.MissingModule( | |
42 'Missing module dokuwiki, please install it with "pip install dokuwiki"' | |
43 ) | |
44 try: | |
45 from PIL import Image # this is already needed by plugin XEP-0054 | |
46 except: | |
47 raise exceptions.MissingModule( | |
48 "Missing module pillow, please download/install it from https://python-pillow.github.io" | |
49 ) | |
50 | |
51 PLUGIN_INFO = { | |
52 C.PI_NAME: "Dokuwiki import", | |
53 C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI", | |
54 C.PI_TYPE: C.PLUG_TYPE_BLOG, | |
55 C.PI_DEPENDENCIES: ["BLOG_IMPORT"], | |
56 C.PI_MAIN: "DokuwikiImport", | |
57 C.PI_HANDLER: "no", | |
58 C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine."""), | |
59 } | |
60 | |
61 SHORT_DESC = D_("import posts from Dokuwiki blog engine") | |
62 | |
63 LONG_DESC = D_( | |
64 """This importer handle Dokuwiki blog engine. | |
65 | |
66 To use it, you need an admin access to a running Dokuwiki website | |
67 (local or on the Internet). The importer retrieves the data using | |
68 the XMLRPC Dokuwiki API. | |
69 | |
70 You can specify a namespace (that could be a namespace directory | |
71 or a single post) or leave it empty to use the root namespace "/" | |
72 and import all the posts. | |
73 | |
74 You can specify a new media repository to modify the internal | |
75 media links and make them point to the URL of your choice, but | |
76 note that the upload is not done automatically: a temporary | |
77 directory will be created on your local drive and you will | |
78 need to upload it yourself to your repository via SSH or FTP. | |
79 | |
80 Following options are recognized: | |
81 | |
82 location: DokuWiki site URL | |
83 user: DokuWiki admin user | |
84 passwd: DokuWiki admin password | |
85 namespace: DokuWiki namespace to import (default: root namespace "/") | |
86 media_repo: URL to the new remote media repository (default: none) | |
87 limit: maximal number of posts to import (default: 100) | |
88 | |
89 Example of usage (with jp frontend): | |
90 | |
91 jp import dokuwiki -p dave --pwd xxxxxx --connect | |
92 http://127.0.1.1 -o user souliane -o passwd qwertz | |
93 -o namespace public:2015:10 | |
94 -o media_repo http://media.diekulturvermittlung.at | |
95 | |
96 This retrieves the 100 last blog posts from http://127.0.1.1 that | |
97 are inside the namespace "public:2015:10" using the Dokuwiki user | |
98 "souliane", and it imports them to sat profile dave's microblog node. | |
99 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now | |
100 pointing to http://media.diekulturvermittlung.at. | |
101 """ | |
102 ) | |
103 DEFAULT_MEDIA_REPO = "" | |
104 DEFAULT_NAMESPACE = "/" | |
105 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long | |
106 | |
107 | |
108 class Importer(DokuWiki): | |
109 def __init__( | |
110 self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT | |
111 ): | |
112 """ | |
113 | |
114 @param url (unicode): DokuWiki site URL | |
115 @param user (unicode): DokuWiki admin user | |
116 @param passwd (unicode): DokuWiki admin password | |
117 @param media_repo (unicode): New remote media repository | |
118 """ | |
119 DokuWiki.__init__(self, url, user, passwd) | |
120 self.url = url | |
121 self.media_repo = media_repo | |
122 self.temp_dir = tempfile.mkdtemp() if self.media_repo else None | |
123 self.limit = limit | |
124 self.posts_data = OrderedDict() | |
125 | |
126 def get_post_id(self, post): | |
127 """Return a unique and constant post id | |
128 | |
129 @param post(dict): parsed post data | |
130 @return (unicode): post unique item id | |
131 """ | |
132 return str(post["id"]) | |
133 | |
134 def get_post_updated(self, post): | |
135 """Return the update date. | |
136 | |
137 @param post(dict): parsed post data | |
138 @return (unicode): update date | |
139 """ | |
140 return str(post["mtime"]) | |
141 | |
142 def get_post_published(self, post): | |
143 """Try to parse the date from the message ID, else use "mtime". | |
144 | |
145 The date can be extracted if the message ID looks like one of: | |
146 - namespace:YYMMDD_short_title | |
147 - namespace:YYYYMMDD_short_title | |
148 @param post (dict): parsed post data | |
149 @return (unicode): publication date | |
150 """ | |
151 id_, default = str(post["id"]), str(post["mtime"]) | |
152 try: | |
153 date = id_.split(":")[-1].split("_")[0] | |
154 except KeyError: | |
155 return default | |
156 try: | |
157 time_struct = time.strptime(date, "%y%m%d") | |
158 except ValueError: | |
159 try: | |
160 time_struct = time.strptime(date, "%Y%m%d") | |
161 except ValueError: | |
162 return default | |
163 return str(calendar.timegm(time_struct)) | |
164 | |
165 def process_post(self, post, profile_jid): | |
166 """Process a single page. | |
167 | |
168 @param post (dict): parsed post data | |
169 @param profile_jid | |
170 """ | |
171 # get main information | |
172 id_ = self.get_post_id(post) | |
173 updated = self.get_post_updated(post) | |
174 published = self.get_post_published(post) | |
175 | |
176 # manage links | |
177 backlinks = self.pages.backlinks(id_) | |
178 for link in self.pages.links(id_): | |
179 if link["type"] != "extern": | |
180 assert link["type"] == "local" | |
181 page = link["page"] | |
182 backlinks.append(page[1:] if page.startswith(":") else page) | |
183 | |
184 self.pages.get(id_) | |
185 content_xhtml = self.process_content(self.pages.html(id_), backlinks, profile_jid) | |
186 | |
187 # XXX: title is already in content_xhtml and difficult to remove, so leave it | |
188 # title = content.split("\n")[0].strip(u"\ufeff= ") | |
189 | |
190 # build the extra data dictionary | |
191 mb_data = { | |
192 "id": id_, | |
193 "published": published, | |
194 "updated": updated, | |
195 "author": profile_jid.user, | |
196 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml | |
197 "content_xhtml": content_xhtml, | |
198 # "title": title, | |
199 "allow_comments": "true", | |
200 } | |
201 | |
202 # find out if the message access is public or restricted | |
203 namespace = id_.split(":")[0] | |
204 if namespace and namespace.lower() not in ("public", "/"): | |
205 mb_data["group"] = namespace # roster group must exist | |
206 | |
207 self.posts_data[id_] = {"blog": mb_data, "comments": [[]]} | |
208 | |
209 def process(self, client, namespace=DEFAULT_NAMESPACE): | |
210 """Process a namespace or a single page. | |
211 | |
212 @param namespace (unicode): DokuWiki namespace (or page) to import | |
213 """ | |
214 profile_jid = client.jid | |
215 log.info("Importing data from DokuWiki %s" % self.version) | |
216 try: | |
217 pages_list = self.pages.list(namespace) | |
218 except DokuWikiError: | |
219 log.warning( | |
220 'Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.' | |
221 ) | |
222 return | |
223 | |
224 if not pages_list: # namespace is actually a page? | |
225 names = namespace.split(":") | |
226 real_namespace = ":".join(names[0:-1]) | |
227 pages_list = self.pages.list(real_namespace) | |
228 pages_list = [page for page in pages_list if page["id"] == namespace] | |
229 namespace = real_namespace | |
230 | |
231 count = 0 | |
232 for page in pages_list: | |
233 self.process_post(page, profile_jid) | |
234 count += 1 | |
235 if count >= self.limit: | |
236 break | |
237 | |
238 return (iter(self.posts_data.values()), len(self.posts_data)) | |
239 | |
240 def process_content(self, text, backlinks, profile_jid): | |
241 """Do text substitutions and file copy. | |
242 | |
243 @param text (unicode): message content | |
244 @param backlinks (list[unicode]): list of backlinks | |
245 """ | |
246 text = text.strip("\ufeff") # this is at the beginning of the file (BOM) | |
247 | |
248 for backlink in backlinks: | |
249 src = '/doku.php?id=%s"' % backlink | |
250 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink) | |
251 text = text.replace(src, tgt) | |
252 | |
253 subs = {} | |
254 | |
255 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>""" | |
256 for tag in re.finditer(link_pattern, text): | |
257 type_, attr, link = tag.group(1), tag.group(2), tag.group(3) | |
258 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href") | |
259 if re.match(r"^\w*://", link): # absolute URL to link directly | |
260 continue | |
261 if self.media_repo: | |
262 self.move_media(link, subs) | |
263 elif link not in subs: | |
264 subs[link] = urllib.parse.urljoin(self.url, link) | |
265 | |
266 for url, new_url in subs.items(): | |
267 text = text.replace(url, new_url) | |
268 return text | |
269 | |
270 def move_media(self, link, subs): | |
271 """Move a media from the DokuWiki host to the new repository. | |
272 | |
273 This also updates the hyperlinks to internal media files. | |
274 @param link (unicode): media link | |
275 @param subs (dict): substitutions data | |
276 """ | |
277 url = urllib.parse.urljoin(self.url, link) | |
278 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link) | |
279 thumb_width = None | |
280 | |
281 if user_media: # media that has been added by the user | |
282 params = urllib.parse.parse_qs(urllib.parse.urlparse(url).query) | |
283 try: | |
284 media = params["media"][0] | |
285 except KeyError: | |
286 log.warning("No media found in fetch URL: %s" % user_media.group(2)) | |
287 return | |
288 if re.match(r"^\w*://", media): # external URL to link directly | |
289 subs[link] = media | |
290 return | |
291 try: # create thumbnail | |
292 thumb_width = params["w"][0] | |
293 except KeyError: | |
294 pass | |
295 | |
296 filename = media.replace(":", "/") | |
297 # XXX: avoid "precondition failed" error (only keep the media parameter) | |
298 url = urllib.parse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media) | |
299 | |
300 elif link.startswith("/lib/plugins/"): | |
301 # other link added by a plugin or something else | |
302 filename = link[13:] | |
303 else: # fake alert... there's no media (or we don't handle it yet) | |
304 return | |
305 | |
306 filepath = os.path.join(self.temp_dir, filename) | |
307 self.download_media(url, filepath) | |
308 | |
309 if thumb_width: | |
310 filename = os.path.join("thumbs", thumb_width, filename) | |
311 thumbnail = os.path.join(self.temp_dir, filename) | |
312 self.create_thumbnail(filepath, thumbnail, thumb_width) | |
313 | |
314 new_url = os.path.join(self.media_repo, filename) | |
315 subs[link] = new_url | |
316 | |
317 def download_media(self, source, dest): | |
318 """Copy media to localhost. | |
319 | |
320 @param source (unicode): source url | |
321 @param dest (unicode): target path | |
322 """ | |
323 dirname = os.path.dirname(dest) | |
324 if not os.path.exists(dest): | |
325 if not os.path.exists(dirname): | |
326 os.makedirs(dirname) | |
327 urllib.request.urlretrieve(source, dest) | |
328 log.debug("DokuWiki media file copied to %s" % dest) | |
329 | |
330 def create_thumbnail(self, source, dest, width): | |
331 """Create a thumbnail. | |
332 | |
333 @param source (unicode): source file path | |
334 @param dest (unicode): destination file path | |
335 @param width (unicode): thumbnail's width | |
336 """ | |
337 thumb_dir = os.path.dirname(dest) | |
338 if not os.path.exists(thumb_dir): | |
339 os.makedirs(thumb_dir) | |
340 try: | |
341 im = Image.open(source) | |
342 im.thumbnail((width, int(width) * im.size[0] / im.size[1])) | |
343 im.save(dest) | |
344 log.debug("DokuWiki media thumbnail created: %s" % dest) | |
345 except IOError: | |
346 log.error("Cannot create DokuWiki media thumbnail %s" % dest) | |
347 | |
348 | |
349 class DokuwikiImport(object): | |
350 def __init__(self, host): | |
351 log.info(_("plugin Dokuwiki import initialization")) | |
352 self.host = host | |
353 self._blog_import = host.plugins["BLOG_IMPORT"] | |
354 self._blog_import.register("dokuwiki", self.dk_import, SHORT_DESC, LONG_DESC) | |
355 | |
356 def dk_import(self, client, location, options=None): | |
357 """import from DokuWiki to PubSub | |
358 | |
359 @param location (unicode): DokuWiki site URL | |
360 @param options (dict, None): DokuWiki import parameters | |
361 - user (unicode): DokuWiki admin user | |
362 - passwd (unicode): DokuWiki admin password | |
363 - namespace (unicode): DokuWiki namespace to import | |
364 - media_repo (unicode): New remote media repository | |
365 """ | |
366 options[self._blog_import.OPT_HOST] = location | |
367 try: | |
368 user = options["user"] | |
369 except KeyError: | |
370 raise exceptions.DataError('parameter "user" is required') | |
371 try: | |
372 passwd = options["passwd"] | |
373 except KeyError: | |
374 raise exceptions.DataError('parameter "passwd" is required') | |
375 | |
376 opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None) | |
377 try: | |
378 media_repo = options["media_repo"] | |
379 if opt_upload_images: | |
380 options[ | |
381 self._blog_import.OPT_UPLOAD_IMAGES | |
382 ] = False # force using --no-images-upload | |
383 info_msg = _( | |
384 "DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}" | |
385 ) | |
386 except KeyError: | |
387 media_repo = DEFAULT_MEDIA_REPO | |
388 if opt_upload_images: | |
389 info_msg = _( | |
390 "DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though." | |
391 ) | |
392 else: | |
393 info_msg = _( | |
394 "DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible." | |
395 ) | |
396 | |
397 try: | |
398 namespace = options["namespace"] | |
399 except KeyError: | |
400 namespace = DEFAULT_NAMESPACE | |
401 try: | |
402 limit = options["limit"] | |
403 except KeyError: | |
404 limit = DEFAULT_LIMIT | |
405 | |
406 dk_importer = Importer(location, user, passwd, media_repo, limit) | |
407 info_msg = info_msg.format( | |
408 temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location | |
409 ) | |
410 self.host.action_new( | |
411 {"xmlui": xml_tools.note(info_msg).toXml()}, profile=client.profile | |
412 ) | |
413 d = threads.deferToThread(dk_importer.process, client, namespace) | |
414 return d |