comparison sat/plugins/plugin_blog_import_dokuwiki.py @ 2562:26edcf3a30eb

core, setup: huge cleaning: - moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention - move twisted directory to root - removed all hacks from setup.py, and added missing dependencies, it is now clean - use https URL for website in setup.py - removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed - renamed sat.sh to sat and fixed its installation - added python_requires to specify Python version needed - replaced glib2reactor which use deprecated code by gtk3reactor sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author Goffi <goffi@goffi.org>
date Mon, 02 Apr 2018 19:44:50 +0200
parents src/plugins/plugin_blog_import_dokuwiki.py@0046283a285d
children 56f94936df1e
comparison
equal deleted inserted replaced
2561:bd30dc3ffe5a 2562:26edcf3a30eb
1 #!/usr/bin/env python2
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin to import dokuwiki blogs
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org)
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org)
7
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
17
18 # You should have received a copy of the GNU Affero General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 from sat.core.i18n import _, D_
22 from sat.core.constants import Const as C
23 from sat.core.log import getLogger
24 log = getLogger(__name__)
25 from sat.core import exceptions
26 from sat.tools import xml_tools
27 from twisted.internet import threads
28 from collections import OrderedDict
29 import calendar
30 import urllib
31 import urlparse
32 import tempfile
33 import re
34 import time
35 import os.path
36 try:
37 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency
38 except ImportError:
39 raise exceptions.MissingModule(u'Missing module dokuwiki, please install it with "pip install dokuwiki"')
40 try:
41 from PIL import Image # this is already needed by plugin XEP-0054
42 except:
43 raise exceptions.MissingModule(u"Missing module pillow, please download/install it from https://python-pillow.github.io")
44
45 PLUGIN_INFO = {
46 C.PI_NAME: "Dokuwiki import",
47 C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI",
48 C.PI_TYPE: C.PLUG_TYPE_BLOG,
49 C.PI_DEPENDENCIES: ["BLOG_IMPORT"],
50 C.PI_MAIN: "DokuwikiImport",
51 C.PI_HANDLER: "no",
52 C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine.""")
53 }
54
55 SHORT_DESC = D_(u"import posts from Dokuwiki blog engine")
56
57 LONG_DESC = D_(u"""This importer handle Dokuwiki blog engine.
58
59 To use it, you need an admin access to a running Dokuwiki website
60 (local or on the Internet). The importer retrieves the data using
61 the XMLRPC Dokuwiki API.
62
63 You can specify a namespace (that could be a namespace directory
64 or a single post) or leave it empty to use the root namespace "/"
65 and import all the posts.
66
67 You can specify a new media repository to modify the internal
68 media links and make them point to the URL of your choice, but
69 note that the upload is not done automatically: a temporary
70 directory will be created on your local drive and you will
71 need to upload it yourself to your repository via SSH or FTP.
72
73 Following options are recognized:
74
75 location: DokuWiki site URL
76 user: DokuWiki admin user
77 passwd: DokuWiki admin password
78 namespace: DokuWiki namespace to import (default: root namespace "/")
79 media_repo: URL to the new remote media repository (default: none)
80 limit: maximal number of posts to import (default: 100)
81
82 Example of usage (with jp frontend):
83
84 jp import dokuwiki -p dave --pwd xxxxxx --connect
85 http://127.0.1.1 -o user souliane -o passwd qwertz
86 -o namespace public:2015:10
87 -o media_repo http://media.diekulturvermittlung.at
88
89 This retrieves the 100 last blog posts from http://127.0.1.1 that
90 are inside the namespace "public:2015:10" using the Dokuwiki user
91 "souliane", and it imports them to sat profile dave's microblog node.
92 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now
93 pointing to http://media.diekulturvermittlung.at.
94 """)
95 DEFAULT_MEDIA_REPO = ""
96 DEFAULT_NAMESPACE = "/"
97 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long
98
99
100 class Importer(DokuWiki):
101
102 def __init__(self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT):
103 """
104
105 @param url (unicode): DokuWiki site URL
106 @param user (unicode): DokuWiki admin user
107 @param passwd (unicode): DokuWiki admin password
108 @param media_repo (unicode): New remote media repository
109 """
110 DokuWiki.__init__(self, url, user, passwd)
111 self.url = url
112 self.media_repo = media_repo
113 self.temp_dir = tempfile.mkdtemp() if self.media_repo else None
114 self.limit = limit
115 self.posts_data = OrderedDict()
116
117 def getPostId(self, post):
118 """Return a unique and constant post id
119
120 @param post(dict): parsed post data
121 @return (unicode): post unique item id
122 """
123 return unicode(post['id'])
124
125 def getPostUpdated(self, post):
126 """Return the update date.
127
128 @param post(dict): parsed post data
129 @return (unicode): update date
130 """
131 return unicode(post['mtime'])
132
133 def getPostPublished(self, post):
134 """Try to parse the date from the message ID, else use "mtime".
135
136 The date can be extracted if the message ID looks like one of:
137 - namespace:YYMMDD_short_title
138 - namespace:YYYYMMDD_short_title
139 @param post (dict): parsed post data
140 @return (unicode): publication date
141 """
142 id_, default = unicode(post["id"]), unicode(post["mtime"])
143 try:
144 date = id_.split(":")[-1].split("_")[0]
145 except KeyError:
146 return default
147 try:
148 time_struct = time.strptime(date, "%y%m%d")
149 except ValueError:
150 try:
151 time_struct = time.strptime(date, "%Y%m%d")
152 except ValueError:
153 return default
154 return unicode(calendar.timegm(time_struct))
155
156 def processPost(self, post, profile_jid):
157 """Process a single page.
158
159 @param post (dict): parsed post data
160 @param profile_jid
161 """
162 # get main information
163 id_ = self.getPostId(post)
164 updated = self.getPostUpdated(post)
165 published = self.getPostPublished(post)
166
167 # manage links
168 backlinks = self.pages.backlinks(id_)
169 for link in self.pages.links(id_):
170 if link["type"] != "extern":
171 assert link["type"] == "local"
172 page = link["page"]
173 backlinks.append(page[1:] if page.startswith(":") else page)
174
175 self.pages.get(id_)
176 content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid)
177
178 # XXX: title is already in content_xhtml and difficult to remove, so leave it
179 # title = content.split("\n")[0].strip(u"\ufeff= ")
180
181 # build the extra data dictionary
182 mb_data = {"id": id_,
183 "published": published,
184 "updated": updated,
185 "author": profile_jid.user,
186 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml
187 "content_xhtml": content_xhtml,
188 # "title": title,
189 "allow_comments": "true",
190 }
191
192 # find out if the message access is public or restricted
193 namespace = id_.split(":")[0]
194 if namespace and namespace.lower() not in ("public", "/"):
195 mb_data["group"] = namespace # roster group must exist
196
197 self.posts_data[id_] = {'blog': mb_data, 'comments':[[]]}
198
199 def process(self, client, namespace=DEFAULT_NAMESPACE):
200 """Process a namespace or a single page.
201
202 @param namespace (unicode): DokuWiki namespace (or page) to import
203 """
204 profile_jid = client.jid
205 log.info("Importing data from DokuWiki %s" % self.version)
206 try:
207 pages_list = self.pages.list(namespace)
208 except DokuWikiError:
209 log.warning('Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.')
210 return
211
212 if not pages_list: # namespace is actually a page?
213 names = namespace.split(":")
214 real_namespace = ":".join(names[0:-1])
215 pages_list = self.pages.list(real_namespace)
216 pages_list = [page for page in pages_list if page["id"] == namespace]
217 namespace = real_namespace
218
219 count = 0
220 for page in pages_list:
221 self.processPost(page, profile_jid)
222 count += 1
223 if count >= self.limit :
224 break
225
226 return (self.posts_data.itervalues(), len(self.posts_data))
227
228 def processContent(self, text, backlinks, profile_jid):
229 """Do text substitutions and file copy.
230
231 @param text (unicode): message content
232 @param backlinks (list[unicode]): list of backlinks
233 """
234 text = text.strip(u"\ufeff") # this is at the beginning of the file (BOM)
235
236 for backlink in backlinks:
237 src = '/doku.php?id=%s"' % backlink
238 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink)
239 text = text.replace(src, tgt)
240
241 subs = {}
242
243 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>"""
244 for tag in re.finditer(link_pattern, text):
245 type_, attr, link = tag.group(1), tag.group(2), tag.group(3)
246 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href")
247 if re.match(r"^\w*://", link): # absolute URL to link directly
248 continue
249 if self.media_repo:
250 self.moveMedia(link, subs)
251 elif link not in subs:
252 subs[link] = urlparse.urljoin(self.url, link)
253
254 for url, new_url in subs.iteritems():
255 text = text.replace(url, new_url)
256 return text
257
258 def moveMedia(self, link, subs):
259 """Move a media from the DokuWiki host to the new repository.
260
261 This also updates the hyperlinks to internal media files.
262 @param link (unicode): media link
263 @param subs (dict): substitutions data
264 """
265 url = urlparse.urljoin(self.url, link)
266 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link)
267 thumb_width = None
268
269 if user_media: # media that has been added by the user
270 params = urlparse.parse_qs(urlparse.urlparse(url).query)
271 try:
272 media = params["media"][0]
273 except KeyError:
274 log.warning("No media found in fetch URL: %s" % user_media.group(2))
275 return
276 if re.match(r"^\w*://", media): # external URL to link directly
277 subs[link] = media
278 return
279 try: # create thumbnail
280 thumb_width = params["w"][0]
281 except KeyError:
282 pass
283
284 filename = media.replace(":", "/")
285 # XXX: avoid "precondition failed" error (only keep the media parameter)
286 url = urlparse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media)
287
288 elif link.startswith("/lib/plugins/"):
289 # other link added by a plugin or something else
290 filename = link[13:]
291 else: # fake alert... there's no media (or we don't handle it yet)
292 return
293
294 filepath = os.path.join(self.temp_dir, filename)
295 self.downloadMedia(url, filepath)
296
297 if thumb_width:
298 filename = os.path.join("thumbs", thumb_width, filename)
299 thumbnail = os.path.join(self.temp_dir, filename)
300 self.createThumbnail(filepath, thumbnail, thumb_width)
301
302 new_url = os.path.join(self.media_repo, filename)
303 subs[link] = new_url
304
305 def downloadMedia(self, source, dest):
306 """Copy media to localhost.
307
308 @param source (unicode): source url
309 @param dest (unicode): target path
310 """
311 dirname = os.path.dirname(dest)
312 if not os.path.exists(dest):
313 if not os.path.exists(dirname):
314 os.makedirs(dirname)
315 urllib.urlretrieve(source, dest)
316 log.debug("DokuWiki media file copied to %s" % dest)
317
318 def createThumbnail(self, source, dest, width):
319 """Create a thumbnail.
320
321 @param source (unicode): source file path
322 @param dest (unicode): destination file path
323 @param width (unicode): thumbnail's width
324 """
325 thumb_dir = os.path.dirname(dest)
326 if not os.path.exists(thumb_dir):
327 os.makedirs(thumb_dir)
328 try:
329 im = Image.open(source)
330 im.thumbnail((width, int(width) * im.size[0] / im.size[1]))
331 im.save(dest)
332 log.debug("DokuWiki media thumbnail created: %s" % dest)
333 except IOError:
334 log.error("Cannot create DokuWiki media thumbnail %s" % dest)
335
336
337
338 class DokuwikiImport(object):
339
340 def __init__(self, host):
341 log.info(_("plugin Dokuwiki Import initialization"))
342 self.host = host
343 self._blog_import = host.plugins['BLOG_IMPORT']
344 self._blog_import.register('dokuwiki', self.DkImport, SHORT_DESC, LONG_DESC)
345
346 def DkImport(self, client, location, options=None):
347 """Import from DokuWiki to PubSub
348
349 @param location (unicode): DokuWiki site URL
350 @param options (dict, None): DokuWiki import parameters
351 - user (unicode): DokuWiki admin user
352 - passwd (unicode): DokuWiki admin password
353 - namespace (unicode): DokuWiki namespace to import
354 - media_repo (unicode): New remote media repository
355 """
356 options[self._blog_import.OPT_HOST] = location
357 try:
358 user = options["user"]
359 except KeyError:
360 raise exceptions.DataError('parameter "user" is required')
361 try:
362 passwd = options["passwd"]
363 except KeyError:
364 raise exceptions.DataError('parameter "passwd" is required')
365
366 opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None)
367 try:
368 media_repo = options["media_repo"]
369 if opt_upload_images:
370 options[self._blog_import.OPT_UPLOAD_IMAGES] = False # force using --no-images-upload
371 info_msg = _("DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}")
372 except KeyError:
373 media_repo = DEFAULT_MEDIA_REPO
374 if opt_upload_images:
375 info_msg = _("DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though.")
376 else:
377 info_msg = _("DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible.")
378
379 try:
380 namespace = options["namespace"]
381 except KeyError:
382 namespace = DEFAULT_NAMESPACE
383 try:
384 limit = options["limit"]
385 except KeyError:
386 limit = DEFAULT_LIMIT
387
388 dk_importer = Importer(location, user, passwd, media_repo, limit)
389 info_msg = info_msg.format(temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location)
390 self.host.actionNew({'xmlui': xml_tools.note(info_msg).toXml()}, profile=client.profile)
391 d = threads.deferToThread(dk_importer.process, client, namespace)
392 return d