comparison src/plugins/plugin_blog_import_dokuwiki.py @ 1841:7717975b3ec3

plugin blog_import_dokuwiki: first draft
author souliane <souliane@mailoo.org>
date Thu, 04 Feb 2016 12:29:43 +0100
parents
children 9fd517248dc8
comparison
equal deleted inserted replaced
1840:9eabf7fadfdd 1841:7717975b3ec3
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin to import external blogs
5 # Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org)
7
8 # This program is free software: you can redistribute it and/or modify
9 # it under the terms of the GNU Affero General Public License as published by
10 # the Free Software Foundation, either version 3 of the License, or
11 # (at your option) any later version.
12
13 # This program is distributed in the hope that it will be useful,
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 # GNU Affero General Public License for more details.
17
18 # You should have received a copy of the GNU Affero General Public License
19 # along with this program. If not, see <http://www.gnu.org/licenses/>.
20
21 from sat.core.i18n import _, D_
22 from sat.core.constants import Const as C
23 from sat.core.log import getLogger
24 log = getLogger(__name__)
25 from sat.core import exceptions
26 from sat.tools import xml_tools
27 from twisted.internet import threads
28 from collections import OrderedDict
29 import calendar
30 import urllib
31 import urlparse
32 import tempfile
33 import re
34 import time
35 import os.path
36 try:
37 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency
38 except ImportError:
39 raise exceptions.MissingModule(u'Missing module dokuwiki, please install it with "pip install dokuwiki"')
40 try:
41 from PIL import Image # this is already needed by plugin XEP-0054
42 except:
43 raise exceptions.MissingModule(u"Missing module pillow, please download/install it from https://python-pillow.github.io")
44 try:
45 from plugin_blog_import import OPT_HOST, OPT_UPLOAD_IMAGES
46 except:
47 raise exceptions.MissingModule(u'Missing module plugin_blog_import, please install the Salut à Toi "BLOG_IMPORT" plugin')
48
49 PLUGIN_INFO = {
50 "name": "Dokuwiki import",
51 "import_name": "IMPORT_DOKUWIKI",
52 "type": C.PLUG_TYPE_BLOG,
53 "dependencies": ["BLOG_IMPORT"],
54 "main": "DokuwikiImport",
55 "handler": "no",
56 "description": _("""Blog importer for Dokuwiki blog engine.""")
57 }
58
59 SHORT_DESC = D_(u"import posts from Dokuwiki blog engine")
60
61 LONG_DESC = D_(u"""This importer handle Dokuwiki blog engine.
62
63 To use it, you need an admin access to a running Dokuwiki website
64 (local or on the Internet). The importer retrieves the data using
65 the XMLRPC Dokuwiki API.
66
67 You can specify a namespace (that could be a namespace directory
68 or a single post) or leave it empty to use the root namespace "/"
69 and import all the posts.
70
71 You can specify a new media repository to modify the internal
72 media links and make them point to the URL of your choice, but
73 note that the upload is not done automatically: a temporary
74 directory will be created on your local drive and you will
75 need to upload it yourself to your repository via SSH or FTP.
76
77 Following options are recognized:
78
79 location: DokuWiki site URL
80 user: DokuWiki admin user
81 passwd: DokuWiki admin password
82 namespace: DokuWiki namespace to import (default: root namespace "/")
83 media_repo: URL to the new remote media repository (default: none)
84 limit: maximal number of posts to import (default: 100)
85
86 Example of usage:
87
88 jp import dokuwiki -p dave --pwd xxxxxx --connect
89 http://127.0.1.1 -o user souliane -o passwd qwertz
90 -o namespace public:2015:10
91 -o media_repo http://media.diekulturvermittlung.at
92
93 This retrieves the 100 last blog posts from http://127.0.1.1 that
94 are inside the namespace "public:2015:10" using the Dokuwiki user
95 "souliane", and it imports them to sat profile dave's microblog node.
96 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now
97 pointing to http://media.diekulturvermittlung.at.
98 """)
99 DEFAULT_MEDIA_REPO = ""
100 DEFAULT_NAMESPACE = "/"
101 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long
102
103
104 class Importer(DokuWiki):
105
106 def __init__(self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT):
107 """
108
109 @param url (unicode): DokuWiki site URL
110 @param user (unicode): DokuWiki admin user
111 @param passwd (unicode): DokuWiki admin password
112 @param media_repo (unicode): New remote media repository
113 """
114 DokuWiki.__init__(self, url, user, passwd)
115 self.url = url
116 self.media_repo = media_repo
117 self.temp_dir = None
118 if self.media_repo:
119 self.temp_dir = tempfile.mkdtemp()
120 self.info_msg = _("DokuWiki media files will be downloaded to %s - to finish the import you will need to upload them to %s" % (self.temp_dir, self.media_repo))
121 else:
122 self.info_msg = _("DokuWiki media files will stay on %s - some of them may be protected by DokuWiki ACL and will not be accessible from XMPP." % url)
123 self.limit = limit
124 self.posts_data = OrderedDict()
125
126 def process(self, client, namespace=DEFAULT_NAMESPACE):
127 """Process a namespace or a single page.
128
129 @param namespace (unicode): DokuWiki namespace (or page) to import
130 """
131 profile_jid = client.jid
132 log.info("Importing data from DokuWiki %s" % self.version)
133 try:
134 pages_list = self.pages.list(namespace)
135 except DokuWikiError:
136 log.warning('Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.')
137 return
138
139 if not pages_list: # namespace is actually a page?
140 names = namespace.split(":")
141 real_namespace = ":".join(names[0:-1])
142 pages_list = self.pages.list(real_namespace)
143 pages_list = [page for page in pages_list if page["id"] == namespace]
144 namespace = real_namespace
145
146 count = 0
147 for page in pages_list:
148
149 # get main information
150 id_, updated, published = unicode(page['id']), unicode(page['mtime']), self.getOriginalDate(page)
151
152 # manage links
153 backlinks = self.pages.backlinks(id_)
154 for link in self.pages.links(id_):
155 if link["type"] != "extern":
156 assert link["type"] == "local"
157 page = link["page"]
158 backlinks.append(page[1:] if page.startswith(":") else page)
159
160 content = self.pages.get(id_)
161 content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid)
162
163 # XXX: title is already in content_xhtml and difficult to remove, so leave it
164 # title = content.split("\n")[0].strip(u"\ufeff= ")
165
166 # build the extra data dictionary
167 mb_data = {"id": id_,
168 "published": published,
169 "updated": updated,
170 "author": profile_jid.user,
171 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml
172 "content_xhtml": content_xhtml,
173 # "title": title,
174 "allow_comments": "true",
175 }
176
177 # find out if the message access is public or restricted
178 namespace = id_.split(":")[0]
179 if namespace and namespace.lower() not in ("public", "/"):
180 mb_data["group"] = namespace # roster group must exist
181
182 self.posts_data[id_] = {'blog': mb_data, 'comments':[[]]}
183
184 count += 1
185 if count >= self.limit :
186 break
187
188 return (self.posts_data.itervalues(), len(self.posts_data))
189
190 def getOriginalDate(self, page):
191 """Try to parse the date from the message ID, else use "mtime".
192
193 The date can be extracted if the message ID looks like one of:
194 - namespace:YYMMDD_short_title
195 - namespace:YYYYMMDD_short_title
196 @param page (dict): message page
197 @return unicode
198 """
199 id_, default = unicode(page["id"]), unicode(page["mtime"])
200 try:
201 date = id_.split(":")[-1].split("_")[0]
202 except KeyError:
203 return default
204 try:
205 time_struct = time.strptime(date, "%y%m%d")
206 except ValueError:
207 try:
208 time_struct = time.strptime(date, "%Y%m%d")
209 except ValueError:
210 return default
211 return unicode(calendar.timegm(time_struct))
212
213
214 def processContent(self, text, backlinks, profile_jid):
215 """Do text substitutions and file copy.
216
217 @param text (unicode): message content
218 @param backlinks (list[unicode]): list of backlinks
219 """
220 text = text.strip(u"\ufeff") # this is at the beginning of the file (BOM)
221
222 for backlink in backlinks:
223 src = '/doku.php?id=%s"' % backlink
224 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink)
225 text = text.replace(src, tgt)
226
227 subs = []
228
229 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>"""
230 for tag in re.finditer(link_pattern, text):
231 type_, attr, link = tag.group(1), tag.group(2), tag.group(3)
232 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href")
233 if re.match(r"^\w*://", link): # absolute URL to link directly
234 continue
235 if self.media_repo:
236 self.moveMedia(link, subs)
237 else:
238 subs.append((link, urlparse.urljoin(self.url, link)))
239
240 for url, new_url in subs:
241 text = text.replace(url, new_url)
242 return text
243
244 def moveMedia(self, link, subs):
245 """Move a media from the DokuWiki host to the new repository.
246
247 @param link (unicode): media link
248 @param subs (set(couple)): substitutions list
249 """
250 url = urlparse.urljoin(self.url, link)
251 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link)
252 thumb_width = None
253
254 if user_media: # media that has been added by the user
255 params = urlparse.parse_qs(urlparse.urlparse(url).query)
256 try:
257 media = params["media"][0]
258 except KeyError:
259 log.warning("No media found in fetch URL: %s" % user_media.group(2))
260 return
261 if re.match(r"^\w*://", media): # external URL to link directly
262 subs.append((link, media))
263 return
264 try: # create thumbnail
265 thumb_width = params["w"][0]
266 except KeyError:
267 pass
268
269 filename = media.replace(":", "/")
270 # XXX: avoid "precondition failed" error (only keep the media parameter)
271 url = urlparse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media)
272
273 elif link.startswith("/lib/plugins/"):
274 # other link added by a plugin or something else
275 filename = link[13:]
276 else: # fake alert... there's no media (or we don't handle it yet)
277 return
278
279 filepath = os.path.join(self.temp_dir, filename)
280 self.downloadMedia(url, filepath)
281
282 if thumb_width:
283 filename = os.path.join("thumbs", thumb_width, filename)
284 thumbnail = os.path.join(self.temp_dir, filename)
285 self.createThumbnail(filepath, thumbnail, thumb_width)
286
287 new_url = os.path.join(self.media_repo, filename)
288 subs.append((link, new_url))
289
290 def downloadMedia(self, source, dest):
291 """Copy media to localhost.
292
293 @param source (unicode): source url
294 @param dest (unicode): target path
295 """
296 dirname = os.path.dirname(dest)
297 if not os.path.exists(dest):
298 if not os.path.exists(dirname):
299 os.makedirs(dirname)
300 urllib.urlretrieve(source, dest)
301 log.debug("DokuWiki media file copied to %s" % dest)
302
303 def createThumbnail(self, source, dest, width):
304 """Create a thumbnail.
305
306 @param source (unicode): source file path
307 @param dest (unicode): destination file path
308 @param width (unicode): thumbnail's width
309 """
310 thumb_dir = os.path.dirname(dest)
311 if not os.path.exists(thumb_dir):
312 os.makedirs(thumb_dir)
313 try:
314 im = Image.open(source)
315 im.thumbnail((width, int(width) * im.size[0] / im.size[1]))
316 im.save(dest)
317 log.debug("DokuWiki media thumbnail created: %s" % dest)
318 except IOError:
319 log.error("Cannot create DokuWiki media thumbnail %s" % dest)
320
321
322
323 class DokuwikiImport(object):
324
325 def __init__(self, host):
326 log.info(_("plugin Dokuwiki Import initialization"))
327 self.host = host
328 host.plugins['BLOG_IMPORT'].register('dokuwiki', self.DkImport, SHORT_DESC, LONG_DESC)
329
330 def DkImport(self, client, location, options=None):
331 """Import from DokuWiki to PubSub
332
333 @param location (unicode): DokuWiki site URL
334 @param options (dict, None): DokuWiki import parameters
335 - user (unicode): DokuWiki admin user
336 - passwd (unicode): DokuWiki admin password
337 - namespace (unicode): DokuWiki namespace to import
338 - media_repo (unicode): New remote media repository
339 """
340 options[OPT_HOST] = location
341 try:
342 user = options["user"]
343 except KeyError:
344 raise exceptions.DataError('parameter "user" is required')
345 try:
346 passwd = options["passwd"]
347 except KeyError:
348 raise exceptions.DataError('parameter "passwd" is required')
349 try:
350 media_repo = options["media_repo"]
351 except KeyError:
352 media_repo = DEFAULT_MEDIA_REPO
353 if options.get(OPT_UPLOAD_IMAGES, None):
354 # TODO: when media_repo is not defined or empty, we should not force this option
355 options[OPT_UPLOAD_IMAGES] = False
356 msg = _(u"Option --no-images-upload will be used by force.")
357 self.host.actionNew({'xmlui': xml_tools.note(msg).toXml()}, profile=client.profile)
358 try:
359 namespace = options["namespace"]
360 except KeyError:
361 namespace = DEFAULT_NAMESPACE
362 try:
363 limit = options["limit"]
364 except KeyError:
365 limit = DEFAULT_LIMIT
366 dk_importer = Importer(location, user, passwd, media_repo, limit)
367 self.host.actionNew({'xmlui': xml_tools.note(dk_importer.info_msg).toXml()}, profile=client.profile)
368 d = threads.deferToThread(dk_importer.process, client, namespace)
369 return d