Mercurial > libervia-backend
comparison src/plugins/plugin_blog_import_dokuwiki.py @ 1841:7717975b3ec3
plugin blog_import_dokuwiki: first draft
author | souliane <souliane@mailoo.org> |
---|---|
date | Thu, 04 Feb 2016 12:29:43 +0100 |
parents | |
children | 9fd517248dc8 |
comparison
equal
deleted
inserted
replaced
1840:9eabf7fadfdd | 1841:7717975b3ec3 |
---|---|
1 #!/usr/bin/python | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # SàT plugin to import external blogs | |
5 # Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org) | |
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org) | |
7 | |
8 # This program is free software: you can redistribute it and/or modify | |
9 # it under the terms of the GNU Affero General Public License as published by | |
10 # the Free Software Foundation, either version 3 of the License, or | |
11 # (at your option) any later version. | |
12 | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU Affero General Public License for more details. | |
17 | |
18 # You should have received a copy of the GNU Affero General Public License | |
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 from sat.core.i18n import _, D_ | |
22 from sat.core.constants import Const as C | |
23 from sat.core.log import getLogger | |
24 log = getLogger(__name__) | |
25 from sat.core import exceptions | |
26 from sat.tools import xml_tools | |
27 from twisted.internet import threads | |
28 from collections import OrderedDict | |
29 import calendar | |
30 import urllib | |
31 import urlparse | |
32 import tempfile | |
33 import re | |
34 import time | |
35 import os.path | |
36 try: | |
37 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency | |
38 except ImportError: | |
39 raise exceptions.MissingModule(u'Missing module dokuwiki, please install it with "pip install dokuwiki"') | |
40 try: | |
41 from PIL import Image # this is already needed by plugin XEP-0054 | |
42 except: | |
43 raise exceptions.MissingModule(u"Missing module pillow, please download/install it from https://python-pillow.github.io") | |
44 try: | |
45 from plugin_blog_import import OPT_HOST, OPT_UPLOAD_IMAGES | |
46 except: | |
47 raise exceptions.MissingModule(u'Missing module plugin_blog_import, please install the Salut à Toi "BLOG_IMPORT" plugin') | |
48 | |
49 PLUGIN_INFO = { | |
50 "name": "Dokuwiki import", | |
51 "import_name": "IMPORT_DOKUWIKI", | |
52 "type": C.PLUG_TYPE_BLOG, | |
53 "dependencies": ["BLOG_IMPORT"], | |
54 "main": "DokuwikiImport", | |
55 "handler": "no", | |
56 "description": _("""Blog importer for Dokuwiki blog engine.""") | |
57 } | |
58 | |
59 SHORT_DESC = D_(u"import posts from Dokuwiki blog engine") | |
60 | |
61 LONG_DESC = D_(u"""This importer handle Dokuwiki blog engine. | |
62 | |
63 To use it, you need an admin access to a running Dokuwiki website | |
64 (local or on the Internet). The importer retrieves the data using | |
65 the XMLRPC Dokuwiki API. | |
66 | |
67 You can specify a namespace (that could be a namespace directory | |
68 or a single post) or leave it empty to use the root namespace "/" | |
69 and import all the posts. | |
70 | |
71 You can specify a new media repository to modify the internal | |
72 media links and make them point to the URL of your choice, but | |
73 note that the upload is not done automatically: a temporary | |
74 directory will be created on your local drive and you will | |
75 need to upload it yourself to your repository via SSH or FTP. | |
76 | |
77 Following options are recognized: | |
78 | |
79 location: DokuWiki site URL | |
80 user: DokuWiki admin user | |
81 passwd: DokuWiki admin password | |
82 namespace: DokuWiki namespace to import (default: root namespace "/") | |
83 media_repo: URL to the new remote media repository (default: none) | |
84 limit: maximal number of posts to import (default: 100) | |
85 | |
86 Example of usage: | |
87 | |
88 jp import dokuwiki -p dave --pwd xxxxxx --connect | |
89 http://127.0.1.1 -o user souliane -o passwd qwertz | |
90 -o namespace public:2015:10 | |
91 -o media_repo http://media.diekulturvermittlung.at | |
92 | |
93 This retrieves the 100 last blog posts from http://127.0.1.1 that | |
94 are inside the namespace "public:2015:10" using the Dokuwiki user | |
95 "souliane", and it imports them to sat profile dave's microblog node. | |
96 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now | |
97 pointing to http://media.diekulturvermittlung.at. | |
98 """) | |
99 DEFAULT_MEDIA_REPO = "" | |
100 DEFAULT_NAMESPACE = "/" | |
101 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long | |
102 | |
103 | |
104 class Importer(DokuWiki): | |
105 | |
106 def __init__(self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT): | |
107 """ | |
108 | |
109 @param url (unicode): DokuWiki site URL | |
110 @param user (unicode): DokuWiki admin user | |
111 @param passwd (unicode): DokuWiki admin password | |
112 @param media_repo (unicode): New remote media repository | |
113 """ | |
114 DokuWiki.__init__(self, url, user, passwd) | |
115 self.url = url | |
116 self.media_repo = media_repo | |
117 self.temp_dir = None | |
118 if self.media_repo: | |
119 self.temp_dir = tempfile.mkdtemp() | |
120 self.info_msg = _("DokuWiki media files will be downloaded to %s - to finish the import you will need to upload them to %s" % (self.temp_dir, self.media_repo)) | |
121 else: | |
122 self.info_msg = _("DokuWiki media files will stay on %s - some of them may be protected by DokuWiki ACL and will not be accessible from XMPP." % url) | |
123 self.limit = limit | |
124 self.posts_data = OrderedDict() | |
125 | |
126 def process(self, client, namespace=DEFAULT_NAMESPACE): | |
127 """Process a namespace or a single page. | |
128 | |
129 @param namespace (unicode): DokuWiki namespace (or page) to import | |
130 """ | |
131 profile_jid = client.jid | |
132 log.info("Importing data from DokuWiki %s" % self.version) | |
133 try: | |
134 pages_list = self.pages.list(namespace) | |
135 except DokuWikiError: | |
136 log.warning('Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.') | |
137 return | |
138 | |
139 if not pages_list: # namespace is actually a page? | |
140 names = namespace.split(":") | |
141 real_namespace = ":".join(names[0:-1]) | |
142 pages_list = self.pages.list(real_namespace) | |
143 pages_list = [page for page in pages_list if page["id"] == namespace] | |
144 namespace = real_namespace | |
145 | |
146 count = 0 | |
147 for page in pages_list: | |
148 | |
149 # get main information | |
150 id_, updated, published = unicode(page['id']), unicode(page['mtime']), self.getOriginalDate(page) | |
151 | |
152 # manage links | |
153 backlinks = self.pages.backlinks(id_) | |
154 for link in self.pages.links(id_): | |
155 if link["type"] != "extern": | |
156 assert link["type"] == "local" | |
157 page = link["page"] | |
158 backlinks.append(page[1:] if page.startswith(":") else page) | |
159 | |
160 content = self.pages.get(id_) | |
161 content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid) | |
162 | |
163 # XXX: title is already in content_xhtml and difficult to remove, so leave it | |
164 # title = content.split("\n")[0].strip(u"\ufeff= ") | |
165 | |
166 # build the extra data dictionary | |
167 mb_data = {"id": id_, | |
168 "published": published, | |
169 "updated": updated, | |
170 "author": profile_jid.user, | |
171 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml | |
172 "content_xhtml": content_xhtml, | |
173 # "title": title, | |
174 "allow_comments": "true", | |
175 } | |
176 | |
177 # find out if the message access is public or restricted | |
178 namespace = id_.split(":")[0] | |
179 if namespace and namespace.lower() not in ("public", "/"): | |
180 mb_data["group"] = namespace # roster group must exist | |
181 | |
182 self.posts_data[id_] = {'blog': mb_data, 'comments':[[]]} | |
183 | |
184 count += 1 | |
185 if count >= self.limit : | |
186 break | |
187 | |
188 return (self.posts_data.itervalues(), len(self.posts_data)) | |
189 | |
190 def getOriginalDate(self, page): | |
191 """Try to parse the date from the message ID, else use "mtime". | |
192 | |
193 The date can be extracted if the message ID looks like one of: | |
194 - namespace:YYMMDD_short_title | |
195 - namespace:YYYYMMDD_short_title | |
196 @param page (dict): message page | |
197 @return unicode | |
198 """ | |
199 id_, default = unicode(page["id"]), unicode(page["mtime"]) | |
200 try: | |
201 date = id_.split(":")[-1].split("_")[0] | |
202 except KeyError: | |
203 return default | |
204 try: | |
205 time_struct = time.strptime(date, "%y%m%d") | |
206 except ValueError: | |
207 try: | |
208 time_struct = time.strptime(date, "%Y%m%d") | |
209 except ValueError: | |
210 return default | |
211 return unicode(calendar.timegm(time_struct)) | |
212 | |
213 | |
214 def processContent(self, text, backlinks, profile_jid): | |
215 """Do text substitutions and file copy. | |
216 | |
217 @param text (unicode): message content | |
218 @param backlinks (list[unicode]): list of backlinks | |
219 """ | |
220 text = text.strip(u"\ufeff") # this is at the beginning of the file (BOM) | |
221 | |
222 for backlink in backlinks: | |
223 src = '/doku.php?id=%s"' % backlink | |
224 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink) | |
225 text = text.replace(src, tgt) | |
226 | |
227 subs = [] | |
228 | |
229 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>""" | |
230 for tag in re.finditer(link_pattern, text): | |
231 type_, attr, link = tag.group(1), tag.group(2), tag.group(3) | |
232 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href") | |
233 if re.match(r"^\w*://", link): # absolute URL to link directly | |
234 continue | |
235 if self.media_repo: | |
236 self.moveMedia(link, subs) | |
237 else: | |
238 subs.append((link, urlparse.urljoin(self.url, link))) | |
239 | |
240 for url, new_url in subs: | |
241 text = text.replace(url, new_url) | |
242 return text | |
243 | |
244 def moveMedia(self, link, subs): | |
245 """Move a media from the DokuWiki host to the new repository. | |
246 | |
247 @param link (unicode): media link | |
248 @param subs (set(couple)): substitutions list | |
249 """ | |
250 url = urlparse.urljoin(self.url, link) | |
251 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link) | |
252 thumb_width = None | |
253 | |
254 if user_media: # media that has been added by the user | |
255 params = urlparse.parse_qs(urlparse.urlparse(url).query) | |
256 try: | |
257 media = params["media"][0] | |
258 except KeyError: | |
259 log.warning("No media found in fetch URL: %s" % user_media.group(2)) | |
260 return | |
261 if re.match(r"^\w*://", media): # external URL to link directly | |
262 subs.append((link, media)) | |
263 return | |
264 try: # create thumbnail | |
265 thumb_width = params["w"][0] | |
266 except KeyError: | |
267 pass | |
268 | |
269 filename = media.replace(":", "/") | |
270 # XXX: avoid "precondition failed" error (only keep the media parameter) | |
271 url = urlparse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media) | |
272 | |
273 elif link.startswith("/lib/plugins/"): | |
274 # other link added by a plugin or something else | |
275 filename = link[13:] | |
276 else: # fake alert... there's no media (or we don't handle it yet) | |
277 return | |
278 | |
279 filepath = os.path.join(self.temp_dir, filename) | |
280 self.downloadMedia(url, filepath) | |
281 | |
282 if thumb_width: | |
283 filename = os.path.join("thumbs", thumb_width, filename) | |
284 thumbnail = os.path.join(self.temp_dir, filename) | |
285 self.createThumbnail(filepath, thumbnail, thumb_width) | |
286 | |
287 new_url = os.path.join(self.media_repo, filename) | |
288 subs.append((link, new_url)) | |
289 | |
290 def downloadMedia(self, source, dest): | |
291 """Copy media to localhost. | |
292 | |
293 @param source (unicode): source url | |
294 @param dest (unicode): target path | |
295 """ | |
296 dirname = os.path.dirname(dest) | |
297 if not os.path.exists(dest): | |
298 if not os.path.exists(dirname): | |
299 os.makedirs(dirname) | |
300 urllib.urlretrieve(source, dest) | |
301 log.debug("DokuWiki media file copied to %s" % dest) | |
302 | |
303 def createThumbnail(self, source, dest, width): | |
304 """Create a thumbnail. | |
305 | |
306 @param source (unicode): source file path | |
307 @param dest (unicode): destination file path | |
308 @param width (unicode): thumbnail's width | |
309 """ | |
310 thumb_dir = os.path.dirname(dest) | |
311 if not os.path.exists(thumb_dir): | |
312 os.makedirs(thumb_dir) | |
313 try: | |
314 im = Image.open(source) | |
315 im.thumbnail((width, int(width) * im.size[0] / im.size[1])) | |
316 im.save(dest) | |
317 log.debug("DokuWiki media thumbnail created: %s" % dest) | |
318 except IOError: | |
319 log.error("Cannot create DokuWiki media thumbnail %s" % dest) | |
320 | |
321 | |
322 | |
323 class DokuwikiImport(object): | |
324 | |
325 def __init__(self, host): | |
326 log.info(_("plugin Dokuwiki Import initialization")) | |
327 self.host = host | |
328 host.plugins['BLOG_IMPORT'].register('dokuwiki', self.DkImport, SHORT_DESC, LONG_DESC) | |
329 | |
330 def DkImport(self, client, location, options=None): | |
331 """Import from DokuWiki to PubSub | |
332 | |
333 @param location (unicode): DokuWiki site URL | |
334 @param options (dict, None): DokuWiki import parameters | |
335 - user (unicode): DokuWiki admin user | |
336 - passwd (unicode): DokuWiki admin password | |
337 - namespace (unicode): DokuWiki namespace to import | |
338 - media_repo (unicode): New remote media repository | |
339 """ | |
340 options[OPT_HOST] = location | |
341 try: | |
342 user = options["user"] | |
343 except KeyError: | |
344 raise exceptions.DataError('parameter "user" is required') | |
345 try: | |
346 passwd = options["passwd"] | |
347 except KeyError: | |
348 raise exceptions.DataError('parameter "passwd" is required') | |
349 try: | |
350 media_repo = options["media_repo"] | |
351 except KeyError: | |
352 media_repo = DEFAULT_MEDIA_REPO | |
353 if options.get(OPT_UPLOAD_IMAGES, None): | |
354 # TODO: when media_repo is not defined or empty, we should not force this option | |
355 options[OPT_UPLOAD_IMAGES] = False | |
356 msg = _(u"Option --no-images-upload will be used by force.") | |
357 self.host.actionNew({'xmlui': xml_tools.note(msg).toXml()}, profile=client.profile) | |
358 try: | |
359 namespace = options["namespace"] | |
360 except KeyError: | |
361 namespace = DEFAULT_NAMESPACE | |
362 try: | |
363 limit = options["limit"] | |
364 except KeyError: | |
365 limit = DEFAULT_LIMIT | |
366 dk_importer = Importer(location, user, passwd, media_repo, limit) | |
367 self.host.actionNew({'xmlui': xml_tools.note(dk_importer.info_msg).toXml()}, profile=client.profile) | |
368 d = threads.deferToThread(dk_importer.process, client, namespace) | |
369 return d |