Mercurial > libervia-backend
comparison sat/plugins/plugin_blog_import_dokuwiki.py @ 2562:26edcf3a30eb
core, setup: huge cleaning:
- moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention
- move twisted directory to root
- removed all hacks from setup.py, and added missing dependencies, it is now clean
- use https URL for website in setup.py
- removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed
- renamed sat.sh to sat and fixed its installation
- added python_requires to specify Python version needed
- replaced glib2reactor which use deprecated code by gtk3reactor
sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 02 Apr 2018 19:44:50 +0200 |
parents | src/plugins/plugin_blog_import_dokuwiki.py@0046283a285d |
children | 56f94936df1e |
comparison
equal
deleted
inserted
replaced
2561:bd30dc3ffe5a | 2562:26edcf3a30eb |
---|---|
1 #!/usr/bin/env python2 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # SàT plugin to import dokuwiki blogs | |
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org) | |
6 # Copyright (C) 2013-2016 Adrien Cossa (souliane@mailoo.org) | |
7 | |
8 # This program is free software: you can redistribute it and/or modify | |
9 # it under the terms of the GNU Affero General Public License as published by | |
10 # the Free Software Foundation, either version 3 of the License, or | |
11 # (at your option) any later version. | |
12 | |
13 # This program is distributed in the hope that it will be useful, | |
14 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
15 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
16 # GNU Affero General Public License for more details. | |
17 | |
18 # You should have received a copy of the GNU Affero General Public License | |
19 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
20 | |
21 from sat.core.i18n import _, D_ | |
22 from sat.core.constants import Const as C | |
23 from sat.core.log import getLogger | |
24 log = getLogger(__name__) | |
25 from sat.core import exceptions | |
26 from sat.tools import xml_tools | |
27 from twisted.internet import threads | |
28 from collections import OrderedDict | |
29 import calendar | |
30 import urllib | |
31 import urlparse | |
32 import tempfile | |
33 import re | |
34 import time | |
35 import os.path | |
36 try: | |
37 from dokuwiki import DokuWiki, DokuWikiError # this is a new dependency | |
38 except ImportError: | |
39 raise exceptions.MissingModule(u'Missing module dokuwiki, please install it with "pip install dokuwiki"') | |
40 try: | |
41 from PIL import Image # this is already needed by plugin XEP-0054 | |
42 except: | |
43 raise exceptions.MissingModule(u"Missing module pillow, please download/install it from https://python-pillow.github.io") | |
44 | |
45 PLUGIN_INFO = { | |
46 C.PI_NAME: "Dokuwiki import", | |
47 C.PI_IMPORT_NAME: "IMPORT_DOKUWIKI", | |
48 C.PI_TYPE: C.PLUG_TYPE_BLOG, | |
49 C.PI_DEPENDENCIES: ["BLOG_IMPORT"], | |
50 C.PI_MAIN: "DokuwikiImport", | |
51 C.PI_HANDLER: "no", | |
52 C.PI_DESCRIPTION: _("""Blog importer for Dokuwiki blog engine.""") | |
53 } | |
54 | |
55 SHORT_DESC = D_(u"import posts from Dokuwiki blog engine") | |
56 | |
57 LONG_DESC = D_(u"""This importer handle Dokuwiki blog engine. | |
58 | |
59 To use it, you need an admin access to a running Dokuwiki website | |
60 (local or on the Internet). The importer retrieves the data using | |
61 the XMLRPC Dokuwiki API. | |
62 | |
63 You can specify a namespace (that could be a namespace directory | |
64 or a single post) or leave it empty to use the root namespace "/" | |
65 and import all the posts. | |
66 | |
67 You can specify a new media repository to modify the internal | |
68 media links and make them point to the URL of your choice, but | |
69 note that the upload is not done automatically: a temporary | |
70 directory will be created on your local drive and you will | |
71 need to upload it yourself to your repository via SSH or FTP. | |
72 | |
73 Following options are recognized: | |
74 | |
75 location: DokuWiki site URL | |
76 user: DokuWiki admin user | |
77 passwd: DokuWiki admin password | |
78 namespace: DokuWiki namespace to import (default: root namespace "/") | |
79 media_repo: URL to the new remote media repository (default: none) | |
80 limit: maximal number of posts to import (default: 100) | |
81 | |
82 Example of usage (with jp frontend): | |
83 | |
84 jp import dokuwiki -p dave --pwd xxxxxx --connect | |
85 http://127.0.1.1 -o user souliane -o passwd qwertz | |
86 -o namespace public:2015:10 | |
87 -o media_repo http://media.diekulturvermittlung.at | |
88 | |
89 This retrieves the 100 last blog posts from http://127.0.1.1 that | |
90 are inside the namespace "public:2015:10" using the Dokuwiki user | |
91 "souliane", and it imports them to sat profile dave's microblog node. | |
92 Internal Dokuwiki media that were hosted on http://127.0.1.1 are now | |
93 pointing to http://media.diekulturvermittlung.at. | |
94 """) | |
95 DEFAULT_MEDIA_REPO = "" | |
96 DEFAULT_NAMESPACE = "/" | |
97 DEFAULT_LIMIT = 100 # you might get a DBUS timeout (no reply) if it lasts too long | |
98 | |
99 | |
100 class Importer(DokuWiki): | |
101 | |
102 def __init__(self, url, user, passwd, media_repo=DEFAULT_MEDIA_REPO, limit=DEFAULT_LIMIT): | |
103 """ | |
104 | |
105 @param url (unicode): DokuWiki site URL | |
106 @param user (unicode): DokuWiki admin user | |
107 @param passwd (unicode): DokuWiki admin password | |
108 @param media_repo (unicode): New remote media repository | |
109 """ | |
110 DokuWiki.__init__(self, url, user, passwd) | |
111 self.url = url | |
112 self.media_repo = media_repo | |
113 self.temp_dir = tempfile.mkdtemp() if self.media_repo else None | |
114 self.limit = limit | |
115 self.posts_data = OrderedDict() | |
116 | |
117 def getPostId(self, post): | |
118 """Return a unique and constant post id | |
119 | |
120 @param post(dict): parsed post data | |
121 @return (unicode): post unique item id | |
122 """ | |
123 return unicode(post['id']) | |
124 | |
125 def getPostUpdated(self, post): | |
126 """Return the update date. | |
127 | |
128 @param post(dict): parsed post data | |
129 @return (unicode): update date | |
130 """ | |
131 return unicode(post['mtime']) | |
132 | |
133 def getPostPublished(self, post): | |
134 """Try to parse the date from the message ID, else use "mtime". | |
135 | |
136 The date can be extracted if the message ID looks like one of: | |
137 - namespace:YYMMDD_short_title | |
138 - namespace:YYYYMMDD_short_title | |
139 @param post (dict): parsed post data | |
140 @return (unicode): publication date | |
141 """ | |
142 id_, default = unicode(post["id"]), unicode(post["mtime"]) | |
143 try: | |
144 date = id_.split(":")[-1].split("_")[0] | |
145 except KeyError: | |
146 return default | |
147 try: | |
148 time_struct = time.strptime(date, "%y%m%d") | |
149 except ValueError: | |
150 try: | |
151 time_struct = time.strptime(date, "%Y%m%d") | |
152 except ValueError: | |
153 return default | |
154 return unicode(calendar.timegm(time_struct)) | |
155 | |
156 def processPost(self, post, profile_jid): | |
157 """Process a single page. | |
158 | |
159 @param post (dict): parsed post data | |
160 @param profile_jid | |
161 """ | |
162 # get main information | |
163 id_ = self.getPostId(post) | |
164 updated = self.getPostUpdated(post) | |
165 published = self.getPostPublished(post) | |
166 | |
167 # manage links | |
168 backlinks = self.pages.backlinks(id_) | |
169 for link in self.pages.links(id_): | |
170 if link["type"] != "extern": | |
171 assert link["type"] == "local" | |
172 page = link["page"] | |
173 backlinks.append(page[1:] if page.startswith(":") else page) | |
174 | |
175 self.pages.get(id_) | |
176 content_xhtml = self.processContent(self.pages.html(id_), backlinks, profile_jid) | |
177 | |
178 # XXX: title is already in content_xhtml and difficult to remove, so leave it | |
179 # title = content.split("\n")[0].strip(u"\ufeff= ") | |
180 | |
181 # build the extra data dictionary | |
182 mb_data = {"id": id_, | |
183 "published": published, | |
184 "updated": updated, | |
185 "author": profile_jid.user, | |
186 # "content": content, # when passed, it is displayed in Libervia instead of content_xhtml | |
187 "content_xhtml": content_xhtml, | |
188 # "title": title, | |
189 "allow_comments": "true", | |
190 } | |
191 | |
192 # find out if the message access is public or restricted | |
193 namespace = id_.split(":")[0] | |
194 if namespace and namespace.lower() not in ("public", "/"): | |
195 mb_data["group"] = namespace # roster group must exist | |
196 | |
197 self.posts_data[id_] = {'blog': mb_data, 'comments':[[]]} | |
198 | |
199 def process(self, client, namespace=DEFAULT_NAMESPACE): | |
200 """Process a namespace or a single page. | |
201 | |
202 @param namespace (unicode): DokuWiki namespace (or page) to import | |
203 """ | |
204 profile_jid = client.jid | |
205 log.info("Importing data from DokuWiki %s" % self.version) | |
206 try: | |
207 pages_list = self.pages.list(namespace) | |
208 except DokuWikiError: | |
209 log.warning('Could not list Dokuwiki pages: please turn the "display_errors" setting to "Off" in the php.ini of the webserver hosting DokuWiki.') | |
210 return | |
211 | |
212 if not pages_list: # namespace is actually a page? | |
213 names = namespace.split(":") | |
214 real_namespace = ":".join(names[0:-1]) | |
215 pages_list = self.pages.list(real_namespace) | |
216 pages_list = [page for page in pages_list if page["id"] == namespace] | |
217 namespace = real_namespace | |
218 | |
219 count = 0 | |
220 for page in pages_list: | |
221 self.processPost(page, profile_jid) | |
222 count += 1 | |
223 if count >= self.limit : | |
224 break | |
225 | |
226 return (self.posts_data.itervalues(), len(self.posts_data)) | |
227 | |
228 def processContent(self, text, backlinks, profile_jid): | |
229 """Do text substitutions and file copy. | |
230 | |
231 @param text (unicode): message content | |
232 @param backlinks (list[unicode]): list of backlinks | |
233 """ | |
234 text = text.strip(u"\ufeff") # this is at the beginning of the file (BOM) | |
235 | |
236 for backlink in backlinks: | |
237 src = '/doku.php?id=%s"' % backlink | |
238 tgt = '/blog/%s/%s" target="#"' % (profile_jid.user, backlink) | |
239 text = text.replace(src, tgt) | |
240 | |
241 subs = {} | |
242 | |
243 link_pattern = r"""<(img|a)[^>]* (src|href)="([^"]+)"[^>]*>""" | |
244 for tag in re.finditer(link_pattern, text): | |
245 type_, attr, link = tag.group(1), tag.group(2), tag.group(3) | |
246 assert (type_ == "img" and attr == "src") or (type_ == "a" and attr == "href") | |
247 if re.match(r"^\w*://", link): # absolute URL to link directly | |
248 continue | |
249 if self.media_repo: | |
250 self.moveMedia(link, subs) | |
251 elif link not in subs: | |
252 subs[link] = urlparse.urljoin(self.url, link) | |
253 | |
254 for url, new_url in subs.iteritems(): | |
255 text = text.replace(url, new_url) | |
256 return text | |
257 | |
258 def moveMedia(self, link, subs): | |
259 """Move a media from the DokuWiki host to the new repository. | |
260 | |
261 This also updates the hyperlinks to internal media files. | |
262 @param link (unicode): media link | |
263 @param subs (dict): substitutions data | |
264 """ | |
265 url = urlparse.urljoin(self.url, link) | |
266 user_media = re.match(r"(/lib/exe/\w+.php\?)(.*)", link) | |
267 thumb_width = None | |
268 | |
269 if user_media: # media that has been added by the user | |
270 params = urlparse.parse_qs(urlparse.urlparse(url).query) | |
271 try: | |
272 media = params["media"][0] | |
273 except KeyError: | |
274 log.warning("No media found in fetch URL: %s" % user_media.group(2)) | |
275 return | |
276 if re.match(r"^\w*://", media): # external URL to link directly | |
277 subs[link] = media | |
278 return | |
279 try: # create thumbnail | |
280 thumb_width = params["w"][0] | |
281 except KeyError: | |
282 pass | |
283 | |
284 filename = media.replace(":", "/") | |
285 # XXX: avoid "precondition failed" error (only keep the media parameter) | |
286 url = urlparse.urljoin(self.url, "/lib/exe/fetch.php?media=%s" % media) | |
287 | |
288 elif link.startswith("/lib/plugins/"): | |
289 # other link added by a plugin or something else | |
290 filename = link[13:] | |
291 else: # fake alert... there's no media (or we don't handle it yet) | |
292 return | |
293 | |
294 filepath = os.path.join(self.temp_dir, filename) | |
295 self.downloadMedia(url, filepath) | |
296 | |
297 if thumb_width: | |
298 filename = os.path.join("thumbs", thumb_width, filename) | |
299 thumbnail = os.path.join(self.temp_dir, filename) | |
300 self.createThumbnail(filepath, thumbnail, thumb_width) | |
301 | |
302 new_url = os.path.join(self.media_repo, filename) | |
303 subs[link] = new_url | |
304 | |
305 def downloadMedia(self, source, dest): | |
306 """Copy media to localhost. | |
307 | |
308 @param source (unicode): source url | |
309 @param dest (unicode): target path | |
310 """ | |
311 dirname = os.path.dirname(dest) | |
312 if not os.path.exists(dest): | |
313 if not os.path.exists(dirname): | |
314 os.makedirs(dirname) | |
315 urllib.urlretrieve(source, dest) | |
316 log.debug("DokuWiki media file copied to %s" % dest) | |
317 | |
318 def createThumbnail(self, source, dest, width): | |
319 """Create a thumbnail. | |
320 | |
321 @param source (unicode): source file path | |
322 @param dest (unicode): destination file path | |
323 @param width (unicode): thumbnail's width | |
324 """ | |
325 thumb_dir = os.path.dirname(dest) | |
326 if not os.path.exists(thumb_dir): | |
327 os.makedirs(thumb_dir) | |
328 try: | |
329 im = Image.open(source) | |
330 im.thumbnail((width, int(width) * im.size[0] / im.size[1])) | |
331 im.save(dest) | |
332 log.debug("DokuWiki media thumbnail created: %s" % dest) | |
333 except IOError: | |
334 log.error("Cannot create DokuWiki media thumbnail %s" % dest) | |
335 | |
336 | |
337 | |
338 class DokuwikiImport(object): | |
339 | |
340 def __init__(self, host): | |
341 log.info(_("plugin Dokuwiki Import initialization")) | |
342 self.host = host | |
343 self._blog_import = host.plugins['BLOG_IMPORT'] | |
344 self._blog_import.register('dokuwiki', self.DkImport, SHORT_DESC, LONG_DESC) | |
345 | |
346 def DkImport(self, client, location, options=None): | |
347 """Import from DokuWiki to PubSub | |
348 | |
349 @param location (unicode): DokuWiki site URL | |
350 @param options (dict, None): DokuWiki import parameters | |
351 - user (unicode): DokuWiki admin user | |
352 - passwd (unicode): DokuWiki admin password | |
353 - namespace (unicode): DokuWiki namespace to import | |
354 - media_repo (unicode): New remote media repository | |
355 """ | |
356 options[self._blog_import.OPT_HOST] = location | |
357 try: | |
358 user = options["user"] | |
359 except KeyError: | |
360 raise exceptions.DataError('parameter "user" is required') | |
361 try: | |
362 passwd = options["passwd"] | |
363 except KeyError: | |
364 raise exceptions.DataError('parameter "passwd" is required') | |
365 | |
366 opt_upload_images = options.get(self._blog_import.OPT_UPLOAD_IMAGES, None) | |
367 try: | |
368 media_repo = options["media_repo"] | |
369 if opt_upload_images: | |
370 options[self._blog_import.OPT_UPLOAD_IMAGES] = False # force using --no-images-upload | |
371 info_msg = _("DokuWiki media files will be *downloaded* to {temp_dir} - to finish the import you have to upload them *manually* to {media_repo}") | |
372 except KeyError: | |
373 media_repo = DEFAULT_MEDIA_REPO | |
374 if opt_upload_images: | |
375 info_msg = _("DokuWiki media files will be *uploaded* to the XMPP server. Hyperlinks to these media may not been updated though.") | |
376 else: | |
377 info_msg = _("DokuWiki media files will *stay* on {location} - some of them may be protected by DokuWiki ACL and will not be accessible.") | |
378 | |
379 try: | |
380 namespace = options["namespace"] | |
381 except KeyError: | |
382 namespace = DEFAULT_NAMESPACE | |
383 try: | |
384 limit = options["limit"] | |
385 except KeyError: | |
386 limit = DEFAULT_LIMIT | |
387 | |
388 dk_importer = Importer(location, user, passwd, media_repo, limit) | |
389 info_msg = info_msg.format(temp_dir=dk_importer.temp_dir, media_repo=media_repo, location=location) | |
390 self.host.actionNew({'xmlui': xml_tools.note(info_msg).toXml()}, profile=client.profile) | |
391 d = threads.deferToThread(dk_importer.process, client, namespace) | |
392 return d |