Mercurial > libervia-backend
comparison sat/plugins/plugin_blog_import_dotclear.py @ 2562:26edcf3a30eb
core, setup: huge cleaning:
- moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention
- move twisted directory to root
- removed all hacks from setup.py, and added missing dependencies, it is now clean
- use https URL for website in setup.py
- removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed
- renamed sat.sh to sat and fixed its installation
- added python_requires to specify Python version needed
- replaced glib2reactor which use deprecated code by gtk3reactor
sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 02 Apr 2018 19:44:50 +0200 |
parents | src/plugins/plugin_blog_import_dotclear.py@0046283a285d |
children | 56f94936df1e |
comparison
equal
deleted
inserted
replaced
2561:bd30dc3ffe5a | 2562:26edcf3a30eb |
---|---|
1 #!/usr/bin/env python2 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # SàT plugin for import external blogs | |
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org) | |
6 | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU Affero General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU Affero General Public License for more details. | |
16 | |
17 # You should have received a copy of the GNU Affero General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 from sat.core.i18n import _, D_ | |
21 from sat.core.constants import Const as C | |
22 from sat.core.log import getLogger | |
23 log = getLogger(__name__) | |
24 from sat.core import exceptions | |
25 from sat.tools.common import data_format | |
26 from twisted.internet import threads | |
27 from collections import OrderedDict | |
28 import itertools | |
29 import time | |
30 import cgi | |
31 import os.path | |
32 | |
33 | |
34 PLUGIN_INFO = { | |
35 C.PI_NAME: "Dotclear import", | |
36 C.PI_IMPORT_NAME: "IMPORT_DOTCLEAR", | |
37 C.PI_TYPE: C.PLUG_TYPE_BLOG, | |
38 C.PI_DEPENDENCIES: ["BLOG_IMPORT"], | |
39 C.PI_MAIN: "DotclearImport", | |
40 C.PI_HANDLER: "no", | |
41 C.PI_DESCRIPTION: _("""Blog importer for Dotclear blog engine.""") | |
42 } | |
43 | |
44 SHORT_DESC = D_(u"import posts from Dotclear blog engine") | |
45 | |
46 LONG_DESC = D_(u"""This importer handle Dotclear blog engine. | |
47 | |
48 To use it, you'll need to export your blog to a flat file. | |
49 You must go in your admin interface and select Plugins/Maintenance then Backup. | |
50 Export only one blog if you have many, i.e. select "Download database of current blog" | |
51 Depending on your configuration, your may need to use Import/Export plugin and export as a flat file. | |
52 | |
53 location: you must use the absolute path to your backup for the location parameter | |
54 """) | |
55 POST_ID_PREFIX = u"sat_dc_" | |
56 KNOWN_DATA_TYPES = ('link', 'setting', 'post', 'meta', 'media', 'post_media', 'comment', 'captcha') | |
57 ESCAPE_MAP = { | |
58 'r': u'\r', | |
59 'n': u'\n', | |
60 '"': u'"', | |
61 '\\': u'\\', | |
62 } | |
63 | |
64 | |
65 class DotclearParser(object): | |
66 # XXX: we have to parse all file to build data | |
67 # this can be ressource intensive on huge blogs | |
68 | |
69 def __init__(self): | |
70 self.posts_data = OrderedDict() | |
71 self.tags = {} | |
72 | |
73 def getPostId(self, post): | |
74 """Return a unique and constant post id | |
75 | |
76 @param post(dict): parsed post data | |
77 @return (unicode): post unique item id | |
78 """ | |
79 return u"{}_{}_{}_{}:{}".format(POST_ID_PREFIX, post['blog_id'], post['user_id'], post['post_id'], post['post_url']) | |
80 | |
81 def getCommentId(self, comment): | |
82 """Return a unique and constant comment id | |
83 | |
84 @param comment(dict): parsed comment | |
85 @return (unicode): comment unique comment id | |
86 """ | |
87 post_id = comment['post_id'] | |
88 parent_item_id = self.posts_data[post_id]['blog']['id'] | |
89 return u"{}_comment_{}".format(parent_item_id, comment['comment_id']) | |
90 | |
91 def getTime(self, data, key): | |
92 """Parse time as given by dotclear, with timezone handling | |
93 | |
94 @param data(dict): dotclear data (post or comment) | |
95 @param key(unicode): key to get (e.g. "post_creadt") | |
96 @return (float): Unix time | |
97 """ | |
98 return time.mktime(time.strptime(data[key], "%Y-%m-%d %H:%M:%S")) | |
99 | |
100 def readFields(self, fields_data): | |
101 buf = [] | |
102 idx = 0 | |
103 while True: | |
104 if fields_data[idx] != '"': | |
105 raise exceptions.ParsingError | |
106 while True: | |
107 idx += 1 | |
108 try: | |
109 char = fields_data[idx] | |
110 except IndexError: | |
111 raise exceptions.ParsingError("Data was expected") | |
112 if char == '"': | |
113 # we have reached the end of this field, | |
114 # we try to parse a new one | |
115 yield u''.join(buf) | |
116 buf = [] | |
117 idx += 1 | |
118 try: | |
119 separator = fields_data[idx] | |
120 except IndexError: | |
121 return | |
122 if separator != u',': | |
123 raise exceptions.ParsingError("Field separator was expeceted") | |
124 idx += 1 | |
125 break # we have a new field | |
126 elif char == u'\\': | |
127 idx += 1 | |
128 try: | |
129 char = ESCAPE_MAP[fields_data[idx]] | |
130 except IndexError: | |
131 raise exceptions.ParsingError("Escaped char was expected") | |
132 except KeyError: | |
133 char = fields_data[idx] | |
134 log.warning(u"Unknown key to escape: {}".format(char)) | |
135 buf.append(char) | |
136 | |
137 def parseFields(self, headers, data): | |
138 return dict(itertools.izip(headers, self.readFields(data))) | |
139 | |
140 def postHandler(self, headers, data, index): | |
141 post = self.parseFields(headers, data) | |
142 log.debug(u'({}) post found: {}'.format(index, post['post_title'])) | |
143 mb_data = {'id': self.getPostId(post), | |
144 'published': self.getTime(post, 'post_creadt'), | |
145 'updated': self.getTime(post, 'post_upddt'), | |
146 'author': post['user_id'], # there use info are not in the archive | |
147 # TODO: option to specify user info | |
148 'content_xhtml': u"{}{}".format(post['post_content_xhtml'], post['post_excerpt_xhtml']), | |
149 'title': post['post_title'], | |
150 'allow_comments': C.boolConst(bool(int(post['post_open_comment']))), | |
151 } | |
152 self.posts_data[post['post_id']] = {'blog': mb_data, 'comments':[[]], 'url': u'/post/{}'.format(post['post_url'])} | |
153 | |
154 def metaHandler(self, headers, data, index): | |
155 meta = self.parseFields(headers, data) | |
156 if meta['meta_type'] == 'tag': | |
157 tags = self.tags.setdefault(meta['post_id'], set()) | |
158 tags.add(meta['meta_id']) | |
159 | |
160 def metaFinishedHandler(self): | |
161 for post_id, tags in self.tags.iteritems(): | |
162 data_format.iter2dict('tag', tags, self.posts_data[post_id]['blog']) | |
163 del self.tags | |
164 | |
165 def commentHandler(self, headers, data, index): | |
166 comment = self.parseFields(headers, data) | |
167 if comment['comment_site']: | |
168 # we don't use atom:uri because it's used for jid in XMPP | |
169 content = u'{}\n<hr>\n<a href="{}">author website</a>'.format( | |
170 comment['comment_content'], | |
171 cgi.escape(comment['comment_site']).replace('"', u'%22')) | |
172 else: | |
173 content = comment['comment_content'] | |
174 mb_data = {'id': self.getCommentId(comment), | |
175 'published': self.getTime(comment, 'comment_dt'), | |
176 'updated': self.getTime(comment, 'comment_upddt'), | |
177 'author': comment['comment_author'], | |
178 # we don't keep email addresses to avoid the author to be spammed | |
179 # (they would be available publicly else) | |
180 # 'author_email': comment['comment_email'], | |
181 'content_xhtml': content, | |
182 } | |
183 self.posts_data[comment['post_id']]['comments'][0].append( | |
184 {'blog': mb_data, 'comments': [[]]}) | |
185 | |
186 def parse(self, db_path): | |
187 with open(db_path) as f: | |
188 signature = f.readline().decode('utf-8') | |
189 try: | |
190 version = signature.split('|')[1] | |
191 except IndexError: | |
192 version = None | |
193 log.debug(u"Dotclear version: {}".format(version)) | |
194 data_type = None | |
195 data_headers = None | |
196 index = None | |
197 while True: | |
198 buf = f.readline().decode('utf-8') | |
199 if not buf: | |
200 break | |
201 if buf.startswith('['): | |
202 header = buf.split(' ', 1) | |
203 data_type = header[0][1:] | |
204 if data_type not in KNOWN_DATA_TYPES: | |
205 log.warning(u"unkown data type: {}".format(data_type)) | |
206 index = 0 | |
207 try: | |
208 data_headers = header[1].split(',') | |
209 # we need to remove the ']' from the last header | |
210 last_header = data_headers[-1] | |
211 data_headers[-1] = last_header[:last_header.rfind(']')] | |
212 except IndexError: | |
213 log.warning(u"Can't read data)") | |
214 else: | |
215 if data_type is None: | |
216 continue | |
217 buf = buf.strip() | |
218 if not buf and data_type in KNOWN_DATA_TYPES: | |
219 try: | |
220 finished_handler = getattr(self, '{}FinishedHandler'.format(data_type)) | |
221 except AttributeError: | |
222 pass | |
223 else: | |
224 finished_handler() | |
225 log.debug(u"{} data finished".format(data_type)) | |
226 data_type = None | |
227 continue | |
228 assert data_type | |
229 try: | |
230 fields_handler = getattr(self, '{}Handler'.format(data_type)) | |
231 except AttributeError: | |
232 pass | |
233 else: | |
234 fields_handler(data_headers, buf, index) | |
235 index += 1 | |
236 return (self.posts_data.itervalues(), len(self.posts_data)) | |
237 | |
238 | |
239 class DotclearImport(object): | |
240 | |
241 def __init__(self, host): | |
242 log.info(_("plugin Dotclear Import initialization")) | |
243 self.host = host | |
244 host.plugins['BLOG_IMPORT'].register('dotclear', self.DcImport, SHORT_DESC, LONG_DESC) | |
245 | |
246 def DcImport(self, client, location, options=None): | |
247 if not os.path.isabs(location): | |
248 raise exceptions.DataError(u"An absolute path to backup data need to be given as location") | |
249 dc_parser = DotclearParser() | |
250 d = threads.deferToThread(dc_parser.parse, location) | |
251 return d |