comparison sat/plugins/plugin_blog_import_dotclear.py @ 2562:26edcf3a30eb

core, setup: huge cleaning: - moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention - move twisted directory to root - removed all hacks from setup.py, and added missing dependencies, it is now clean - use https URL for website in setup.py - removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed - renamed sat.sh to sat and fixed its installation - added python_requires to specify Python version needed - replaced glib2reactor which use deprecated code by gtk3reactor sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author Goffi <goffi@goffi.org>
date Mon, 02 Apr 2018 19:44:50 +0200
parents src/plugins/plugin_blog_import_dotclear.py@0046283a285d
children 56f94936df1e
comparison
equal deleted inserted replaced
2561:bd30dc3ffe5a 2562:26edcf3a30eb
1 #!/usr/bin/env python2
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 from sat.core.i18n import _, D_
21 from sat.core.constants import Const as C
22 from sat.core.log import getLogger
23 log = getLogger(__name__)
24 from sat.core import exceptions
25 from sat.tools.common import data_format
26 from twisted.internet import threads
27 from collections import OrderedDict
28 import itertools
29 import time
30 import cgi
31 import os.path
32
33
34 PLUGIN_INFO = {
35 C.PI_NAME: "Dotclear import",
36 C.PI_IMPORT_NAME: "IMPORT_DOTCLEAR",
37 C.PI_TYPE: C.PLUG_TYPE_BLOG,
38 C.PI_DEPENDENCIES: ["BLOG_IMPORT"],
39 C.PI_MAIN: "DotclearImport",
40 C.PI_HANDLER: "no",
41 C.PI_DESCRIPTION: _("""Blog importer for Dotclear blog engine.""")
42 }
43
44 SHORT_DESC = D_(u"import posts from Dotclear blog engine")
45
46 LONG_DESC = D_(u"""This importer handle Dotclear blog engine.
47
48 To use it, you'll need to export your blog to a flat file.
49 You must go in your admin interface and select Plugins/Maintenance then Backup.
50 Export only one blog if you have many, i.e. select "Download database of current blog"
51 Depending on your configuration, your may need to use Import/Export plugin and export as a flat file.
52
53 location: you must use the absolute path to your backup for the location parameter
54 """)
55 POST_ID_PREFIX = u"sat_dc_"
56 KNOWN_DATA_TYPES = ('link', 'setting', 'post', 'meta', 'media', 'post_media', 'comment', 'captcha')
57 ESCAPE_MAP = {
58 'r': u'\r',
59 'n': u'\n',
60 '"': u'"',
61 '\\': u'\\',
62 }
63
64
65 class DotclearParser(object):
66 # XXX: we have to parse all file to build data
67 # this can be ressource intensive on huge blogs
68
69 def __init__(self):
70 self.posts_data = OrderedDict()
71 self.tags = {}
72
73 def getPostId(self, post):
74 """Return a unique and constant post id
75
76 @param post(dict): parsed post data
77 @return (unicode): post unique item id
78 """
79 return u"{}_{}_{}_{}:{}".format(POST_ID_PREFIX, post['blog_id'], post['user_id'], post['post_id'], post['post_url'])
80
81 def getCommentId(self, comment):
82 """Return a unique and constant comment id
83
84 @param comment(dict): parsed comment
85 @return (unicode): comment unique comment id
86 """
87 post_id = comment['post_id']
88 parent_item_id = self.posts_data[post_id]['blog']['id']
89 return u"{}_comment_{}".format(parent_item_id, comment['comment_id'])
90
91 def getTime(self, data, key):
92 """Parse time as given by dotclear, with timezone handling
93
94 @param data(dict): dotclear data (post or comment)
95 @param key(unicode): key to get (e.g. "post_creadt")
96 @return (float): Unix time
97 """
98 return time.mktime(time.strptime(data[key], "%Y-%m-%d %H:%M:%S"))
99
100 def readFields(self, fields_data):
101 buf = []
102 idx = 0
103 while True:
104 if fields_data[idx] != '"':
105 raise exceptions.ParsingError
106 while True:
107 idx += 1
108 try:
109 char = fields_data[idx]
110 except IndexError:
111 raise exceptions.ParsingError("Data was expected")
112 if char == '"':
113 # we have reached the end of this field,
114 # we try to parse a new one
115 yield u''.join(buf)
116 buf = []
117 idx += 1
118 try:
119 separator = fields_data[idx]
120 except IndexError:
121 return
122 if separator != u',':
123 raise exceptions.ParsingError("Field separator was expeceted")
124 idx += 1
125 break # we have a new field
126 elif char == u'\\':
127 idx += 1
128 try:
129 char = ESCAPE_MAP[fields_data[idx]]
130 except IndexError:
131 raise exceptions.ParsingError("Escaped char was expected")
132 except KeyError:
133 char = fields_data[idx]
134 log.warning(u"Unknown key to escape: {}".format(char))
135 buf.append(char)
136
137 def parseFields(self, headers, data):
138 return dict(itertools.izip(headers, self.readFields(data)))
139
140 def postHandler(self, headers, data, index):
141 post = self.parseFields(headers, data)
142 log.debug(u'({}) post found: {}'.format(index, post['post_title']))
143 mb_data = {'id': self.getPostId(post),
144 'published': self.getTime(post, 'post_creadt'),
145 'updated': self.getTime(post, 'post_upddt'),
146 'author': post['user_id'], # there use info are not in the archive
147 # TODO: option to specify user info
148 'content_xhtml': u"{}{}".format(post['post_content_xhtml'], post['post_excerpt_xhtml']),
149 'title': post['post_title'],
150 'allow_comments': C.boolConst(bool(int(post['post_open_comment']))),
151 }
152 self.posts_data[post['post_id']] = {'blog': mb_data, 'comments':[[]], 'url': u'/post/{}'.format(post['post_url'])}
153
154 def metaHandler(self, headers, data, index):
155 meta = self.parseFields(headers, data)
156 if meta['meta_type'] == 'tag':
157 tags = self.tags.setdefault(meta['post_id'], set())
158 tags.add(meta['meta_id'])
159
160 def metaFinishedHandler(self):
161 for post_id, tags in self.tags.iteritems():
162 data_format.iter2dict('tag', tags, self.posts_data[post_id]['blog'])
163 del self.tags
164
165 def commentHandler(self, headers, data, index):
166 comment = self.parseFields(headers, data)
167 if comment['comment_site']:
168 # we don't use atom:uri because it's used for jid in XMPP
169 content = u'{}\n<hr>\n<a href="{}">author website</a>'.format(
170 comment['comment_content'],
171 cgi.escape(comment['comment_site']).replace('"', u'%22'))
172 else:
173 content = comment['comment_content']
174 mb_data = {'id': self.getCommentId(comment),
175 'published': self.getTime(comment, 'comment_dt'),
176 'updated': self.getTime(comment, 'comment_upddt'),
177 'author': comment['comment_author'],
178 # we don't keep email addresses to avoid the author to be spammed
179 # (they would be available publicly else)
180 # 'author_email': comment['comment_email'],
181 'content_xhtml': content,
182 }
183 self.posts_data[comment['post_id']]['comments'][0].append(
184 {'blog': mb_data, 'comments': [[]]})
185
186 def parse(self, db_path):
187 with open(db_path) as f:
188 signature = f.readline().decode('utf-8')
189 try:
190 version = signature.split('|')[1]
191 except IndexError:
192 version = None
193 log.debug(u"Dotclear version: {}".format(version))
194 data_type = None
195 data_headers = None
196 index = None
197 while True:
198 buf = f.readline().decode('utf-8')
199 if not buf:
200 break
201 if buf.startswith('['):
202 header = buf.split(' ', 1)
203 data_type = header[0][1:]
204 if data_type not in KNOWN_DATA_TYPES:
205 log.warning(u"unkown data type: {}".format(data_type))
206 index = 0
207 try:
208 data_headers = header[1].split(',')
209 # we need to remove the ']' from the last header
210 last_header = data_headers[-1]
211 data_headers[-1] = last_header[:last_header.rfind(']')]
212 except IndexError:
213 log.warning(u"Can't read data)")
214 else:
215 if data_type is None:
216 continue
217 buf = buf.strip()
218 if not buf and data_type in KNOWN_DATA_TYPES:
219 try:
220 finished_handler = getattr(self, '{}FinishedHandler'.format(data_type))
221 except AttributeError:
222 pass
223 else:
224 finished_handler()
225 log.debug(u"{} data finished".format(data_type))
226 data_type = None
227 continue
228 assert data_type
229 try:
230 fields_handler = getattr(self, '{}Handler'.format(data_type))
231 except AttributeError:
232 pass
233 else:
234 fields_handler(data_headers, buf, index)
235 index += 1
236 return (self.posts_data.itervalues(), len(self.posts_data))
237
238
239 class DotclearImport(object):
240
241 def __init__(self, host):
242 log.info(_("plugin Dotclear Import initialization"))
243 self.host = host
244 host.plugins['BLOG_IMPORT'].register('dotclear', self.DcImport, SHORT_DESC, LONG_DESC)
245
246 def DcImport(self, client, location, options=None):
247 if not os.path.isabs(location):
248 raise exceptions.DataError(u"An absolute path to backup data need to be given as location")
249 dc_parser = DotclearParser()
250 d = threads.deferToThread(dc_parser.parse, location)
251 return d