comparison src/plugins/plugin_blog_import_dotclear.py @ 1826:d80ccf4bf201

plugin blog import dotclear: this plugin import Dotclear 2 backups
author Goffi <goffi@goffi.org>
date Fri, 22 Jan 2016 20:24:17 +0100
parents
children d3354c80bd1f
comparison
equal deleted inserted replaced
1825:4e51f21c687f 1826:d80ccf4bf201
1 #!/usr/bin/python
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2016 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 from sat.core.i18n import _, D_
21 from sat.core.constants import Const as C
22 from sat.core.log import getLogger
23 log = getLogger(__name__)
24 from sat.core import exceptions
25 from sat.tools import common
26 from twisted.internet import threads
27 from collections import OrderedDict
28 import itertools
29 import time
30 import cgi
31 import os.path
32
33
34 PLUGIN_INFO = {
35 "name": "Dotclear import",
36 "import_name": "IMPORT_DOTCLEAR",
37 "type": C.PLUG_TYPE_BLOG,
38 "dependencies": ["BLOG_IMPORT"],
39 "main": "DotclearImport",
40 "handler": "no",
41 "description": _("""Blog importer for Dotclear blog engine.""")
42 }
43
44 SHORT_DESC = D_(u"import posts from Dotclear blog engine")
45
46 LONG_DESC = D_(u"""This importer handle Dotclear blog engine.
47
48 To use it, you'll need to export your blog to a flat file.
49 You must go in your admin interface and select Plugins/Maintenance then Backup.
50 Export only one blog if you have many, i.e. select "Download database of current blog"
51 Depending on your configuration, your may need to use Import/Export plugin and export as a flat file.
52
53 location: you must use the absolute path to your backup for the location parameter
54 """)
55 POST_ID_PREFIX = u"SàT_DOTCLEAR_IMPORT_BLOG"
56 KNOWN_DATA_TYPES = ('link', 'setting', 'post', 'meta', 'media', 'post_media', 'comment', 'captcha')
57 ESCAPE_MAP = {
58 'r': u'\r',
59 'n': u'\n',
60 '"': u'"',
61 '\\': u'\\',
62 }
63
64
65 class DotclearParser(object):
66 # XXX: we have to parse all file to build data
67 # this can be ressource intensive on huge blogs
68
69 def __init__(self):
70 self.posts_data = OrderedDict()
71 self.tags = {}
72
73 def getPostId(self, post):
74 """Return a unique and constant post id
75
76 @param post(dict): parsed post data
77 @return (unicode): post unique item id
78 """
79 return u"{}_{}_{}_{}:{}".format(POST_ID_PREFIX, post['blog_id'], post['user_id'], post['post_id'], post['post_url'])
80
81 def getCommentId(self, comment):
82 """Return a unique and constant comment id
83
84 @param comment(dict): parsed comment
85 @return (unicode): comment unique comment id
86 """
87 post_id = comment['post_id']
88 parent_item_id = self.posts_data[post_id]['blog']['id']
89 return u"{}_comment_{}".format(parent_item_id, comment['comment_id'])
90
91 def getTime(self, data, key):
92 """Parse time as given by dotclear, with timezone handling
93
94 @param data(dict): dotclear data (post or comment)
95 @param key(unicode): key to get (e.g. "post_creadt")
96 @return (float): Unix time
97 """
98 return time.mktime(time.strptime(data[key], "%Y-%m-%d %H:%M:%S"))
99
100 def readFields(self, fields_data):
101 buf = []
102 idx = 0
103 while True:
104 if fields_data[idx] != '"':
105 raise exceptions.ParsingError
106 while True:
107 idx += 1
108 try:
109 char = fields_data[idx]
110 except IndexError:
111 raise exceptions.ParsingError("Data was expected")
112 if char == '"':
113 # we have reached the end of this field,
114 # we try to parse a new one
115 yield u''.join(buf)
116 buf = []
117 idx += 1
118 try:
119 separator = fields_data[idx]
120 except IndexError:
121 return
122 if separator != u',':
123 raise exceptions.ParsingError("Field separator was expeceted")
124 idx += 1
125 break # we have a new field
126 elif char == u'\\':
127 idx += 1
128 try:
129 char = ESCAPE_MAP[fields_data[idx]]
130 except IndexError:
131 raise exceptions.ParsingError("Escaped char was expected")
132 except KeyError:
133 char = fields_data[idx]
134 log.warning(u"Unknown key to escape: {}".format(char))
135 buf.append(char)
136
137 def parseFields(self, headers, data):
138 return dict(itertools.izip(headers, self.readFields(data)))
139
140 def postHandler(self, headers, data, index):
141 post = self.parseFields(headers, data)
142 log.debug(u'({}) post found: {}'.format(index, post['post_title']))
143 mb_data = {'id': self.getPostId(post),
144 'published': self.getTime(post, 'post_creadt'),
145 'updated': self.getTime(post, 'post_upddt'),
146 'author': post['user_id'], # there use info are not in the archive
147 # TODO: option to specify user info
148 'content_xhtml': u"{}{}".format(post['post_content_xhtml'], post['post_excerpt_xhtml']),
149 'title': post['post_title'],
150 'allow_comments': C.boolConst(bool(int(post['post_open_comment']))),
151 }
152 self.posts_data[post['post_id']] = {'blog': mb_data, 'comments':[[]], 'url': u'/post/{}'.format(post['post_url'])}
153
154 def metaHandler(self, headers, data, index):
155 meta = self.parseFields(headers, data)
156 if meta['meta_type'] == 'tag':
157 tags = self.tags.setdefault(meta['post_id'], set())
158 tags.add(meta['meta_id'])
159
160 def metaFinishedHandler(self):
161 for post_id, tags in self.tags.iteritems():
162 common.iter2dict('tag', tags, self.posts_data[post_id]['blog'])
163 del self.tags
164
165 def commentHandler(self, headers, data, index):
166 comment = self.parseFields(headers, data)
167 if comment['comment_site']:
168 # we don't use atom:uri because it's used for jid in XMPP
169 content = u'{}\n<hr>\n<a href="{}">author website</a>'.format(
170 comment['comment_content'],
171 cgi.escape(comment['comment_site']).replace('"', u'%22'))
172 else:
173 content = comment['comment_content']
174 mb_data = {'id': self.getCommentId(comment),
175 'published': self.getTime(comment, 'comment_dt'),
176 'updated': self.getTime(comment, 'comment_upddt'),
177 'author': comment['comment_author'],
178 # we don't keep email addresses to avoid the author to be spammed
179 # (they would be available publicly else)
180 # 'author_email': comment['comment_email'],
181 'content_xhtml': content,
182 }
183 self.posts_data[comment['post_id']]['comments'][0].append(
184 {'blog': mb_data, 'comments': [[]]})
185
186 def parse(self, db_path):
187 with open(db_path) as f:
188 signature = f.readline().decode('utf-8')
189 try:
190 version = signature.split('|')[1]
191 except IndexError:
192 version = None
193 log.debug(u"Dotclear version: {}".format(version))
194 data_type = None
195 data_headers = None
196 index = None
197 while True:
198 buf = f.readline().decode('utf-8')
199 if not buf:
200 break
201 if buf.startswith('['):
202 header = buf.split(' ', 1)
203 data_type = header[0][1:]
204 if data_type not in KNOWN_DATA_TYPES:
205 log.warning(u"unkown data type: {}".format(data_type))
206 index = 0
207 try:
208 data_headers = header[1].split(',')
209 # we need to remove the ']' from the last header
210 last_header = data_headers[-1]
211 data_headers[-1] = last_header[:last_header.rfind(']')]
212 except IndexError:
213 log.warning(u"Can't read data)")
214 else:
215 if data_type is None:
216 continue
217 buf = buf.strip()
218 if not buf and data_type in KNOWN_DATA_TYPES:
219 try:
220 finished_handler = getattr(self, '{}FinishedHandler'.format(data_type))
221 except AttributeError:
222 pass
223 else:
224 finished_handler()
225 log.debug(u"{} data finished".format(data_type))
226 data_type = None
227 continue
228 assert data_type
229 try:
230 fields_handler = getattr(self, '{}Handler'.format(data_type))
231 except AttributeError:
232 pass
233 else:
234 fields_handler(data_headers, buf, index)
235 index += 1
236 return (self.posts_data.itervalues(), len(self.posts_data))
237
238
239 class DotclearImport(object):
240
241 def __init__(self, host):
242 log.info(_("plugin Dotclear Import initialization"))
243 self.host = host
244 host.plugins['BLOG_IMPORT'].register('dotclear', self.DcImport, SHORT_DESC, LONG_DESC)
245
246 def DcImport(self, client, location, options=None):
247 if not os.path.isabs(location):
248 raise exceptions.DataError(u"An absolute path to backup data need to be given as location")
249 dc_parser = DotclearParser()
250 d = threads.deferToThread(dc_parser.parse, location)
251 return d