comparison sat/plugins/plugin_blog_import_dotclear.py @ 3028:ab2696e34d29

Python 3 port: /!\ this is a huge commit /!\ starting from this commit, SàT is needs Python 3.6+ /!\ SàT maybe be instable or some feature may not work anymore, this will improve with time This patch port backend, bridge and frontends to Python 3. Roughly this has been done this way: - 2to3 tools has been applied (with python 3.7) - all references to python2 have been replaced with python3 (notably shebangs) - fixed files not handled by 2to3 (notably the shell script) - several manual fixes - fixed issues reported by Python 3 that where not handled in Python 2 - replaced "async" with "async_" when needed (it's a reserved word from Python 3.7) - replaced zope's "implements" with @implementer decorator - temporary hack to handle data pickled in database, as str or bytes may be returned, to be checked later - fixed hash comparison for password - removed some code which is not needed anymore with Python 3 - deactivated some code which needs to be checked (notably certificate validation) - tested with jp, fixed reported issues until some basic commands worked - ported Primitivus (after porting dependencies like urwid satext) - more manual fixes
author Goffi <goffi@goffi.org>
date Tue, 13 Aug 2019 19:08:41 +0200
parents 003b8b4b56a7
children 9d0df638c8b4
comparison
equal deleted inserted replaced
3027:ff5bcb12ae60 3028:ab2696e34d29
1 #!/usr/bin/env python2 1 #!/usr/bin/env python3
2 # -*- coding: utf-8 -*- 2 # -*- coding: utf-8 -*-
3 3
4 # SàT plugin for import external blogs 4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2019 Jérôme Poisson (goffi@goffi.org) 5 # Copyright (C) 2009-2019 Jérôme Poisson (goffi@goffi.org)
6 6
40 C.PI_MAIN: "DotclearImport", 40 C.PI_MAIN: "DotclearImport",
41 C.PI_HANDLER: "no", 41 C.PI_HANDLER: "no",
42 C.PI_DESCRIPTION: _("""Blog importer for Dotclear blog engine."""), 42 C.PI_DESCRIPTION: _("""Blog importer for Dotclear blog engine."""),
43 } 43 }
44 44
45 SHORT_DESC = D_(u"import posts from Dotclear blog engine") 45 SHORT_DESC = D_("import posts from Dotclear blog engine")
46 46
47 LONG_DESC = D_( 47 LONG_DESC = D_(
48 u"""This importer handle Dotclear blog engine. 48 """This importer handle Dotclear blog engine.
49 49
50 To use it, you'll need to export your blog to a flat file. 50 To use it, you'll need to export your blog to a flat file.
51 You must go in your admin interface and select Plugins/Maintenance then Backup. 51 You must go in your admin interface and select Plugins/Maintenance then Backup.
52 Export only one blog if you have many, i.e. select "Download database of current blog" 52 Export only one blog if you have many, i.e. select "Download database of current blog"
53 Depending on your configuration, your may need to use Import/Export plugin and export as a flat file. 53 Depending on your configuration, your may need to use Import/Export plugin and export as a flat file.
54 54
55 location: you must use the absolute path to your backup for the location parameter 55 location: you must use the absolute path to your backup for the location parameter
56 """ 56 """
57 ) 57 )
58 POST_ID_PREFIX = u"sat_dc_" 58 POST_ID_PREFIX = "sat_dc_"
59 KNOWN_DATA_TYPES = ( 59 KNOWN_DATA_TYPES = (
60 "link", 60 "link",
61 "setting", 61 "setting",
62 "post", 62 "post",
63 "meta", 63 "meta",
64 "media", 64 "media",
65 "post_media", 65 "post_media",
66 "comment", 66 "comment",
67 "captcha", 67 "captcha",
68 ) 68 )
69 ESCAPE_MAP = {"r": u"\r", "n": u"\n", '"': u'"', "\\": u"\\"} 69 ESCAPE_MAP = {"r": "\r", "n": "\n", '"': '"', "\\": "\\"}
70 70
71 71
72 class DotclearParser(object): 72 class DotclearParser(object):
73 # XXX: we have to parse all file to build data 73 # XXX: we have to parse all file to build data
74 # this can be ressource intensive on huge blogs 74 # this can be ressource intensive on huge blogs
81 """Return a unique and constant post id 81 """Return a unique and constant post id
82 82
83 @param post(dict): parsed post data 83 @param post(dict): parsed post data
84 @return (unicode): post unique item id 84 @return (unicode): post unique item id
85 """ 85 """
86 return u"{}_{}_{}_{}:{}".format( 86 return "{}_{}_{}_{}:{}".format(
87 POST_ID_PREFIX, 87 POST_ID_PREFIX,
88 post["blog_id"], 88 post["blog_id"],
89 post["user_id"], 89 post["user_id"],
90 post["post_id"], 90 post["post_id"],
91 post["post_url"], 91 post["post_url"],
97 @param comment(dict): parsed comment 97 @param comment(dict): parsed comment
98 @return (unicode): comment unique comment id 98 @return (unicode): comment unique comment id
99 """ 99 """
100 post_id = comment["post_id"] 100 post_id = comment["post_id"]
101 parent_item_id = self.posts_data[post_id]["blog"]["id"] 101 parent_item_id = self.posts_data[post_id]["blog"]["id"]
102 return u"{}_comment_{}".format(parent_item_id, comment["comment_id"]) 102 return "{}_comment_{}".format(parent_item_id, comment["comment_id"])
103 103
104 def getTime(self, data, key): 104 def getTime(self, data, key):
105 """Parse time as given by dotclear, with timezone handling 105 """Parse time as given by dotclear, with timezone handling
106 106
107 @param data(dict): dotclear data (post or comment) 107 @param data(dict): dotclear data (post or comment)
123 except IndexError: 123 except IndexError:
124 raise exceptions.ParsingError("Data was expected") 124 raise exceptions.ParsingError("Data was expected")
125 if char == '"': 125 if char == '"':
126 # we have reached the end of this field, 126 # we have reached the end of this field,
127 # we try to parse a new one 127 # we try to parse a new one
128 yield u"".join(buf) 128 yield "".join(buf)
129 buf = [] 129 buf = []
130 idx += 1 130 idx += 1
131 try: 131 try:
132 separator = fields_data[idx] 132 separator = fields_data[idx]
133 except IndexError: 133 except IndexError:
134 return 134 return
135 if separator != u",": 135 if separator != ",":
136 raise exceptions.ParsingError("Field separator was expeceted") 136 raise exceptions.ParsingError("Field separator was expeceted")
137 idx += 1 137 idx += 1
138 break # we have a new field 138 break # we have a new field
139 elif char == u"\\": 139 elif char == "\\":
140 idx += 1 140 idx += 1
141 try: 141 try:
142 char = ESCAPE_MAP[fields_data[idx]] 142 char = ESCAPE_MAP[fields_data[idx]]
143 except IndexError: 143 except IndexError:
144 raise exceptions.ParsingError("Escaped char was expected") 144 raise exceptions.ParsingError("Escaped char was expected")
145 except KeyError: 145 except KeyError:
146 char = fields_data[idx] 146 char = fields_data[idx]
147 log.warning(u"Unknown key to escape: {}".format(char)) 147 log.warning("Unknown key to escape: {}".format(char))
148 buf.append(char) 148 buf.append(char)
149 149
150 def parseFields(self, headers, data): 150 def parseFields(self, headers, data):
151 return dict(itertools.izip(headers, self.readFields(data))) 151 return dict(zip(headers, self.readFields(data)))
152 152
153 def postHandler(self, headers, data, index): 153 def postHandler(self, headers, data, index):
154 post = self.parseFields(headers, data) 154 post = self.parseFields(headers, data)
155 log.debug(u"({}) post found: {}".format(index, post["post_title"])) 155 log.debug("({}) post found: {}".format(index, post["post_title"]))
156 mb_data = { 156 mb_data = {
157 "id": self.getPostId(post), 157 "id": self.getPostId(post),
158 "published": self.getTime(post, "post_creadt"), 158 "published": self.getTime(post, "post_creadt"),
159 "updated": self.getTime(post, "post_upddt"), 159 "updated": self.getTime(post, "post_upddt"),
160 "author": post["user_id"], # there use info are not in the archive 160 "author": post["user_id"], # there use info are not in the archive
161 # TODO: option to specify user info 161 # TODO: option to specify user info
162 "content_xhtml": u"{}{}".format( 162 "content_xhtml": "{}{}".format(
163 post["post_content_xhtml"], post["post_excerpt_xhtml"] 163 post["post_content_xhtml"], post["post_excerpt_xhtml"]
164 ), 164 ),
165 "title": post["post_title"], 165 "title": post["post_title"],
166 "allow_comments": C.boolConst(bool(int(post["post_open_comment"]))), 166 "allow_comments": C.boolConst(bool(int(post["post_open_comment"]))),
167 } 167 }
168 self.posts_data[post["post_id"]] = { 168 self.posts_data[post["post_id"]] = {
169 "blog": mb_data, 169 "blog": mb_data,
170 "comments": [[]], 170 "comments": [[]],
171 "url": u"/post/{}".format(post["post_url"]), 171 "url": "/post/{}".format(post["post_url"]),
172 } 172 }
173 173
174 def metaHandler(self, headers, data, index): 174 def metaHandler(self, headers, data, index):
175 meta = self.parseFields(headers, data) 175 meta = self.parseFields(headers, data)
176 if meta["meta_type"] == "tag": 176 if meta["meta_type"] == "tag":
177 tags = self.tags.setdefault(meta["post_id"], set()) 177 tags = self.tags.setdefault(meta["post_id"], set())
178 tags.add(meta["meta_id"]) 178 tags.add(meta["meta_id"])
179 179
180 def metaFinishedHandler(self): 180 def metaFinishedHandler(self):
181 for post_id, tags in self.tags.iteritems(): 181 for post_id, tags in self.tags.items():
182 data_format.iter2dict("tag", tags, self.posts_data[post_id]["blog"]) 182 data_format.iter2dict("tag", tags, self.posts_data[post_id]["blog"])
183 del self.tags 183 del self.tags
184 184
185 def commentHandler(self, headers, data, index): 185 def commentHandler(self, headers, data, index):
186 comment = self.parseFields(headers, data) 186 comment = self.parseFields(headers, data)
187 if comment["comment_site"]: 187 if comment["comment_site"]:
188 # we don't use atom:uri because it's used for jid in XMPP 188 # we don't use atom:uri because it's used for jid in XMPP
189 content = u'{}\n<hr>\n<a href="{}">author website</a>'.format( 189 content = '{}\n<hr>\n<a href="{}">author website</a>'.format(
190 comment["comment_content"], 190 comment["comment_content"],
191 cgi.escape(comment["comment_site"]).replace('"', u"%22"), 191 cgi.escape(comment["comment_site"]).replace('"', "%22"),
192 ) 192 )
193 else: 193 else:
194 content = comment["comment_content"] 194 content = comment["comment_content"]
195 mb_data = { 195 mb_data = {
196 "id": self.getCommentId(comment), 196 "id": self.getCommentId(comment),
206 {"blog": mb_data, "comments": [[]]} 206 {"blog": mb_data, "comments": [[]]}
207 ) 207 )
208 208
209 def parse(self, db_path): 209 def parse(self, db_path):
210 with open(db_path) as f: 210 with open(db_path) as f:
211 signature = f.readline().decode("utf-8") 211 signature = f.readline()
212 try: 212 try:
213 version = signature.split("|")[1] 213 version = signature.split("|")[1]
214 except IndexError: 214 except IndexError:
215 version = None 215 version = None
216 log.debug(u"Dotclear version: {}".format(version)) 216 log.debug("Dotclear version: {}".format(version))
217 data_type = None 217 data_type = None
218 data_headers = None 218 data_headers = None
219 index = None 219 index = None
220 while True: 220 while True:
221 buf = f.readline().decode("utf-8") 221 buf = f.readline()
222 if not buf: 222 if not buf:
223 break 223 break
224 if buf.startswith("["): 224 if buf.startswith("["):
225 header = buf.split(" ", 1) 225 header = buf.split(" ", 1)
226 data_type = header[0][1:] 226 data_type = header[0][1:]
227 if data_type not in KNOWN_DATA_TYPES: 227 if data_type not in KNOWN_DATA_TYPES:
228 log.warning(u"unkown data type: {}".format(data_type)) 228 log.warning("unkown data type: {}".format(data_type))
229 index = 0 229 index = 0
230 try: 230 try:
231 data_headers = header[1].split(",") 231 data_headers = header[1].split(",")
232 # we need to remove the ']' from the last header 232 # we need to remove the ']' from the last header
233 last_header = data_headers[-1] 233 last_header = data_headers[-1]
234 data_headers[-1] = last_header[: last_header.rfind("]")] 234 data_headers[-1] = last_header[: last_header.rfind("]")]
235 except IndexError: 235 except IndexError:
236 log.warning(u"Can't read data)") 236 log.warning("Can't read data)")
237 else: 237 else:
238 if data_type is None: 238 if data_type is None:
239 continue 239 continue
240 buf = buf.strip() 240 buf = buf.strip()
241 if not buf and data_type in KNOWN_DATA_TYPES: 241 if not buf and data_type in KNOWN_DATA_TYPES:
245 ) 245 )
246 except AttributeError: 246 except AttributeError:
247 pass 247 pass
248 else: 248 else:
249 finished_handler() 249 finished_handler()
250 log.debug(u"{} data finished".format(data_type)) 250 log.debug("{} data finished".format(data_type))
251 data_type = None 251 data_type = None
252 continue 252 continue
253 assert data_type 253 assert data_type
254 try: 254 try:
255 fields_handler = getattr(self, "{}Handler".format(data_type)) 255 fields_handler = getattr(self, "{}Handler".format(data_type))
256 except AttributeError: 256 except AttributeError:
257 pass 257 pass
258 else: 258 else:
259 fields_handler(data_headers, buf, index) 259 fields_handler(data_headers, buf, index)
260 index += 1 260 index += 1
261 return (self.posts_data.itervalues(), len(self.posts_data)) 261 return (iter(self.posts_data.values()), len(self.posts_data))
262 262
263 263
264 class DotclearImport(object): 264 class DotclearImport(object):
265 def __init__(self, host): 265 def __init__(self, host):
266 log.info(_("plugin Dotclear Import initialization")) 266 log.info(_("plugin Dotclear Import initialization"))
270 ) 270 )
271 271
272 def DcImport(self, client, location, options=None): 272 def DcImport(self, client, location, options=None):
273 if not os.path.isabs(location): 273 if not os.path.isabs(location):
274 raise exceptions.DataError( 274 raise exceptions.DataError(
275 u"An absolute path to backup data need to be given as location" 275 "An absolute path to backup data need to be given as location"
276 ) 276 )
277 dc_parser = DotclearParser() 277 dc_parser = DotclearParser()
278 d = threads.deferToThread(dc_parser.parse, location) 278 d = threads.deferToThread(dc_parser.parse, location)
279 return d 279 return d