Mercurial > libervia-backend
comparison libervia/backend/plugins/plugin_blog_import.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 02 Jun 2023 11:49:51 +0200 |
parents | sat/plugins/plugin_blog_import.py@524856bd7b19 |
children |
comparison
equal
deleted
inserted
replaced
4070:d10748475025 | 4071:4b842c1fb686 |
---|---|
1 #!/usr/bin/env python3 | |
2 | |
3 | |
4 # SàT plugin for import external blogs | |
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) | |
6 | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU Affero General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU Affero General Public License for more details. | |
16 | |
17 # You should have received a copy of the GNU Affero General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 | |
21 from libervia.backend.core.i18n import _ | |
22 from libervia.backend.core.constants import Const as C | |
23 from libervia.backend.core.log import getLogger | |
24 | |
25 log = getLogger(__name__) | |
26 from twisted.internet import defer | |
27 from twisted.web import client as web_client | |
28 from twisted.words.xish import domish | |
29 from libervia.backend.core import exceptions | |
30 from libervia.backend.tools import xml_tools | |
31 import os | |
32 import os.path | |
33 import tempfile | |
34 import urllib.parse | |
35 import shortuuid | |
36 | |
37 | |
38 PLUGIN_INFO = { | |
39 C.PI_NAME: "blog import", | |
40 C.PI_IMPORT_NAME: "BLOG_IMPORT", | |
41 C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT), | |
42 C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT_SYNTAXES", "UPLOAD"], | |
43 C.PI_MAIN: "BlogImportPlugin", | |
44 C.PI_HANDLER: "no", | |
45 C.PI_DESCRIPTION: _( | |
46 """Blog import management: | |
47 This plugin manage the different blog importers which can register to it, and handle generic importing tasks.""" | |
48 ), | |
49 } | |
50 | |
51 OPT_HOST = "host" | |
52 OPT_UPLOAD_IMAGES = "upload_images" | |
53 OPT_UPLOAD_IGNORE_HOST = "upload_ignore_host" | |
54 OPT_IGNORE_TLS = "ignore_tls_errors" | |
55 URL_REDIRECT_PREFIX = "url_redirect_" | |
56 | |
57 | |
58 class BlogImportPlugin(object): | |
59 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS) | |
60 JSON_OPTIONS = () | |
61 OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True, OPT_IGNORE_TLS: False} | |
62 | |
63 def __init__(self, host): | |
64 log.info(_("plugin Blog import initialization")) | |
65 self.host = host | |
66 self._u = host.plugins["UPLOAD"] | |
67 self._p = host.plugins["XEP-0060"] | |
68 self._m = host.plugins["XEP-0277"] | |
69 self._s = self.host.plugins["TEXT_SYNTAXES"] | |
70 host.plugins["IMPORT"].initialize(self, "blog") | |
71 | |
72 def import_item( | |
73 self, client, item_import_data, session, options, return_data, service, node | |
74 ): | |
75 """import_item specialized for blog import | |
76 | |
77 @param item_import_data(dict): | |
78 * mandatory keys: | |
79 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en) | |
80 the importer MUST NOT create node or call XEP-0277 plugin itself | |
81 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments | |
82 It is recommanded to use a unique id in the "id" key which is constant per blog item, | |
83 so if the import fail, a new import will overwrite the failed items and avoid duplicates. | |
84 | |
85 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments') | |
86 a list of list is used because XEP-0277 can handler several comments nodes, | |
87 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]]) | |
88 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed. | |
89 If allow_comments is False and some comments are present, an exceptions.DataError will be raised | |
90 * optional keys: | |
91 'url' (unicode): former url of the post (only the path, without host part) | |
92 if present the association to the new path will be displayed to user, so it can make redirections if necessary | |
93 @param options(dict, None): Below are the generic options, | |
94 blog importer can have specific ones. All options have unicode values | |
95 generic options: | |
96 - OPT_HOST (unicode): original host | |
97 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True | |
98 see OPT_UPLOAD_IGNORE_HOST. | |
99 Default: True | |
100 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host | |
101 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload. | |
102 Default: False | |
103 @param return_data(dict): will contain link between former posts and new items | |
104 | |
105 """ | |
106 mb_data = item_import_data["blog"] | |
107 try: | |
108 item_id = mb_data["id"] | |
109 except KeyError: | |
110 item_id = mb_data["id"] = str(shortuuid.uuid()) | |
111 | |
112 try: | |
113 # we keep the link between old url and new blog item | |
114 # so the user can redirect its former blog urls | |
115 old_uri = item_import_data["url"] | |
116 except KeyError: | |
117 pass | |
118 else: | |
119 new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.get_node_uri( | |
120 service if service is not None else client.jid.userhostJID(), | |
121 node or self._m.namespace, | |
122 item_id, | |
123 ) | |
124 log.info("url link from {old} to {new}".format(old=old_uri, new=new_uri)) | |
125 | |
126 return mb_data | |
127 | |
128 @defer.inlineCallbacks | |
129 def import_sub_items(self, client, item_import_data, mb_data, session, options): | |
130 # comments data | |
131 if len(item_import_data["comments"]) != 1: | |
132 raise NotImplementedError("can't manage multiple comment links") | |
133 allow_comments = C.bool(mb_data.get("allow_comments", C.BOOL_FALSE)) | |
134 if allow_comments: | |
135 comments_service = yield self._m.get_comments_service(client) | |
136 comments_node = self._m.get_comments_node(mb_data["id"]) | |
137 mb_data["comments_service"] = comments_service.full() | |
138 mb_data["comments_node"] = comments_node | |
139 recurse_kwargs = { | |
140 "items_import_data": item_import_data["comments"][0], | |
141 "service": comments_service, | |
142 "node": comments_node, | |
143 } | |
144 defer.returnValue(recurse_kwargs) | |
145 else: | |
146 if item_import_data["comments"][0]: | |
147 raise exceptions.DataError( | |
148 "allow_comments set to False, but comments are there" | |
149 ) | |
150 defer.returnValue(None) | |
151 | |
152 def publish_item(self, client, mb_data, service, node, session): | |
153 log.debug( | |
154 "uploading item [{id}]: {title}".format( | |
155 id=mb_data["id"], title=mb_data.get("title", "") | |
156 ) | |
157 ) | |
158 return self._m.send(client, mb_data, service, node) | |
159 | |
160 @defer.inlineCallbacks | |
161 def item_filters(self, client, mb_data, session, options): | |
162 """Apply filters according to options | |
163 | |
164 modify mb_data in place | |
165 @param posts_data(list[dict]): data as returned by importer callback | |
166 @param options(dict): dict as given in [blogImport] | |
167 """ | |
168 # FIXME: blog filters don't work on text content | |
169 # TODO: text => XHTML conversion should handler links with <a/> | |
170 # filters can then be used by converting text to XHTML | |
171 if not options: | |
172 return | |
173 | |
174 # we want only XHTML content | |
175 for prefix in ( | |
176 "content", | |
177 ): # a tuple is use, if title need to be added in the future | |
178 try: | |
179 rich = mb_data["{}_rich".format(prefix)] | |
180 except KeyError: | |
181 pass | |
182 else: | |
183 if "{}_xhtml".format(prefix) in mb_data: | |
184 raise exceptions.DataError( | |
185 "importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format( | |
186 prefix=prefix | |
187 ) | |
188 ) | |
189 # we convert rich syntax to XHTML here, so we can handle filters easily | |
190 converted = yield self._s.convert( | |
191 rich, self._s.get_current_syntax(client.profile), safe=False | |
192 ) | |
193 mb_data["{}_xhtml".format(prefix)] = converted | |
194 del mb_data["{}_rich".format(prefix)] | |
195 | |
196 try: | |
197 mb_data["txt"] | |
198 except KeyError: | |
199 pass | |
200 else: | |
201 if "{}_xhtml".format(prefix) in mb_data: | |
202 log.warning( | |
203 "{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format( | |
204 prefix=prefix | |
205 ) | |
206 ) | |
207 del mb_data["{}_text".format(prefix)] | |
208 else: | |
209 log.warning( | |
210 "importer gave a text {prefix}, blog filters don't work on text {prefix}".format( | |
211 prefix=prefix | |
212 ) | |
213 ) | |
214 return | |
215 | |
216 # at this point, we have only XHTML version of content | |
217 try: | |
218 top_elt = xml_tools.ElementParser()( | |
219 mb_data["content_xhtml"], namespace=C.NS_XHTML | |
220 ) | |
221 except domish.ParserError: | |
222 # we clean the xml and try again our luck | |
223 cleaned = yield self._s.clean_xhtml(mb_data["content_xhtml"]) | |
224 top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML) | |
225 opt_host = options.get(OPT_HOST) | |
226 if opt_host: | |
227 # we normalise the domain | |
228 parsed_host = urllib.parse.urlsplit(opt_host) | |
229 opt_host = urllib.parse.urlunsplit( | |
230 ( | |
231 parsed_host.scheme or "http", | |
232 parsed_host.netloc or parsed_host.path, | |
233 "", | |
234 "", | |
235 "", | |
236 ) | |
237 ) | |
238 | |
239 tmp_dir = tempfile.mkdtemp() | |
240 try: | |
241 # TODO: would be nice to also update the hyperlinks to these images, e.g. when you have <a href="{url}"><img src="{url}"></a> | |
242 for img_elt in xml_tools.find_all(top_elt, names=["img"]): | |
243 yield self.img_filters(client, img_elt, options, opt_host, tmp_dir) | |
244 finally: | |
245 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong | |
246 | |
247 # we now replace the content with filtered one | |
248 mb_data["content_xhtml"] = top_elt.toXml() | |
249 | |
250 @defer.inlineCallbacks | |
251 def img_filters(self, client, img_elt, options, opt_host, tmp_dir): | |
252 """Filters handling images | |
253 | |
254 url without host are fixed (if possible) | |
255 according to options, images are uploaded to XMPP server | |
256 @param img_elt(domish.Element): <img/> element to handle | |
257 @param options(dict): filters options | |
258 @param opt_host(unicode): normalised host given in options | |
259 @param tmp_dir(str): path to temp directory | |
260 """ | |
261 try: | |
262 url = img_elt["src"] | |
263 if url[0] == "/": | |
264 if not opt_host: | |
265 log.warning( | |
266 "host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}".format( | |
267 url=url, xml=img_elt.toXml() | |
268 ) | |
269 ) | |
270 return | |
271 else: | |
272 url = urllib.parse.urljoin(opt_host, url) | |
273 filename = url.rsplit("/", 1)[-1].strip() | |
274 if not filename: | |
275 raise KeyError | |
276 except (KeyError, IndexError): | |
277 log.warning("ignoring invalid img element: {}".format(img_elt.toXml())) | |
278 return | |
279 | |
280 # we change the url for the normalized one | |
281 img_elt["src"] = url | |
282 | |
283 if options.get(OPT_UPLOAD_IMAGES, False): | |
284 # upload is requested | |
285 try: | |
286 ignore_host = options[OPT_UPLOAD_IGNORE_HOST] | |
287 except KeyError: | |
288 pass | |
289 else: | |
290 # host is the ignored one, we skip | |
291 parsed_url = urllib.parse.urlsplit(url) | |
292 if ignore_host in parsed_url.hostname: | |
293 log.info( | |
294 "Don't upload image at {url} because of {opt} option".format( | |
295 url=url, opt=OPT_UPLOAD_IGNORE_HOST | |
296 ) | |
297 ) | |
298 return | |
299 | |
300 # we download images and re-upload them via XMPP | |
301 tmp_file = os.path.join(tmp_dir, filename).encode("utf-8") | |
302 upload_options = {"ignore_tls_errors": options.get(OPT_IGNORE_TLS, False)} | |
303 | |
304 try: | |
305 yield web_client.downloadPage(url.encode("utf-8"), tmp_file) | |
306 filename = filename.replace( | |
307 "%", "_" | |
308 ) # FIXME: tmp workaround for a bug in prosody http upload | |
309 __, download_d = yield self._u.upload( | |
310 client, tmp_file, filename, extra=upload_options | |
311 ) | |
312 download_url = yield download_d | |
313 except Exception as e: | |
314 log.warning( | |
315 "can't download image at {url}: {reason}".format(url=url, reason=e) | |
316 ) | |
317 else: | |
318 img_elt["src"] = download_url | |
319 | |
320 try: | |
321 os.unlink(tmp_file) | |
322 except OSError: | |
323 pass |