comparison libervia/backend/plugins/plugin_blog_import.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`
author Goffi <goffi@goffi.org>
date Fri, 02 Jun 2023 11:49:51 +0200
parents sat/plugins/plugin_blog_import.py@524856bd7b19
children e9971a4b0627
comparison
equal deleted inserted replaced
4070:d10748475025 4071:4b842c1fb686
1 #!/usr/bin/env python3
2
3
4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20
21 from libervia.backend.core.i18n import _
22 from libervia.backend.core.constants import Const as C
23 from libervia.backend.core.log import getLogger
24
25 log = getLogger(__name__)
26 from twisted.internet import defer
27 from twisted.web import client as web_client
28 from twisted.words.xish import domish
29 from libervia.backend.core import exceptions
30 from libervia.backend.tools import xml_tools
31 import os
32 import os.path
33 import tempfile
34 import urllib.parse
35 import shortuuid
36
37
38 PLUGIN_INFO = {
39 C.PI_NAME: "blog import",
40 C.PI_IMPORT_NAME: "BLOG_IMPORT",
41 C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT),
42 C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT_SYNTAXES", "UPLOAD"],
43 C.PI_MAIN: "BlogImportPlugin",
44 C.PI_HANDLER: "no",
45 C.PI_DESCRIPTION: _(
46 """Blog import management:
47 This plugin manage the different blog importers which can register to it, and handle generic importing tasks."""
48 ),
49 }
50
51 OPT_HOST = "host"
52 OPT_UPLOAD_IMAGES = "upload_images"
53 OPT_UPLOAD_IGNORE_HOST = "upload_ignore_host"
54 OPT_IGNORE_TLS = "ignore_tls_errors"
55 URL_REDIRECT_PREFIX = "url_redirect_"
56
57
58 class BlogImportPlugin(object):
59 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
60 JSON_OPTIONS = ()
61 OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True, OPT_IGNORE_TLS: False}
62
63 def __init__(self, host):
64 log.info(_("plugin Blog import initialization"))
65 self.host = host
66 self._u = host.plugins["UPLOAD"]
67 self._p = host.plugins["XEP-0060"]
68 self._m = host.plugins["XEP-0277"]
69 self._s = self.host.plugins["TEXT_SYNTAXES"]
70 host.plugins["IMPORT"].initialize(self, "blog")
71
72 def import_item(
73 self, client, item_import_data, session, options, return_data, service, node
74 ):
75 """import_item specialized for blog import
76
77 @param item_import_data(dict):
78 * mandatory keys:
79 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
80 the importer MUST NOT create node or call XEP-0277 plugin itself
81 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
82 It is recommanded to use a unique id in the "id" key which is constant per blog item,
83 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
84
85 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
86 a list of list is used because XEP-0277 can handler several comments nodes,
87 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
88 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
89 If allow_comments is False and some comments are present, an exceptions.DataError will be raised
90 * optional keys:
91 'url' (unicode): former url of the post (only the path, without host part)
92 if present the association to the new path will be displayed to user, so it can make redirections if necessary
93 @param options(dict, None): Below are the generic options,
94 blog importer can have specific ones. All options have unicode values
95 generic options:
96 - OPT_HOST (unicode): original host
97 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
98 see OPT_UPLOAD_IGNORE_HOST.
99 Default: True
100 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
101 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
102 Default: False
103 @param return_data(dict): will contain link between former posts and new items
104
105 """
106 mb_data = item_import_data["blog"]
107 try:
108 item_id = mb_data["id"]
109 except KeyError:
110 item_id = mb_data["id"] = str(shortuuid.uuid())
111
112 try:
113 # we keep the link between old url and new blog item
114 # so the user can redirect its former blog urls
115 old_uri = item_import_data["url"]
116 except KeyError:
117 pass
118 else:
119 new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.get_node_uri(
120 service if service is not None else client.jid.userhostJID(),
121 node or self._m.namespace,
122 item_id,
123 )
124 log.info("url link from {old} to {new}".format(old=old_uri, new=new_uri))
125
126 return mb_data
127
128 @defer.inlineCallbacks
129 def import_sub_items(self, client, item_import_data, mb_data, session, options):
130 # comments data
131 if len(item_import_data["comments"]) != 1:
132 raise NotImplementedError("can't manage multiple comment links")
133 allow_comments = C.bool(mb_data.get("allow_comments", C.BOOL_FALSE))
134 if allow_comments:
135 comments_service = yield self._m.get_comments_service(client)
136 comments_node = self._m.get_comments_node(mb_data["id"])
137 mb_data["comments_service"] = comments_service.full()
138 mb_data["comments_node"] = comments_node
139 recurse_kwargs = {
140 "items_import_data": item_import_data["comments"][0],
141 "service": comments_service,
142 "node": comments_node,
143 }
144 defer.returnValue(recurse_kwargs)
145 else:
146 if item_import_data["comments"][0]:
147 raise exceptions.DataError(
148 "allow_comments set to False, but comments are there"
149 )
150 defer.returnValue(None)
151
152 def publish_item(self, client, mb_data, service, node, session):
153 log.debug(
154 "uploading item [{id}]: {title}".format(
155 id=mb_data["id"], title=mb_data.get("title", "")
156 )
157 )
158 return self._m.send(client, mb_data, service, node)
159
160 @defer.inlineCallbacks
161 def item_filters(self, client, mb_data, session, options):
162 """Apply filters according to options
163
164 modify mb_data in place
165 @param posts_data(list[dict]): data as returned by importer callback
166 @param options(dict): dict as given in [blogImport]
167 """
168 # FIXME: blog filters don't work on text content
169 # TODO: text => XHTML conversion should handler links with <a/>
170 # filters can then be used by converting text to XHTML
171 if not options:
172 return
173
174 # we want only XHTML content
175 for prefix in (
176 "content",
177 ): # a tuple is use, if title need to be added in the future
178 try:
179 rich = mb_data["{}_rich".format(prefix)]
180 except KeyError:
181 pass
182 else:
183 if "{}_xhtml".format(prefix) in mb_data:
184 raise exceptions.DataError(
185 "importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(
186 prefix=prefix
187 )
188 )
189 # we convert rich syntax to XHTML here, so we can handle filters easily
190 converted = yield self._s.convert(
191 rich, self._s.get_current_syntax(client.profile), safe=False
192 )
193 mb_data["{}_xhtml".format(prefix)] = converted
194 del mb_data["{}_rich".format(prefix)]
195
196 try:
197 mb_data["txt"]
198 except KeyError:
199 pass
200 else:
201 if "{}_xhtml".format(prefix) in mb_data:
202 log.warning(
203 "{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(
204 prefix=prefix
205 )
206 )
207 del mb_data["{}_text".format(prefix)]
208 else:
209 log.warning(
210 "importer gave a text {prefix}, blog filters don't work on text {prefix}".format(
211 prefix=prefix
212 )
213 )
214 return
215
216 # at this point, we have only XHTML version of content
217 try:
218 top_elt = xml_tools.ElementParser()(
219 mb_data["content_xhtml"], namespace=C.NS_XHTML
220 )
221 except domish.ParserError:
222 # we clean the xml and try again our luck
223 cleaned = yield self._s.clean_xhtml(mb_data["content_xhtml"])
224 top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
225 opt_host = options.get(OPT_HOST)
226 if opt_host:
227 # we normalise the domain
228 parsed_host = urllib.parse.urlsplit(opt_host)
229 opt_host = urllib.parse.urlunsplit(
230 (
231 parsed_host.scheme or "http",
232 parsed_host.netloc or parsed_host.path,
233 "",
234 "",
235 "",
236 )
237 )
238
239 tmp_dir = tempfile.mkdtemp()
240 try:
241 # TODO: would be nice to also update the hyperlinks to these images, e.g. when you have <a href="{url}"><img src="{url}"></a>
242 for img_elt in xml_tools.find_all(top_elt, names=["img"]):
243 yield self.img_filters(client, img_elt, options, opt_host, tmp_dir)
244 finally:
245 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
246
247 # we now replace the content with filtered one
248 mb_data["content_xhtml"] = top_elt.toXml()
249
250 @defer.inlineCallbacks
251 def img_filters(self, client, img_elt, options, opt_host, tmp_dir):
252 """Filters handling images
253
254 url without host are fixed (if possible)
255 according to options, images are uploaded to XMPP server
256 @param img_elt(domish.Element): <img/> element to handle
257 @param options(dict): filters options
258 @param opt_host(unicode): normalised host given in options
259 @param tmp_dir(str): path to temp directory
260 """
261 try:
262 url = img_elt["src"]
263 if url[0] == "/":
264 if not opt_host:
265 log.warning(
266 "host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}".format(
267 url=url, xml=img_elt.toXml()
268 )
269 )
270 return
271 else:
272 url = urllib.parse.urljoin(opt_host, url)
273 filename = url.rsplit("/", 1)[-1].strip()
274 if not filename:
275 raise KeyError
276 except (KeyError, IndexError):
277 log.warning("ignoring invalid img element: {}".format(img_elt.toXml()))
278 return
279
280 # we change the url for the normalized one
281 img_elt["src"] = url
282
283 if options.get(OPT_UPLOAD_IMAGES, False):
284 # upload is requested
285 try:
286 ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
287 except KeyError:
288 pass
289 else:
290 # host is the ignored one, we skip
291 parsed_url = urllib.parse.urlsplit(url)
292 if ignore_host in parsed_url.hostname:
293 log.info(
294 "Don't upload image at {url} because of {opt} option".format(
295 url=url, opt=OPT_UPLOAD_IGNORE_HOST
296 )
297 )
298 return
299
300 # we download images and re-upload them via XMPP
301 tmp_file = os.path.join(tmp_dir, filename).encode("utf-8")
302 upload_options = {"ignore_tls_errors": options.get(OPT_IGNORE_TLS, False)}
303
304 try:
305 yield web_client.downloadPage(url.encode("utf-8"), tmp_file)
306 filename = filename.replace(
307 "%", "_"
308 ) # FIXME: tmp workaround for a bug in prosody http upload
309 __, download_d = yield self._u.upload(
310 client, tmp_file, filename, extra=upload_options
311 )
312 download_url = yield download_d
313 except Exception as e:
314 log.warning(
315 "can't download image at {url}: {reason}".format(url=url, reason=e)
316 )
317 else:
318 img_elt["src"] = download_url
319
320 try:
321 os.unlink(tmp_file)
322 except OSError:
323 pass