Mercurial > libervia-backend
comparison sat/plugins/plugin_blog_import.py @ 2562:26edcf3a30eb
core, setup: huge cleaning:
- moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention
- move twisted directory to root
- removed all hacks from setup.py, and added missing dependencies, it is now clean
- use https URL for website in setup.py
- removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed
- renamed sat.sh to sat and fixed its installation
- added python_requires to specify Python version needed
- replaced glib2reactor which use deprecated code by gtk3reactor
sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 02 Apr 2018 19:44:50 +0200 |
parents | src/plugins/plugin_blog_import.py@0046283a285d |
children | 56f94936df1e |
comparison
equal
deleted
inserted
replaced
2561:bd30dc3ffe5a | 2562:26edcf3a30eb |
---|---|
1 #!/usr/bin/env python2 | |
2 # -*- coding: utf-8 -*- | |
3 | |
4 # SàT plugin for import external blogs | |
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org) | |
6 | |
7 # This program is free software: you can redistribute it and/or modify | |
8 # it under the terms of the GNU Affero General Public License as published by | |
9 # the Free Software Foundation, either version 3 of the License, or | |
10 # (at your option) any later version. | |
11 | |
12 # This program is distributed in the hope that it will be useful, | |
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of | |
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
15 # GNU Affero General Public License for more details. | |
16 | |
17 # You should have received a copy of the GNU Affero General Public License | |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | |
19 | |
20 | |
21 from sat.core.i18n import _ | |
22 from sat.core.constants import Const as C | |
23 from sat.core.log import getLogger | |
24 log = getLogger(__name__) | |
25 from twisted.internet import defer | |
26 from twisted.web import client as web_client | |
27 from twisted.words.xish import domish | |
28 from sat.core import exceptions | |
29 from sat.tools import xml_tools | |
30 import os | |
31 import os.path | |
32 import tempfile | |
33 import urlparse | |
34 import shortuuid | |
35 | |
36 | |
37 PLUGIN_INFO = { | |
38 C.PI_NAME: "blog import", | |
39 C.PI_IMPORT_NAME: "BLOG_IMPORT", | |
40 C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT), | |
41 C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT-SYNTAXES", "UPLOAD"], | |
42 C.PI_MAIN: "BlogImportPlugin", | |
43 C.PI_HANDLER: "no", | |
44 C.PI_DESCRIPTION: _(u"""Blog import management: | |
45 This plugin manage the different blog importers which can register to it, and handle generic importing tasks.""") | |
46 } | |
47 | |
48 OPT_HOST = 'host' | |
49 OPT_UPLOAD_IMAGES = 'upload_images' | |
50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host' | |
51 OPT_IGNORE_TLS = 'ignore_tls_errors' | |
52 URL_REDIRECT_PREFIX = 'url_redirect_' | |
53 | |
54 | |
55 class BlogImportPlugin(object): | |
56 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS) | |
57 JSON_OPTIONS = () | |
58 OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True, | |
59 OPT_IGNORE_TLS: False} | |
60 | |
61 def __init__(self, host): | |
62 log.info(_("plugin Blog Import initialization")) | |
63 self.host = host | |
64 self._u = host.plugins['UPLOAD'] | |
65 self._p = host.plugins['XEP-0060'] | |
66 self._m = host.plugins['XEP-0277'] | |
67 self._s = self.host.plugins['TEXT-SYNTAXES'] | |
68 host.plugins['IMPORT'].initialize(self, u'blog') | |
69 | |
70 def importItem(self, client, item_import_data, session, options, return_data, service, node): | |
71 """importItem specialized for blog import | |
72 | |
73 @param item_import_data(dict): | |
74 * mandatory keys: | |
75 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en) | |
76 the importer MUST NOT create node or call XEP-0277 plugin itself | |
77 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments | |
78 It is recommanded to use a unique id in the "id" key which is constant per blog item, | |
79 so if the import fail, a new import will overwrite the failed items and avoid duplicates. | |
80 | |
81 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments') | |
82 a list of list is used because XEP-0277 can handler several comments nodes, | |
83 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]]) | |
84 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed. | |
85 If allow_comments is False and some comments are present, an exceptions.DataError will be raised | |
86 * optional keys: | |
87 'url' (unicode): former url of the post (only the path, without host part) | |
88 if present the association to the new path will be displayed to user, so it can make redirections if necessary | |
89 @param options(dict, None): Below are the generic options, | |
90 blog importer can have specific ones. All options have unicode values | |
91 generic options: | |
92 - OPT_HOST (unicode): original host | |
93 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True | |
94 see OPT_UPLOAD_IGNORE_HOST. | |
95 Default: True | |
96 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host | |
97 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload. | |
98 Default: False | |
99 @param return_data(dict): will contain link between former posts and new items | |
100 | |
101 """ | |
102 mb_data = item_import_data['blog'] | |
103 try: | |
104 item_id = mb_data['id'] | |
105 except KeyError: | |
106 item_id = mb_data['id'] = unicode(shortuuid.uuid()) | |
107 | |
108 try: | |
109 # we keep the link between old url and new blog item | |
110 # so the user can redirect its former blog urls | |
111 old_uri = item_import_data['url'] | |
112 except KeyError: | |
113 pass | |
114 else: | |
115 new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.getNodeURI( | |
116 service if service is not None else client.jid.userhostJID(), | |
117 node or self._m.namespace, | |
118 item_id) | |
119 log.info(u"url link from {old} to {new}".format( | |
120 old=old_uri, new=new_uri)) | |
121 | |
122 return mb_data | |
123 | |
124 @defer.inlineCallbacks | |
125 def importSubItems(self, client, item_import_data, mb_data, session, options): | |
126 # comments data | |
127 if len(item_import_data['comments']) != 1: | |
128 raise NotImplementedError(u"can't manage multiple comment links") | |
129 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE)) | |
130 if allow_comments: | |
131 comments_service = yield self._m.getCommentsService(client) | |
132 comments_node = self._m.getCommentsNode(mb_data['id']) | |
133 mb_data['comments_service'] = comments_service.full() | |
134 mb_data['comments_node'] = comments_node | |
135 recurse_kwargs = { | |
136 'items_import_data':item_import_data['comments'][0], | |
137 'service':comments_service, | |
138 'node':comments_node} | |
139 defer.returnValue(recurse_kwargs) | |
140 else: | |
141 if item_import_data['comments'][0]: | |
142 raise exceptions.DataError(u"allow_comments set to False, but comments are there") | |
143 defer.returnValue(None) | |
144 | |
145 def publishItem(self, client, mb_data, service, node, session): | |
146 log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title',''))) | |
147 return self._m.send(client, mb_data, service, node) | |
148 | |
149 @defer.inlineCallbacks | |
150 def itemFilters(self, client, mb_data, session, options): | |
151 """Apply filters according to options | |
152 | |
153 modify mb_data in place | |
154 @param posts_data(list[dict]): data as returned by importer callback | |
155 @param options(dict): dict as given in [blogImport] | |
156 """ | |
157 # FIXME: blog filters don't work on text content | |
158 # TODO: text => XHTML conversion should handler links with <a/> | |
159 # filters can then be used by converting text to XHTML | |
160 if not options: | |
161 return | |
162 | |
163 # we want only XHTML content | |
164 for prefix in ('content',): # a tuple is use, if title need to be added in the future | |
165 try: | |
166 rich = mb_data['{}_rich'.format(prefix)] | |
167 except KeyError: | |
168 pass | |
169 else: | |
170 if '{}_xhtml'.format(prefix) in mb_data: | |
171 raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix)) | |
172 # we convert rich syntax to XHTML here, so we can handle filters easily | |
173 converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False) | |
174 mb_data['{}_xhtml'.format(prefix)] = converted | |
175 del mb_data['{}_rich'.format(prefix)] | |
176 | |
177 try: | |
178 mb_data['txt'] | |
179 except KeyError: | |
180 pass | |
181 else: | |
182 if '{}_xhtml'.format(prefix) in mb_data: | |
183 log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix)) | |
184 del mb_data['{}_text'.format(prefix)] | |
185 else: | |
186 log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix)) | |
187 return | |
188 | |
189 # at this point, we have only XHTML version of content | |
190 try: | |
191 top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML) | |
192 except domish.ParserError: | |
193 # we clean the xml and try again our luck | |
194 cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml']) | |
195 top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML) | |
196 opt_host = options.get(OPT_HOST) | |
197 if opt_host: | |
198 # we normalise the domain | |
199 parsed_host = urlparse.urlsplit(opt_host) | |
200 opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', '')) | |
201 | |
202 tmp_dir = tempfile.mkdtemp() | |
203 try: | |
204 # TODO: would be nice to also update the hyperlinks to these images, e.g. when you have <a href="{url}"><img src="{url}"></a> | |
205 for img_elt in xml_tools.findAll(top_elt, names=[u'img']): | |
206 yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir) | |
207 finally: | |
208 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong | |
209 | |
210 # we now replace the content with filtered one | |
211 mb_data['content_xhtml'] = top_elt.toXml() | |
212 | |
213 @defer.inlineCallbacks | |
214 def imgFilters(self, client, img_elt, options, opt_host, tmp_dir): | |
215 """Filters handling images | |
216 | |
217 url without host are fixed (if possible) | |
218 according to options, images are uploaded to XMPP server | |
219 @param img_elt(domish.Element): <img/> element to handle | |
220 @param options(dict): filters options | |
221 @param opt_host(unicode): normalised host given in options | |
222 @param tmp_dir(str): path to temp directory | |
223 """ | |
224 try: | |
225 url = img_elt['src'] | |
226 if url[0] == u'/': | |
227 if not opt_host: | |
228 log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}" | |
229 .format(url=url, xml=img_elt.toXml())) | |
230 return | |
231 else: | |
232 url = urlparse.urljoin(opt_host, url) | |
233 filename = url.rsplit('/',1)[-1].strip() | |
234 if not filename: | |
235 raise KeyError | |
236 except (KeyError, IndexError): | |
237 log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml())) | |
238 return | |
239 | |
240 # we change the url for the normalized one | |
241 img_elt['src'] = url | |
242 | |
243 if options.get(OPT_UPLOAD_IMAGES, False): | |
244 # upload is requested | |
245 try: | |
246 ignore_host = options[OPT_UPLOAD_IGNORE_HOST] | |
247 except KeyError: | |
248 pass | |
249 else: | |
250 # host is the ignored one, we skip | |
251 parsed_url = urlparse.urlsplit(url) | |
252 if ignore_host in parsed_url.hostname: | |
253 log.info(u"Don't upload image at {url} because of {opt} option".format( | |
254 url=url, opt=OPT_UPLOAD_IGNORE_HOST)) | |
255 return | |
256 | |
257 # we download images and re-upload them via XMPP | |
258 tmp_file = os.path.join(tmp_dir, filename).encode('utf-8') | |
259 upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)} | |
260 | |
261 try: | |
262 yield web_client.downloadPage(url.encode('utf-8'), tmp_file) | |
263 filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload | |
264 dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options) | |
265 download_url = yield download_d | |
266 except Exception as e: | |
267 log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e)) | |
268 else: | |
269 img_elt['src'] = download_url | |
270 | |
271 try: | |
272 os.unlink(tmp_file) | |
273 except OSError: | |
274 pass |