comparison sat/plugins/plugin_blog_import.py @ 2562:26edcf3a30eb

core, setup: huge cleaning: - moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention - move twisted directory to root - removed all hacks from setup.py, and added missing dependencies, it is now clean - use https URL for website in setup.py - removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed - renamed sat.sh to sat and fixed its installation - added python_requires to specify Python version needed - replaced glib2reactor which use deprecated code by gtk3reactor sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author Goffi <goffi@goffi.org>
date Mon, 02 Apr 2018 19:44:50 +0200
parents src/plugins/plugin_blog_import.py@0046283a285d
children 56f94936df1e
comparison
equal deleted inserted replaced
2561:bd30dc3ffe5a 2562:26edcf3a30eb
1 #!/usr/bin/env python2
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin for import external blogs
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20
21 from sat.core.i18n import _
22 from sat.core.constants import Const as C
23 from sat.core.log import getLogger
24 log = getLogger(__name__)
25 from twisted.internet import defer
26 from twisted.web import client as web_client
27 from twisted.words.xish import domish
28 from sat.core import exceptions
29 from sat.tools import xml_tools
30 import os
31 import os.path
32 import tempfile
33 import urlparse
34 import shortuuid
35
36
37 PLUGIN_INFO = {
38 C.PI_NAME: "blog import",
39 C.PI_IMPORT_NAME: "BLOG_IMPORT",
40 C.PI_TYPE: (C.PLUG_TYPE_BLOG, C.PLUG_TYPE_IMPORT),
41 C.PI_DEPENDENCIES: ["IMPORT", "XEP-0060", "XEP-0277", "TEXT-SYNTAXES", "UPLOAD"],
42 C.PI_MAIN: "BlogImportPlugin",
43 C.PI_HANDLER: "no",
44 C.PI_DESCRIPTION: _(u"""Blog import management:
45 This plugin manage the different blog importers which can register to it, and handle generic importing tasks.""")
46 }
47
48 OPT_HOST = 'host'
49 OPT_UPLOAD_IMAGES = 'upload_images'
50 OPT_UPLOAD_IGNORE_HOST = 'upload_ignore_host'
51 OPT_IGNORE_TLS = 'ignore_tls_errors'
52 URL_REDIRECT_PREFIX = 'url_redirect_'
53
54
55 class BlogImportPlugin(object):
56 BOOL_OPTIONS = (OPT_UPLOAD_IMAGES, OPT_IGNORE_TLS)
57 JSON_OPTIONS = ()
58 OPT_DEFAULTS = {OPT_UPLOAD_IMAGES: True,
59 OPT_IGNORE_TLS: False}
60
61 def __init__(self, host):
62 log.info(_("plugin Blog Import initialization"))
63 self.host = host
64 self._u = host.plugins['UPLOAD']
65 self._p = host.plugins['XEP-0060']
66 self._m = host.plugins['XEP-0277']
67 self._s = self.host.plugins['TEXT-SYNTAXES']
68 host.plugins['IMPORT'].initialize(self, u'blog')
69
70 def importItem(self, client, item_import_data, session, options, return_data, service, node):
71 """importItem specialized for blog import
72
73 @param item_import_data(dict):
74 * mandatory keys:
75 'blog' (dict): microblog data of the blog post (cf. http://wiki.goffi.org/wiki/Bridge_API_-_Microblogging/en)
76 the importer MUST NOT create node or call XEP-0277 plugin itself
77 'comments*' key MUST NOT be used in this microblog_data, see bellow for comments
78 It is recommanded to use a unique id in the "id" key which is constant per blog item,
79 so if the import fail, a new import will overwrite the failed items and avoid duplicates.
80
81 'comments' (list[list[dict]],None): Dictionaries must have the same keys as main item (i.e. 'blog' and 'comments')
82 a list of list is used because XEP-0277 can handler several comments nodes,
83 but in most cases, there will we only one item it the first list (something like [[{comment1_data},{comment2_data}, ...]])
84 blog['allow_comments'] must be True if there is any comment, and False (or not present) if comments are not allowed.
85 If allow_comments is False and some comments are present, an exceptions.DataError will be raised
86 * optional keys:
87 'url' (unicode): former url of the post (only the path, without host part)
88 if present the association to the new path will be displayed to user, so it can make redirections if necessary
89 @param options(dict, None): Below are the generic options,
90 blog importer can have specific ones. All options have unicode values
91 generic options:
92 - OPT_HOST (unicode): original host
93 - OPT_UPLOAD_IMAGES (bool): upload images to XMPP server if True
94 see OPT_UPLOAD_IGNORE_HOST.
95 Default: True
96 - OPT_UPLOAD_IGNORE_HOST (unicode): don't upload images from this host
97 - OPT_IGNORE_TLS (bool): ignore TLS error for image upload.
98 Default: False
99 @param return_data(dict): will contain link between former posts and new items
100
101 """
102 mb_data = item_import_data['blog']
103 try:
104 item_id = mb_data['id']
105 except KeyError:
106 item_id = mb_data['id'] = unicode(shortuuid.uuid())
107
108 try:
109 # we keep the link between old url and new blog item
110 # so the user can redirect its former blog urls
111 old_uri = item_import_data['url']
112 except KeyError:
113 pass
114 else:
115 new_uri = return_data[URL_REDIRECT_PREFIX + old_uri] = self._p.getNodeURI(
116 service if service is not None else client.jid.userhostJID(),
117 node or self._m.namespace,
118 item_id)
119 log.info(u"url link from {old} to {new}".format(
120 old=old_uri, new=new_uri))
121
122 return mb_data
123
124 @defer.inlineCallbacks
125 def importSubItems(self, client, item_import_data, mb_data, session, options):
126 # comments data
127 if len(item_import_data['comments']) != 1:
128 raise NotImplementedError(u"can't manage multiple comment links")
129 allow_comments = C.bool(mb_data.get('allow_comments', C.BOOL_FALSE))
130 if allow_comments:
131 comments_service = yield self._m.getCommentsService(client)
132 comments_node = self._m.getCommentsNode(mb_data['id'])
133 mb_data['comments_service'] = comments_service.full()
134 mb_data['comments_node'] = comments_node
135 recurse_kwargs = {
136 'items_import_data':item_import_data['comments'][0],
137 'service':comments_service,
138 'node':comments_node}
139 defer.returnValue(recurse_kwargs)
140 else:
141 if item_import_data['comments'][0]:
142 raise exceptions.DataError(u"allow_comments set to False, but comments are there")
143 defer.returnValue(None)
144
145 def publishItem(self, client, mb_data, service, node, session):
146 log.debug(u"uploading item [{id}]: {title}".format(id=mb_data['id'], title=mb_data.get('title','')))
147 return self._m.send(client, mb_data, service, node)
148
149 @defer.inlineCallbacks
150 def itemFilters(self, client, mb_data, session, options):
151 """Apply filters according to options
152
153 modify mb_data in place
154 @param posts_data(list[dict]): data as returned by importer callback
155 @param options(dict): dict as given in [blogImport]
156 """
157 # FIXME: blog filters don't work on text content
158 # TODO: text => XHTML conversion should handler links with <a/>
159 # filters can then be used by converting text to XHTML
160 if not options:
161 return
162
163 # we want only XHTML content
164 for prefix in ('content',): # a tuple is use, if title need to be added in the future
165 try:
166 rich = mb_data['{}_rich'.format(prefix)]
167 except KeyError:
168 pass
169 else:
170 if '{}_xhtml'.format(prefix) in mb_data:
171 raise exceptions.DataError(u"importer gave {prefix}_rich and {prefix}_xhtml at the same time, this is not allowed".format(prefix=prefix))
172 # we convert rich syntax to XHTML here, so we can handle filters easily
173 converted = yield self._s.convert(rich, self._s.getCurrentSyntax(client.profile), safe=False)
174 mb_data['{}_xhtml'.format(prefix)] = converted
175 del mb_data['{}_rich'.format(prefix)]
176
177 try:
178 mb_data['txt']
179 except KeyError:
180 pass
181 else:
182 if '{}_xhtml'.format(prefix) in mb_data:
183 log.warning(u"{prefix}_text will be replaced by converted {prefix}_xhtml, so filters can be handled".format(prefix=prefix))
184 del mb_data['{}_text'.format(prefix)]
185 else:
186 log.warning(u"importer gave a text {prefix}, blog filters don't work on text {prefix}".format(prefix=prefix))
187 return
188
189 # at this point, we have only XHTML version of content
190 try:
191 top_elt = xml_tools.ElementParser()(mb_data['content_xhtml'], namespace=C.NS_XHTML)
192 except domish.ParserError:
193 # we clean the xml and try again our luck
194 cleaned = yield self._s.cleanXHTML(mb_data['content_xhtml'])
195 top_elt = xml_tools.ElementParser()(cleaned, namespace=C.NS_XHTML)
196 opt_host = options.get(OPT_HOST)
197 if opt_host:
198 # we normalise the domain
199 parsed_host = urlparse.urlsplit(opt_host)
200 opt_host = urlparse.urlunsplit((parsed_host.scheme or 'http', parsed_host.netloc or parsed_host.path, '', '', ''))
201
202 tmp_dir = tempfile.mkdtemp()
203 try:
204 # TODO: would be nice to also update the hyperlinks to these images, e.g. when you have <a href="{url}"><img src="{url}"></a>
205 for img_elt in xml_tools.findAll(top_elt, names=[u'img']):
206 yield self.imgFilters(client, img_elt, options, opt_host, tmp_dir)
207 finally:
208 os.rmdir(tmp_dir) # XXX: tmp_dir should be empty, or something went wrong
209
210 # we now replace the content with filtered one
211 mb_data['content_xhtml'] = top_elt.toXml()
212
213 @defer.inlineCallbacks
214 def imgFilters(self, client, img_elt, options, opt_host, tmp_dir):
215 """Filters handling images
216
217 url without host are fixed (if possible)
218 according to options, images are uploaded to XMPP server
219 @param img_elt(domish.Element): <img/> element to handle
220 @param options(dict): filters options
221 @param opt_host(unicode): normalised host given in options
222 @param tmp_dir(str): path to temp directory
223 """
224 try:
225 url = img_elt['src']
226 if url[0] == u'/':
227 if not opt_host:
228 log.warning(u"host was not specified, we can't deal with src without host ({url}) and have to ignore the following <img/>:\n{xml}"
229 .format(url=url, xml=img_elt.toXml()))
230 return
231 else:
232 url = urlparse.urljoin(opt_host, url)
233 filename = url.rsplit('/',1)[-1].strip()
234 if not filename:
235 raise KeyError
236 except (KeyError, IndexError):
237 log.warning(u"ignoring invalid img element: {}".format(img_elt.toXml()))
238 return
239
240 # we change the url for the normalized one
241 img_elt['src'] = url
242
243 if options.get(OPT_UPLOAD_IMAGES, False):
244 # upload is requested
245 try:
246 ignore_host = options[OPT_UPLOAD_IGNORE_HOST]
247 except KeyError:
248 pass
249 else:
250 # host is the ignored one, we skip
251 parsed_url = urlparse.urlsplit(url)
252 if ignore_host in parsed_url.hostname:
253 log.info(u"Don't upload image at {url} because of {opt} option".format(
254 url=url, opt=OPT_UPLOAD_IGNORE_HOST))
255 return
256
257 # we download images and re-upload them via XMPP
258 tmp_file = os.path.join(tmp_dir, filename).encode('utf-8')
259 upload_options = {'ignore_tls_errors': options.get(OPT_IGNORE_TLS, False)}
260
261 try:
262 yield web_client.downloadPage(url.encode('utf-8'), tmp_file)
263 filename = filename.replace(u'%', u'_') # FIXME: tmp workaround for a bug in prosody http upload
264 dummy, download_d = yield self._u.upload(client, tmp_file, filename, options=upload_options)
265 download_url = yield download_d
266 except Exception as e:
267 log.warning(u"can't download image at {url}: {reason}".format(url=url, reason=e))
268 else:
269 img_elt['src'] = download_url
270
271 try:
272 os.unlink(tmp_file)
273 except OSError:
274 pass