comparison sat/plugins/plugin_misc_text_syntaxes.py @ 2624:56f94936df1e

code style reformatting using black
author Goffi <goffi@goffi.org>
date Wed, 27 Jun 2018 20:14:46 +0200
parents 26edcf3a30eb
children 003b8b4b56a7
comparison
equal deleted inserted replaced
2623:49533de4540b 2624:56f94936df1e
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 from sat.core.i18n import _, D_ 20 from sat.core.i18n import _, D_
21 from sat.core.constants import Const as C 21 from sat.core.constants import Const as C
22 from sat.core.log import getLogger 22 from sat.core.log import getLogger
23
23 log = getLogger(__name__) 24 log = getLogger(__name__)
24 25
25 from twisted.internet import defer 26 from twisted.internet import defer
26 from twisted.internet.threads import deferToThread 27 from twisted.internet.threads import deferToThread
27 from sat.core import exceptions 28 from sat.core import exceptions
29
28 try: 30 try:
29 from lxml import html 31 from lxml import html
30 from lxml.html import clean 32 from lxml.html import clean
31 except ImportError: 33 except ImportError:
32 raise exceptions.MissingModule(u"Missing module lxml, please download/install it from http://lxml.de/") 34 raise exceptions.MissingModule(
35 u"Missing module lxml, please download/install it from http://lxml.de/"
36 )
33 from cgi import escape 37 from cgi import escape
34 import re 38 import re
35 39
36 40
37 CATEGORY = D_("Composition") 41 CATEGORY = D_("Composition")
39 _SYNTAX_XHTML = "XHTML" 43 _SYNTAX_XHTML = "XHTML"
40 _SYNTAX_CURRENT = "@CURRENT@" 44 _SYNTAX_CURRENT = "@CURRENT@"
41 45
42 # TODO: check/adapt following list 46 # TODO: check/adapt following list
43 # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html) 47 # list initialy based on feedparser list (http://pythonhosted.org/feedparser/html-sanitization.html)
44 STYLES_WHITELIST = ("azimuth", "background-color", "border-bottom-color", "border-collapse", "border-color", "border-left-color", "border-right-color", "border-top-color", "clear", "color", "cursor", "direction", "display", "elevation", "float", "font", "font-family", "font-size", "font-style", "font-variant", "font-weight", "height", "letter-spacing", "line-height", "overflow", "pause", "pause-after", "pause-before", "pitch", "pitch-range", "richness", "speak", "speak-header", "speak-numeral", "speak-punctuation", "speech-rate", "stress", "text-align", "text-decoration", "text-indent", "unicode-bidi", "vertical-align", "voice-family", "volume", "white-space", "width") 48 STYLES_WHITELIST = (
45 49 "azimuth",
46 SAFE_ATTRS = html.defs.safe_attrs.union(('style', 'poster', 'controls')) 50 "background-color",
47 STYLES_VALUES_REGEX = r'^(' + '|'.join(['([a-z-]+)', # alphabetical names 51 "border-bottom-color",
48 '(#[0-9a-f]+)', # hex value 52 "border-collapse",
49 '(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))', # values with units (or not) 53 "border-color",
50 'rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)', # rgb function 54 "border-left-color",
51 'rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)', # rgba function 55 "border-right-color",
52 ]) + ') *(!important)?$' # we accept "!important" at the end 56 "border-top-color",
57 "clear",
58 "color",
59 "cursor",
60 "direction",
61 "display",
62 "elevation",
63 "float",
64 "font",
65 "font-family",
66 "font-size",
67 "font-style",
68 "font-variant",
69 "font-weight",
70 "height",
71 "letter-spacing",
72 "line-height",
73 "overflow",
74 "pause",
75 "pause-after",
76 "pause-before",
77 "pitch",
78 "pitch-range",
79 "richness",
80 "speak",
81 "speak-header",
82 "speak-numeral",
83 "speak-punctuation",
84 "speech-rate",
85 "stress",
86 "text-align",
87 "text-decoration",
88 "text-indent",
89 "unicode-bidi",
90 "vertical-align",
91 "voice-family",
92 "volume",
93 "white-space",
94 "width",
95 )
96
97 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls"))
98 STYLES_VALUES_REGEX = (
99 r"^("
100 + "|".join(
101 [
102 "([a-z-]+)", # alphabetical names
103 "(#[0-9a-f]+)", # hex value
104 "(\d+(.\d+)? *(|%|em|ex|px|in|cm|mm|pt|pc))", # values with units (or not)
105 "rgb\( *((\d+(.\d+)?), *){2}(\d+(.\d+)?) *\)", # rgb function
106 "rgba\( *((\d+(.\d+)?), *){3}(\d+(.\d+)?) *\)", # rgba function
107 ]
108 )
109 + ") *(!important)?$"
110 ) # we accept "!important" at the end
53 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX) 111 STYLES_ACCEPTED_VALUE = re.compile(STYLES_VALUES_REGEX)
54 112
55 PLUGIN_INFO = { 113 PLUGIN_INFO = {
56 C.PI_NAME: "Text syntaxes", 114 C.PI_NAME: "Text syntaxes",
57 C.PI_IMPORT_NAME: "TEXT-SYNTAXES", 115 C.PI_IMPORT_NAME: "TEXT-SYNTAXES",
58 C.PI_TYPE: "MISC", 116 C.PI_TYPE: "MISC",
59 C.PI_PROTOCOLS: [], 117 C.PI_PROTOCOLS: [],
60 C.PI_DEPENDENCIES: [], 118 C.PI_DEPENDENCIES: [],
61 C.PI_MAIN: "TextSyntaxes", 119 C.PI_MAIN: "TextSyntaxes",
62 C.PI_HANDLER: "no", 120 C.PI_HANDLER: "no",
63 C.PI_DESCRIPTION: _("""Management of various text syntaxes (XHTML-IM, Markdown, etc)""") 121 C.PI_DESCRIPTION: _(
122 """Management of various text syntaxes (XHTML-IM, Markdown, etc)"""
123 ),
64 } 124 }
65 125
66 126
67 class TextSyntaxes(object): 127 class TextSyntaxes(object):
68 """ Text conversion class 128 """ Text conversion class
89 </individual> 149 </individual>
90 </params> 150 </params>
91 """ 151 """
92 152
93 params_data = { 153 params_data = {
94 'category_name': CATEGORY, 154 "category_name": CATEGORY,
95 'category_label': _(CATEGORY), 155 "category_label": _(CATEGORY),
96 'name': NAME, 156 "name": NAME,
97 'label': _(NAME), 157 "label": _(NAME),
98 'syntaxes': syntaxes, 158 "syntaxes": syntaxes,
99 } 159 }
100 160
101 def __init__(self, host): 161 def __init__(self, host):
102 log.info(_("Text syntaxes plugin initialization")) 162 log.info(_("Text syntaxes plugin initialization"))
103 self.host = host 163 self.host = host
104 self.addSyntax(self.SYNTAX_XHTML, lambda xhtml: defer.succeed(xhtml), lambda xhtml: defer.succeed(xhtml), 164 self.addSyntax(
105 TextSyntaxes.OPT_NO_THREAD) 165 self.SYNTAX_XHTML,
166 lambda xhtml: defer.succeed(xhtml),
167 lambda xhtml: defer.succeed(xhtml),
168 TextSyntaxes.OPT_NO_THREAD,
169 )
106 # TODO: text => XHTML should add <a/> to url like in frontends 170 # TODO: text => XHTML should add <a/> to url like in frontends
107 # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar 171 # it's probably best to move sat_frontends.tools.strings to sat.tools.common or similar
108 self.addSyntax(self.SYNTAX_TEXT, lambda text: escape(text), lambda xhtml: self._removeMarkups(xhtml), [TextSyntaxes.OPT_HIDDEN]) 172 self.addSyntax(
173 self.SYNTAX_TEXT,
174 lambda text: escape(text),
175 lambda xhtml: self._removeMarkups(xhtml),
176 [TextSyntaxes.OPT_HIDDEN],
177 )
109 try: 178 try:
110 import markdown, html2text 179 import markdown, html2text
111 180
112 def _html2text(html, baseurl=''): 181 def _html2text(html, baseurl=""):
113 h = html2text.HTML2Text(baseurl=baseurl) 182 h = html2text.HTML2Text(baseurl=baseurl)
114 h.body_width = 0 # do not truncate the lines, it breaks the long URLs 183 h.body_width = 0 # do not truncate the lines, it breaks the long URLs
115 return h.handle(html) 184 return h.handle(html)
116 self.addSyntax(self.SYNTAX_MARKDOWN, markdown.markdown, _html2text, [TextSyntaxes.OPT_DEFAULT]) 185
186 self.addSyntax(
187 self.SYNTAX_MARKDOWN,
188 markdown.markdown,
189 _html2text,
190 [TextSyntaxes.OPT_DEFAULT],
191 )
117 except ImportError: 192 except ImportError:
118 log.warning(u"markdown or html2text not found, can't use Markdown syntax") 193 log.warning(u"markdown or html2text not found, can't use Markdown syntax")
119 log.info(u"You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/") 194 log.info(
120 host.bridge.addMethod("syntaxConvert", ".plugin", in_sign='sssbs', out_sign='s', 195 u"You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/"
121 async=True, method=self.convert) 196 )
122 host.bridge.addMethod("syntaxGet", ".plugin", in_sign='s', out_sign='s', 197 host.bridge.addMethod(
123 method=self.getSyntax) 198 "syntaxConvert",
199 ".plugin",
200 in_sign="sssbs",
201 out_sign="s",
202 async=True,
203 method=self.convert,
204 )
205 host.bridge.addMethod(
206 "syntaxGet", ".plugin", in_sign="s", out_sign="s", method=self.getSyntax
207 )
124 208
125 def _updateParamOptions(self): 209 def _updateParamOptions(self):
126 data_synt = TextSyntaxes.syntaxes 210 data_synt = TextSyntaxes.syntaxes
127 default_synt = TextSyntaxes.default_syntax 211 default_synt = TextSyntaxes.default_syntax
128 syntaxes = [] 212 syntaxes = []
134 218
135 syntaxes.sort(key=lambda synt: synt.lower()) 219 syntaxes.sort(key=lambda synt: synt.lower())
136 options = [] 220 options = []
137 221
138 for syntax in syntaxes: 222 for syntax in syntaxes:
139 selected = 'selected="true"' if syntax == default_synt else '' 223 selected = 'selected="true"' if syntax == default_synt else ""
140 options.append(u'<option value="%s" %s/>' % (syntax, selected)) 224 options.append(u'<option value="%s" %s/>' % (syntax, selected))
141 225
142 TextSyntaxes.params_data["options"] = u'\n'.join(options) 226 TextSyntaxes.params_data["options"] = u"\n".join(options)
143 self.host.memory.updateParams(TextSyntaxes.params % TextSyntaxes.params_data) 227 self.host.memory.updateParams(TextSyntaxes.params % TextSyntaxes.params_data)
144 228
145 def getCurrentSyntax(self, profile): 229 def getCurrentSyntax(self, profile):
146 """ Return the selected syntax for the given profile 230 """ Return the selected syntax for the given profile
147 231
148 @param profile: %(doc_profile)s 232 @param profile: %(doc_profile)s
149 @return: profile selected syntax 233 @return: profile selected syntax
150 """ 234 """
151 return self.host.memory.getParamA(NAME, CATEGORY , profile_key=profile) 235 return self.host.memory.getParamA(NAME, CATEGORY, profile_key=profile)
152 236
153 def _logError(self, failure, action=u"converting syntax"): 237 def _logError(self, failure, action=u"converting syntax"):
154 log.error(u"Error while {action}: {failure}".format(action=action, failure=failure)) 238 log.error(
239 u"Error while {action}: {failure}".format(action=action, failure=failure)
240 )
155 return failure 241 return failure
156 242
157 def cleanXHTML(self, xhtml): 243 def cleanXHTML(self, xhtml):
158 """ Clean XHTML text by removing potentially dangerous/malicious parts 244 """ Clean XHTML text by removing potentially dangerous/malicious parts
159 @param xhtml: raw xhtml text to clean (or lxml's HtmlElement) 245 @param xhtml: raw xhtml text to clean (or lxml's HtmlElement)
160 """ 246 """
247
161 def blocking_cleaning(xhtml): 248 def blocking_cleaning(xhtml):
162 """ Clean XHTML and style attributes """ 249 """ Clean XHTML and style attributes """
163 250
164 def clean_style(styles_raw): 251 def clean_style(styles_raw):
165 """" Remove styles not in the whitelist, 252 """" Remove styles not in the whitelist,
166 or where the value doesn't match the regex """ 253 or where the value doesn't match the regex """
167 styles = styles_raw.split(";") 254 styles = styles_raw.split(";")
168 cleaned_styles = [] 255 cleaned_styles = []
169 for style in styles: 256 for style in styles:
170 try: 257 try:
171 key, value = style.split(':') 258 key, value = style.split(":")
172 except ValueError: 259 except ValueError:
173 continue 260 continue
174 key = key.lower().strip() 261 key = key.lower().strip()
175 if key not in STYLES_WHITELIST: 262 if key not in STYLES_WHITELIST:
176 continue 263 continue
178 if not STYLES_ACCEPTED_VALUE.match(value): 265 if not STYLES_ACCEPTED_VALUE.match(value):
179 continue 266 continue
180 if value == "none": 267 if value == "none":
181 continue 268 continue
182 cleaned_styles.append((key, value)) 269 cleaned_styles.append((key, value))
183 return "; ".join(["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]) 270 return "; ".join(
271 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
272 )
184 273
185 if isinstance(xhtml, basestring): 274 if isinstance(xhtml, basestring):
186 xhtml_elt = html.fromstring(xhtml) 275 xhtml_elt = html.fromstring(xhtml)
187 elif isinstance(xhtml, html.HtmlElement): 276 elif isinstance(xhtml, html.HtmlElement):
188 xhtml_elt = xhtml 277 xhtml_elt = xhtml
189 else: 278 else:
190 log.error("Only strings and HtmlElements can be cleaned") 279 log.error("Only strings and HtmlElements can be cleaned")
191 raise exceptions.DataError 280 raise exceptions.DataError
192 cleaner = clean.Cleaner(style=False, 281 cleaner = clean.Cleaner(
193 add_nofollow=False, 282 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
194 safe_attrs=SAFE_ATTRS) 283 )
195 xhtml_elt = cleaner.clean_html(xhtml_elt) 284 xhtml_elt = cleaner.clean_html(xhtml_elt)
196 for elt in xhtml_elt.xpath("//*[@style]"): 285 for elt in xhtml_elt.xpath("//*[@style]"):
197 elt.set("style", clean_style(elt.get('style'))) 286 elt.set("style", clean_style(elt.get("style")))
198 return html.tostring(xhtml_elt, encoding=unicode, method='xml') 287 return html.tostring(xhtml_elt, encoding=unicode, method="xml")
199 288
200 d = deferToThread(blocking_cleaning, xhtml) 289 d = deferToThread(blocking_cleaning, xhtml)
201 d.addErrback(self._logError, action=u"cleaning syntax") 290 d.addErrback(self._logError, action=u"cleaning syntax")
202 return d 291 return d
203 292
204 def convert(self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None): 293 def convert(
294 self, text, syntax_from, syntax_to=_SYNTAX_XHTML, safe=True, profile=None
295 ):
205 """Convert a text between two syntaxes 296 """Convert a text between two syntaxes
206 297
207 @param text: text to convert 298 @param text: text to convert
208 @param syntax_from: source syntax (e.g. "markdown") 299 @param syntax_from: source syntax (e.g. "markdown")
209 @param syntax_to: dest syntax (e.g.: "XHTML") 300 @param syntax_to: dest syntax (e.g.: "XHTML")
233 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]: 324 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_from]["flags"]:
234 d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text) 325 d = defer.maybeDeferred(syntaxes[syntax_from]["to"], text)
235 else: 326 else:
236 d = deferToThread(syntaxes[syntax_from]["to"], text) 327 d = deferToThread(syntaxes[syntax_from]["to"], text)
237 328
238 #TODO: keep only body element and change it to a div here ? 329 # TODO: keep only body element and change it to a div here ?
239 330
240 if safe: 331 if safe:
241 d.addCallback(self.cleanXHTML) 332 d.addCallback(self.cleanXHTML)
242 333
243 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]: 334 if TextSyntaxes.OPT_NO_THREAD in syntaxes[syntax_to]["flags"]:
247 338
248 # converters can add new lines that disturb the microblog change detection 339 # converters can add new lines that disturb the microblog change detection
249 d.addCallback(lambda text: text.rstrip()) 340 d.addCallback(lambda text: text.rstrip())
250 return d 341 return d
251 342
252 def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags = None): 343 def addSyntax(self, name, to_xhtml_cb, from_xhtml_cb, flags=None):
253 """Add a new syntax to the manager 344 """Add a new syntax to the manager
254 345
255 @param name: unique name of the syntax 346 @param name: unique name of the syntax
256 @param to_xhtml_cb: callback to convert from syntax to XHTML 347 @param to_xhtml_cb: callback to convert from syntax to XHTML
257 @param from_xhtml_cb: callback to convert from XHTML to syntax 348 @param from_xhtml_cb: callback to convert from XHTML to syntax
260 TextSyntaxes.OPT_HIDDEN: do not show in parameters 351 TextSyntaxes.OPT_HIDDEN: do not show in parameters
261 TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred) 352 TextSyntaxes.OPT_NO_THREAD: do not defer to thread when converting (the callback may then return a deferred)
262 """ 353 """
263 flags = flags if flags is not None else [] 354 flags = flags if flags is not None else []
264 if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags: 355 if TextSyntaxes.OPT_HIDDEN in flags and TextSyntaxes.OPT_DEFAULT in flags:
265 raise ValueError(u"{} and {} are mutually exclusive".format(TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT)) 356 raise ValueError(
357 u"{} and {} are mutually exclusive".format(
358 TextSyntaxes.OPT_HIDDEN, TextSyntaxes.OPT_DEFAULT
359 )
360 )
266 361
267 syntaxes = TextSyntaxes.syntaxes 362 syntaxes = TextSyntaxes.syntaxes
268 key = name.lower().strip() 363 key = name.lower().strip()
269 if key in syntaxes: 364 if key in syntaxes:
270 raise exceptions.ConflictError(u"This syntax key already exists: {}".format(key)) 365 raise exceptions.ConflictError(
271 syntaxes[key] = {"name": name, "to": to_xhtml_cb, "from": from_xhtml_cb, "flags": flags} 366 u"This syntax key already exists: {}".format(key)
367 )
368 syntaxes[key] = {
369 "name": name,
370 "to": to_xhtml_cb,
371 "from": from_xhtml_cb,
372 "flags": flags,
373 }
272 if TextSyntaxes.OPT_DEFAULT in flags: 374 if TextSyntaxes.OPT_DEFAULT in flags:
273 TextSyntaxes.default_syntaxe = key 375 TextSyntaxes.default_syntaxe = key
274 376
275 self._updateParamOptions() 377 self._updateParamOptions()
276 378
288 """Remove XHTML markups from the given string. 390 """Remove XHTML markups from the given string.
289 391
290 @param xhtml: the XHTML string to be cleaned 392 @param xhtml: the XHTML string to be cleaned
291 @return: the cleaned string 393 @return: the cleaned string
292 """ 394 """
293 cleaner = clean.Cleaner(kill_tags=['style']) 395 cleaner = clean.Cleaner(kill_tags=["style"])
294 cleaned = cleaner.clean_html(html.fromstring(xhtml)) 396 cleaned = cleaner.clean_html(html.fromstring(xhtml))
295 return html.tostring(cleaned, encoding=unicode, method="text") 397 return html.tostring(cleaned, encoding=unicode, method="text")