comparison sat/plugins/plugin_misc_text_syntaxes.py @ 3709:09f5ac48ffe3

merge bookmark @
author Goffi <goffi@goffi.org>
date Fri, 12 Nov 2021 17:21:24 +0100
parents f568f304c982 0bbdc50aa405
children 33d75cd3c371
comparison
equal deleted inserted replaced
3684:8353cc3b8db9 3709:09f5ac48ffe3
15 # GNU Affero General Public License for more details. 15 # GNU Affero General Public License for more details.
16 16
17 # You should have received a copy of the GNU Affero General Public License 17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 from functools import partial
21 from html import escape
20 import re 22 import re
21 from html import escape 23 from typing import Set
22 from functools import partial
23 from sat.core.i18n import _, D_
24 from sat.core.constants import Const as C
25 from sat.core.log import getLogger
26 24
27 from twisted.internet import defer 25 from twisted.internet import defer
28 from twisted.internet.threads import deferToThread 26 from twisted.internet.threads import deferToThread
27
29 from sat.core import exceptions 28 from sat.core import exceptions
29 from sat.core.constants import Const as C
30 from sat.core.i18n import D_, _
31 from sat.core.log import getLogger
30 from sat.tools import xml_tools 32 from sat.tools import xml_tools
31 33
32 try: 34 try:
33 from lxml import html 35 from lxml import html
34 from lxml.html import clean 36 from lxml.html import clean
113 "param", 115 "param",
114 "source", 116 "source",
115 "track", 117 "track",
116 "wbr") 118 "wbr")
117 119
118 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) 120 SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
121 SAFE_CLASSES = {
122 # those classes are used for code highlighting
123 "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
124 "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
125 "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
126 "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
127 "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
128 }
119 STYLES_VALUES_REGEX = ( 129 STYLES_VALUES_REGEX = (
120 r"^(" 130 r"^("
121 + "|".join( 131 + "|".join(
122 [ 132 [
123 "([a-z-]+)", # alphabetical names 133 "([a-z-]+)", # alphabetical names
235 [TextSyntaxes.OPT_DEFAULT], 245 [TextSyntaxes.OPT_DEFAULT],
236 ) 246 )
237 except ImportError: 247 except ImportError:
238 log.warning("markdown or html2text not found, can't use Markdown syntax") 248 log.warning("markdown or html2text not found, can't use Markdown syntax")
239 log.info( 249 log.info(
240 "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/" 250 "You can download/install them from https://pythonhosted.org/Markdown/ "
251 "and https://github.com/Alir3z4/html2text/"
241 ) 252 )
242 host.bridge.addMethod( 253 host.bridge.addMethod(
243 "syntaxConvert", 254 "syntaxConvert",
244 ".plugin", 255 ".plugin",
245 in_sign="sssbs", 256 in_sign="sssbs",
286 log.error( 297 log.error(
287 "Error while {action}: {failure}".format(action=action, failure=failure) 298 "Error while {action}: {failure}".format(action=action, failure=failure)
288 ) 299 )
289 return failure 300 return failure
290 301
291 def cleanStyle(self, styles): 302 def cleanStyle(self, styles_raw: str) -> str:
292 """"Clean unsafe CSS styles 303 """"Clean unsafe CSS styles
293 304
294 Remove styles not in the whitelist, or where the value doesn't match the regex 305 Remove styles not in the whitelist, or where the value doesn't match the regex
295 @param styles_raw(unicode): CSS styles 306 @param styles_raw: CSS styles
296 @return (unicode): cleaned styles 307 @return: cleaned styles
297 """ 308 """
298 styles = styles.split(";") 309 styles: List[str] = styles_raw.split(";")
299 cleaned_styles = [] 310 cleaned_styles = []
300 for style in styles: 311 for style in styles:
301 try: 312 try:
302 key, value = style.split(":") 313 key, value = style.split(":")
303 except ValueError: 314 except ValueError:
313 cleaned_styles.append((key, value)) 324 cleaned_styles.append((key, value))
314 return "; ".join( 325 return "; ".join(
315 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles] 326 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
316 ) 327 )
317 328
329 def cleanClasses(self, classes_raw: str) -> str:
330 """Remove any non whitelisted class
331
332 @param classes_raw: classes set on an element
333 @return: remaining classes (can be empty string)
334 """
335 return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
336
318 def cleanXHTML(self, xhtml): 337 def cleanXHTML(self, xhtml):
319 """Clean XHTML text by removing potentially dangerous/malicious parts 338 """Clean XHTML text by removing potentially dangerous/malicious parts
320 339
321 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean 340 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
322 @return (unicode): cleaned XHTML 341 @return (unicode): cleaned XHTML
339 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS 358 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
340 ) 359 )
341 xhtml_elt = cleaner.clean_html(xhtml_elt) 360 xhtml_elt = cleaner.clean_html(xhtml_elt)
342 for elt in xhtml_elt.xpath("//*[@style]"): 361 for elt in xhtml_elt.xpath("//*[@style]"):
343 elt.set("style", self.cleanStyle(elt.get("style"))) 362 elt.set("style", self.cleanStyle(elt.get("style")))
363 for elt in xhtml_elt.xpath("//*[@class]"):
364 elt.set("class", self.cleanClasses(elt.get("class")))
344 # we remove self-closing elements for non-void elements 365 # we remove self-closing elements for non-void elements
345 for element in xhtml_elt.iter(tag=etree.Element): 366 for element in xhtml_elt.iter(tag=etree.Element):
346 if not element.text: 367 if not element.text:
347 if element.tag in VOID_ELEMENTS: 368 if element.tag in VOID_ELEMENTS:
348 element.text = None 369 element.text = None