comparison sat/plugins/plugin_misc_text_syntaxes.py @ 3693:0bbdc50aa405

plugin text syntaxes: remove `id` attributes and whitelist allowed classes: for now, the only allowed classes are the one used for code highlighting. fix 389
author Goffi <goffi@goffi.org>
date Thu, 21 Oct 2021 11:55:39 +0200
parents be6d91572633
children 09f5ac48ffe3
comparison
equal deleted inserted replaced
3692:f35ba048687e 3693:0bbdc50aa405
15 # GNU Affero General Public License for more details. 15 # GNU Affero General Public License for more details.
16 16
17 # You should have received a copy of the GNU Affero General Public License 17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 19
20 from functools import partial
21 from html import escape
20 import re 22 import re
21 from html import escape 23 from typing import Set
22 from functools import partial
23 from sat.core.i18n import _, D_
24 from sat.core.constants import Const as C
25 from sat.core.log import getLogger
26 24
27 from twisted.internet import defer 25 from twisted.internet import defer
28 from twisted.internet.threads import deferToThread 26 from twisted.internet.threads import deferToThread
27
29 from sat.core import exceptions 28 from sat.core import exceptions
29 from sat.core.constants import Const as C
30 from sat.core.i18n import D_, _
31 from sat.core.log import getLogger
30 from sat.tools import xml_tools 32 from sat.tools import xml_tools
31 33
32 try: 34 try:
33 from lxml import html 35 from lxml import html
34 from lxml.html import clean 36 from lxml.html import clean
113 "param", 115 "param",
114 "source", 116 "source",
115 "track", 117 "track",
116 "wbr") 118 "wbr")
117 119
118 SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) 120 SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
121 SAFE_CLASSES = {
122 # those classes are used for code highlighting
123 "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
124 "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
125 "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
126 "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
127 "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
128 }
119 STYLES_VALUES_REGEX = ( 129 STYLES_VALUES_REGEX = (
120 r"^(" 130 r"^("
121 + "|".join( 131 + "|".join(
122 [ 132 [
123 "([a-z-]+)", # alphabetical names 133 "([a-z-]+)", # alphabetical names
233 [TextSyntaxes.OPT_DEFAULT], 243 [TextSyntaxes.OPT_DEFAULT],
234 ) 244 )
235 except ImportError: 245 except ImportError:
236 log.warning("markdown or html2text not found, can't use Markdown syntax") 246 log.warning("markdown or html2text not found, can't use Markdown syntax")
237 log.info( 247 log.info(
238 "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/" 248 "You can download/install them from https://pythonhosted.org/Markdown/ "
249 "and https://github.com/Alir3z4/html2text/"
239 ) 250 )
240 host.bridge.addMethod( 251 host.bridge.addMethod(
241 "syntaxConvert", 252 "syntaxConvert",
242 ".plugin", 253 ".plugin",
243 in_sign="sssbs", 254 in_sign="sssbs",
284 log.error( 295 log.error(
285 "Error while {action}: {failure}".format(action=action, failure=failure) 296 "Error while {action}: {failure}".format(action=action, failure=failure)
286 ) 297 )
287 return failure 298 return failure
288 299
289 def cleanStyle(self, styles): 300 def cleanStyle(self, styles_raw: str) -> str:
290 """"Clean unsafe CSS styles 301 """"Clean unsafe CSS styles
291 302
292 Remove styles not in the whitelist, or where the value doesn't match the regex 303 Remove styles not in the whitelist, or where the value doesn't match the regex
293 @param styles_raw(unicode): CSS styles 304 @param styles_raw: CSS styles
294 @return (unicode): cleaned styles 305 @return: cleaned styles
295 """ 306 """
296 styles = styles.split(";") 307 styles: List[str] = styles_raw.split(";")
297 cleaned_styles = [] 308 cleaned_styles = []
298 for style in styles: 309 for style in styles:
299 try: 310 try:
300 key, value = style.split(":") 311 key, value = style.split(":")
301 except ValueError: 312 except ValueError:
311 cleaned_styles.append((key, value)) 322 cleaned_styles.append((key, value))
312 return "; ".join( 323 return "; ".join(
313 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles] 324 ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
314 ) 325 )
315 326
327 def cleanClasses(self, classes_raw: str) -> str:
328 """Remove any non whitelisted class
329
330 @param classes_raw: classes set on an element
331 @return: remaining classes (can be empty string)
332 """
333 return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
334
316 def cleanXHTML(self, xhtml): 335 def cleanXHTML(self, xhtml):
317 """Clean XHTML text by removing potentially dangerous/malicious parts 336 """Clean XHTML text by removing potentially dangerous/malicious parts
318 337
319 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean 338 @param xhtml(unicode, lxml.etree._Element): raw HTML/XHTML text to clean
320 @return (unicode): cleaned XHTML 339 @return (unicode): cleaned XHTML
337 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS 356 style=False, add_nofollow=False, safe_attrs=SAFE_ATTRS
338 ) 357 )
339 xhtml_elt = cleaner.clean_html(xhtml_elt) 358 xhtml_elt = cleaner.clean_html(xhtml_elt)
340 for elt in xhtml_elt.xpath("//*[@style]"): 359 for elt in xhtml_elt.xpath("//*[@style]"):
341 elt.set("style", self.cleanStyle(elt.get("style"))) 360 elt.set("style", self.cleanStyle(elt.get("style")))
361 for elt in xhtml_elt.xpath("//*[@class]"):
362 elt.set("class", self.cleanClasses(elt.get("class")))
342 # we remove self-closing elements for non-void elements 363 # we remove self-closing elements for non-void elements
343 for element in xhtml_elt.iter(tag=etree.Element): 364 for element in xhtml_elt.iter(tag=etree.Element):
344 if not element.text: 365 if not element.text:
345 if element.tag in VOID_ELEMENTS: 366 if element.tag in VOID_ELEMENTS:
346 element.text = None 367 element.text = None