# HG changeset patch # User Goffi # Date 1634810139 -7200 # Node ID 0bbdc50aa405858307d8133cf9d13407be20a3a2 # Parent f35ba048687e25f07284db1311ffd2daacdde02f plugin text syntaxes: remove `id` attributes and whitelist allowed classes: for now, the only allowed classes are the one used for code highlighting. fix 389 diff -r f35ba048687e -r 0bbdc50aa405 sat/plugins/plugin_misc_text_syntaxes.py --- a/sat/plugins/plugin_misc_text_syntaxes.py Thu Oct 21 11:53:56 2021 +0200 +++ b/sat/plugins/plugin_misc_text_syntaxes.py Thu Oct 21 11:55:39 2021 +0200 @@ -17,16 +17,18 @@ # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see . -import re +from functools import partial from html import escape -from functools import partial -from sat.core.i18n import _, D_ -from sat.core.constants import Const as C -from sat.core.log import getLogger +import re +from typing import Set from twisted.internet import defer from twisted.internet.threads import deferToThread + from sat.core import exceptions +from sat.core.constants import Const as C +from sat.core.i18n import D_, _ +from sat.core.log import getLogger from sat.tools import xml_tools try: @@ -115,7 +117,15 @@ "track", "wbr") -SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls")) +SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"} +SAFE_CLASSES = { + # those classes are used for code highlighting + "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh", + "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd", + "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd", + "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc", + "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write", +} STYLES_VALUES_REGEX = ( r"^(" + "|".join( @@ -235,7 +245,8 @@ except ImportError: log.warning("markdown or html2text not found, can't use Markdown syntax") log.info( - "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/" + "You can download/install them from https://pythonhosted.org/Markdown/ " + "and https://github.com/Alir3z4/html2text/" ) host.bridge.addMethod( "syntaxConvert", @@ -286,14 +297,14 @@ ) return failure - def cleanStyle(self, styles): + def cleanStyle(self, styles_raw: str) -> str: """"Clean unsafe CSS styles Remove styles not in the whitelist, or where the value doesn't match the regex - @param styles_raw(unicode): CSS styles - @return (unicode): cleaned styles + @param styles_raw: CSS styles + @return: cleaned styles """ - styles = styles.split(";") + styles: List[str] = styles_raw.split(";") cleaned_styles = [] for style in styles: try: @@ -313,6 +324,14 @@ ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles] ) + def cleanClasses(self, classes_raw: str) -> str: + """Remove any non whitelisted class + + @param classes_raw: classes set on an element + @return: remaining classes (can be empty string) + """ + return " ".join(SAFE_CLASSES.intersection(classes_raw.split())) + def cleanXHTML(self, xhtml): """Clean XHTML text by removing potentially dangerous/malicious parts @@ -339,6 +358,8 @@ xhtml_elt = cleaner.clean_html(xhtml_elt) for elt in xhtml_elt.xpath("//*[@style]"): elt.set("style", self.cleanStyle(elt.get("style"))) + for elt in xhtml_elt.xpath("//*[@class]"): + elt.set("class", self.cleanClasses(elt.get("class"))) # we remove self-closing elements for non-void elements for element in xhtml_elt.iter(tag=etree.Element): if not element.text: