diff sat/plugins/plugin_misc_text_syntaxes.py @ 3709:09f5ac48ffe3

merge bookmark @
author Goffi <goffi@goffi.org>
date Fri, 12 Nov 2021 17:21:24 +0100
parents f568f304c982 0bbdc50aa405
children 33d75cd3c371
line wrap: on
line diff
--- a/sat/plugins/plugin_misc_text_syntaxes.py	Mon Sep 27 08:29:09 2021 +0200
+++ b/sat/plugins/plugin_misc_text_syntaxes.py	Fri Nov 12 17:21:24 2021 +0100
@@ -17,16 +17,18 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import re
+from functools import partial
 from html import escape
-from functools import partial
-from sat.core.i18n import _, D_
-from sat.core.constants import Const as C
-from sat.core.log import getLogger
+import re
+from typing import Set
 
 from twisted.internet import defer
 from twisted.internet.threads import deferToThread
+
 from sat.core import exceptions
+from sat.core.constants import Const as C
+from sat.core.i18n import D_, _
+from sat.core.log import getLogger
 from sat.tools import xml_tools
 
 try:
@@ -115,7 +117,15 @@
     "track",
     "wbr")
 
-SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls"))
+SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
+SAFE_CLASSES = {
+    # those classes are used for code highlighting
+    "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
+    "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
+    "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
+    "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
+    "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
+}
 STYLES_VALUES_REGEX = (
     r"^("
     + "|".join(
@@ -237,7 +247,8 @@
         except ImportError:
             log.warning("markdown or html2text not found, can't use Markdown syntax")
             log.info(
-                "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/"
+                "You can download/install them from https://pythonhosted.org/Markdown/ "
+                "and https://github.com/Alir3z4/html2text/"
             )
         host.bridge.addMethod(
             "syntaxConvert",
@@ -288,14 +299,14 @@
         )
         return failure
 
-    def cleanStyle(self, styles):
+    def cleanStyle(self, styles_raw: str) -> str:
         """"Clean unsafe CSS styles
 
         Remove styles not in the whitelist, or where the value doesn't match the regex
-        @param styles_raw(unicode): CSS styles
-        @return (unicode): cleaned styles
+        @param styles_raw: CSS styles
+        @return: cleaned styles
         """
-        styles = styles.split(";")
+        styles: List[str] = styles_raw.split(";")
         cleaned_styles = []
         for style in styles:
             try:
@@ -315,6 +326,14 @@
             ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
         )
 
+    def cleanClasses(self, classes_raw: str) -> str:
+        """Remove any non whitelisted class
+
+        @param classes_raw: classes set on an element
+        @return: remaining classes (can be empty string)
+        """
+        return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
+
     def cleanXHTML(self, xhtml):
         """Clean XHTML text by removing potentially dangerous/malicious parts
 
@@ -341,6 +360,8 @@
         xhtml_elt = cleaner.clean_html(xhtml_elt)
         for elt in xhtml_elt.xpath("//*[@style]"):
             elt.set("style", self.cleanStyle(elt.get("style")))
+        for elt in xhtml_elt.xpath("//*[@class]"):
+            elt.set("class", self.cleanClasses(elt.get("class")))
         # we remove self-closing elements for non-void elements
         for element in xhtml_elt.iter(tag=etree.Element):
             if not element.text: