changeset 3693:0bbdc50aa405

plugin text syntaxes: remove `id` attributes and whitelist allowed classes: for now, the only allowed classes are the one used for code highlighting. fix 389
author Goffi <goffi@goffi.org>
date Thu, 21 Oct 2021 11:55:39 +0200
parents f35ba048687e
children a6811543c7ff
files sat/plugins/plugin_misc_text_syntaxes.py
diffstat 1 files changed, 32 insertions(+), 11 deletions(-) [+]
line wrap: on
line diff
--- a/sat/plugins/plugin_misc_text_syntaxes.py	Thu Oct 21 11:53:56 2021 +0200
+++ b/sat/plugins/plugin_misc_text_syntaxes.py	Thu Oct 21 11:55:39 2021 +0200
@@ -17,16 +17,18 @@
 # You should have received a copy of the GNU Affero General Public License
 # along with this program.  If not, see <http://www.gnu.org/licenses/>.
 
-import re
+from functools import partial
 from html import escape
-from functools import partial
-from sat.core.i18n import _, D_
-from sat.core.constants import Const as C
-from sat.core.log import getLogger
+import re
+from typing import Set
 
 from twisted.internet import defer
 from twisted.internet.threads import deferToThread
+
 from sat.core import exceptions
+from sat.core.constants import Const as C
+from sat.core.i18n import D_, _
+from sat.core.log import getLogger
 from sat.tools import xml_tools
 
 try:
@@ -115,7 +117,15 @@
     "track",
     "wbr")
 
-SAFE_ATTRS = html.defs.safe_attrs.union(("style", "poster", "controls"))
+SAFE_ATTRS = html.defs.safe_attrs.union({"style", "poster", "controls"}) - {"id"}
+SAFE_CLASSES = {
+    # those classes are used for code highlighting
+    "bp", "c", "ch", "cm", "cp", "cpf", "cs", "dl", "err", "fm", "gd", "ge", "get", "gh",
+    "gi", "go", "gp", "gr", "gs", "gt", "gu", "highlight", "hll", "il", "k", "kc", "kd",
+    "kn", "kp", "kr", "kt", "m", "mb", "mf", "mh", "mi", "mo", "na", "nb", "nc", "nd",
+    "ne", "nf", "ni", "nl", "nn", "no", "nt", "nv", "o", "ow", "s", "sa", "sb", "sc",
+    "sd", "se", "sh", "si", "sr", "ss", "sx", "vc", "vg", "vi", "vm", "w", "write",
+}
 STYLES_VALUES_REGEX = (
     r"^("
     + "|".join(
@@ -235,7 +245,8 @@
         except ImportError:
             log.warning("markdown or html2text not found, can't use Markdown syntax")
             log.info(
-                "You can download/install them from https://pythonhosted.org/Markdown/ and https://github.com/Alir3z4/html2text/"
+                "You can download/install them from https://pythonhosted.org/Markdown/ "
+                "and https://github.com/Alir3z4/html2text/"
             )
         host.bridge.addMethod(
             "syntaxConvert",
@@ -286,14 +297,14 @@
         )
         return failure
 
-    def cleanStyle(self, styles):
+    def cleanStyle(self, styles_raw: str) -> str:
         """"Clean unsafe CSS styles
 
         Remove styles not in the whitelist, or where the value doesn't match the regex
-        @param styles_raw(unicode): CSS styles
-        @return (unicode): cleaned styles
+        @param styles_raw: CSS styles
+        @return: cleaned styles
         """
-        styles = styles.split(";")
+        styles: List[str] = styles_raw.split(";")
         cleaned_styles = []
         for style in styles:
             try:
@@ -313,6 +324,14 @@
             ["%s: %s" % (key_, value_) for key_, value_ in cleaned_styles]
         )
 
+    def cleanClasses(self, classes_raw: str) -> str:
+        """Remove any non whitelisted class
+
+        @param classes_raw: classes set on an element
+        @return: remaining classes (can be empty string)
+        """
+        return " ".join(SAFE_CLASSES.intersection(classes_raw.split()))
+
     def cleanXHTML(self, xhtml):
         """Clean XHTML text by removing potentially dangerous/malicious parts
 
@@ -339,6 +358,8 @@
         xhtml_elt = cleaner.clean_html(xhtml_elt)
         for elt in xhtml_elt.xpath("//*[@style]"):
             elt.set("style", self.cleanStyle(elt.get("style")))
+        for elt in xhtml_elt.xpath("//*[@class]"):
+            elt.set("class", self.cleanClasses(elt.get("class")))
         # we remove self-closing elements for non-void elements
         for element in xhtml_elt.iter(tag=etree.Element):
             if not element.text: