diff browser_side/tools.py @ 347:f1ba38043d78

browser_side: status panel is based on a new class LightTextEditor which uses HTML5 "editablecontent" property
author souliane <souliane@mailoo.org>
date Fri, 07 Feb 2014 20:14:11 +0100
parents ce5b33f499c5
children f488692c4903
line wrap: on
line diff
--- a/browser_side/tools.py	Fri Feb 07 20:08:28 2014 +0100
+++ b/browser_side/tools.py	Fri Feb 07 20:14:11 2014 +0100
@@ -22,6 +22,7 @@
 from pyjamas import Window
 from nativedom import NativeDOM
 from sat_frontends.tools import xmltools
+import re
 
 dom = NativeDOM()
 
@@ -31,6 +32,30 @@
     return html.replace('<', '&lt;').replace('>', '&gt;')
 
 
+def html_clean(html):
+    """
+    Remove HTML markup from the given string.
+    Copied from nltk.clean_html (http://www.nltk.org/)
+
+    @param html: the HTML string to be cleaned
+    @type html: C{string}
+    @rtype: C{string}
+    """
+
+    # First we remove inline JavaScript/CSS:
+    cleaned = re.sub(r"(?is)<(script|style).*?>.*?(</\1>)", "", html.strip())
+    # Then we remove html comments. This has to be done before removing regular
+    # tags since comments can contain '>' characters.
+    cleaned = re.sub(r"(?s)<!--(.*?)-->[\n]?", "", cleaned)
+    # Next we can remove the remaining tags:
+    cleaned = re.sub(r"(?s)<.*?>", " ", cleaned)
+    # Finally, we deal with whitespace
+    cleaned = re.sub(r"&nbsp;", " ", cleaned)
+    cleaned = re.sub(r"  ", " ", cleaned)
+    cleaned = re.sub(r"  ", " ", cleaned)
+    return cleaned.strip()
+
+
 def inlineRoot(xhtml):
     """ make root element inline """
     doc = dom.parseString(xhtml)