changeset 1807:0d3110341947

plugin syntax dc_wiki: added XHTML => dc_wiki converter, plus some bug fixes: - escape_chars regex has been fixed (an \ was missing for backslash escaping) - escape_chars is now splited in consts so XHTML => dc_wiki escape less characters, for better readability - fixed bad length check in DCWikiParser.parser_link dotclear wiki syntax should now be usable
author Goffi <goffi@goffi.org>
date Mon, 18 Jan 2016 17:04:29 +0100 (2016-01-18)
parents fd788d24277a
children 18561326a561
files src/plugins/plugin_syntax_dc_wiki.py
diffstat 1 files changed, 345 insertions(+), 3 deletions(-) [+]
line wrap: on
line diff
--- a/src/plugins/plugin_syntax_dc_wiki.py	Sun Jan 17 20:39:20 2016 +0100
+++ b/src/plugins/plugin_syntax_dc_wiki.py	Mon Jan 18 17:04:29 2016 +0100
@@ -23,8 +23,10 @@
 from sat.core.log import getLogger
 log = getLogger(__name__)
 from sat.core.constants import Const as C
+from sat.core import exceptions
 from twisted.words.xish import domish
 from sat.tools import xml_tools
+import copy
 import re
 
 SYNTAX_NAME = "dc_wiki"
@@ -42,8 +44,15 @@
 NOTE_TPL = u'[{}]' # Note template
 NOTE_A_REV_TPL = u'rev_note_{}'
 NOTE_A_TPL = u'note_{}'
+ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])"
+ESCAPE_CHARS_EXTRA = r"!?_+'()" # These chars are not escaped in XHTML => dc_wiki conversion,
+                                # but are used in the other direction
+ESCAPE_CHARS = ESCAPE_CHARS_BASE.format('')
+FLAG_UL = 'ul' # must be the name of the element
+FLAG_OL = 'ol'
+ELT_WITH_STYLE = ('img', 'div') # elements where a style attribute is expected
 
-wiki = [r"\\(?P<escape_char>[][!_+%'|\/*#@{}~?$()-])",
+wiki = [r'\\' + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA),
         r"^!!!!!(?P<h1_title>.+?)$",
         r"^!!!!(?P<h2_title>.+?)$",
         r"^!!!(?P<h3_title>.+?)$",
@@ -175,7 +184,7 @@
         url_data = string.split(u'|')
         a_elt = parent.addElement('a')
         length = len(url_data)
-        if length == 0:
+        if length == 1:
             url = url_data[0]
             a_elt['href'] = url
             a_elt.addContent(url)
@@ -289,12 +298,345 @@
         return div_elt
 
 
+class XHTMLParser(object):
+
+    def __init__(self):
+        self.flags = None
+        self.toto = 0
+        self.footnotes = None # will hold a map from url to buffer id
+        for i in xrange(1,6):
+            setattr(self,
+                'parser_h{}'.format(i),
+                lambda elt, buf, level=i: self.parserHeading(elt, buf, level)
+                )
+
+    def parser_a(self, elt, buf):
+        try:
+            url = elt['href']
+        except KeyError:
+            # probably an anchor
+            try:
+                id_ = elt['id']
+                if not id_:
+                    # we don't want empty values
+                    raise KeyError
+            except KeyError:
+                self.parserGeneric(elt, buf)
+            else:
+                buf.append(u'~~{}~~'.format(id_))
+            return
+
+        link_data = [url]
+        name = unicode(elt)
+        if name != url:
+            link_data.insert(0, name)
+
+        lang = elt.getAttribute('lang')
+        title = elt.getAttribute('title')
+        if lang is not None:
+            link_data.append(lang)
+        elif title is not None:
+            link_data.appand(u'')
+        if title is not None:
+            link_data.append(title)
+        buf.append(u'[')
+        buf.append(u'|'.join(link_data))
+        buf.append(u']')
+
+    def parser_acronym(self, elt, buf):
+        try:
+            title = elt['title']
+        except KeyError:
+            log.debug(u"Acronyme without title, using generic parser")
+            self.parserGeneric(elt, buf)
+            return
+        buf.append(u'??{}|{}??'.format(unicode(elt), title))
+
+    def parser_blockquote(self, elt, buf):
+        # we remove wrapping <p> to avoid empty line with "> "
+        children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')])
+        if len(children) == 1 and children[0].name == 'p':
+            elt = children[0]
+        tmp_buf = []
+        self.parseChildren(elt, tmp_buf)
+        blockquote = u'> ' + u'\n> '.join(u''.join(tmp_buf).split('\n'))
+        buf.append(blockquote)
+
+    def parser_br(self, elt, buf):
+        buf.append(u'%%%')
+
+    def parser_code(self, elt, buf):
+        buf.append(u'@@')
+        self.parseChildren(elt, buf)
+        buf.append(u'@@')
+
+    def parser_del(self, elt, buf):
+        buf.append(u'--')
+        self.parseChildren(elt, buf)
+        buf.append(u'--')
+
+    def parser_div(self, elt, buf):
+        if elt.getAttribute('class') == 'footnotes':
+            self.parserFootnote(elt, buf)
+        else:
+            self.parseChildren(elt, buf, block=True)
+
+    def parser_em(self, elt, buf):
+        buf.append(u"''")
+        self.parseChildren(elt, buf)
+        buf.append(u"''")
+
+    def parser_h6(self, elt, buf):
+        # XXX: <h6/> heading is not managed by wiki syntax
+        #      so we handle it with a <h5/>
+        elt = copy.copy(elt) # we don't want to change to original element
+        elt.name = 'h5'
+        self._parse(elt, buf)
+
+    def parser_hr(self, elt, buf):
+        buf.append(u'\n----\n')
+
+    def parser_img(self, elt, buf):
+        try:
+            url = elt['src']
+        except KeyError:
+            log.warning(u"Ignoring <img/> without src")
+            return
+
+        image_data=[url]
+
+        alt = elt.getAttribute('alt')
+        style = elt.getAttribute('style', '')
+        desc = elt.getAttribute('longdesc')
+
+        if '0 1em 1em 0' in style:
+            position = 'L'
+        elif '0 0 1em 1em' in style:
+            position = 'R'
+        elif 'auto' in style:
+            position = 'C'
+        else:
+            position = None
+
+        if alt:
+            image_data.append(alt)
+        elif position or desc:
+            image_data.append(u'')
+
+        if position:
+            image_data.append(position)
+        elif desc:
+            image_data.append(u'')
+
+        if desc:
+            image_data.append(desc)
+
+        buf.append(u'((')
+        buf.append(u'|'.join(image_data))
+        buf.append(u'))')
+
+    def parser_ins(self, elt, buf):
+        buf.append(u'++')
+        self.parseChildren(elt, buf)
+        buf.append(u'++')
+
+    def parser_li(self, elt, buf):
+        flag = None
+        current_flag = None
+        bullets = []
+        for flag in reversed(self.flags):
+            if flag in (FLAG_UL, FLAG_OL):
+                if current_flag is None:
+                    current_flag = flag
+                if flag == current_flag:
+                    bullets.append(u'*' if flag == FLAG_UL else u'#')
+                else:
+                    break
+
+        if flag != current_flag and buf[-1] == u' ':
+            # this trick is to avoid a space when we switch
+            # from (un)ordered to the other type on the same row
+            # e.g. *# unorder + ordered item
+            del buf[-1]
+
+        buf.extend(bullets)
+
+        buf.append(u' ')
+        self.parseChildren(elt, buf)
+        buf.append(u'\n')
+
+    def parser_ol(self, elt, buf):
+        self.parserList(elt, buf, FLAG_OL)
+
+    def parser_p(self, elt, buf):
+        self.parseChildren(elt, buf)
+        buf.append(u'\n\n')
+
+    def parser_pre(self, elt, buf):
+        pre = u''.join([child.toXml() if domish.IElement.providedBy(child) else unicode(child) for child in elt.children])
+        pre = u' ' + u'\n '.join(pre.split('\n'))
+        buf.append(pre)
+
+    def parser_q(self, elt, buf):
+        quote_data=[unicode(elt)]
+
+        lang = elt.getAttribute('lang')
+        cite = elt.getAttribute('url')
+
+        if lang:
+            quote_data.append(lang)
+        elif cite:
+            quote_data.append(u'')
+
+        if cite:
+            quote_data.append(cite)
+
+        buf.append(u'{{')
+        buf.append(u'|'.join(quote_data))
+        buf.append(u'}}')
+
+    def parser_span(self, elt, buf):
+        self.parseChildren(elt, buf, block=True)
+
+    def parser_strong(self, elt, buf):
+        buf.append(u'__')
+        self.parseChildren(elt, buf)
+        buf.append(u'__')
+
+    def parser_sup(self, elt, buf):
+        # sup is mainly used for footnotes, so we check if we have an anchor inside
+        children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')])
+        if (len(children) == 1 and domish.IElement.providedBy(children[0])
+            and children[0].name == 'a' and '#' in children[0].getAttribute('href', '')):
+            url = children[0]['href']
+            note_id = url[url.find('#')+1:]
+            if not note_id:
+                log.warning("bad link found in footnote")
+                self.parserGeneric(elt, buf)
+                return
+            # this looks like a footnote
+            buf.append(u'$$')
+            buf.append(u' ') # placeholder
+            self.footnotes[note_id] = len(buf) - 1
+            buf.append(u'$$')
+        else:
+            self.parserGeneric(elt, buf)
+
+    def parser_ul(self, elt, buf):
+        self.parserList(elt, buf, FLAG_UL)
+
+    def parserList(self, elt, buf, type_):
+        self.flags.append(type_)
+        self.parseChildren(elt, buf, block=True)
+        idx = 0
+        for flag in reversed(self.flags):
+            idx -= 1
+            if flag == type_:
+                del self.flags[idx]
+                break
+
+        if idx == 0:
+            raise exceptions.InternalError(u"flag has been removed by an other parser")
+
+    def parserHeading(self, elt, buf, level):
+        buf.append((6-level) * u'!')
+        for child in elt.children:
+            # we ignore other elements for a Hx title
+            self.parserText(child, buf)
+        buf.append(u'\n')
+
+    def parserFootnote(self, elt, buf):
+        for elt in elt.elements():
+            # all children other than <p/> are ignored
+            if elt.name == 'p':
+                a_elt = elt.a
+                if a_elt is None:
+                    log.warning(u"<p/> element doesn't contain <a/> in footnote, ignoring it")
+                    continue
+                try:
+                    note_idx = self.footnotes[a_elt['id']]
+                except KeyError:
+                    log.warning(u"Note id doesn't match any known note, ignoring it")
+                # we create a dummy element to parse all children after the <a/>
+                dummy_elt = domish.Element((None, 'note'))
+                a_idx = elt.children.index(a_elt)
+                dummy_elt.children = elt.children[a_idx+1:]
+                note_buf = []
+                self.parseChildren(dummy_elt, note_buf)
+                # now we can replace the placeholder
+                buf[note_idx] = u''.join(note_buf)
+
+    def parserText(self, txt, buf, keep_whitespaces=False):
+        txt = unicode(txt)
+        if not keep_whitespaces:
+            # we get text and only let one inter word space
+            txt = u' '.join(txt.split())
+        txt = re.sub(ESCAPE_CHARS, r'\\\1', txt)
+        if txt:
+            buf.append(txt)
+        return txt
+
+    def parserGeneric(self, elt, buf):
+        # as dotclear wiki syntax handle arbitrary XHTML code
+        # we use this feature to add elements that we don't know
+        buf.append(u"\n\n///html\n{}\n///\n\n".format(elt.toXml()))
+
+    def parseChildren(self, elt, buf, block=False):
+        first_visible = True
+        for child in elt.children:
+            if not block and not first_visible and buf and buf[-1][-1] not in (' ','\n'):
+                # we add separation if it isn't already there
+                buf.append(u' ')
+            if domish.IElement.providedBy(child):
+                self._parse(child, buf)
+                first_visible = False
+            else:
+                appended = self.parserText(child, buf)
+                if appended:
+                    first_visible = False
+
+    def _parse(self, elt, buf):
+        elt_name = elt.name.lower()
+        style = elt.getAttribute('style')
+        if style and elt_name not in ELT_WITH_STYLE:
+            # if we have style we use generic parser to put raw HTML
+            # to avoid losing it
+            parser = self.parserGeneric
+        else:
+            try:
+                parser = getattr(self, "parser_{}".format(elt_name))
+            except AttributeError:
+                log.debug("Can't find parser for {} element, using generic one".format(elt.name))
+                parser = self.parserGeneric
+        parser(elt, buf)
+
+    def parse(self, elt):
+        self.flags = []
+        self.footnotes = {}
+        buf = []
+        self._parse(elt, buf)
+        return u''.join(buf)
+
+    def parseString(self, string):
+        wrapped_html = u"<div>{}</div>".format(string)
+        try:
+            div_elt = xml_tools.ElementParser()(wrapped_html)
+        except domish.ParserError as e:
+            log.warning(u"Error while parsing HTML content: {}".format(e))
+            return
+        children = list(div_elt.elements())
+        if len(children) == 1 and children[0].name == 'div':
+            div_elt = children[0]
+        return self.parse(div_elt)
+
+
 class DCWikiSyntax(object):
 
     def __init__(self, host):
         log.info(_(u"Dotclear wiki syntax plugin initialization"))
         self.host = host
         self._dc_parser = DCWikiParser()
+        self._xhtml_parser = XHTMLParser()
         self._stx = self.host.plugins["TEXT-SYNTAXES"]
         self._stx.addSyntax(SYNTAX_NAME, self.parseWiki, self.parseXHTML, [self._stx.OPT_NO_THREAD])
 
@@ -303,4 +645,4 @@
         return div_elt.toXml()
 
     def parseXHTML(self, xhtml):
-        raise NotImplementedError
+        return self._xhtml_parser.parseString(xhtml)