Mercurial > libervia-backend
changeset 1807:0d3110341947
plugin syntax dc_wiki: added XHTML => dc_wiki converter, plus some bug fixes:
- escape_chars regex has been fixed (an \ was missing for backslash escaping)
- escape_chars is now splited in consts so XHTML => dc_wiki escape less characters, for better readability
- fixed bad length check in DCWikiParser.parser_link
dotclear wiki syntax should now be usable
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 18 Jan 2016 17:04:29 +0100 (2016-01-18) |
parents | fd788d24277a |
children | 18561326a561 |
files | src/plugins/plugin_syntax_dc_wiki.py |
diffstat | 1 files changed, 345 insertions(+), 3 deletions(-) [+] |
line wrap: on
line diff
--- a/src/plugins/plugin_syntax_dc_wiki.py Sun Jan 17 20:39:20 2016 +0100 +++ b/src/plugins/plugin_syntax_dc_wiki.py Mon Jan 18 17:04:29 2016 +0100 @@ -23,8 +23,10 @@ from sat.core.log import getLogger log = getLogger(__name__) from sat.core.constants import Const as C +from sat.core import exceptions from twisted.words.xish import domish from sat.tools import xml_tools +import copy import re SYNTAX_NAME = "dc_wiki" @@ -42,8 +44,15 @@ NOTE_TPL = u'[{}]' # Note template NOTE_A_REV_TPL = u'rev_note_{}' NOTE_A_TPL = u'note_{}' +ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])" +ESCAPE_CHARS_EXTRA = r"!?_+'()" # These chars are not escaped in XHTML => dc_wiki conversion, + # but are used in the other direction +ESCAPE_CHARS = ESCAPE_CHARS_BASE.format('') +FLAG_UL = 'ul' # must be the name of the element +FLAG_OL = 'ol' +ELT_WITH_STYLE = ('img', 'div') # elements where a style attribute is expected -wiki = [r"\\(?P<escape_char>[][!_+%'|\/*#@{}~?$()-])", +wiki = [r'\\' + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA), r"^!!!!!(?P<h1_title>.+?)$", r"^!!!!(?P<h2_title>.+?)$", r"^!!!(?P<h3_title>.+?)$", @@ -175,7 +184,7 @@ url_data = string.split(u'|') a_elt = parent.addElement('a') length = len(url_data) - if length == 0: + if length == 1: url = url_data[0] a_elt['href'] = url a_elt.addContent(url) @@ -289,12 +298,345 @@ return div_elt +class XHTMLParser(object): + + def __init__(self): + self.flags = None + self.toto = 0 + self.footnotes = None # will hold a map from url to buffer id + for i in xrange(1,6): + setattr(self, + 'parser_h{}'.format(i), + lambda elt, buf, level=i: self.parserHeading(elt, buf, level) + ) + + def parser_a(self, elt, buf): + try: + url = elt['href'] + except KeyError: + # probably an anchor + try: + id_ = elt['id'] + if not id_: + # we don't want empty values + raise KeyError + except KeyError: + self.parserGeneric(elt, buf) + else: + buf.append(u'~~{}~~'.format(id_)) + return + + link_data = [url] + name = unicode(elt) + if name != url: + link_data.insert(0, name) + + lang = elt.getAttribute('lang') + title = elt.getAttribute('title') + if lang is not None: + link_data.append(lang) + elif title is not None: + link_data.appand(u'') + if title is not None: + link_data.append(title) + buf.append(u'[') + buf.append(u'|'.join(link_data)) + buf.append(u']') + + def parser_acronym(self, elt, buf): + try: + title = elt['title'] + except KeyError: + log.debug(u"Acronyme without title, using generic parser") + self.parserGeneric(elt, buf) + return + buf.append(u'??{}|{}??'.format(unicode(elt), title)) + + def parser_blockquote(self, elt, buf): + # we remove wrapping <p> to avoid empty line with "> " + children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')]) + if len(children) == 1 and children[0].name == 'p': + elt = children[0] + tmp_buf = [] + self.parseChildren(elt, tmp_buf) + blockquote = u'> ' + u'\n> '.join(u''.join(tmp_buf).split('\n')) + buf.append(blockquote) + + def parser_br(self, elt, buf): + buf.append(u'%%%') + + def parser_code(self, elt, buf): + buf.append(u'@@') + self.parseChildren(elt, buf) + buf.append(u'@@') + + def parser_del(self, elt, buf): + buf.append(u'--') + self.parseChildren(elt, buf) + buf.append(u'--') + + def parser_div(self, elt, buf): + if elt.getAttribute('class') == 'footnotes': + self.parserFootnote(elt, buf) + else: + self.parseChildren(elt, buf, block=True) + + def parser_em(self, elt, buf): + buf.append(u"''") + self.parseChildren(elt, buf) + buf.append(u"''") + + def parser_h6(self, elt, buf): + # XXX: <h6/> heading is not managed by wiki syntax + # so we handle it with a <h5/> + elt = copy.copy(elt) # we don't want to change to original element + elt.name = 'h5' + self._parse(elt, buf) + + def parser_hr(self, elt, buf): + buf.append(u'\n----\n') + + def parser_img(self, elt, buf): + try: + url = elt['src'] + except KeyError: + log.warning(u"Ignoring <img/> without src") + return + + image_data=[url] + + alt = elt.getAttribute('alt') + style = elt.getAttribute('style', '') + desc = elt.getAttribute('longdesc') + + if '0 1em 1em 0' in style: + position = 'L' + elif '0 0 1em 1em' in style: + position = 'R' + elif 'auto' in style: + position = 'C' + else: + position = None + + if alt: + image_data.append(alt) + elif position or desc: + image_data.append(u'') + + if position: + image_data.append(position) + elif desc: + image_data.append(u'') + + if desc: + image_data.append(desc) + + buf.append(u'((') + buf.append(u'|'.join(image_data)) + buf.append(u'))') + + def parser_ins(self, elt, buf): + buf.append(u'++') + self.parseChildren(elt, buf) + buf.append(u'++') + + def parser_li(self, elt, buf): + flag = None + current_flag = None + bullets = [] + for flag in reversed(self.flags): + if flag in (FLAG_UL, FLAG_OL): + if current_flag is None: + current_flag = flag + if flag == current_flag: + bullets.append(u'*' if flag == FLAG_UL else u'#') + else: + break + + if flag != current_flag and buf[-1] == u' ': + # this trick is to avoid a space when we switch + # from (un)ordered to the other type on the same row + # e.g. *# unorder + ordered item + del buf[-1] + + buf.extend(bullets) + + buf.append(u' ') + self.parseChildren(elt, buf) + buf.append(u'\n') + + def parser_ol(self, elt, buf): + self.parserList(elt, buf, FLAG_OL) + + def parser_p(self, elt, buf): + self.parseChildren(elt, buf) + buf.append(u'\n\n') + + def parser_pre(self, elt, buf): + pre = u''.join([child.toXml() if domish.IElement.providedBy(child) else unicode(child) for child in elt.children]) + pre = u' ' + u'\n '.join(pre.split('\n')) + buf.append(pre) + + def parser_q(self, elt, buf): + quote_data=[unicode(elt)] + + lang = elt.getAttribute('lang') + cite = elt.getAttribute('url') + + if lang: + quote_data.append(lang) + elif cite: + quote_data.append(u'') + + if cite: + quote_data.append(cite) + + buf.append(u'{{') + buf.append(u'|'.join(quote_data)) + buf.append(u'}}') + + def parser_span(self, elt, buf): + self.parseChildren(elt, buf, block=True) + + def parser_strong(self, elt, buf): + buf.append(u'__') + self.parseChildren(elt, buf) + buf.append(u'__') + + def parser_sup(self, elt, buf): + # sup is mainly used for footnotes, so we check if we have an anchor inside + children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')]) + if (len(children) == 1 and domish.IElement.providedBy(children[0]) + and children[0].name == 'a' and '#' in children[0].getAttribute('href', '')): + url = children[0]['href'] + note_id = url[url.find('#')+1:] + if not note_id: + log.warning("bad link found in footnote") + self.parserGeneric(elt, buf) + return + # this looks like a footnote + buf.append(u'$$') + buf.append(u' ') # placeholder + self.footnotes[note_id] = len(buf) - 1 + buf.append(u'$$') + else: + self.parserGeneric(elt, buf) + + def parser_ul(self, elt, buf): + self.parserList(elt, buf, FLAG_UL) + + def parserList(self, elt, buf, type_): + self.flags.append(type_) + self.parseChildren(elt, buf, block=True) + idx = 0 + for flag in reversed(self.flags): + idx -= 1 + if flag == type_: + del self.flags[idx] + break + + if idx == 0: + raise exceptions.InternalError(u"flag has been removed by an other parser") + + def parserHeading(self, elt, buf, level): + buf.append((6-level) * u'!') + for child in elt.children: + # we ignore other elements for a Hx title + self.parserText(child, buf) + buf.append(u'\n') + + def parserFootnote(self, elt, buf): + for elt in elt.elements(): + # all children other than <p/> are ignored + if elt.name == 'p': + a_elt = elt.a + if a_elt is None: + log.warning(u"<p/> element doesn't contain <a/> in footnote, ignoring it") + continue + try: + note_idx = self.footnotes[a_elt['id']] + except KeyError: + log.warning(u"Note id doesn't match any known note, ignoring it") + # we create a dummy element to parse all children after the <a/> + dummy_elt = domish.Element((None, 'note')) + a_idx = elt.children.index(a_elt) + dummy_elt.children = elt.children[a_idx+1:] + note_buf = [] + self.parseChildren(dummy_elt, note_buf) + # now we can replace the placeholder + buf[note_idx] = u''.join(note_buf) + + def parserText(self, txt, buf, keep_whitespaces=False): + txt = unicode(txt) + if not keep_whitespaces: + # we get text and only let one inter word space + txt = u' '.join(txt.split()) + txt = re.sub(ESCAPE_CHARS, r'\\\1', txt) + if txt: + buf.append(txt) + return txt + + def parserGeneric(self, elt, buf): + # as dotclear wiki syntax handle arbitrary XHTML code + # we use this feature to add elements that we don't know + buf.append(u"\n\n///html\n{}\n///\n\n".format(elt.toXml())) + + def parseChildren(self, elt, buf, block=False): + first_visible = True + for child in elt.children: + if not block and not first_visible and buf and buf[-1][-1] not in (' ','\n'): + # we add separation if it isn't already there + buf.append(u' ') + if domish.IElement.providedBy(child): + self._parse(child, buf) + first_visible = False + else: + appended = self.parserText(child, buf) + if appended: + first_visible = False + + def _parse(self, elt, buf): + elt_name = elt.name.lower() + style = elt.getAttribute('style') + if style and elt_name not in ELT_WITH_STYLE: + # if we have style we use generic parser to put raw HTML + # to avoid losing it + parser = self.parserGeneric + else: + try: + parser = getattr(self, "parser_{}".format(elt_name)) + except AttributeError: + log.debug("Can't find parser for {} element, using generic one".format(elt.name)) + parser = self.parserGeneric + parser(elt, buf) + + def parse(self, elt): + self.flags = [] + self.footnotes = {} + buf = [] + self._parse(elt, buf) + return u''.join(buf) + + def parseString(self, string): + wrapped_html = u"<div>{}</div>".format(string) + try: + div_elt = xml_tools.ElementParser()(wrapped_html) + except domish.ParserError as e: + log.warning(u"Error while parsing HTML content: {}".format(e)) + return + children = list(div_elt.elements()) + if len(children) == 1 and children[0].name == 'div': + div_elt = children[0] + return self.parse(div_elt) + + class DCWikiSyntax(object): def __init__(self, host): log.info(_(u"Dotclear wiki syntax plugin initialization")) self.host = host self._dc_parser = DCWikiParser() + self._xhtml_parser = XHTMLParser() self._stx = self.host.plugins["TEXT-SYNTAXES"] self._stx.addSyntax(SYNTAX_NAME, self.parseWiki, self.parseXHTML, [self._stx.OPT_NO_THREAD]) @@ -303,4 +645,4 @@ return div_elt.toXml() def parseXHTML(self, xhtml): - raise NotImplementedError + return self._xhtml_parser.parseString(xhtml)