view sat/plugins/plugin_syntax_wiki_dotclear.py @ 3250:e4d3ba75b1b2

core (memory/disco): fixed types of disco extensions: typeCheck() is not automatically called on reception by Wokkel, as a result extensions fields values may be strings instead of the field type. TypeCheck is now explicitly called in memory.disco to avoid that. It is not called immediately on reception as the string value is needed to calculate the capability hash
author Goffi <goffi@goffi.org>
date Tue, 14 Apr 2020 20:25:05 +0200
parents 559a625a236b
children be6d91572633
line wrap: on
line source

#!/usr/bin/env python3


# SàT plugin for Dotclear Wiki Syntax
# Copyright (C) 2009-2020 Jérôme Poisson (goffi@goffi.org)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

# XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent

from sat.core.i18n import _
from sat.core.log import getLogger

log = getLogger(__name__)
from sat.core.constants import Const as C
from sat.core import exceptions
from twisted.words.xish import domish
from sat.tools import xml_tools
import copy
import re

PLUGIN_INFO = {
    C.PI_NAME: "Dotclear Wiki Syntax Plugin",
    C.PI_IMPORT_NAME: "SYNT_DC_WIKI",
    C.PI_TYPE: C.PLUG_TYPE_SYNTAXE,
    C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"],
    C.PI_MAIN: "DCWikiSyntax",
    C.PI_HANDLER: "",
    C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""),
}

NOTE_TPL = "[{}]"  # Note template
NOTE_A_REV_TPL = "rev_note_{}"
NOTE_A_TPL = "note_{}"
ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])"
ESCAPE_CHARS_EXTRA = (
    r"!?_+'()"
)  # These chars are not escaped in XHTML => dc_wiki conversion,
# but are used in the other direction
ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("")
FLAG_UL = "ul"  # must be the name of the element
FLAG_OL = "ol"
ELT_WITH_STYLE = ("img", "div")  # elements where a style attribute is expected

wiki = [
    r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA),
    r"^!!!!!(?P<h1_title>.+?)$",
    r"^!!!!(?P<h2_title>.+?)$",
    r"^!!!(?P<h3_title>.+?)$",
    r"^!!(?P<h4_title>.+?)$",
    r"^!(?P<h5_title>.+?)$",
    r"^----$(?P<horizontal_rule>)",
    r"^\*(?P<list_bullet>.*?)$",
    r"^#(?P<list_ordered>.*?)$",
    r"^ (?P<preformated>.*?)$",
    r"^> +?(?P<quote>.*?)$",
    r"''(?P<emphasis>.+?)''",
    r"__(?P<strong_emphasis>.+?)__",
    r"%%%(?P<line_break>)",
    r"\+\+(?P<insertion>.+?)\+\+",
    r"--(?P<deletion>.+?)--",
    r"\[(?P<link>.+?)\]",
    r"\(\((?P<image>.+?)\)\)",
    r"~(?P<anchor>.+?)~",
    r"\?\?(?P<acronym>.+?\|.+?)\?\?",
    r"{{(?P<inline_quote>.+?)}}",
    r"@@(?P<code>.+?)@@",
    r"\$\$(?P<footnote>.+?)\$\$",
    r"(?P<text>.+?)",
]

wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL)
wiki_block_level_re = re.compile(
    r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)",
    re.MULTILINE | re.DOTALL,
)


class DCWikiParser(object):
    def __init__(self):
        self._footnotes = None
        for i in range(5):
            setattr(
                self,
                "parser_h{}_title".format(i),
                lambda string, parent, i=i: self._parser_title(
                    string, parent, "h{}".format(i)
                ),
            )

    def parser_paragraph(self, string, parent):
        p_elt = parent.addElement("p")
        self._parse(string, p_elt)

    def parser_html(self, string, parent):
        wrapped_html = "<div>{}</div>".format(string)
        try:
            div_elt = xml_tools.ElementParser()(wrapped_html)
        except domish.ParserError as e:
            log.warning("Error while parsing HTML content, ignoring it: {}".format(e))
            return
        children = list(div_elt.elements())
        if len(children) == 1 and children[0].name == "div":
            div_elt = children[0]
        parent.addChild(div_elt)

    def parser_escape_char(self, string, parent):
        parent.addContent(string)

    def _parser_title(self, string, parent, name):
        elt = parent.addElement(name)
        elt.addContent(string)

    def parser_horizontal_rule(self, string, parent):
        parent.addElement("hr")

    def _parser_list(self, string, parent, list_type):
        depth = 0
        while string[depth : depth + 1] == "*":
            depth += 1

        string = string[depth:].lstrip()

        for i in range(depth + 1):
            list_elt = getattr(parent, list_type)
            if not list_elt:
                parent = parent.addElement(list_type)
            else:
                parent = list_elt

        li_elt = parent.addElement("li")
        self._parse(string, li_elt)

    def parser_list_bullet(self, string, parent):
        self._parser_list(string, parent, "ul")

    def parser_list_ordered(self, string, parent):
        self._parser_list(string, parent, "ol")

    def parser_preformated(self, string, parent):
        pre_elt = parent.pre
        if pre_elt is None:
            pre_elt = parent.addElement("pre")
        else:
            # we are on a new line, and this is important for <pre/>
            pre_elt.addContent("\n")
        pre_elt.addContent(string)

    def parser_quote(self, string, parent):
        blockquote_elt = parent.blockquote
        if blockquote_elt is None:
            blockquote_elt = parent.addElement("blockquote")
        p_elt = blockquote_elt.p
        if p_elt is None:
            p_elt = blockquote_elt.addElement("p")
        else:
            string = "\n" + string

        self._parse(string, p_elt)

    def parser_emphasis(self, string, parent):
        em_elt = parent.addElement("em")
        self._parse(string, em_elt)

    def parser_strong_emphasis(self, string, parent):
        strong_elt = parent.addElement("strong")
        self._parse(string, strong_elt)

    def parser_line_break(self, string, parent):
        parent.addElement("br")

    def parser_insertion(self, string, parent):
        ins_elt = parent.addElement("ins")
        self._parse(string, ins_elt)

    def parser_deletion(self, string, parent):
        del_elt = parent.addElement("del")
        self._parse(string, del_elt)

    def parser_link(self, string, parent):
        url_data = string.split("|")
        a_elt = parent.addElement("a")
        length = len(url_data)
        if length == 1:
            url = url_data[0]
            a_elt["href"] = url
            a_elt.addContent(url)
        else:
            name = url_data[0]
            url = url_data[1]
            a_elt["href"] = url
            a_elt.addContent(name)
            if length >= 3:
                a_elt["lang"] = url_data[2]
            if length >= 4:
                a_elt["title"] = url_data[3]
            if length > 4:
                log.warning("too much data for url, ignoring extra data")

    def parser_image(self, string, parent):
        image_data = string.split("|")
        img_elt = parent.addElement("img")

        for idx, attribute in enumerate(("src", "alt", "position", "longdesc")):
            try:
                data = image_data[idx]
            except IndexError:
                break

            if attribute != "position":
                img_elt[attribute] = data
            else:
                data = data.lower()
                if data in ("l", "g"):
                    img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0"
                elif data in ("r", "d"):
                    img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em"
                elif data == "c":
                    img_elt[
                        "style"
                    ] = "display:block; margin-left:auto; margin-right:auto"
                else:
                    log.warning("bad position argument for image, ignoring it")

    def parser_anchor(self, string, parent):
        a_elt = parent.addElement("a")
        a_elt["id"] = string

    def parser_acronym(self, string, parent):
        acronym, title = string.split("|", 1)
        acronym_elt = parent.addElement("acronym", content=acronym)
        acronym_elt["title"] = title

    def parser_inline_quote(self, string, parent):
        quote_data = string.split("|")
        quote = quote_data[0]
        q_elt = parent.addElement("q", content=quote)
        for idx, attribute in enumerate(("lang", "cite"), 1):
            try:
                data = quote_data[idx]
            except IndexError:
                break
            q_elt[attribute] = data

    def parser_code(self, string, parent):
        parent.addElement("code", content=string)

    def parser_footnote(self, string, parent):
        idx = len(self._footnotes) + 1
        note_txt = NOTE_TPL.format(idx)
        sup_elt = parent.addElement("sup")
        sup_elt["class"] = "note"
        a_elt = sup_elt.addElement("a", content=note_txt)
        a_elt["id"] = NOTE_A_REV_TPL.format(idx)
        a_elt["href"] = "#{}".format(NOTE_A_TPL.format(idx))

        p_elt = domish.Element((None, "p"))
        a_elt = p_elt.addElement("a", content=note_txt)
        a_elt["id"] = NOTE_A_TPL.format(idx)
        a_elt["href"] = "#{}".format(NOTE_A_REV_TPL.format(idx))
        self._parse(string, p_elt)
        # footnotes are actually added at the end of the parsing
        self._footnotes.append(p_elt)

    def parser_text(self, string, parent):
        parent.addContent(string)

    def _parse(self, string, parent, block_level=False):
        regex = wiki_block_level_re if block_level else wiki_re

        for match in regex.finditer(string):
            if match.lastgroup is None:
                parent.addContent(string)
                return
            matched = match.group(match.lastgroup)
            try:
                parser = getattr(self, "parser_{}".format(match.lastgroup))
            except AttributeError:
                log.warning("No parser found for {}".format(match.lastgroup))
                # parent.addContent(string)
                continue
            parser(matched, parent)

    def parse(self, string):
        self._footnotes = []
        div_elt = domish.Element((None, "div"))
        self._parse(string, parent=div_elt, block_level=True)
        if self._footnotes:
            foot_div_elt = div_elt.addElement("div")
            foot_div_elt["class"] = "footnotes"
            # we add a simple horizontal rule which can be customized
            # with footnotes class, instead of a text which would need
            # to be translated
            foot_div_elt.addElement("hr")
            for elt in self._footnotes:
                foot_div_elt.addChild(elt)
        return div_elt


class XHTMLParser(object):
    def __init__(self):
        self.flags = None
        self.toto = 0
        self.footnotes = None  # will hold a map from url to buffer id
        for i in range(1, 6):
            setattr(
                self,
                "parser_h{}".format(i),
                lambda elt, buf, level=i: self.parserHeading(elt, buf, level),
            )

    def parser_a(self, elt, buf):
        try:
            url = elt["href"]
        except KeyError:
            # probably an anchor
            try:
                id_ = elt["id"]
                if not id_:
                    # we don't want empty values
                    raise KeyError
            except KeyError:
                self.parserGeneric(elt, buf)
            else:
                buf.append("~~{}~~".format(id_))
            return

        link_data = [url]
        name = str(elt)
        if name != url:
            link_data.insert(0, name)

        lang = elt.getAttribute("lang")
        title = elt.getAttribute("title")
        if lang is not None:
            link_data.append(lang)
        elif title is not None:
            link_data.appand("")
        if title is not None:
            link_data.append(title)
        buf.append("[")
        buf.append("|".join(link_data))
        buf.append("]")

    def parser_acronym(self, elt, buf):
        try:
            title = elt["title"]
        except KeyError:
            log.debug("Acronyme without title, using generic parser")
            self.parserGeneric(elt, buf)
            return
        buf.append("??{}|{}??".format(str(elt), title))

    def parser_blockquote(self, elt, buf):
        # we remove wrapping <p> to avoid empty line with "> "
        children = list(
            [child for child in elt.children if str(child).strip() not in ("", "\n")]
        )
        if len(children) == 1 and children[0].name == "p":
            elt = children[0]
        tmp_buf = []
        self.parseChildren(elt, tmp_buf)
        blockquote = "> " + "\n> ".join("".join(tmp_buf).split("\n"))
        buf.append(blockquote)

    def parser_br(self, elt, buf):
        buf.append("%%%")

    def parser_code(self, elt, buf):
        buf.append("@@")
        self.parseChildren(elt, buf)
        buf.append("@@")

    def parser_del(self, elt, buf):
        buf.append("--")
        self.parseChildren(elt, buf)
        buf.append("--")

    def parser_div(self, elt, buf):
        if elt.getAttribute("class") == "footnotes":
            self.parserFootnote(elt, buf)
        else:
            self.parseChildren(elt, buf, block=True)

    def parser_em(self, elt, buf):
        buf.append("''")
        self.parseChildren(elt, buf)
        buf.append("''")

    def parser_h6(self, elt, buf):
        # XXX: <h6/> heading is not managed by wiki syntax
        #      so we handle it with a <h5/>
        elt = copy.copy(elt)  # we don't want to change to original element
        elt.name = "h5"
        self._parse(elt, buf)

    def parser_hr(self, elt, buf):
        buf.append("\n----\n")

    def parser_img(self, elt, buf):
        try:
            url = elt["src"]
        except KeyError:
            log.warning("Ignoring <img/> without src")
            return

        image_data = [url]

        alt = elt.getAttribute("alt")
        style = elt.getAttribute("style", "")
        desc = elt.getAttribute("longdesc")

        if "0 1em 1em 0" in style:
            position = "L"
        elif "0 0 1em 1em" in style:
            position = "R"
        elif "auto" in style:
            position = "C"
        else:
            position = None

        if alt:
            image_data.append(alt)
        elif position or desc:
            image_data.append("")

        if position:
            image_data.append(position)
        elif desc:
            image_data.append("")

        if desc:
            image_data.append(desc)

        buf.append("((")
        buf.append("|".join(image_data))
        buf.append("))")

    def parser_ins(self, elt, buf):
        buf.append("++")
        self.parseChildren(elt, buf)
        buf.append("++")

    def parser_li(self, elt, buf):
        flag = None
        current_flag = None
        bullets = []
        for flag in reversed(self.flags):
            if flag in (FLAG_UL, FLAG_OL):
                if current_flag is None:
                    current_flag = flag
                if flag == current_flag:
                    bullets.append("*" if flag == FLAG_UL else "#")
                else:
                    break

        if flag != current_flag and buf[-1] == " ":
            # this trick is to avoid a space when we switch
            # from (un)ordered to the other type on the same row
            # e.g. *# unorder + ordered item
            del buf[-1]

        buf.extend(bullets)

        buf.append(" ")
        self.parseChildren(elt, buf)
        buf.append("\n")

    def parser_ol(self, elt, buf):
        self.parserList(elt, buf, FLAG_OL)

    def parser_p(self, elt, buf):
        self.parseChildren(elt, buf)
        buf.append("\n\n")

    def parser_pre(self, elt, buf):
        pre = "".join(
            [
                child.toXml() if domish.IElement.providedBy(child) else str(child)
                for child in elt.children
            ]
        )
        pre = " " + "\n ".join(pre.split("\n"))
        buf.append(pre)

    def parser_q(self, elt, buf):
        quote_data = [str(elt)]

        lang = elt.getAttribute("lang")
        cite = elt.getAttribute("url")

        if lang:
            quote_data.append(lang)
        elif cite:
            quote_data.append("")

        if cite:
            quote_data.append(cite)

        buf.append("{{")
        buf.append("|".join(quote_data))
        buf.append("}}")

    def parser_span(self, elt, buf):
        self.parseChildren(elt, buf, block=True)

    def parser_strong(self, elt, buf):
        buf.append("__")
        self.parseChildren(elt, buf)
        buf.append("__")

    def parser_sup(self, elt, buf):
        # sup is mainly used for footnotes, so we check if we have an anchor inside
        children = list(
            [child for child in elt.children if str(child).strip() not in ("", "\n")]
        )
        if (
            len(children) == 1
            and domish.IElement.providedBy(children[0])
            and children[0].name == "a"
            and "#" in children[0].getAttribute("href", "")
        ):
            url = children[0]["href"]
            note_id = url[url.find("#") + 1 :]
            if not note_id:
                log.warning("bad link found in footnote")
                self.parserGeneric(elt, buf)
                return
            # this looks like a footnote
            buf.append("$$")
            buf.append(" ")  # placeholder
            self.footnotes[note_id] = len(buf) - 1
            buf.append("$$")
        else:
            self.parserGeneric(elt, buf)

    def parser_ul(self, elt, buf):
        self.parserList(elt, buf, FLAG_UL)

    def parserList(self, elt, buf, type_):
        self.flags.append(type_)
        self.parseChildren(elt, buf, block=True)
        idx = 0
        for flag in reversed(self.flags):
            idx -= 1
            if flag == type_:
                del self.flags[idx]
                break

        if idx == 0:
            raise exceptions.InternalError("flag has been removed by an other parser")

    def parserHeading(self, elt, buf, level):
        buf.append((6 - level) * "!")
        for child in elt.children:
            # we ignore other elements for a Hx title
            self.parserText(child, buf)
        buf.append("\n")

    def parserFootnote(self, elt, buf):
        for elt in elt.elements():
            # all children other than <p/> are ignored
            if elt.name == "p":
                a_elt = elt.a
                if a_elt is None:
                    log.warning(
                        "<p/> element doesn't contain <a/> in footnote, ignoring it"
                    )
                    continue
                try:
                    note_idx = self.footnotes[a_elt["id"]]
                except KeyError:
                    log.warning("Note id doesn't match any known note, ignoring it")
                # we create a dummy element to parse all children after the <a/>
                dummy_elt = domish.Element((None, "note"))
                a_idx = elt.children.index(a_elt)
                dummy_elt.children = elt.children[a_idx + 1 :]
                note_buf = []
                self.parseChildren(dummy_elt, note_buf)
                # now we can replace the placeholder
                buf[note_idx] = "".join(note_buf)

    def parserText(self, txt, buf, keep_whitespaces=False):
        txt = str(txt)
        if not keep_whitespaces:
            # we get text and only let one inter word space
            txt = " ".join(txt.split())
        txt = re.sub(ESCAPE_CHARS, r"\\\1", txt)
        if txt:
            buf.append(txt)
        return txt

    def parserGeneric(self, elt, buf):
        # as dotclear wiki syntax handle arbitrary XHTML code
        # we use this feature to add elements that we don't know
        buf.append("\n\n///html\n{}\n///\n\n".format(elt.toXml()))

    def parseChildren(self, elt, buf, block=False):
        first_visible = True
        for child in elt.children:
            if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"):
                # we add separation if it isn't already there
                buf.append(" ")
            if domish.IElement.providedBy(child):
                self._parse(child, buf)
                first_visible = False
            else:
                appended = self.parserText(child, buf)
                if appended:
                    first_visible = False

    def _parse(self, elt, buf):
        elt_name = elt.name.lower()
        style = elt.getAttribute("style")
        if style and elt_name not in ELT_WITH_STYLE:
            # if we have style we use generic parser to put raw HTML
            # to avoid losing it
            parser = self.parserGeneric
        else:
            try:
                parser = getattr(self, "parser_{}".format(elt_name))
            except AttributeError:
                log.debug(
                    "Can't find parser for {} element, using generic one".format(elt.name)
                )
                parser = self.parserGeneric
        parser(elt, buf)

    def parse(self, elt):
        self.flags = []
        self.footnotes = {}
        buf = []
        self._parse(elt, buf)
        return "".join(buf)

    def parseString(self, string):
        wrapped_html = "<div>{}</div>".format(string)
        try:
            div_elt = xml_tools.ElementParser()(wrapped_html)
        except domish.ParserError as e:
            log.warning("Error while parsing HTML content: {}".format(e))
            return
        children = list(div_elt.elements())
        if len(children) == 1 and children[0].name == "div":
            div_elt = children[0]
        return self.parse(div_elt)


class DCWikiSyntax(object):
    SYNTAX_NAME = "wiki_dotclear"

    def __init__(self, host):
        log.info(_("Dotclear wiki syntax plugin initialization"))
        self.host = host
        self._dc_parser = DCWikiParser()
        self._xhtml_parser = XHTMLParser()
        self._stx = self.host.plugins["TEXT_SYNTAXES"]
        self._stx.addSyntax(
            self.SYNTAX_NAME, self.parseWiki, self.parseXHTML, [self._stx.OPT_NO_THREAD]
        )

    def parseWiki(self, wiki_stx):
        div_elt = self._dc_parser.parse(wiki_stx)
        return div_elt.toXml()

    def parseXHTML(self, xhtml):
        return self._xhtml_parser.parseString(xhtml)