Mercurial > libervia-backend
diff libervia/backend/plugins/plugin_syntax_wiki_dotclear.py @ 4071:4b842c1fb686
refactoring: renamed `sat` package to `libervia.backend`
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 02 Jun 2023 11:49:51 +0200 |
parents | sat/plugins/plugin_syntax_wiki_dotclear.py@524856bd7b19 |
children | 0d7bb4df2343 |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libervia/backend/plugins/plugin_syntax_wiki_dotclear.py Fri Jun 02 11:49:51 2023 +0200 @@ -0,0 +1,678 @@ +#!/usr/bin/env python3 + + +# SàT plugin for Dotclear Wiki Syntax +# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +# XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent + +from libervia.backend.core.i18n import _ +from libervia.backend.core.log import getLogger + +log = getLogger(__name__) +from libervia.backend.core.constants import Const as C +from libervia.backend.core import exceptions +from twisted.words.xish import domish +from libervia.backend.tools import xml_tools +import copy +import re + +PLUGIN_INFO = { + C.PI_NAME: "Dotclear Wiki Syntax Plugin", + C.PI_IMPORT_NAME: "SYNT_DC_WIKI", + C.PI_TYPE: C.PLUG_TYPE_SYNTAXE, + C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"], + C.PI_MAIN: "DCWikiSyntax", + C.PI_HANDLER: "", + C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""), +} + +NOTE_TPL = "[{}]" # Note template +NOTE_A_REV_TPL = "rev_note_{}" +NOTE_A_TPL = "note_{}" +ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])" +ESCAPE_CHARS_EXTRA = ( + r"!?_+'()" +) # These chars are not escaped in XHTML => dc_wiki conversion, +# but are used in the other direction +ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("") +FLAG_UL = "ul" # must be the name of the element +FLAG_OL = "ol" +ELT_WITH_STYLE = ("img", "div") # elements where a style attribute is expected + +wiki = [ + r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA), + r"^!!!!!(?P<h1_title>.+?)$", + r"^!!!!(?P<h2_title>.+?)$", + r"^!!!(?P<h3_title>.+?)$", + r"^!!(?P<h4_title>.+?)$", + r"^!(?P<h5_title>.+?)$", + r"^----$(?P<horizontal_rule>)", + r"^\*(?P<list_bullet>.*?)$", + r"^#(?P<list_ordered>.*?)$", + r"^ (?P<preformated>.*?)$", + r"^> +?(?P<quote>.*?)$", + r"''(?P<emphasis>.+?)''", + r"__(?P<strong_emphasis>.+?)__", + r"%%%(?P<line_break>)", + r"\+\+(?P<insertion>.+?)\+\+", + r"--(?P<deletion>.+?)--", + r"\[(?P<link>.+?)\]", + r"\(\((?P<image>.+?)\)\)", + r"~(?P<anchor>.+?)~", + r"\?\?(?P<acronym>.+?\|.+?)\?\?", + r"{{(?P<inline_quote>.+?)}}", + r"@@(?P<code>.+?)@@", + r"\$\$(?P<footnote>.+?)\$\$", + r"(?P<text>.+?)", +] + +wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL) +wiki_block_level_re = re.compile( + r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)", + re.MULTILINE | re.DOTALL, +) + + +class DCWikiParser(object): + def __init__(self): + self._footnotes = None + for i in range(5): + setattr( + self, + "parser_h{}_title".format(i), + lambda string, parent, i=i: self._parser_title( + string, parent, "h{}".format(i) + ), + ) + + def parser_paragraph(self, string, parent): + p_elt = parent.addElement("p") + self._parse(string, p_elt) + + def parser_html(self, string, parent): + wrapped_html = "<div>{}</div>".format(string) + try: + div_elt = xml_tools.ElementParser()(wrapped_html) + except domish.ParserError as e: + log.warning("Error while parsing HTML content, ignoring it: {}".format(e)) + return + children = list(div_elt.elements()) + if len(children) == 1 and children[0].name == "div": + div_elt = children[0] + parent.addChild(div_elt) + + def parser_escape_char(self, string, parent): + parent.addContent(string) + + def _parser_title(self, string, parent, name): + elt = parent.addElement(name) + elt.addContent(string) + + def parser_horizontal_rule(self, string, parent): + parent.addElement("hr") + + def _parser_list(self, string, parent, list_type): + depth = 0 + while string[depth : depth + 1] == "*": + depth += 1 + + string = string[depth:].lstrip() + + for i in range(depth + 1): + list_elt = getattr(parent, list_type) + if not list_elt: + parent = parent.addElement(list_type) + else: + parent = list_elt + + li_elt = parent.addElement("li") + self._parse(string, li_elt) + + def parser_list_bullet(self, string, parent): + self._parser_list(string, parent, "ul") + + def parser_list_ordered(self, string, parent): + self._parser_list(string, parent, "ol") + + def parser_preformated(self, string, parent): + pre_elt = parent.pre + if pre_elt is None: + pre_elt = parent.addElement("pre") + else: + # we are on a new line, and this is important for <pre/> + pre_elt.addContent("\n") + pre_elt.addContent(string) + + def parser_quote(self, string, parent): + blockquote_elt = parent.blockquote + if blockquote_elt is None: + blockquote_elt = parent.addElement("blockquote") + p_elt = blockquote_elt.p + if p_elt is None: + p_elt = blockquote_elt.addElement("p") + else: + string = "\n" + string + + self._parse(string, p_elt) + + def parser_emphasis(self, string, parent): + em_elt = parent.addElement("em") + self._parse(string, em_elt) + + def parser_strong_emphasis(self, string, parent): + strong_elt = parent.addElement("strong") + self._parse(string, strong_elt) + + def parser_line_break(self, string, parent): + parent.addElement("br") + + def parser_insertion(self, string, parent): + ins_elt = parent.addElement("ins") + self._parse(string, ins_elt) + + def parser_deletion(self, string, parent): + del_elt = parent.addElement("del") + self._parse(string, del_elt) + + def parser_link(self, string, parent): + url_data = string.split("|") + a_elt = parent.addElement("a") + length = len(url_data) + if length == 1: + url = url_data[0] + a_elt["href"] = url + a_elt.addContent(url) + else: + name = url_data[0] + url = url_data[1] + a_elt["href"] = url + a_elt.addContent(name) + if length >= 3: + a_elt["lang"] = url_data[2] + if length >= 4: + a_elt["title"] = url_data[3] + if length > 4: + log.warning("too much data for url, ignoring extra data") + + def parser_image(self, string, parent): + image_data = string.split("|") + img_elt = parent.addElement("img") + + for idx, attribute in enumerate(("src", "alt", "position", "longdesc")): + try: + data = image_data[idx] + except IndexError: + break + + if attribute != "position": + img_elt[attribute] = data + else: + data = data.lower() + if data in ("l", "g"): + img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0" + elif data in ("r", "d"): + img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em" + elif data == "c": + img_elt[ + "style" + ] = "display:block; margin-left:auto; margin-right:auto" + else: + log.warning("bad position argument for image, ignoring it") + + def parser_anchor(self, string, parent): + a_elt = parent.addElement("a") + a_elt["id"] = string + + def parser_acronym(self, string, parent): + acronym, title = string.split("|", 1) + acronym_elt = parent.addElement("acronym", content=acronym) + acronym_elt["title"] = title + + def parser_inline_quote(self, string, parent): + quote_data = string.split("|") + quote = quote_data[0] + q_elt = parent.addElement("q", content=quote) + for idx, attribute in enumerate(("lang", "cite"), 1): + try: + data = quote_data[idx] + except IndexError: + break + q_elt[attribute] = data + + def parser_code(self, string, parent): + parent.addElement("code", content=string) + + def parser_footnote(self, string, parent): + idx = len(self._footnotes) + 1 + note_txt = NOTE_TPL.format(idx) + sup_elt = parent.addElement("sup") + sup_elt["class"] = "note" + a_elt = sup_elt.addElement("a", content=note_txt) + a_elt["id"] = NOTE_A_REV_TPL.format(idx) + a_elt["href"] = "#{}".format(NOTE_A_TPL.format(idx)) + + p_elt = domish.Element((None, "p")) + a_elt = p_elt.addElement("a", content=note_txt) + a_elt["id"] = NOTE_A_TPL.format(idx) + a_elt["href"] = "#{}".format(NOTE_A_REV_TPL.format(idx)) + self._parse(string, p_elt) + # footnotes are actually added at the end of the parsing + self._footnotes.append(p_elt) + + def parser_text(self, string, parent): + parent.addContent(string) + + def _parse(self, string, parent, block_level=False): + regex = wiki_block_level_re if block_level else wiki_re + + for match in regex.finditer(string): + if match.lastgroup is None: + parent.addContent(string) + return + matched = match.group(match.lastgroup) + try: + parser = getattr(self, "parser_{}".format(match.lastgroup)) + except AttributeError: + log.warning("No parser found for {}".format(match.lastgroup)) + # parent.addContent(string) + continue + parser(matched, parent) + + def parse(self, string): + self._footnotes = [] + div_elt = domish.Element((None, "div")) + self._parse(string, parent=div_elt, block_level=True) + if self._footnotes: + foot_div_elt = div_elt.addElement("div") + foot_div_elt["class"] = "footnotes" + # we add a simple horizontal rule which can be customized + # with footnotes class, instead of a text which would need + # to be translated + foot_div_elt.addElement("hr") + for elt in self._footnotes: + foot_div_elt.addChild(elt) + return div_elt + + +class XHTMLParser(object): + def __init__(self): + self.flags = None + self.toto = 0 + self.footnotes = None # will hold a map from url to buffer id + for i in range(1, 6): + setattr( + self, + "parser_h{}".format(i), + lambda elt, buf, level=i: self.parser_heading(elt, buf, level), + ) + + def parser_a(self, elt, buf): + try: + url = elt["href"] + except KeyError: + # probably an anchor + try: + id_ = elt["id"] + if not id_: + # we don't want empty values + raise KeyError + except KeyError: + self.parser_generic(elt, buf) + else: + buf.append("~~{}~~".format(id_)) + return + + link_data = [url] + name = str(elt) + if name != url: + link_data.insert(0, name) + + lang = elt.getAttribute("lang") + title = elt.getAttribute("title") + if lang is not None: + link_data.append(lang) + elif title is not None: + link_data.appand("") + if title is not None: + link_data.append(title) + buf.append("[") + buf.append("|".join(link_data)) + buf.append("]") + + def parser_acronym(self, elt, buf): + try: + title = elt["title"] + except KeyError: + log.debug("Acronyme without title, using generic parser") + self.parser_generic(elt, buf) + return + buf.append("??{}|{}??".format(str(elt), title)) + + def parser_blockquote(self, elt, buf): + # we remove wrapping <p> to avoid empty line with "> " + children = list( + [child for child in elt.children if str(child).strip() not in ("", "\n")] + ) + if len(children) == 1 and children[0].name == "p": + elt = children[0] + tmp_buf = [] + self.parse_children(elt, tmp_buf) + blockquote = "> " + "\n> ".join("".join(tmp_buf).split("\n")) + buf.append(blockquote) + + def parser_br(self, elt, buf): + buf.append("%%%") + + def parser_code(self, elt, buf): + buf.append("@@") + self.parse_children(elt, buf) + buf.append("@@") + + def parser_del(self, elt, buf): + buf.append("--") + self.parse_children(elt, buf) + buf.append("--") + + def parser_div(self, elt, buf): + if elt.getAttribute("class") == "footnotes": + self.parser_footnote(elt, buf) + else: + self.parse_children(elt, buf, block=True) + + def parser_em(self, elt, buf): + buf.append("''") + self.parse_children(elt, buf) + buf.append("''") + + def parser_h6(self, elt, buf): + # XXX: <h6/> heading is not managed by wiki syntax + # so we handle it with a <h5/> + elt = copy.copy(elt) # we don't want to change to original element + elt.name = "h5" + self._parse(elt, buf) + + def parser_hr(self, elt, buf): + buf.append("\n----\n") + + def parser_img(self, elt, buf): + try: + url = elt["src"] + except KeyError: + log.warning("Ignoring <img/> without src") + return + + image_data = [url] + + alt = elt.getAttribute("alt") + style = elt.getAttribute("style", "") + desc = elt.getAttribute("longdesc") + + if "0 1em 1em 0" in style: + position = "L" + elif "0 0 1em 1em" in style: + position = "R" + elif "auto" in style: + position = "C" + else: + position = None + + if alt: + image_data.append(alt) + elif position or desc: + image_data.append("") + + if position: + image_data.append(position) + elif desc: + image_data.append("") + + if desc: + image_data.append(desc) + + buf.append("((") + buf.append("|".join(image_data)) + buf.append("))") + + def parser_ins(self, elt, buf): + buf.append("++") + self.parse_children(elt, buf) + buf.append("++") + + def parser_li(self, elt, buf): + flag = None + current_flag = None + bullets = [] + for flag in reversed(self.flags): + if flag in (FLAG_UL, FLAG_OL): + if current_flag is None: + current_flag = flag + if flag == current_flag: + bullets.append("*" if flag == FLAG_UL else "#") + else: + break + + if flag != current_flag and buf[-1] == " ": + # this trick is to avoid a space when we switch + # from (un)ordered to the other type on the same row + # e.g. *# unorder + ordered item + del buf[-1] + + buf.extend(bullets) + + buf.append(" ") + self.parse_children(elt, buf) + buf.append("\n") + + def parser_ol(self, elt, buf): + self.parser_list(elt, buf, FLAG_OL) + + def parser_p(self, elt, buf): + self.parse_children(elt, buf) + buf.append("\n\n") + + def parser_pre(self, elt, buf): + pre = "".join( + [ + child.toXml() if domish.IElement.providedBy(child) else str(child) + for child in elt.children + ] + ) + pre = " " + "\n ".join(pre.split("\n")) + buf.append(pre) + + def parser_q(self, elt, buf): + quote_data = [str(elt)] + + lang = elt.getAttribute("lang") + cite = elt.getAttribute("url") + + if lang: + quote_data.append(lang) + elif cite: + quote_data.append("") + + if cite: + quote_data.append(cite) + + buf.append("{{") + buf.append("|".join(quote_data)) + buf.append("}}") + + def parser_span(self, elt, buf): + self.parse_children(elt, buf, block=True) + + def parser_strong(self, elt, buf): + buf.append("__") + self.parse_children(elt, buf) + buf.append("__") + + def parser_sup(self, elt, buf): + # sup is mainly used for footnotes, so we check if we have an anchor inside + children = list( + [child for child in elt.children if str(child).strip() not in ("", "\n")] + ) + if ( + len(children) == 1 + and domish.IElement.providedBy(children[0]) + and children[0].name == "a" + and "#" in children[0].getAttribute("href", "") + ): + url = children[0]["href"] + note_id = url[url.find("#") + 1 :] + if not note_id: + log.warning("bad link found in footnote") + self.parser_generic(elt, buf) + return + # this looks like a footnote + buf.append("$$") + buf.append(" ") # placeholder + self.footnotes[note_id] = len(buf) - 1 + buf.append("$$") + else: + self.parser_generic(elt, buf) + + def parser_ul(self, elt, buf): + self.parser_list(elt, buf, FLAG_UL) + + def parser_list(self, elt, buf, type_): + self.flags.append(type_) + self.parse_children(elt, buf, block=True) + idx = 0 + for flag in reversed(self.flags): + idx -= 1 + if flag == type_: + del self.flags[idx] + break + + if idx == 0: + raise exceptions.InternalError("flag has been removed by an other parser") + + def parser_heading(self, elt, buf, level): + buf.append((6 - level) * "!") + for child in elt.children: + # we ignore other elements for a Hx title + self.parser_text(child, buf) + buf.append("\n") + + def parser_footnote(self, elt, buf): + for elt in elt.elements(): + # all children other than <p/> are ignored + if elt.name == "p": + a_elt = elt.a + if a_elt is None: + log.warning( + "<p/> element doesn't contain <a/> in footnote, ignoring it" + ) + continue + try: + note_idx = self.footnotes[a_elt["id"]] + except KeyError: + log.warning("Note id doesn't match any known note, ignoring it") + # we create a dummy element to parse all children after the <a/> + dummy_elt = domish.Element((None, "note")) + a_idx = elt.children.index(a_elt) + dummy_elt.children = elt.children[a_idx + 1 :] + note_buf = [] + self.parse_children(dummy_elt, note_buf) + # now we can replace the placeholder + buf[note_idx] = "".join(note_buf) + + def parser_text(self, txt, buf, keep_whitespaces=False): + txt = str(txt) + if not keep_whitespaces: + # we get text and only let one inter word space + txt = " ".join(txt.split()) + txt = re.sub(ESCAPE_CHARS, r"\\\1", txt) + if txt: + buf.append(txt) + return txt + + def parser_generic(self, elt, buf): + # as dotclear wiki syntax handle arbitrary XHTML code + # we use this feature to add elements that we don't know + buf.append("\n\n///html\n{}\n///\n\n".format(elt.toXml())) + + def parse_children(self, elt, buf, block=False): + first_visible = True + for child in elt.children: + if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"): + # we add separation if it isn't already there + buf.append(" ") + if domish.IElement.providedBy(child): + self._parse(child, buf) + first_visible = False + else: + appended = self.parser_text(child, buf) + if appended: + first_visible = False + + def _parse(self, elt, buf): + elt_name = elt.name.lower() + style = elt.getAttribute("style") + if style and elt_name not in ELT_WITH_STYLE: + # if we have style we use generic parser to put raw HTML + # to avoid losing it + parser = self.parser_generic + else: + try: + parser = getattr(self, "parser_{}".format(elt_name)) + except AttributeError: + log.debug( + "Can't find parser for {} element, using generic one".format(elt.name) + ) + parser = self.parser_generic + parser(elt, buf) + + def parse(self, elt): + self.flags = [] + self.footnotes = {} + buf = [] + self._parse(elt, buf) + return "".join(buf) + + def parseString(self, string): + wrapped_html = "<div>{}</div>".format(string) + try: + div_elt = xml_tools.ElementParser()(wrapped_html) + except domish.ParserError as e: + log.warning("Error while parsing HTML content: {}".format(e)) + return + children = list(div_elt.elements()) + if len(children) == 1 and children[0].name == "div": + div_elt = children[0] + return self.parse(div_elt) + + +class DCWikiSyntax(object): + SYNTAX_NAME = "wiki_dotclear" + + def __init__(self, host): + log.info(_("Dotclear wiki syntax plugin initialization")) + self.host = host + self._dc_parser = DCWikiParser() + self._xhtml_parser = XHTMLParser() + self._stx = self.host.plugins["TEXT_SYNTAXES"] + self._stx.add_syntax( + self.SYNTAX_NAME, self.parse_wiki, self.parse_xhtml, [self._stx.OPT_NO_THREAD] + ) + + def parse_wiki(self, wiki_stx): + div_elt = self._dc_parser.parse(wiki_stx) + return div_elt.toXml() + + def parse_xhtml(self, xhtml): + return self._xhtml_parser.parseString(xhtml)