libervia-backend: libervia/backend/plugins/plugin_syntax_wiki

comparison libervia/backend/plugins/plugin_syntax_wiki_dotclear.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`

author	Goffi <goffi@goffi.org>
date	Fri, 02 Jun 2023 11:49:51 +0200
parents	sat/plugins/plugin_syntax_wiki_dotclear.py@524856bd7b19
children	0d7bb4df2343

comparison

equal deleted inserted replaced

-:d10748475025
+:4b842c1fb686
+#!/usr/bin/env python3
+# SàT plugin for Dotclear Wiki Syntax
+# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+# XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent
+from libervia.backend.core.i18n import _
+from libervia.backend.core.log import getLogger
+log = getLogger(__name__)
+from libervia.backend.core.constants import Const as C
+from libervia.backend.core import exceptions
+from twisted.words.xish import domish
+from libervia.backend.tools import xml_tools
+import copy
+import re
+PLUGIN_INFO = {
+C.PI_NAME: "Dotclear Wiki Syntax Plugin",
+C.PI_IMPORT_NAME: "SYNT_DC_WIKI",
+C.PI_TYPE: C.PLUG_TYPE_SYNTAXE,
+C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"],
+C.PI_MAIN: "DCWikiSyntax",
+C.PI_HANDLER: "",
+C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""),
+}
+NOTE_TPL = "[{}]"  # Note template
+NOTE_A_REV_TPL = "rev_note_{}"
+NOTE_A_TPL = "note_{}"
+ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])"
+ESCAPE_CHARS_EXTRA = (
+r"!?_+'()"
+)  # These chars are not escaped in XHTML => dc_wiki conversion,
+# but are used in the other direction
+ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("")
+FLAG_UL = "ul"  # must be the name of the element
+FLAG_OL = "ol"
+ELT_WITH_STYLE = ("img", "div")  # elements where a style attribute is expected
+wiki = [
+r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA),
+r"^!!!!!(?P<h1_title>.+?)$",
+r"^!!!!(?P<h2_title>.+?)$",
+r"^!!!(?P<h3_title>.+?)$",
+r"^!!(?P<h4_title>.+?)$",
+r"^!(?P<h5_title>.+?)$",
+r"^----$(?P<horizontal_rule>)",
+r"^\*(?P<list_bullet>.*?)$",
+r"^#(?P<list_ordered>.*?)$",
+r"^ (?P<preformated>.*?)$",
+r"^> +?(?P<quote>.*?)$",
+r"''(?P<emphasis>.+?)''",
+r"__(?P<strong_emphasis>.+?)__",
+r"%%%(?P<line_break>)",
+r"\+\+(?P<insertion>.+?)\+\+",
+r"--(?P<deletion>.+?)--",
+r"\[(?P<link>.+?)\]",
+r"\(\((?P<image>.+?)\)\)",
+r"~(?P<anchor>.+?)~",
+r"\?\?(?P<acronym>.+?\|.+?)\?\?",
+r"{{(?P<inline_quote>.+?)}}",
+r"@@(?P<code>.+?)@@",
+r"\$\$(?P<footnote>.+?)\$\$",
+r"(?P<text>.+?)",
+]
+wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL)
+wiki_block_level_re = re.compile(
+r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)",
+re.MULTILINE | re.DOTALL,
+)
+class DCWikiParser(object):
+def __init__(self):
+self._footnotes = None
+for i in range(5):
+setattr(
+self,
+"parser_h{}_title".format(i),
+lambda string, parent, i=i: self._parser_title(
+string, parent, "h{}".format(i)
+),
+)
+def parser_paragraph(self, string, parent):
+p_elt = parent.addElement("p")
+self._parse(string, p_elt)
+def parser_html(self, string, parent):
+wrapped_html = "<div>{}</div>".format(string)
+try:
+div_elt = xml_tools.ElementParser()(wrapped_html)
+except domish.ParserError as e:
+log.warning("Error while parsing HTML content, ignoring it: {}".format(e))
+return
+children = list(div_elt.elements())
+if len(children) == 1 and children[0].name == "div":
+div_elt = children[0]
+parent.addChild(div_elt)
+def parser_escape_char(self, string, parent):
+parent.addContent(string)
+def _parser_title(self, string, parent, name):
+elt = parent.addElement(name)
+elt.addContent(string)
+def parser_horizontal_rule(self, string, parent):
+parent.addElement("hr")
+def _parser_list(self, string, parent, list_type):
+depth = 0
+while string[depth : depth + 1] == "*":
+depth += 1
+string = string[depth:].lstrip()
+for i in range(depth + 1):
+list_elt = getattr(parent, list_type)
+if not list_elt:
+parent = parent.addElement(list_type)
+else:
+parent = list_elt
+li_elt = parent.addElement("li")
+self._parse(string, li_elt)
+def parser_list_bullet(self, string, parent):
+self._parser_list(string, parent, "ul")
+def parser_list_ordered(self, string, parent):
+self._parser_list(string, parent, "ol")
+def parser_preformated(self, string, parent):
+pre_elt = parent.pre
+if pre_elt is None:
+pre_elt = parent.addElement("pre")
+else:
+# we are on a new line, and this is important for <pre/>
+pre_elt.addContent("\n")
+pre_elt.addContent(string)
+def parser_quote(self, string, parent):
+blockquote_elt = parent.blockquote
+if blockquote_elt is None:
+blockquote_elt = parent.addElement("blockquote")
+p_elt = blockquote_elt.p
+if p_elt is None:
+p_elt = blockquote_elt.addElement("p")
+else:
+string = "\n" + string
+self._parse(string, p_elt)
+def parser_emphasis(self, string, parent):
+em_elt = parent.addElement("em")
+self._parse(string, em_elt)
+def parser_strong_emphasis(self, string, parent):
+strong_elt = parent.addElement("strong")
+self._parse(string, strong_elt)
+def parser_line_break(self, string, parent):
+parent.addElement("br")
+def parser_insertion(self, string, parent):
+ins_elt = parent.addElement("ins")
+self._parse(string, ins_elt)
+def parser_deletion(self, string, parent):
+del_elt = parent.addElement("del")
+self._parse(string, del_elt)
+def parser_link(self, string, parent):
+url_data = string.split("|")
+a_elt = parent.addElement("a")
+length = len(url_data)
+if length == 1:
+url = url_data[0]
+a_elt["href"] = url
+a_elt.addContent(url)
+else:
+name = url_data[0]
+url = url_data[1]
+a_elt["href"] = url
+a_elt.addContent(name)
+if length >= 3:
+a_elt["lang"] = url_data[2]
+if length >= 4:
+a_elt["title"] = url_data[3]
+if length > 4:
+log.warning("too much data for url, ignoring extra data")
+def parser_image(self, string, parent):
+image_data = string.split("|")
+img_elt = parent.addElement("img")
+for idx, attribute in enumerate(("src", "alt", "position", "longdesc")):
+try:
+data = image_data[idx]
+except IndexError:
+break
+if attribute != "position":
+img_elt[attribute] = data
+else:
+data = data.lower()
+if data in ("l", "g"):
+img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0"
+elif data in ("r", "d"):
+img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em"
+elif data == "c":
+img_elt[
+"style"
+] = "display:block; margin-left:auto; margin-right:auto"
+else:
+log.warning("bad position argument for image, ignoring it")
+def parser_anchor(self, string, parent):
+a_elt = parent.addElement("a")
+a_elt["id"] = string
+def parser_acronym(self, string, parent):
+acronym, title = string.split("|", 1)
+acronym_elt = parent.addElement("acronym", content=acronym)
+acronym_elt["title"] = title
+def parser_inline_quote(self, string, parent):
+quote_data = string.split("|")
+quote = quote_data[0]
+q_elt = parent.addElement("q", content=quote)
+for idx, attribute in enumerate(("lang", "cite"), 1):
+try:
+data = quote_data[idx]
+except IndexError:
+break
+q_elt[attribute] = data
+def parser_code(self, string, parent):
+parent.addElement("code", content=string)
+def parser_footnote(self, string, parent):
+idx = len(self._footnotes) + 1
+note_txt = NOTE_TPL.format(idx)
+sup_elt = parent.addElement("sup")
+sup_elt["class"] = "note"
+a_elt = sup_elt.addElement("a", content=note_txt)
+a_elt["id"] = NOTE_A_REV_TPL.format(idx)
+a_elt["href"] = "#{}".format(NOTE_A_TPL.format(idx))
+p_elt = domish.Element((None, "p"))
+a_elt = p_elt.addElement("a", content=note_txt)
+a_elt["id"] = NOTE_A_TPL.format(idx)
+a_elt["href"] = "#{}".format(NOTE_A_REV_TPL.format(idx))
+self._parse(string, p_elt)
+# footnotes are actually added at the end of the parsing
+self._footnotes.append(p_elt)
+def parser_text(self, string, parent):
+parent.addContent(string)
+def _parse(self, string, parent, block_level=False):
+regex = wiki_block_level_re if block_level else wiki_re
+for match in regex.finditer(string):
+if match.lastgroup is None:
+parent.addContent(string)
+return
+matched = match.group(match.lastgroup)
+try:
+parser = getattr(self, "parser_{}".format(match.lastgroup))
+except AttributeError:
+log.warning("No parser found for {}".format(match.lastgroup))
+# parent.addContent(string)
+continue
+parser(matched, parent)
+def parse(self, string):
+self._footnotes = []
+div_elt = domish.Element((None, "div"))
+self._parse(string, parent=div_elt, block_level=True)
+if self._footnotes:
+foot_div_elt = div_elt.addElement("div")
+foot_div_elt["class"] = "footnotes"
+# we add a simple horizontal rule which can be customized
+# with footnotes class, instead of a text which would need
+# to be translated
+foot_div_elt.addElement("hr")
+for elt in self._footnotes:
+foot_div_elt.addChild(elt)
+return div_elt
+class XHTMLParser(object):
+def __init__(self):
+self.flags = None
+self.toto = 0
+self.footnotes = None  # will hold a map from url to buffer id
+for i in range(1, 6):
+setattr(
+self,
+"parser_h{}".format(i),
+lambda elt, buf, level=i: self.parser_heading(elt, buf, level),
+)
+def parser_a(self, elt, buf):
+try:
+url = elt["href"]
+except KeyError:
+# probably an anchor
+try:
+id_ = elt["id"]
+if not id_:
+# we don't want empty values
+raise KeyError
+except KeyError:
+self.parser_generic(elt, buf)
+else:
+buf.append("~~{}~~".format(id_))
+return
+link_data = [url]
+name = str(elt)
+if name != url:
+link_data.insert(0, name)
+lang = elt.getAttribute("lang")
+title = elt.getAttribute("title")
+if lang is not None:
+link_data.append(lang)
+elif title is not None:
+link_data.appand("")
+if title is not None:
+link_data.append(title)
+buf.append("[")
+buf.append("|".join(link_data))
+buf.append("]")
+def parser_acronym(self, elt, buf):
+try:
+title = elt["title"]
+except KeyError:
+log.debug("Acronyme without title, using generic parser")
+self.parser_generic(elt, buf)
+return
+buf.append("??{}|{}??".format(str(elt), title))
+def parser_blockquote(self, elt, buf):
+# we remove wrapping <p> to avoid empty line with "> "
+children = list(
+[child for child in elt.children if str(child).strip() not in ("", "\n")]
+)
+if len(children) == 1 and children[0].name == "p":
+elt = children[0]
+tmp_buf = []
+self.parse_children(elt, tmp_buf)
+blockquote = "> " + "\n> ".join("".join(tmp_buf).split("\n"))
+buf.append(blockquote)
+def parser_br(self, elt, buf):
+buf.append("%%%")
+def parser_code(self, elt, buf):
+buf.append("@@")
+self.parse_children(elt, buf)
+buf.append("@@")
+def parser_del(self, elt, buf):
+buf.append("--")
+self.parse_children(elt, buf)
+buf.append("--")
+def parser_div(self, elt, buf):
+if elt.getAttribute("class") == "footnotes":
+self.parser_footnote(elt, buf)
+else:
+self.parse_children(elt, buf, block=True)
+def parser_em(self, elt, buf):
+buf.append("''")
+self.parse_children(elt, buf)
+buf.append("''")
+def parser_h6(self, elt, buf):
+# XXX: <h6/> heading is not managed by wiki syntax
+#      so we handle it with a <h5/>
+elt = copy.copy(elt)  # we don't want to change to original element
+elt.name = "h5"
+self._parse(elt, buf)
+def parser_hr(self, elt, buf):
+buf.append("\n----\n")
+def parser_img(self, elt, buf):
+try:
+url = elt["src"]
+except KeyError:
+log.warning("Ignoring <img/> without src")
+return
+image_data = [url]
+alt = elt.getAttribute("alt")
+style = elt.getAttribute("style", "")
+desc = elt.getAttribute("longdesc")
+if "0 1em 1em 0" in style:
+position = "L"
+elif "0 0 1em 1em" in style:
+position = "R"
+elif "auto" in style:
+position = "C"
+else:
+position = None
+if alt:
+image_data.append(alt)
+elif position or desc:
+image_data.append("")
+if position:
+image_data.append(position)
+elif desc:
+image_data.append("")
+if desc:
+image_data.append(desc)
+buf.append("((")
+buf.append("|".join(image_data))
+buf.append("))")
+def parser_ins(self, elt, buf):
+buf.append("++")
+self.parse_children(elt, buf)
+buf.append("++")
+def parser_li(self, elt, buf):
+flag = None
+current_flag = None
+bullets = []
+for flag in reversed(self.flags):
+if flag in (FLAG_UL, FLAG_OL):
+if current_flag is None:
+current_flag = flag
+if flag == current_flag:
+bullets.append("*" if flag == FLAG_UL else "#")
+else:
+break
+if flag != current_flag and buf[-1] == " ":
+# this trick is to avoid a space when we switch
+# from (un)ordered to the other type on the same row
+# e.g. *# unorder + ordered item
+del buf[-1]
+buf.extend(bullets)
+buf.append(" ")
+self.parse_children(elt, buf)
+buf.append("\n")
+def parser_ol(self, elt, buf):
+self.parser_list(elt, buf, FLAG_OL)
+def parser_p(self, elt, buf):
+self.parse_children(elt, buf)
+buf.append("\n\n")
+def parser_pre(self, elt, buf):
+pre = "".join(
+[
+child.toXml() if domish.IElement.providedBy(child) else str(child)
+for child in elt.children
+]
+)
+pre = " " + "\n ".join(pre.split("\n"))
+buf.append(pre)
+def parser_q(self, elt, buf):
+quote_data = [str(elt)]
+lang = elt.getAttribute("lang")
+cite = elt.getAttribute("url")
+if lang:
+quote_data.append(lang)
+elif cite:
+quote_data.append("")
+if cite:
+quote_data.append(cite)
+buf.append("{{")
+buf.append("|".join(quote_data))
+buf.append("}}")
+def parser_span(self, elt, buf):
+self.parse_children(elt, buf, block=True)
+def parser_strong(self, elt, buf):
+buf.append("__")
+self.parse_children(elt, buf)
+buf.append("__")
+def parser_sup(self, elt, buf):
+# sup is mainly used for footnotes, so we check if we have an anchor inside
+children = list(
+[child for child in elt.children if str(child).strip() not in ("", "\n")]
+)
+if (
+len(children) == 1
+and domish.IElement.providedBy(children[0])
+and children[0].name == "a"
+and "#" in children[0].getAttribute("href", "")
+):
+url = children[0]["href"]
+note_id = url[url.find("#") + 1 :]
+if not note_id:
+log.warning("bad link found in footnote")
+self.parser_generic(elt, buf)
+return
+# this looks like a footnote
+buf.append("$$")
+buf.append(" ")  # placeholder
+self.footnotes[note_id] = len(buf) - 1
+buf.append("$$")
+else:
+self.parser_generic(elt, buf)
+def parser_ul(self, elt, buf):
+self.parser_list(elt, buf, FLAG_UL)
+def parser_list(self, elt, buf, type_):
+self.flags.append(type_)
+self.parse_children(elt, buf, block=True)
+idx = 0
+for flag in reversed(self.flags):
+idx -= 1
+if flag == type_:
+del self.flags[idx]
+break
+if idx == 0:
+raise exceptions.InternalError("flag has been removed by an other parser")
+def parser_heading(self, elt, buf, level):
+buf.append((6 - level) * "!")
+for child in elt.children:
+# we ignore other elements for a Hx title
+self.parser_text(child, buf)
+buf.append("\n")
+def parser_footnote(self, elt, buf):
+for elt in elt.elements():
+# all children other than <p/> are ignored
+if elt.name == "p":
+a_elt = elt.a
+if a_elt is None:
+log.warning(
+"<p/> element doesn't contain <a/> in footnote, ignoring it"
+)
+continue
+try:
+note_idx = self.footnotes[a_elt["id"]]
+except KeyError:
+log.warning("Note id doesn't match any known note, ignoring it")
+# we create a dummy element to parse all children after the <a/>
+dummy_elt = domish.Element((None, "note"))
+a_idx = elt.children.index(a_elt)
+dummy_elt.children = elt.children[a_idx + 1 :]
+note_buf = []
+self.parse_children(dummy_elt, note_buf)
+# now we can replace the placeholder
+buf[note_idx] = "".join(note_buf)
+def parser_text(self, txt, buf, keep_whitespaces=False):
+txt = str(txt)
+if not keep_whitespaces:
+# we get text and only let one inter word space
+txt = " ".join(txt.split())
+txt = re.sub(ESCAPE_CHARS, r"\\\1", txt)
+if txt:
+buf.append(txt)
+return txt
+def parser_generic(self, elt, buf):
+# as dotclear wiki syntax handle arbitrary XHTML code
+# we use this feature to add elements that we don't know
+buf.append("\n\n///html\n{}\n///\n\n".format(elt.toXml()))
+def parse_children(self, elt, buf, block=False):
+first_visible = True
+for child in elt.children:
+if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"):
+# we add separation if it isn't already there
+buf.append(" ")
+if domish.IElement.providedBy(child):
+self._parse(child, buf)
+first_visible = False
+else:
+appended = self.parser_text(child, buf)
+if appended:
+first_visible = False
+def _parse(self, elt, buf):
+elt_name = elt.name.lower()
+style = elt.getAttribute("style")
+if style and elt_name not in ELT_WITH_STYLE:
+# if we have style we use generic parser to put raw HTML
+# to avoid losing it
+parser = self.parser_generic
+else:
+try:
+parser = getattr(self, "parser_{}".format(elt_name))
+except AttributeError:
+log.debug(
+"Can't find parser for {} element, using generic one".format(elt.name)
+)
+parser = self.parser_generic
+parser(elt, buf)
+def parse(self, elt):
+self.flags = []
+self.footnotes = {}
+buf = []
+self._parse(elt, buf)
+return "".join(buf)
+def parseString(self, string):
+wrapped_html = "<div>{}</div>".format(string)
+try:
+div_elt = xml_tools.ElementParser()(wrapped_html)
+except domish.ParserError as e:
+log.warning("Error while parsing HTML content: {}".format(e))
+return
+children = list(div_elt.elements())
+if len(children) == 1 and children[0].name == "div":
+div_elt = children[0]
+return self.parse(div_elt)
+class DCWikiSyntax(object):
+SYNTAX_NAME = "wiki_dotclear"
+def __init__(self, host):
+log.info(_("Dotclear wiki syntax plugin initialization"))
+self.host = host
+self._dc_parser = DCWikiParser()
+self._xhtml_parser = XHTMLParser()
+self._stx = self.host.plugins["TEXT_SYNTAXES"]
+self._stx.add_syntax(
+self.SYNTAX_NAME, self.parse_wiki, self.parse_xhtml, [self._stx.OPT_NO_THREAD]
+)
+def parse_wiki(self, wiki_stx):
+div_elt = self._dc_parser.parse(wiki_stx)
+return div_elt.toXml()
+def parse_xhtml(self, xhtml):
+return self._xhtml_parser.parseString(xhtml)

Mercurial > libervia-backend

comparison libervia/backend/plugins/plugin_syntax_wiki_dotclear.py @ 4071:4b842c1fb686