Mercurial > libervia-backend
view libervia/backend/plugins/plugin_syntax_wiki_dotclear.py @ 4095:684ba556a617
core (memory/sqla_mapping): fix legacy pickled values:
folloing packages refactoring, legacy pickled values could not be unpickled (due to use of
old classes). This temporary workaround fix it, but the right thing to do will be to move
from pickle to JSON at some point.
author | Goffi <goffi@goffi.org> |
---|---|
date | Mon, 12 Jun 2023 14:57:27 +0200 |
parents | 4b842c1fb686 |
children | 0d7bb4df2343 |
line wrap: on
line source
#!/usr/bin/env python3 # SàT plugin for Dotclear Wiki Syntax # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent from libervia.backend.core.i18n import _ from libervia.backend.core.log import getLogger log = getLogger(__name__) from libervia.backend.core.constants import Const as C from libervia.backend.core import exceptions from twisted.words.xish import domish from libervia.backend.tools import xml_tools import copy import re PLUGIN_INFO = { C.PI_NAME: "Dotclear Wiki Syntax Plugin", C.PI_IMPORT_NAME: "SYNT_DC_WIKI", C.PI_TYPE: C.PLUG_TYPE_SYNTAXE, C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"], C.PI_MAIN: "DCWikiSyntax", C.PI_HANDLER: "", C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""), } NOTE_TPL = "[{}]" # Note template NOTE_A_REV_TPL = "rev_note_{}" NOTE_A_TPL = "note_{}" ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])" ESCAPE_CHARS_EXTRA = ( r"!?_+'()" ) # These chars are not escaped in XHTML => dc_wiki conversion, # but are used in the other direction ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("") FLAG_UL = "ul" # must be the name of the element FLAG_OL = "ol" ELT_WITH_STYLE = ("img", "div") # elements where a style attribute is expected wiki = [ r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA), r"^!!!!!(?P<h1_title>.+?)$", r"^!!!!(?P<h2_title>.+?)$", r"^!!!(?P<h3_title>.+?)$", r"^!!(?P<h4_title>.+?)$", r"^!(?P<h5_title>.+?)$", r"^----$(?P<horizontal_rule>)", r"^\*(?P<list_bullet>.*?)$", r"^#(?P<list_ordered>.*?)$", r"^ (?P<preformated>.*?)$", r"^> +?(?P<quote>.*?)$", r"''(?P<emphasis>.+?)''", r"__(?P<strong_emphasis>.+?)__", r"%%%(?P<line_break>)", r"\+\+(?P<insertion>.+?)\+\+", r"--(?P<deletion>.+?)--", r"\[(?P<link>.+?)\]", r"\(\((?P<image>.+?)\)\)", r"~(?P<anchor>.+?)~", r"\?\?(?P<acronym>.+?\|.+?)\?\?", r"{{(?P<inline_quote>.+?)}}", r"@@(?P<code>.+?)@@", r"\$\$(?P<footnote>.+?)\$\$", r"(?P<text>.+?)", ] wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL) wiki_block_level_re = re.compile( r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)", re.MULTILINE | re.DOTALL, ) class DCWikiParser(object): def __init__(self): self._footnotes = None for i in range(5): setattr( self, "parser_h{}_title".format(i), lambda string, parent, i=i: self._parser_title( string, parent, "h{}".format(i) ), ) def parser_paragraph(self, string, parent): p_elt = parent.addElement("p") self._parse(string, p_elt) def parser_html(self, string, parent): wrapped_html = "<div>{}</div>".format(string) try: div_elt = xml_tools.ElementParser()(wrapped_html) except domish.ParserError as e: log.warning("Error while parsing HTML content, ignoring it: {}".format(e)) return children = list(div_elt.elements()) if len(children) == 1 and children[0].name == "div": div_elt = children[0] parent.addChild(div_elt) def parser_escape_char(self, string, parent): parent.addContent(string) def _parser_title(self, string, parent, name): elt = parent.addElement(name) elt.addContent(string) def parser_horizontal_rule(self, string, parent): parent.addElement("hr") def _parser_list(self, string, parent, list_type): depth = 0 while string[depth : depth + 1] == "*": depth += 1 string = string[depth:].lstrip() for i in range(depth + 1): list_elt = getattr(parent, list_type) if not list_elt: parent = parent.addElement(list_type) else: parent = list_elt li_elt = parent.addElement("li") self._parse(string, li_elt) def parser_list_bullet(self, string, parent): self._parser_list(string, parent, "ul") def parser_list_ordered(self, string, parent): self._parser_list(string, parent, "ol") def parser_preformated(self, string, parent): pre_elt = parent.pre if pre_elt is None: pre_elt = parent.addElement("pre") else: # we are on a new line, and this is important for <pre/> pre_elt.addContent("\n") pre_elt.addContent(string) def parser_quote(self, string, parent): blockquote_elt = parent.blockquote if blockquote_elt is None: blockquote_elt = parent.addElement("blockquote") p_elt = blockquote_elt.p if p_elt is None: p_elt = blockquote_elt.addElement("p") else: string = "\n" + string self._parse(string, p_elt) def parser_emphasis(self, string, parent): em_elt = parent.addElement("em") self._parse(string, em_elt) def parser_strong_emphasis(self, string, parent): strong_elt = parent.addElement("strong") self._parse(string, strong_elt) def parser_line_break(self, string, parent): parent.addElement("br") def parser_insertion(self, string, parent): ins_elt = parent.addElement("ins") self._parse(string, ins_elt) def parser_deletion(self, string, parent): del_elt = parent.addElement("del") self._parse(string, del_elt) def parser_link(self, string, parent): url_data = string.split("|") a_elt = parent.addElement("a") length = len(url_data) if length == 1: url = url_data[0] a_elt["href"] = url a_elt.addContent(url) else: name = url_data[0] url = url_data[1] a_elt["href"] = url a_elt.addContent(name) if length >= 3: a_elt["lang"] = url_data[2] if length >= 4: a_elt["title"] = url_data[3] if length > 4: log.warning("too much data for url, ignoring extra data") def parser_image(self, string, parent): image_data = string.split("|") img_elt = parent.addElement("img") for idx, attribute in enumerate(("src", "alt", "position", "longdesc")): try: data = image_data[idx] except IndexError: break if attribute != "position": img_elt[attribute] = data else: data = data.lower() if data in ("l", "g"): img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0" elif data in ("r", "d"): img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em" elif data == "c": img_elt[ "style" ] = "display:block; margin-left:auto; margin-right:auto" else: log.warning("bad position argument for image, ignoring it") def parser_anchor(self, string, parent): a_elt = parent.addElement("a") a_elt["id"] = string def parser_acronym(self, string, parent): acronym, title = string.split("|", 1) acronym_elt = parent.addElement("acronym", content=acronym) acronym_elt["title"] = title def parser_inline_quote(self, string, parent): quote_data = string.split("|") quote = quote_data[0] q_elt = parent.addElement("q", content=quote) for idx, attribute in enumerate(("lang", "cite"), 1): try: data = quote_data[idx] except IndexError: break q_elt[attribute] = data def parser_code(self, string, parent): parent.addElement("code", content=string) def parser_footnote(self, string, parent): idx = len(self._footnotes) + 1 note_txt = NOTE_TPL.format(idx) sup_elt = parent.addElement("sup") sup_elt["class"] = "note" a_elt = sup_elt.addElement("a", content=note_txt) a_elt["id"] = NOTE_A_REV_TPL.format(idx) a_elt["href"] = "#{}".format(NOTE_A_TPL.format(idx)) p_elt = domish.Element((None, "p")) a_elt = p_elt.addElement("a", content=note_txt) a_elt["id"] = NOTE_A_TPL.format(idx) a_elt["href"] = "#{}".format(NOTE_A_REV_TPL.format(idx)) self._parse(string, p_elt) # footnotes are actually added at the end of the parsing self._footnotes.append(p_elt) def parser_text(self, string, parent): parent.addContent(string) def _parse(self, string, parent, block_level=False): regex = wiki_block_level_re if block_level else wiki_re for match in regex.finditer(string): if match.lastgroup is None: parent.addContent(string) return matched = match.group(match.lastgroup) try: parser = getattr(self, "parser_{}".format(match.lastgroup)) except AttributeError: log.warning("No parser found for {}".format(match.lastgroup)) # parent.addContent(string) continue parser(matched, parent) def parse(self, string): self._footnotes = [] div_elt = domish.Element((None, "div")) self._parse(string, parent=div_elt, block_level=True) if self._footnotes: foot_div_elt = div_elt.addElement("div") foot_div_elt["class"] = "footnotes" # we add a simple horizontal rule which can be customized # with footnotes class, instead of a text which would need # to be translated foot_div_elt.addElement("hr") for elt in self._footnotes: foot_div_elt.addChild(elt) return div_elt class XHTMLParser(object): def __init__(self): self.flags = None self.toto = 0 self.footnotes = None # will hold a map from url to buffer id for i in range(1, 6): setattr( self, "parser_h{}".format(i), lambda elt, buf, level=i: self.parser_heading(elt, buf, level), ) def parser_a(self, elt, buf): try: url = elt["href"] except KeyError: # probably an anchor try: id_ = elt["id"] if not id_: # we don't want empty values raise KeyError except KeyError: self.parser_generic(elt, buf) else: buf.append("~~{}~~".format(id_)) return link_data = [url] name = str(elt) if name != url: link_data.insert(0, name) lang = elt.getAttribute("lang") title = elt.getAttribute("title") if lang is not None: link_data.append(lang) elif title is not None: link_data.appand("") if title is not None: link_data.append(title) buf.append("[") buf.append("|".join(link_data)) buf.append("]") def parser_acronym(self, elt, buf): try: title = elt["title"] except KeyError: log.debug("Acronyme without title, using generic parser") self.parser_generic(elt, buf) return buf.append("??{}|{}??".format(str(elt), title)) def parser_blockquote(self, elt, buf): # we remove wrapping <p> to avoid empty line with "> " children = list( [child for child in elt.children if str(child).strip() not in ("", "\n")] ) if len(children) == 1 and children[0].name == "p": elt = children[0] tmp_buf = [] self.parse_children(elt, tmp_buf) blockquote = "> " + "\n> ".join("".join(tmp_buf).split("\n")) buf.append(blockquote) def parser_br(self, elt, buf): buf.append("%%%") def parser_code(self, elt, buf): buf.append("@@") self.parse_children(elt, buf) buf.append("@@") def parser_del(self, elt, buf): buf.append("--") self.parse_children(elt, buf) buf.append("--") def parser_div(self, elt, buf): if elt.getAttribute("class") == "footnotes": self.parser_footnote(elt, buf) else: self.parse_children(elt, buf, block=True) def parser_em(self, elt, buf): buf.append("''") self.parse_children(elt, buf) buf.append("''") def parser_h6(self, elt, buf): # XXX: <h6/> heading is not managed by wiki syntax # so we handle it with a <h5/> elt = copy.copy(elt) # we don't want to change to original element elt.name = "h5" self._parse(elt, buf) def parser_hr(self, elt, buf): buf.append("\n----\n") def parser_img(self, elt, buf): try: url = elt["src"] except KeyError: log.warning("Ignoring <img/> without src") return image_data = [url] alt = elt.getAttribute("alt") style = elt.getAttribute("style", "") desc = elt.getAttribute("longdesc") if "0 1em 1em 0" in style: position = "L" elif "0 0 1em 1em" in style: position = "R" elif "auto" in style: position = "C" else: position = None if alt: image_data.append(alt) elif position or desc: image_data.append("") if position: image_data.append(position) elif desc: image_data.append("") if desc: image_data.append(desc) buf.append("((") buf.append("|".join(image_data)) buf.append("))") def parser_ins(self, elt, buf): buf.append("++") self.parse_children(elt, buf) buf.append("++") def parser_li(self, elt, buf): flag = None current_flag = None bullets = [] for flag in reversed(self.flags): if flag in (FLAG_UL, FLAG_OL): if current_flag is None: current_flag = flag if flag == current_flag: bullets.append("*" if flag == FLAG_UL else "#") else: break if flag != current_flag and buf[-1] == " ": # this trick is to avoid a space when we switch # from (un)ordered to the other type on the same row # e.g. *# unorder + ordered item del buf[-1] buf.extend(bullets) buf.append(" ") self.parse_children(elt, buf) buf.append("\n") def parser_ol(self, elt, buf): self.parser_list(elt, buf, FLAG_OL) def parser_p(self, elt, buf): self.parse_children(elt, buf) buf.append("\n\n") def parser_pre(self, elt, buf): pre = "".join( [ child.toXml() if domish.IElement.providedBy(child) else str(child) for child in elt.children ] ) pre = " " + "\n ".join(pre.split("\n")) buf.append(pre) def parser_q(self, elt, buf): quote_data = [str(elt)] lang = elt.getAttribute("lang") cite = elt.getAttribute("url") if lang: quote_data.append(lang) elif cite: quote_data.append("") if cite: quote_data.append(cite) buf.append("{{") buf.append("|".join(quote_data)) buf.append("}}") def parser_span(self, elt, buf): self.parse_children(elt, buf, block=True) def parser_strong(self, elt, buf): buf.append("__") self.parse_children(elt, buf) buf.append("__") def parser_sup(self, elt, buf): # sup is mainly used for footnotes, so we check if we have an anchor inside children = list( [child for child in elt.children if str(child).strip() not in ("", "\n")] ) if ( len(children) == 1 and domish.IElement.providedBy(children[0]) and children[0].name == "a" and "#" in children[0].getAttribute("href", "") ): url = children[0]["href"] note_id = url[url.find("#") + 1 :] if not note_id: log.warning("bad link found in footnote") self.parser_generic(elt, buf) return # this looks like a footnote buf.append("$$") buf.append(" ") # placeholder self.footnotes[note_id] = len(buf) - 1 buf.append("$$") else: self.parser_generic(elt, buf) def parser_ul(self, elt, buf): self.parser_list(elt, buf, FLAG_UL) def parser_list(self, elt, buf, type_): self.flags.append(type_) self.parse_children(elt, buf, block=True) idx = 0 for flag in reversed(self.flags): idx -= 1 if flag == type_: del self.flags[idx] break if idx == 0: raise exceptions.InternalError("flag has been removed by an other parser") def parser_heading(self, elt, buf, level): buf.append((6 - level) * "!") for child in elt.children: # we ignore other elements for a Hx title self.parser_text(child, buf) buf.append("\n") def parser_footnote(self, elt, buf): for elt in elt.elements(): # all children other than <p/> are ignored if elt.name == "p": a_elt = elt.a if a_elt is None: log.warning( "<p/> element doesn't contain <a/> in footnote, ignoring it" ) continue try: note_idx = self.footnotes[a_elt["id"]] except KeyError: log.warning("Note id doesn't match any known note, ignoring it") # we create a dummy element to parse all children after the <a/> dummy_elt = domish.Element((None, "note")) a_idx = elt.children.index(a_elt) dummy_elt.children = elt.children[a_idx + 1 :] note_buf = [] self.parse_children(dummy_elt, note_buf) # now we can replace the placeholder buf[note_idx] = "".join(note_buf) def parser_text(self, txt, buf, keep_whitespaces=False): txt = str(txt) if not keep_whitespaces: # we get text and only let one inter word space txt = " ".join(txt.split()) txt = re.sub(ESCAPE_CHARS, r"\\\1", txt) if txt: buf.append(txt) return txt def parser_generic(self, elt, buf): # as dotclear wiki syntax handle arbitrary XHTML code # we use this feature to add elements that we don't know buf.append("\n\n///html\n{}\n///\n\n".format(elt.toXml())) def parse_children(self, elt, buf, block=False): first_visible = True for child in elt.children: if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"): # we add separation if it isn't already there buf.append(" ") if domish.IElement.providedBy(child): self._parse(child, buf) first_visible = False else: appended = self.parser_text(child, buf) if appended: first_visible = False def _parse(self, elt, buf): elt_name = elt.name.lower() style = elt.getAttribute("style") if style and elt_name not in ELT_WITH_STYLE: # if we have style we use generic parser to put raw HTML # to avoid losing it parser = self.parser_generic else: try: parser = getattr(self, "parser_{}".format(elt_name)) except AttributeError: log.debug( "Can't find parser for {} element, using generic one".format(elt.name) ) parser = self.parser_generic parser(elt, buf) def parse(self, elt): self.flags = [] self.footnotes = {} buf = [] self._parse(elt, buf) return "".join(buf) def parseString(self, string): wrapped_html = "<div>{}</div>".format(string) try: div_elt = xml_tools.ElementParser()(wrapped_html) except domish.ParserError as e: log.warning("Error while parsing HTML content: {}".format(e)) return children = list(div_elt.elements()) if len(children) == 1 and children[0].name == "div": div_elt = children[0] return self.parse(div_elt) class DCWikiSyntax(object): SYNTAX_NAME = "wiki_dotclear" def __init__(self, host): log.info(_("Dotclear wiki syntax plugin initialization")) self.host = host self._dc_parser = DCWikiParser() self._xhtml_parser = XHTMLParser() self._stx = self.host.plugins["TEXT_SYNTAXES"] self._stx.add_syntax( self.SYNTAX_NAME, self.parse_wiki, self.parse_xhtml, [self._stx.OPT_NO_THREAD] ) def parse_wiki(self, wiki_stx): div_elt = self._dc_parser.parse(wiki_stx) return div_elt.toXml() def parse_xhtml(self, xhtml): return self._xhtml_parser.parseString(xhtml)