Mercurial > libervia-backend
view sat/plugins/plugin_syntax_wiki_dotclear.py @ 2721:4aaa47f62d8d
core (memory/sqlite): fixed v7 update performance issue:
Performance issue was due to bad ordering in table dropping, this has been fixed.
"infos" message are also deleted by this update as they are containing only presence data and take a lot of space for a barely useful data.
A config option may be available in the future to store presence data in logs.
author | Goffi <goffi@goffi.org> |
---|---|
date | Tue, 11 Dec 2018 23:53:27 +0100 |
parents | 56f94936df1e |
children | 003b8b4b56a7 |
line wrap: on
line source
#!/usr/bin/env python2 # -*- coding: utf-8 -*- # SàT plugin for Dotclear Wiki Syntax # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU Affero General Public License for more details. # You should have received a copy of the GNU Affero General Public License # along with this program. If not, see <http://www.gnu.org/licenses/>. # XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent from sat.core.i18n import _ from sat.core.log import getLogger log = getLogger(__name__) from sat.core.constants import Const as C from sat.core import exceptions from twisted.words.xish import domish from sat.tools import xml_tools import copy import re PLUGIN_INFO = { C.PI_NAME: "Dotclear Wiki Syntax Plugin", C.PI_IMPORT_NAME: "SYNT_DC_WIKI", C.PI_TYPE: C.PLUG_TYPE_SYNTAXE, C.PI_DEPENDENCIES: ["TEXT-SYNTAXES"], C.PI_MAIN: "DCWikiSyntax", C.PI_HANDLER: "", C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""), } NOTE_TPL = u"[{}]" # Note template NOTE_A_REV_TPL = u"rev_note_{}" NOTE_A_TPL = u"note_{}" ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])" ESCAPE_CHARS_EXTRA = ( r"!?_+'()" ) # These chars are not escaped in XHTML => dc_wiki conversion, # but are used in the other direction ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("") FLAG_UL = "ul" # must be the name of the element FLAG_OL = "ol" ELT_WITH_STYLE = ("img", "div") # elements where a style attribute is expected wiki = [ r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA), r"^!!!!!(?P<h1_title>.+?)$", r"^!!!!(?P<h2_title>.+?)$", r"^!!!(?P<h3_title>.+?)$", r"^!!(?P<h4_title>.+?)$", r"^!(?P<h5_title>.+?)$", r"^----$(?P<horizontal_rule>)", r"^\*(?P<list_bullet>.*?)$", r"^#(?P<list_ordered>.*?)$", r"^ (?P<preformated>.*?)$", r"^> +?(?P<quote>.*?)$", r"''(?P<emphasis>.+?)''", r"__(?P<strong_emphasis>.+?)__", r"%%%(?P<line_break>)", r"\+\+(?P<insertion>.+?)\+\+", r"--(?P<deletion>.+?)--", r"\[(?P<link>.+?)\]", r"\(\((?P<image>.+?)\)\)", r"~(?P<anchor>.+?)~", r"\?\?(?P<acronym>.+?\|.+?)\?\?", r"{{(?P<inline_quote>.+?)}}", r"@@(?P<code>.+?)@@", r"\$\$(?P<footnote>.+?)\$\$", r"(?P<text>.+?)", ] wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL) wiki_block_level_re = re.compile( r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)", re.MULTILINE | re.DOTALL, ) class DCWikiParser(object): def __init__(self): self._footnotes = None for i in xrange(5): setattr( self, "parser_h{}_title".format(i), lambda string, parent, i=i: self._parser_title( string, parent, "h{}".format(i) ), ) def parser_paragraph(self, string, parent): p_elt = parent.addElement("p") self._parse(string, p_elt) def parser_html(self, string, parent): wrapped_html = "<div>{}</div>".format(string) try: div_elt = xml_tools.ElementParser()(wrapped_html) except domish.ParserError as e: log.warning(u"Error while parsing HTML content, ignoring it: {}".format(e)) return children = list(div_elt.elements()) if len(children) == 1 and children[0].name == "div": div_elt = children[0] parent.addChild(div_elt) def parser_escape_char(self, string, parent): parent.addContent(string) def _parser_title(self, string, parent, name): elt = parent.addElement(name) elt.addContent(string) def parser_horizontal_rule(self, string, parent): parent.addElement("hr") def _parser_list(self, string, parent, list_type): depth = 0 while string[depth : depth + 1] == "*": depth += 1 string = string[depth:].lstrip() for i in xrange(depth + 1): list_elt = getattr(parent, list_type) if not list_elt: parent = parent.addElement(list_type) else: parent = list_elt li_elt = parent.addElement("li") self._parse(string, li_elt) def parser_list_bullet(self, string, parent): self._parser_list(string, parent, "ul") def parser_list_ordered(self, string, parent): self._parser_list(string, parent, "ol") def parser_preformated(self, string, parent): pre_elt = parent.pre if pre_elt is None: pre_elt = parent.addElement("pre") else: # we are on a new line, and this is important for <pre/> pre_elt.addContent("\n") pre_elt.addContent(string) def parser_quote(self, string, parent): blockquote_elt = parent.blockquote if blockquote_elt is None: blockquote_elt = parent.addElement("blockquote") p_elt = blockquote_elt.p if p_elt is None: p_elt = blockquote_elt.addElement("p") else: string = u"\n" + string self._parse(string, p_elt) def parser_emphasis(self, string, parent): em_elt = parent.addElement("em") self._parse(string, em_elt) def parser_strong_emphasis(self, string, parent): strong_elt = parent.addElement("strong") self._parse(string, strong_elt) def parser_line_break(self, string, parent): parent.addElement("br") def parser_insertion(self, string, parent): ins_elt = parent.addElement("ins") self._parse(string, ins_elt) def parser_deletion(self, string, parent): del_elt = parent.addElement("del") self._parse(string, del_elt) def parser_link(self, string, parent): url_data = string.split(u"|") a_elt = parent.addElement("a") length = len(url_data) if length == 1: url = url_data[0] a_elt["href"] = url a_elt.addContent(url) else: name = url_data[0] url = url_data[1] a_elt["href"] = url a_elt.addContent(name) if length >= 3: a_elt["lang"] = url_data[2] if length >= 4: a_elt["title"] = url_data[3] if length > 4: log.warning(u"too much data for url, ignoring extra data") def parser_image(self, string, parent): image_data = string.split(u"|") img_elt = parent.addElement("img") for idx, attribute in enumerate(("src", "alt", "position", "longdesc")): try: data = image_data[idx] except IndexError: break if attribute != "position": img_elt[attribute] = data else: data = data.lower() if data in ("l", "g"): img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0" elif data in ("r", "d"): img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em" elif data == "c": img_elt[ "style" ] = "display:block; margin-left:auto; margin-right:auto" else: log.warning(u"bad position argument for image, ignoring it") def parser_anchor(self, string, parent): a_elt = parent.addElement("a") a_elt["id"] = string def parser_acronym(self, string, parent): acronym, title = string.split(u"|", 1) acronym_elt = parent.addElement("acronym", content=acronym) acronym_elt["title"] = title def parser_inline_quote(self, string, parent): quote_data = string.split(u"|") quote = quote_data[0] q_elt = parent.addElement("q", content=quote) for idx, attribute in enumerate(("lang", "cite"), 1): try: data = quote_data[idx] except IndexError: break q_elt[attribute] = data def parser_code(self, string, parent): parent.addElement("code", content=string) def parser_footnote(self, string, parent): idx = len(self._footnotes) + 1 note_txt = NOTE_TPL.format(idx) sup_elt = parent.addElement("sup") sup_elt["class"] = "note" a_elt = sup_elt.addElement("a", content=note_txt) a_elt["id"] = NOTE_A_REV_TPL.format(idx) a_elt["href"] = u"#{}".format(NOTE_A_TPL.format(idx)) p_elt = domish.Element((None, "p")) a_elt = p_elt.addElement("a", content=note_txt) a_elt["id"] = NOTE_A_TPL.format(idx) a_elt["href"] = u"#{}".format(NOTE_A_REV_TPL.format(idx)) self._parse(string, p_elt) # footnotes are actually added at the end of the parsing self._footnotes.append(p_elt) def parser_text(self, string, parent): parent.addContent(string) def _parse(self, string, parent, block_level=False): regex = wiki_block_level_re if block_level else wiki_re for match in regex.finditer(string): if match.lastgroup is None: parent.addContent(string) return matched = match.group(match.lastgroup) try: parser = getattr(self, "parser_{}".format(match.lastgroup)) except AttributeError: log.warning(u"No parser found for {}".format(match.lastgroup)) # parent.addContent(string) continue parser(matched, parent) def parse(self, string): self._footnotes = [] div_elt = domish.Element((None, "div")) self._parse(string, parent=div_elt, block_level=True) if self._footnotes: foot_div_elt = div_elt.addElement("div") foot_div_elt["class"] = "footnotes" # we add a simple horizontal rule which can be customized # with footnotes class, instead of a text which would need # to be translated foot_div_elt.addElement("hr") for elt in self._footnotes: foot_div_elt.addChild(elt) return div_elt class XHTMLParser(object): def __init__(self): self.flags = None self.toto = 0 self.footnotes = None # will hold a map from url to buffer id for i in xrange(1, 6): setattr( self, "parser_h{}".format(i), lambda elt, buf, level=i: self.parserHeading(elt, buf, level), ) def parser_a(self, elt, buf): try: url = elt["href"] except KeyError: # probably an anchor try: id_ = elt["id"] if not id_: # we don't want empty values raise KeyError except KeyError: self.parserGeneric(elt, buf) else: buf.append(u"~~{}~~".format(id_)) return link_data = [url] name = unicode(elt) if name != url: link_data.insert(0, name) lang = elt.getAttribute("lang") title = elt.getAttribute("title") if lang is not None: link_data.append(lang) elif title is not None: link_data.appand(u"") if title is not None: link_data.append(title) buf.append(u"[") buf.append(u"|".join(link_data)) buf.append(u"]") def parser_acronym(self, elt, buf): try: title = elt["title"] except KeyError: log.debug(u"Acronyme without title, using generic parser") self.parserGeneric(elt, buf) return buf.append(u"??{}|{}??".format(unicode(elt), title)) def parser_blockquote(self, elt, buf): # we remove wrapping <p> to avoid empty line with "> " children = list( [child for child in elt.children if unicode(child).strip() not in ("", "\n")] ) if len(children) == 1 and children[0].name == "p": elt = children[0] tmp_buf = [] self.parseChildren(elt, tmp_buf) blockquote = u"> " + u"\n> ".join(u"".join(tmp_buf).split("\n")) buf.append(blockquote) def parser_br(self, elt, buf): buf.append(u"%%%") def parser_code(self, elt, buf): buf.append(u"@@") self.parseChildren(elt, buf) buf.append(u"@@") def parser_del(self, elt, buf): buf.append(u"--") self.parseChildren(elt, buf) buf.append(u"--") def parser_div(self, elt, buf): if elt.getAttribute("class") == "footnotes": self.parserFootnote(elt, buf) else: self.parseChildren(elt, buf, block=True) def parser_em(self, elt, buf): buf.append(u"''") self.parseChildren(elt, buf) buf.append(u"''") def parser_h6(self, elt, buf): # XXX: <h6/> heading is not managed by wiki syntax # so we handle it with a <h5/> elt = copy.copy(elt) # we don't want to change to original element elt.name = "h5" self._parse(elt, buf) def parser_hr(self, elt, buf): buf.append(u"\n----\n") def parser_img(self, elt, buf): try: url = elt["src"] except KeyError: log.warning(u"Ignoring <img/> without src") return image_data = [url] alt = elt.getAttribute("alt") style = elt.getAttribute("style", "") desc = elt.getAttribute("longdesc") if "0 1em 1em 0" in style: position = "L" elif "0 0 1em 1em" in style: position = "R" elif "auto" in style: position = "C" else: position = None if alt: image_data.append(alt) elif position or desc: image_data.append(u"") if position: image_data.append(position) elif desc: image_data.append(u"") if desc: image_data.append(desc) buf.append(u"((") buf.append(u"|".join(image_data)) buf.append(u"))") def parser_ins(self, elt, buf): buf.append(u"++") self.parseChildren(elt, buf) buf.append(u"++") def parser_li(self, elt, buf): flag = None current_flag = None bullets = [] for flag in reversed(self.flags): if flag in (FLAG_UL, FLAG_OL): if current_flag is None: current_flag = flag if flag == current_flag: bullets.append(u"*" if flag == FLAG_UL else u"#") else: break if flag != current_flag and buf[-1] == u" ": # this trick is to avoid a space when we switch # from (un)ordered to the other type on the same row # e.g. *# unorder + ordered item del buf[-1] buf.extend(bullets) buf.append(u" ") self.parseChildren(elt, buf) buf.append(u"\n") def parser_ol(self, elt, buf): self.parserList(elt, buf, FLAG_OL) def parser_p(self, elt, buf): self.parseChildren(elt, buf) buf.append(u"\n\n") def parser_pre(self, elt, buf): pre = u"".join( [ child.toXml() if domish.IElement.providedBy(child) else unicode(child) for child in elt.children ] ) pre = u" " + u"\n ".join(pre.split("\n")) buf.append(pre) def parser_q(self, elt, buf): quote_data = [unicode(elt)] lang = elt.getAttribute("lang") cite = elt.getAttribute("url") if lang: quote_data.append(lang) elif cite: quote_data.append(u"") if cite: quote_data.append(cite) buf.append(u"{{") buf.append(u"|".join(quote_data)) buf.append(u"}}") def parser_span(self, elt, buf): self.parseChildren(elt, buf, block=True) def parser_strong(self, elt, buf): buf.append(u"__") self.parseChildren(elt, buf) buf.append(u"__") def parser_sup(self, elt, buf): # sup is mainly used for footnotes, so we check if we have an anchor inside children = list( [child for child in elt.children if unicode(child).strip() not in ("", "\n")] ) if ( len(children) == 1 and domish.IElement.providedBy(children[0]) and children[0].name == "a" and "#" in children[0].getAttribute("href", "") ): url = children[0]["href"] note_id = url[url.find("#") + 1 :] if not note_id: log.warning("bad link found in footnote") self.parserGeneric(elt, buf) return # this looks like a footnote buf.append(u"$$") buf.append(u" ") # placeholder self.footnotes[note_id] = len(buf) - 1 buf.append(u"$$") else: self.parserGeneric(elt, buf) def parser_ul(self, elt, buf): self.parserList(elt, buf, FLAG_UL) def parserList(self, elt, buf, type_): self.flags.append(type_) self.parseChildren(elt, buf, block=True) idx = 0 for flag in reversed(self.flags): idx -= 1 if flag == type_: del self.flags[idx] break if idx == 0: raise exceptions.InternalError(u"flag has been removed by an other parser") def parserHeading(self, elt, buf, level): buf.append((6 - level) * u"!") for child in elt.children: # we ignore other elements for a Hx title self.parserText(child, buf) buf.append(u"\n") def parserFootnote(self, elt, buf): for elt in elt.elements(): # all children other than <p/> are ignored if elt.name == "p": a_elt = elt.a if a_elt is None: log.warning( u"<p/> element doesn't contain <a/> in footnote, ignoring it" ) continue try: note_idx = self.footnotes[a_elt["id"]] except KeyError: log.warning(u"Note id doesn't match any known note, ignoring it") # we create a dummy element to parse all children after the <a/> dummy_elt = domish.Element((None, "note")) a_idx = elt.children.index(a_elt) dummy_elt.children = elt.children[a_idx + 1 :] note_buf = [] self.parseChildren(dummy_elt, note_buf) # now we can replace the placeholder buf[note_idx] = u"".join(note_buf) def parserText(self, txt, buf, keep_whitespaces=False): txt = unicode(txt) if not keep_whitespaces: # we get text and only let one inter word space txt = u" ".join(txt.split()) txt = re.sub(ESCAPE_CHARS, r"\\\1", txt) if txt: buf.append(txt) return txt def parserGeneric(self, elt, buf): # as dotclear wiki syntax handle arbitrary XHTML code # we use this feature to add elements that we don't know buf.append(u"\n\n///html\n{}\n///\n\n".format(elt.toXml())) def parseChildren(self, elt, buf, block=False): first_visible = True for child in elt.children: if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"): # we add separation if it isn't already there buf.append(u" ") if domish.IElement.providedBy(child): self._parse(child, buf) first_visible = False else: appended = self.parserText(child, buf) if appended: first_visible = False def _parse(self, elt, buf): elt_name = elt.name.lower() style = elt.getAttribute("style") if style and elt_name not in ELT_WITH_STYLE: # if we have style we use generic parser to put raw HTML # to avoid losing it parser = self.parserGeneric else: try: parser = getattr(self, "parser_{}".format(elt_name)) except AttributeError: log.debug( "Can't find parser for {} element, using generic one".format(elt.name) ) parser = self.parserGeneric parser(elt, buf) def parse(self, elt): self.flags = [] self.footnotes = {} buf = [] self._parse(elt, buf) return u"".join(buf) def parseString(self, string): wrapped_html = u"<div>{}</div>".format(string) try: div_elt = xml_tools.ElementParser()(wrapped_html) except domish.ParserError as e: log.warning(u"Error while parsing HTML content: {}".format(e)) return children = list(div_elt.elements()) if len(children) == 1 and children[0].name == "div": div_elt = children[0] return self.parse(div_elt) class DCWikiSyntax(object): SYNTAX_NAME = "wiki_dotclear" def __init__(self, host): log.info(_(u"Dotclear wiki syntax plugin initialization")) self.host = host self._dc_parser = DCWikiParser() self._xhtml_parser = XHTMLParser() self._stx = self.host.plugins["TEXT-SYNTAXES"] self._stx.addSyntax( self.SYNTAX_NAME, self.parseWiki, self.parseXHTML, [self._stx.OPT_NO_THREAD] ) def parseWiki(self, wiki_stx): div_elt = self._dc_parser.parse(wiki_stx) return div_elt.toXml() def parseXHTML(self, xhtml): return self._xhtml_parser.parseString(xhtml)