comparison libervia/backend/plugins/plugin_syntax_wiki_dotclear.py @ 4071:4b842c1fb686

refactoring: renamed `sat` package to `libervia.backend`
author Goffi <goffi@goffi.org>
date Fri, 02 Jun 2023 11:49:51 +0200
parents sat/plugins/plugin_syntax_wiki_dotclear.py@524856bd7b19
children 0d7bb4df2343
comparison
equal deleted inserted replaced
4070:d10748475025 4071:4b842c1fb686
1 #!/usr/bin/env python3
2
3
4 # SàT plugin for Dotclear Wiki Syntax
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 # XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent
21
22 from libervia.backend.core.i18n import _
23 from libervia.backend.core.log import getLogger
24
25 log = getLogger(__name__)
26 from libervia.backend.core.constants import Const as C
27 from libervia.backend.core import exceptions
28 from twisted.words.xish import domish
29 from libervia.backend.tools import xml_tools
30 import copy
31 import re
32
33 PLUGIN_INFO = {
34 C.PI_NAME: "Dotclear Wiki Syntax Plugin",
35 C.PI_IMPORT_NAME: "SYNT_DC_WIKI",
36 C.PI_TYPE: C.PLUG_TYPE_SYNTAXE,
37 C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"],
38 C.PI_MAIN: "DCWikiSyntax",
39 C.PI_HANDLER: "",
40 C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax"""),
41 }
42
43 NOTE_TPL = "[{}]" # Note template
44 NOTE_A_REV_TPL = "rev_note_{}"
45 NOTE_A_TPL = "note_{}"
46 ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])"
47 ESCAPE_CHARS_EXTRA = (
48 r"!?_+'()"
49 ) # These chars are not escaped in XHTML => dc_wiki conversion,
50 # but are used in the other direction
51 ESCAPE_CHARS = ESCAPE_CHARS_BASE.format("")
52 FLAG_UL = "ul" # must be the name of the element
53 FLAG_OL = "ol"
54 ELT_WITH_STYLE = ("img", "div") # elements where a style attribute is expected
55
56 wiki = [
57 r"\\" + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA),
58 r"^!!!!!(?P<h1_title>.+?)$",
59 r"^!!!!(?P<h2_title>.+?)$",
60 r"^!!!(?P<h3_title>.+?)$",
61 r"^!!(?P<h4_title>.+?)$",
62 r"^!(?P<h5_title>.+?)$",
63 r"^----$(?P<horizontal_rule>)",
64 r"^\*(?P<list_bullet>.*?)$",
65 r"^#(?P<list_ordered>.*?)$",
66 r"^ (?P<preformated>.*?)$",
67 r"^> +?(?P<quote>.*?)$",
68 r"''(?P<emphasis>.+?)''",
69 r"__(?P<strong_emphasis>.+?)__",
70 r"%%%(?P<line_break>)",
71 r"\+\+(?P<insertion>.+?)\+\+",
72 r"--(?P<deletion>.+?)--",
73 r"\[(?P<link>.+?)\]",
74 r"\(\((?P<image>.+?)\)\)",
75 r"~(?P<anchor>.+?)~",
76 r"\?\?(?P<acronym>.+?\|.+?)\?\?",
77 r"{{(?P<inline_quote>.+?)}}",
78 r"@@(?P<code>.+?)@@",
79 r"\$\$(?P<footnote>.+?)\$\$",
80 r"(?P<text>.+?)",
81 ]
82
83 wiki_re = re.compile("|".join(wiki), re.MULTILINE | re.DOTALL)
84 wiki_block_level_re = re.compile(
85 r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)",
86 re.MULTILINE | re.DOTALL,
87 )
88
89
90 class DCWikiParser(object):
91 def __init__(self):
92 self._footnotes = None
93 for i in range(5):
94 setattr(
95 self,
96 "parser_h{}_title".format(i),
97 lambda string, parent, i=i: self._parser_title(
98 string, parent, "h{}".format(i)
99 ),
100 )
101
102 def parser_paragraph(self, string, parent):
103 p_elt = parent.addElement("p")
104 self._parse(string, p_elt)
105
106 def parser_html(self, string, parent):
107 wrapped_html = "<div>{}</div>".format(string)
108 try:
109 div_elt = xml_tools.ElementParser()(wrapped_html)
110 except domish.ParserError as e:
111 log.warning("Error while parsing HTML content, ignoring it: {}".format(e))
112 return
113 children = list(div_elt.elements())
114 if len(children) == 1 and children[0].name == "div":
115 div_elt = children[0]
116 parent.addChild(div_elt)
117
118 def parser_escape_char(self, string, parent):
119 parent.addContent(string)
120
121 def _parser_title(self, string, parent, name):
122 elt = parent.addElement(name)
123 elt.addContent(string)
124
125 def parser_horizontal_rule(self, string, parent):
126 parent.addElement("hr")
127
128 def _parser_list(self, string, parent, list_type):
129 depth = 0
130 while string[depth : depth + 1] == "*":
131 depth += 1
132
133 string = string[depth:].lstrip()
134
135 for i in range(depth + 1):
136 list_elt = getattr(parent, list_type)
137 if not list_elt:
138 parent = parent.addElement(list_type)
139 else:
140 parent = list_elt
141
142 li_elt = parent.addElement("li")
143 self._parse(string, li_elt)
144
145 def parser_list_bullet(self, string, parent):
146 self._parser_list(string, parent, "ul")
147
148 def parser_list_ordered(self, string, parent):
149 self._parser_list(string, parent, "ol")
150
151 def parser_preformated(self, string, parent):
152 pre_elt = parent.pre
153 if pre_elt is None:
154 pre_elt = parent.addElement("pre")
155 else:
156 # we are on a new line, and this is important for <pre/>
157 pre_elt.addContent("\n")
158 pre_elt.addContent(string)
159
160 def parser_quote(self, string, parent):
161 blockquote_elt = parent.blockquote
162 if blockquote_elt is None:
163 blockquote_elt = parent.addElement("blockquote")
164 p_elt = blockquote_elt.p
165 if p_elt is None:
166 p_elt = blockquote_elt.addElement("p")
167 else:
168 string = "\n" + string
169
170 self._parse(string, p_elt)
171
172 def parser_emphasis(self, string, parent):
173 em_elt = parent.addElement("em")
174 self._parse(string, em_elt)
175
176 def parser_strong_emphasis(self, string, parent):
177 strong_elt = parent.addElement("strong")
178 self._parse(string, strong_elt)
179
180 def parser_line_break(self, string, parent):
181 parent.addElement("br")
182
183 def parser_insertion(self, string, parent):
184 ins_elt = parent.addElement("ins")
185 self._parse(string, ins_elt)
186
187 def parser_deletion(self, string, parent):
188 del_elt = parent.addElement("del")
189 self._parse(string, del_elt)
190
191 def parser_link(self, string, parent):
192 url_data = string.split("|")
193 a_elt = parent.addElement("a")
194 length = len(url_data)
195 if length == 1:
196 url = url_data[0]
197 a_elt["href"] = url
198 a_elt.addContent(url)
199 else:
200 name = url_data[0]
201 url = url_data[1]
202 a_elt["href"] = url
203 a_elt.addContent(name)
204 if length >= 3:
205 a_elt["lang"] = url_data[2]
206 if length >= 4:
207 a_elt["title"] = url_data[3]
208 if length > 4:
209 log.warning("too much data for url, ignoring extra data")
210
211 def parser_image(self, string, parent):
212 image_data = string.split("|")
213 img_elt = parent.addElement("img")
214
215 for idx, attribute in enumerate(("src", "alt", "position", "longdesc")):
216 try:
217 data = image_data[idx]
218 except IndexError:
219 break
220
221 if attribute != "position":
222 img_elt[attribute] = data
223 else:
224 data = data.lower()
225 if data in ("l", "g"):
226 img_elt["style"] = "display:block; float:left; margin:0 1em 1em 0"
227 elif data in ("r", "d"):
228 img_elt["style"] = "display:block; float:right; margin:0 0 1em 1em"
229 elif data == "c":
230 img_elt[
231 "style"
232 ] = "display:block; margin-left:auto; margin-right:auto"
233 else:
234 log.warning("bad position argument for image, ignoring it")
235
236 def parser_anchor(self, string, parent):
237 a_elt = parent.addElement("a")
238 a_elt["id"] = string
239
240 def parser_acronym(self, string, parent):
241 acronym, title = string.split("|", 1)
242 acronym_elt = parent.addElement("acronym", content=acronym)
243 acronym_elt["title"] = title
244
245 def parser_inline_quote(self, string, parent):
246 quote_data = string.split("|")
247 quote = quote_data[0]
248 q_elt = parent.addElement("q", content=quote)
249 for idx, attribute in enumerate(("lang", "cite"), 1):
250 try:
251 data = quote_data[idx]
252 except IndexError:
253 break
254 q_elt[attribute] = data
255
256 def parser_code(self, string, parent):
257 parent.addElement("code", content=string)
258
259 def parser_footnote(self, string, parent):
260 idx = len(self._footnotes) + 1
261 note_txt = NOTE_TPL.format(idx)
262 sup_elt = parent.addElement("sup")
263 sup_elt["class"] = "note"
264 a_elt = sup_elt.addElement("a", content=note_txt)
265 a_elt["id"] = NOTE_A_REV_TPL.format(idx)
266 a_elt["href"] = "#{}".format(NOTE_A_TPL.format(idx))
267
268 p_elt = domish.Element((None, "p"))
269 a_elt = p_elt.addElement("a", content=note_txt)
270 a_elt["id"] = NOTE_A_TPL.format(idx)
271 a_elt["href"] = "#{}".format(NOTE_A_REV_TPL.format(idx))
272 self._parse(string, p_elt)
273 # footnotes are actually added at the end of the parsing
274 self._footnotes.append(p_elt)
275
276 def parser_text(self, string, parent):
277 parent.addContent(string)
278
279 def _parse(self, string, parent, block_level=False):
280 regex = wiki_block_level_re if block_level else wiki_re
281
282 for match in regex.finditer(string):
283 if match.lastgroup is None:
284 parent.addContent(string)
285 return
286 matched = match.group(match.lastgroup)
287 try:
288 parser = getattr(self, "parser_{}".format(match.lastgroup))
289 except AttributeError:
290 log.warning("No parser found for {}".format(match.lastgroup))
291 # parent.addContent(string)
292 continue
293 parser(matched, parent)
294
295 def parse(self, string):
296 self._footnotes = []
297 div_elt = domish.Element((None, "div"))
298 self._parse(string, parent=div_elt, block_level=True)
299 if self._footnotes:
300 foot_div_elt = div_elt.addElement("div")
301 foot_div_elt["class"] = "footnotes"
302 # we add a simple horizontal rule which can be customized
303 # with footnotes class, instead of a text which would need
304 # to be translated
305 foot_div_elt.addElement("hr")
306 for elt in self._footnotes:
307 foot_div_elt.addChild(elt)
308 return div_elt
309
310
311 class XHTMLParser(object):
312 def __init__(self):
313 self.flags = None
314 self.toto = 0
315 self.footnotes = None # will hold a map from url to buffer id
316 for i in range(1, 6):
317 setattr(
318 self,
319 "parser_h{}".format(i),
320 lambda elt, buf, level=i: self.parser_heading(elt, buf, level),
321 )
322
323 def parser_a(self, elt, buf):
324 try:
325 url = elt["href"]
326 except KeyError:
327 # probably an anchor
328 try:
329 id_ = elt["id"]
330 if not id_:
331 # we don't want empty values
332 raise KeyError
333 except KeyError:
334 self.parser_generic(elt, buf)
335 else:
336 buf.append("~~{}~~".format(id_))
337 return
338
339 link_data = [url]
340 name = str(elt)
341 if name != url:
342 link_data.insert(0, name)
343
344 lang = elt.getAttribute("lang")
345 title = elt.getAttribute("title")
346 if lang is not None:
347 link_data.append(lang)
348 elif title is not None:
349 link_data.appand("")
350 if title is not None:
351 link_data.append(title)
352 buf.append("[")
353 buf.append("|".join(link_data))
354 buf.append("]")
355
356 def parser_acronym(self, elt, buf):
357 try:
358 title = elt["title"]
359 except KeyError:
360 log.debug("Acronyme without title, using generic parser")
361 self.parser_generic(elt, buf)
362 return
363 buf.append("??{}|{}??".format(str(elt), title))
364
365 def parser_blockquote(self, elt, buf):
366 # we remove wrapping <p> to avoid empty line with "> "
367 children = list(
368 [child for child in elt.children if str(child).strip() not in ("", "\n")]
369 )
370 if len(children) == 1 and children[0].name == "p":
371 elt = children[0]
372 tmp_buf = []
373 self.parse_children(elt, tmp_buf)
374 blockquote = "> " + "\n> ".join("".join(tmp_buf).split("\n"))
375 buf.append(blockquote)
376
377 def parser_br(self, elt, buf):
378 buf.append("%%%")
379
380 def parser_code(self, elt, buf):
381 buf.append("@@")
382 self.parse_children(elt, buf)
383 buf.append("@@")
384
385 def parser_del(self, elt, buf):
386 buf.append("--")
387 self.parse_children(elt, buf)
388 buf.append("--")
389
390 def parser_div(self, elt, buf):
391 if elt.getAttribute("class") == "footnotes":
392 self.parser_footnote(elt, buf)
393 else:
394 self.parse_children(elt, buf, block=True)
395
396 def parser_em(self, elt, buf):
397 buf.append("''")
398 self.parse_children(elt, buf)
399 buf.append("''")
400
401 def parser_h6(self, elt, buf):
402 # XXX: <h6/> heading is not managed by wiki syntax
403 # so we handle it with a <h5/>
404 elt = copy.copy(elt) # we don't want to change to original element
405 elt.name = "h5"
406 self._parse(elt, buf)
407
408 def parser_hr(self, elt, buf):
409 buf.append("\n----\n")
410
411 def parser_img(self, elt, buf):
412 try:
413 url = elt["src"]
414 except KeyError:
415 log.warning("Ignoring <img/> without src")
416 return
417
418 image_data = [url]
419
420 alt = elt.getAttribute("alt")
421 style = elt.getAttribute("style", "")
422 desc = elt.getAttribute("longdesc")
423
424 if "0 1em 1em 0" in style:
425 position = "L"
426 elif "0 0 1em 1em" in style:
427 position = "R"
428 elif "auto" in style:
429 position = "C"
430 else:
431 position = None
432
433 if alt:
434 image_data.append(alt)
435 elif position or desc:
436 image_data.append("")
437
438 if position:
439 image_data.append(position)
440 elif desc:
441 image_data.append("")
442
443 if desc:
444 image_data.append(desc)
445
446 buf.append("((")
447 buf.append("|".join(image_data))
448 buf.append("))")
449
450 def parser_ins(self, elt, buf):
451 buf.append("++")
452 self.parse_children(elt, buf)
453 buf.append("++")
454
455 def parser_li(self, elt, buf):
456 flag = None
457 current_flag = None
458 bullets = []
459 for flag in reversed(self.flags):
460 if flag in (FLAG_UL, FLAG_OL):
461 if current_flag is None:
462 current_flag = flag
463 if flag == current_flag:
464 bullets.append("*" if flag == FLAG_UL else "#")
465 else:
466 break
467
468 if flag != current_flag and buf[-1] == " ":
469 # this trick is to avoid a space when we switch
470 # from (un)ordered to the other type on the same row
471 # e.g. *# unorder + ordered item
472 del buf[-1]
473
474 buf.extend(bullets)
475
476 buf.append(" ")
477 self.parse_children(elt, buf)
478 buf.append("\n")
479
480 def parser_ol(self, elt, buf):
481 self.parser_list(elt, buf, FLAG_OL)
482
483 def parser_p(self, elt, buf):
484 self.parse_children(elt, buf)
485 buf.append("\n\n")
486
487 def parser_pre(self, elt, buf):
488 pre = "".join(
489 [
490 child.toXml() if domish.IElement.providedBy(child) else str(child)
491 for child in elt.children
492 ]
493 )
494 pre = " " + "\n ".join(pre.split("\n"))
495 buf.append(pre)
496
497 def parser_q(self, elt, buf):
498 quote_data = [str(elt)]
499
500 lang = elt.getAttribute("lang")
501 cite = elt.getAttribute("url")
502
503 if lang:
504 quote_data.append(lang)
505 elif cite:
506 quote_data.append("")
507
508 if cite:
509 quote_data.append(cite)
510
511 buf.append("{{")
512 buf.append("|".join(quote_data))
513 buf.append("}}")
514
515 def parser_span(self, elt, buf):
516 self.parse_children(elt, buf, block=True)
517
518 def parser_strong(self, elt, buf):
519 buf.append("__")
520 self.parse_children(elt, buf)
521 buf.append("__")
522
523 def parser_sup(self, elt, buf):
524 # sup is mainly used for footnotes, so we check if we have an anchor inside
525 children = list(
526 [child for child in elt.children if str(child).strip() not in ("", "\n")]
527 )
528 if (
529 len(children) == 1
530 and domish.IElement.providedBy(children[0])
531 and children[0].name == "a"
532 and "#" in children[0].getAttribute("href", "")
533 ):
534 url = children[0]["href"]
535 note_id = url[url.find("#") + 1 :]
536 if not note_id:
537 log.warning("bad link found in footnote")
538 self.parser_generic(elt, buf)
539 return
540 # this looks like a footnote
541 buf.append("$$")
542 buf.append(" ") # placeholder
543 self.footnotes[note_id] = len(buf) - 1
544 buf.append("$$")
545 else:
546 self.parser_generic(elt, buf)
547
548 def parser_ul(self, elt, buf):
549 self.parser_list(elt, buf, FLAG_UL)
550
551 def parser_list(self, elt, buf, type_):
552 self.flags.append(type_)
553 self.parse_children(elt, buf, block=True)
554 idx = 0
555 for flag in reversed(self.flags):
556 idx -= 1
557 if flag == type_:
558 del self.flags[idx]
559 break
560
561 if idx == 0:
562 raise exceptions.InternalError("flag has been removed by an other parser")
563
564 def parser_heading(self, elt, buf, level):
565 buf.append((6 - level) * "!")
566 for child in elt.children:
567 # we ignore other elements for a Hx title
568 self.parser_text(child, buf)
569 buf.append("\n")
570
571 def parser_footnote(self, elt, buf):
572 for elt in elt.elements():
573 # all children other than <p/> are ignored
574 if elt.name == "p":
575 a_elt = elt.a
576 if a_elt is None:
577 log.warning(
578 "<p/> element doesn't contain <a/> in footnote, ignoring it"
579 )
580 continue
581 try:
582 note_idx = self.footnotes[a_elt["id"]]
583 except KeyError:
584 log.warning("Note id doesn't match any known note, ignoring it")
585 # we create a dummy element to parse all children after the <a/>
586 dummy_elt = domish.Element((None, "note"))
587 a_idx = elt.children.index(a_elt)
588 dummy_elt.children = elt.children[a_idx + 1 :]
589 note_buf = []
590 self.parse_children(dummy_elt, note_buf)
591 # now we can replace the placeholder
592 buf[note_idx] = "".join(note_buf)
593
594 def parser_text(self, txt, buf, keep_whitespaces=False):
595 txt = str(txt)
596 if not keep_whitespaces:
597 # we get text and only let one inter word space
598 txt = " ".join(txt.split())
599 txt = re.sub(ESCAPE_CHARS, r"\\\1", txt)
600 if txt:
601 buf.append(txt)
602 return txt
603
604 def parser_generic(self, elt, buf):
605 # as dotclear wiki syntax handle arbitrary XHTML code
606 # we use this feature to add elements that we don't know
607 buf.append("\n\n///html\n{}\n///\n\n".format(elt.toXml()))
608
609 def parse_children(self, elt, buf, block=False):
610 first_visible = True
611 for child in elt.children:
612 if not block and not first_visible and buf and buf[-1][-1] not in (" ", "\n"):
613 # we add separation if it isn't already there
614 buf.append(" ")
615 if domish.IElement.providedBy(child):
616 self._parse(child, buf)
617 first_visible = False
618 else:
619 appended = self.parser_text(child, buf)
620 if appended:
621 first_visible = False
622
623 def _parse(self, elt, buf):
624 elt_name = elt.name.lower()
625 style = elt.getAttribute("style")
626 if style and elt_name not in ELT_WITH_STYLE:
627 # if we have style we use generic parser to put raw HTML
628 # to avoid losing it
629 parser = self.parser_generic
630 else:
631 try:
632 parser = getattr(self, "parser_{}".format(elt_name))
633 except AttributeError:
634 log.debug(
635 "Can't find parser for {} element, using generic one".format(elt.name)
636 )
637 parser = self.parser_generic
638 parser(elt, buf)
639
640 def parse(self, elt):
641 self.flags = []
642 self.footnotes = {}
643 buf = []
644 self._parse(elt, buf)
645 return "".join(buf)
646
647 def parseString(self, string):
648 wrapped_html = "<div>{}</div>".format(string)
649 try:
650 div_elt = xml_tools.ElementParser()(wrapped_html)
651 except domish.ParserError as e:
652 log.warning("Error while parsing HTML content: {}".format(e))
653 return
654 children = list(div_elt.elements())
655 if len(children) == 1 and children[0].name == "div":
656 div_elt = children[0]
657 return self.parse(div_elt)
658
659
660 class DCWikiSyntax(object):
661 SYNTAX_NAME = "wiki_dotclear"
662
663 def __init__(self, host):
664 log.info(_("Dotclear wiki syntax plugin initialization"))
665 self.host = host
666 self._dc_parser = DCWikiParser()
667 self._xhtml_parser = XHTMLParser()
668 self._stx = self.host.plugins["TEXT_SYNTAXES"]
669 self._stx.add_syntax(
670 self.SYNTAX_NAME, self.parse_wiki, self.parse_xhtml, [self._stx.OPT_NO_THREAD]
671 )
672
673 def parse_wiki(self, wiki_stx):
674 div_elt = self._dc_parser.parse(wiki_stx)
675 return div_elt.toXml()
676
677 def parse_xhtml(self, xhtml):
678 return self._xhtml_parser.parseString(xhtml)