comparison sat/plugins/plugin_syntax_wiki_dotclear.py @ 2562:26edcf3a30eb

core, setup: huge cleaning: - moved directories from src and frontends/src to sat and sat_frontends, which is the recommanded naming convention - move twisted directory to root - removed all hacks from setup.py, and added missing dependencies, it is now clean - use https URL for website in setup.py - removed "Environment :: X11 Applications :: GTK", as wix is deprecated and removed - renamed sat.sh to sat and fixed its installation - added python_requires to specify Python version needed - replaced glib2reactor which use deprecated code by gtk3reactor sat can now be installed directly from virtualenv without using --system-site-packages anymore \o/
author Goffi <goffi@goffi.org>
date Mon, 02 Apr 2018 19:44:50 +0200
parents src/plugins/plugin_syntax_wiki_dotclear.py@0046283a285d
children 56f94936df1e
comparison
equal deleted inserted replaced
2561:bd30dc3ffe5a 2562:26edcf3a30eb
1 #!/usr/bin/env python2
2 # -*- coding: utf-8 -*-
3
4 # SàT plugin for Dotclear Wiki Syntax
5 # Copyright (C) 2009-2018 Jérôme Poisson (goffi@goffi.org)
6
7 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version.
11
12 # This program is distributed in the hope that it will be useful,
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 # GNU Affero General Public License for more details.
16
17 # You should have received a copy of the GNU Affero General Public License
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19
20 # XXX: ref used: http://dotclear.org/documentation/2.0/usage/syntaxes#wiki-syntax-and-xhtml-equivalent
21
22 from sat.core.i18n import _
23 from sat.core.log import getLogger
24 log = getLogger(__name__)
25 from sat.core.constants import Const as C
26 from sat.core import exceptions
27 from twisted.words.xish import domish
28 from sat.tools import xml_tools
29 import copy
30 import re
31
32 PLUGIN_INFO = {
33 C.PI_NAME: "Dotclear Wiki Syntax Plugin",
34 C.PI_IMPORT_NAME: "SYNT_DC_WIKI",
35 C.PI_TYPE: C.PLUG_TYPE_SYNTAXE,
36 C.PI_DEPENDENCIES: ["TEXT-SYNTAXES"],
37 C.PI_MAIN: "DCWikiSyntax",
38 C.PI_HANDLER: "",
39 C.PI_DESCRIPTION: _("""Implementation of Dotclear wiki syntax""")
40 }
41
42 NOTE_TPL = u'[{}]' # Note template
43 NOTE_A_REV_TPL = u'rev_note_{}'
44 NOTE_A_TPL = u'note_{}'
45 ESCAPE_CHARS_BASE = r"(?P<escape_char>[][{}%|\\/*#@{{}}~$-])"
46 ESCAPE_CHARS_EXTRA = r"!?_+'()" # These chars are not escaped in XHTML => dc_wiki conversion,
47 # but are used in the other direction
48 ESCAPE_CHARS = ESCAPE_CHARS_BASE.format('')
49 FLAG_UL = 'ul' # must be the name of the element
50 FLAG_OL = 'ol'
51 ELT_WITH_STYLE = ('img', 'div') # elements where a style attribute is expected
52
53 wiki = [r'\\' + ESCAPE_CHARS_BASE.format(ESCAPE_CHARS_EXTRA),
54 r"^!!!!!(?P<h1_title>.+?)$",
55 r"^!!!!(?P<h2_title>.+?)$",
56 r"^!!!(?P<h3_title>.+?)$",
57 r"^!!(?P<h4_title>.+?)$",
58 r"^!(?P<h5_title>.+?)$",
59 r"^----$(?P<horizontal_rule>)",
60 r"^\*(?P<list_bullet>.*?)$",
61 r"^#(?P<list_ordered>.*?)$",
62 r"^ (?P<preformated>.*?)$",
63 r"^> +?(?P<quote>.*?)$",
64 r"''(?P<emphasis>.+?)''",
65 r"__(?P<strong_emphasis>.+?)__",
66 r"%%%(?P<line_break>)",
67 r"\+\+(?P<insertion>.+?)\+\+",
68 r"--(?P<deletion>.+?)--",
69 r"\[(?P<link>.+?)\]",
70 r"\(\((?P<image>.+?)\)\)",
71 r"~(?P<anchor>.+?)~",
72 r"\?\?(?P<acronym>.+?\|.+?)\?\?",
73 r"{{(?P<inline_quote>.+?)}}",
74 r"@@(?P<code>.+?)@@",
75 r"\$\$(?P<footnote>.+?)\$\$",
76 r"(?P<text>.+?)",
77 ]
78
79 wiki_re = re.compile('|'.join(wiki), re.MULTILINE | re.DOTALL)
80 wiki_block_level_re = re.compile(r"^///html(?P<html>.+?)///\n\n|(?P<paragraph>.+?)(?:\n{2,}|\Z)", re.MULTILINE | re.DOTALL)
81
82
83 class DCWikiParser(object):
84
85 def __init__(self):
86 self._footnotes = None
87 for i in xrange(5):
88 setattr(self,
89 'parser_h{}_title'.format(i),
90 lambda string, parent, i=i: self._parser_title(string, parent, 'h{}'.format(i)))
91
92 def parser_paragraph(self, string, parent):
93 p_elt = parent.addElement('p')
94 self._parse(string, p_elt)
95
96 def parser_html(self, string, parent):
97 wrapped_html = "<div>{}</div>".format(string)
98 try:
99 div_elt = xml_tools.ElementParser()(wrapped_html)
100 except domish.ParserError as e:
101 log.warning(u"Error while parsing HTML content, ignoring it: {}".format(e))
102 return
103 children = list(div_elt.elements())
104 if len(children) == 1 and children[0].name == 'div':
105 div_elt = children[0]
106 parent.addChild(div_elt)
107
108 def parser_escape_char(self, string, parent):
109 parent.addContent(string)
110
111 def _parser_title(self, string, parent, name):
112 elt = parent.addElement(name)
113 elt.addContent(string)
114
115 def parser_horizontal_rule(self, string, parent):
116 parent.addElement('hr')
117
118 def _parser_list(self, string, parent, list_type):
119 depth = 0
120 while string[depth:depth+1] == '*':
121 depth +=1
122
123 string = string[depth:].lstrip()
124
125 for i in xrange(depth+1):
126 list_elt = getattr(parent, list_type)
127 if not list_elt:
128 parent = parent.addElement(list_type)
129 else:
130 parent = list_elt
131
132 li_elt = parent.addElement('li')
133 self._parse(string, li_elt)
134
135 def parser_list_bullet(self, string, parent):
136 self._parser_list(string, parent, 'ul')
137
138 def parser_list_ordered(self, string, parent):
139 self._parser_list(string, parent, 'ol')
140
141 def parser_preformated(self, string, parent):
142 pre_elt = parent.pre
143 if pre_elt is None:
144 pre_elt = parent.addElement('pre')
145 else:
146 # we are on a new line, and this is important for <pre/>
147 pre_elt.addContent('\n')
148 pre_elt.addContent(string)
149
150 def parser_quote(self, string, parent):
151 blockquote_elt = parent.blockquote
152 if blockquote_elt is None:
153 blockquote_elt = parent.addElement('blockquote')
154 p_elt = blockquote_elt.p
155 if p_elt is None:
156 p_elt = blockquote_elt.addElement('p')
157 else:
158 string = u'\n' + string
159
160 self._parse(string, p_elt)
161
162 def parser_emphasis(self, string, parent):
163 em_elt = parent.addElement('em')
164 self._parse(string, em_elt)
165
166 def parser_strong_emphasis(self, string, parent):
167 strong_elt = parent.addElement('strong')
168 self._parse(string, strong_elt)
169
170 def parser_line_break(self, string, parent):
171 parent.addElement('br')
172
173 def parser_insertion(self, string, parent):
174 ins_elt = parent.addElement('ins')
175 self._parse(string, ins_elt)
176
177 def parser_deletion(self, string, parent):
178 del_elt = parent.addElement('del')
179 self._parse(string, del_elt)
180
181 def parser_link(self, string, parent):
182 url_data = string.split(u'|')
183 a_elt = parent.addElement('a')
184 length = len(url_data)
185 if length == 1:
186 url = url_data[0]
187 a_elt['href'] = url
188 a_elt.addContent(url)
189 else:
190 name = url_data[0]
191 url = url_data[1]
192 a_elt['href'] = url
193 a_elt.addContent(name)
194 if length >= 3:
195 a_elt['lang'] = url_data[2]
196 if length >= 4:
197 a_elt['title'] = url_data[3]
198 if length > 4:
199 log.warning(u"too much data for url, ignoring extra data")
200
201 def parser_image(self, string, parent):
202 image_data = string.split(u'|')
203 img_elt = parent.addElement('img')
204
205 for idx, attribute in enumerate(('src', 'alt', 'position', 'longdesc')):
206 try:
207 data = image_data[idx]
208 except IndexError:
209 break
210
211 if attribute != 'position':
212 img_elt[attribute] = data
213 else:
214 data = data.lower()
215 if data in ('l', 'g'):
216 img_elt['style'] = "display:block; float:left; margin:0 1em 1em 0"
217 elif data in ('r', 'd'):
218 img_elt['style'] = "display:block; float:right; margin:0 0 1em 1em"
219 elif data == 'c':
220 img_elt['style'] = "display:block; margin-left:auto; margin-right:auto"
221 else:
222 log.warning(u"bad position argument for image, ignoring it")
223
224 def parser_anchor(self, string, parent):
225 a_elt = parent.addElement('a')
226 a_elt['id'] = string
227
228 def parser_acronym(self, string, parent):
229 acronym, title = string.split(u'|',1)
230 acronym_elt = parent.addElement('acronym', content=acronym)
231 acronym_elt['title'] = title
232
233 def parser_inline_quote(self, string, parent):
234 quote_data = string.split(u'|')
235 quote = quote_data[0]
236 q_elt = parent.addElement('q', content=quote)
237 for idx, attribute in enumerate(('lang', 'cite'), 1):
238 try:
239 data = quote_data[idx]
240 except IndexError:
241 break
242 q_elt[attribute] = data
243
244 def parser_code(self, string, parent):
245 parent.addElement('code', content=string)
246
247 def parser_footnote(self, string, parent):
248 idx = len(self._footnotes) + 1
249 note_txt = NOTE_TPL.format(idx)
250 sup_elt = parent.addElement('sup')
251 sup_elt['class'] = 'note'
252 a_elt = sup_elt.addElement('a', content=note_txt)
253 a_elt['id'] = NOTE_A_REV_TPL.format(idx)
254 a_elt['href'] = u'#{}'.format(NOTE_A_TPL.format(idx))
255
256 p_elt = domish.Element((None, 'p'))
257 a_elt = p_elt.addElement('a', content=note_txt)
258 a_elt['id'] = NOTE_A_TPL.format(idx)
259 a_elt['href'] = u'#{}'.format(NOTE_A_REV_TPL.format(idx))
260 self._parse(string, p_elt)
261 # footnotes are actually added at the end of the parsing
262 self._footnotes.append(p_elt)
263
264 def parser_text(self, string, parent):
265 parent.addContent(string)
266
267 def _parse(self, string, parent, block_level=False):
268 regex = wiki_block_level_re if block_level else wiki_re
269
270 for match in regex.finditer(string):
271 if match.lastgroup is None:
272 parent.addContent(string)
273 return
274 matched = match.group(match.lastgroup)
275 try:
276 parser = getattr(self, 'parser_{}'.format(match.lastgroup))
277 except AttributeError:
278 log.warning(u"No parser found for {}".format(match.lastgroup))
279 # parent.addContent(string)
280 continue
281 parser(matched, parent)
282
283 def parse(self, string):
284 self._footnotes = []
285 div_elt = domish.Element((None, 'div'))
286 self._parse(string, parent=div_elt, block_level=True)
287 if self._footnotes:
288 foot_div_elt = div_elt.addElement('div')
289 foot_div_elt['class'] = 'footnotes'
290 # we add a simple horizontal rule which can be customized
291 # with footnotes class, instead of a text which would need
292 # to be translated
293 foot_div_elt.addElement('hr')
294 for elt in self._footnotes:
295 foot_div_elt.addChild(elt)
296 return div_elt
297
298
299 class XHTMLParser(object):
300
301 def __init__(self):
302 self.flags = None
303 self.toto = 0
304 self.footnotes = None # will hold a map from url to buffer id
305 for i in xrange(1,6):
306 setattr(self,
307 'parser_h{}'.format(i),
308 lambda elt, buf, level=i: self.parserHeading(elt, buf, level)
309 )
310
311 def parser_a(self, elt, buf):
312 try:
313 url = elt['href']
314 except KeyError:
315 # probably an anchor
316 try:
317 id_ = elt['id']
318 if not id_:
319 # we don't want empty values
320 raise KeyError
321 except KeyError:
322 self.parserGeneric(elt, buf)
323 else:
324 buf.append(u'~~{}~~'.format(id_))
325 return
326
327 link_data = [url]
328 name = unicode(elt)
329 if name != url:
330 link_data.insert(0, name)
331
332 lang = elt.getAttribute('lang')
333 title = elt.getAttribute('title')
334 if lang is not None:
335 link_data.append(lang)
336 elif title is not None:
337 link_data.appand(u'')
338 if title is not None:
339 link_data.append(title)
340 buf.append(u'[')
341 buf.append(u'|'.join(link_data))
342 buf.append(u']')
343
344 def parser_acronym(self, elt, buf):
345 try:
346 title = elt['title']
347 except KeyError:
348 log.debug(u"Acronyme without title, using generic parser")
349 self.parserGeneric(elt, buf)
350 return
351 buf.append(u'??{}|{}??'.format(unicode(elt), title))
352
353 def parser_blockquote(self, elt, buf):
354 # we remove wrapping <p> to avoid empty line with "> "
355 children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')])
356 if len(children) == 1 and children[0].name == 'p':
357 elt = children[0]
358 tmp_buf = []
359 self.parseChildren(elt, tmp_buf)
360 blockquote = u'> ' + u'\n> '.join(u''.join(tmp_buf).split('\n'))
361 buf.append(blockquote)
362
363 def parser_br(self, elt, buf):
364 buf.append(u'%%%')
365
366 def parser_code(self, elt, buf):
367 buf.append(u'@@')
368 self.parseChildren(elt, buf)
369 buf.append(u'@@')
370
371 def parser_del(self, elt, buf):
372 buf.append(u'--')
373 self.parseChildren(elt, buf)
374 buf.append(u'--')
375
376 def parser_div(self, elt, buf):
377 if elt.getAttribute('class') == 'footnotes':
378 self.parserFootnote(elt, buf)
379 else:
380 self.parseChildren(elt, buf, block=True)
381
382 def parser_em(self, elt, buf):
383 buf.append(u"''")
384 self.parseChildren(elt, buf)
385 buf.append(u"''")
386
387 def parser_h6(self, elt, buf):
388 # XXX: <h6/> heading is not managed by wiki syntax
389 # so we handle it with a <h5/>
390 elt = copy.copy(elt) # we don't want to change to original element
391 elt.name = 'h5'
392 self._parse(elt, buf)
393
394 def parser_hr(self, elt, buf):
395 buf.append(u'\n----\n')
396
397 def parser_img(self, elt, buf):
398 try:
399 url = elt['src']
400 except KeyError:
401 log.warning(u"Ignoring <img/> without src")
402 return
403
404 image_data=[url]
405
406 alt = elt.getAttribute('alt')
407 style = elt.getAttribute('style', '')
408 desc = elt.getAttribute('longdesc')
409
410 if '0 1em 1em 0' in style:
411 position = 'L'
412 elif '0 0 1em 1em' in style:
413 position = 'R'
414 elif 'auto' in style:
415 position = 'C'
416 else:
417 position = None
418
419 if alt:
420 image_data.append(alt)
421 elif position or desc:
422 image_data.append(u'')
423
424 if position:
425 image_data.append(position)
426 elif desc:
427 image_data.append(u'')
428
429 if desc:
430 image_data.append(desc)
431
432 buf.append(u'((')
433 buf.append(u'|'.join(image_data))
434 buf.append(u'))')
435
436 def parser_ins(self, elt, buf):
437 buf.append(u'++')
438 self.parseChildren(elt, buf)
439 buf.append(u'++')
440
441 def parser_li(self, elt, buf):
442 flag = None
443 current_flag = None
444 bullets = []
445 for flag in reversed(self.flags):
446 if flag in (FLAG_UL, FLAG_OL):
447 if current_flag is None:
448 current_flag = flag
449 if flag == current_flag:
450 bullets.append(u'*' if flag == FLAG_UL else u'#')
451 else:
452 break
453
454 if flag != current_flag and buf[-1] == u' ':
455 # this trick is to avoid a space when we switch
456 # from (un)ordered to the other type on the same row
457 # e.g. *# unorder + ordered item
458 del buf[-1]
459
460 buf.extend(bullets)
461
462 buf.append(u' ')
463 self.parseChildren(elt, buf)
464 buf.append(u'\n')
465
466 def parser_ol(self, elt, buf):
467 self.parserList(elt, buf, FLAG_OL)
468
469 def parser_p(self, elt, buf):
470 self.parseChildren(elt, buf)
471 buf.append(u'\n\n')
472
473 def parser_pre(self, elt, buf):
474 pre = u''.join([child.toXml() if domish.IElement.providedBy(child) else unicode(child) for child in elt.children])
475 pre = u' ' + u'\n '.join(pre.split('\n'))
476 buf.append(pre)
477
478 def parser_q(self, elt, buf):
479 quote_data=[unicode(elt)]
480
481 lang = elt.getAttribute('lang')
482 cite = elt.getAttribute('url')
483
484 if lang:
485 quote_data.append(lang)
486 elif cite:
487 quote_data.append(u'')
488
489 if cite:
490 quote_data.append(cite)
491
492 buf.append(u'{{')
493 buf.append(u'|'.join(quote_data))
494 buf.append(u'}}')
495
496 def parser_span(self, elt, buf):
497 self.parseChildren(elt, buf, block=True)
498
499 def parser_strong(self, elt, buf):
500 buf.append(u'__')
501 self.parseChildren(elt, buf)
502 buf.append(u'__')
503
504 def parser_sup(self, elt, buf):
505 # sup is mainly used for footnotes, so we check if we have an anchor inside
506 children = list([child for child in elt.children if unicode(child).strip() not in ('', '\n')])
507 if (len(children) == 1 and domish.IElement.providedBy(children[0])
508 and children[0].name == 'a' and '#' in children[0].getAttribute('href', '')):
509 url = children[0]['href']
510 note_id = url[url.find('#')+1:]
511 if not note_id:
512 log.warning("bad link found in footnote")
513 self.parserGeneric(elt, buf)
514 return
515 # this looks like a footnote
516 buf.append(u'$$')
517 buf.append(u' ') # placeholder
518 self.footnotes[note_id] = len(buf) - 1
519 buf.append(u'$$')
520 else:
521 self.parserGeneric(elt, buf)
522
523 def parser_ul(self, elt, buf):
524 self.parserList(elt, buf, FLAG_UL)
525
526 def parserList(self, elt, buf, type_):
527 self.flags.append(type_)
528 self.parseChildren(elt, buf, block=True)
529 idx = 0
530 for flag in reversed(self.flags):
531 idx -= 1
532 if flag == type_:
533 del self.flags[idx]
534 break
535
536 if idx == 0:
537 raise exceptions.InternalError(u"flag has been removed by an other parser")
538
539 def parserHeading(self, elt, buf, level):
540 buf.append((6-level) * u'!')
541 for child in elt.children:
542 # we ignore other elements for a Hx title
543 self.parserText(child, buf)
544 buf.append(u'\n')
545
546 def parserFootnote(self, elt, buf):
547 for elt in elt.elements():
548 # all children other than <p/> are ignored
549 if elt.name == 'p':
550 a_elt = elt.a
551 if a_elt is None:
552 log.warning(u"<p/> element doesn't contain <a/> in footnote, ignoring it")
553 continue
554 try:
555 note_idx = self.footnotes[a_elt['id']]
556 except KeyError:
557 log.warning(u"Note id doesn't match any known note, ignoring it")
558 # we create a dummy element to parse all children after the <a/>
559 dummy_elt = domish.Element((None, 'note'))
560 a_idx = elt.children.index(a_elt)
561 dummy_elt.children = elt.children[a_idx+1:]
562 note_buf = []
563 self.parseChildren(dummy_elt, note_buf)
564 # now we can replace the placeholder
565 buf[note_idx] = u''.join(note_buf)
566
567 def parserText(self, txt, buf, keep_whitespaces=False):
568 txt = unicode(txt)
569 if not keep_whitespaces:
570 # we get text and only let one inter word space
571 txt = u' '.join(txt.split())
572 txt = re.sub(ESCAPE_CHARS, r'\\\1', txt)
573 if txt:
574 buf.append(txt)
575 return txt
576
577 def parserGeneric(self, elt, buf):
578 # as dotclear wiki syntax handle arbitrary XHTML code
579 # we use this feature to add elements that we don't know
580 buf.append(u"\n\n///html\n{}\n///\n\n".format(elt.toXml()))
581
582 def parseChildren(self, elt, buf, block=False):
583 first_visible = True
584 for child in elt.children:
585 if not block and not first_visible and buf and buf[-1][-1] not in (' ','\n'):
586 # we add separation if it isn't already there
587 buf.append(u' ')
588 if domish.IElement.providedBy(child):
589 self._parse(child, buf)
590 first_visible = False
591 else:
592 appended = self.parserText(child, buf)
593 if appended:
594 first_visible = False
595
596 def _parse(self, elt, buf):
597 elt_name = elt.name.lower()
598 style = elt.getAttribute('style')
599 if style and elt_name not in ELT_WITH_STYLE:
600 # if we have style we use generic parser to put raw HTML
601 # to avoid losing it
602 parser = self.parserGeneric
603 else:
604 try:
605 parser = getattr(self, "parser_{}".format(elt_name))
606 except AttributeError:
607 log.debug("Can't find parser for {} element, using generic one".format(elt.name))
608 parser = self.parserGeneric
609 parser(elt, buf)
610
611 def parse(self, elt):
612 self.flags = []
613 self.footnotes = {}
614 buf = []
615 self._parse(elt, buf)
616 return u''.join(buf)
617
618 def parseString(self, string):
619 wrapped_html = u"<div>{}</div>".format(string)
620 try:
621 div_elt = xml_tools.ElementParser()(wrapped_html)
622 except domish.ParserError as e:
623 log.warning(u"Error while parsing HTML content: {}".format(e))
624 return
625 children = list(div_elt.elements())
626 if len(children) == 1 and children[0].name == 'div':
627 div_elt = children[0]
628 return self.parse(div_elt)
629
630
631 class DCWikiSyntax(object):
632 SYNTAX_NAME = "wiki_dotclear"
633
634 def __init__(self, host):
635 log.info(_(u"Dotclear wiki syntax plugin initialization"))
636 self.host = host
637 self._dc_parser = DCWikiParser()
638 self._xhtml_parser = XHTMLParser()
639 self._stx = self.host.plugins["TEXT-SYNTAXES"]
640 self._stx.addSyntax(self.SYNTAX_NAME, self.parseWiki, self.parseXHTML, [self._stx.OPT_NO_THREAD])
641
642 def parseWiki(self, wiki_stx):
643 div_elt = self._dc_parser.parse(wiki_stx)
644 return div_elt.toXml()
645
646 def parseXHTML(self, xhtml):
647 return self._xhtml_parser.parseString(xhtml)