Mercurial > libervia-backend
annotate libervia/backend/plugins/plugin_comp_email_gateway/cleaning.py @ 4401:ae26233b655f default tip
doc (components): Add message cleaning section to email gateway doc:
fix 464
author | Goffi <goffi@goffi.org> |
---|---|
date | Thu, 11 Sep 2025 21:17:51 +0200 |
parents | fe09446a09ce |
children |
rev | line source |
---|---|
4399
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
1 #!/usr/bin/env python3 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
2 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
3 # Libervia Email Gateway Component |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
4 # Copyright (C) 2009-2025 Jérôme Poisson (goffi@goffi.org) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
5 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
6 # This program is free software: you can redistribute it and/or modify |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
7 # it under the terms of the GNU Affero General Public License as published by |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
8 # the Free Software Foundation, either version 3 of the License, or |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
9 # (at your option) any later version. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
10 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
11 # This program is distributed in the hope that it will be useful, |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
12 # but WITHOUT ANY WARRANTY; without even the implied warranty of |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
13 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
14 # GNU Affero General Public License for more details. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
15 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
16 # You should have received a copy of the GNU Affero General Public License |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
17 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
18 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
19 from html import escape |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
20 import re |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
21 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
22 from lxml import etree, html |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
23 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
24 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
25 class TextToHtmlConverter: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
26 """Convert plain text to semantic HTML with proper quote and list handling.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
27 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
28 def __init__(self) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
29 """Initialize converter with patterns.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
30 self.ordered_list_pattern = re.compile(r"^\s*(\d+[\.\)])\s+(.+)$") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
31 self.unordered_list_pattern = re.compile(r"^\s*([*\-\+])\s+(.+)$") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
32 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
33 def text_to_html(self, text: str) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
34 """Convert plain text to HTML. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
35 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
36 Plain text with formatting often seen in emails is converted to HTML to make |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
37 text-only email consistent with HTML ones. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
38 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
39 @param text: Plain text to convert. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
40 @return: HTML version. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
41 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
42 lines = text.split("\n") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
43 html_parts = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
44 current_paragraph_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
45 in_ordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
46 in_unordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
47 ordered_list_items = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
48 unordered_list_items = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
49 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
50 i = 0 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
51 while i < len(lines): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
52 line = lines[i] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
53 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
54 # Match line type and handle accordingly. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
55 line_type = self._classify_line(line) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
56 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
57 # If we're in a list and encounter a non-list item, close the list. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
58 if in_ordered_list and line_type != "ordered_list": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
59 html_parts.append(self._create_ordered_list(ordered_list_items)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
60 ordered_list_items.clear() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
61 in_ordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
62 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
63 if in_unordered_list and line_type != "unordered_list": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
64 html_parts.append(self._create_unordered_list(unordered_list_items)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
65 unordered_list_items.clear() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
66 in_unordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
67 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
68 match line_type: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
69 case "empty": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
70 if current_paragraph_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
71 paragraph_content = "<br />".join( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
72 escape(line) for line in current_paragraph_lines |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
73 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
74 html_parts.append(f"<p>{paragraph_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
75 current_paragraph_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
76 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
77 # Handle consecutive empty lines. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
78 if not html_parts or html_parts[-1] != "<br />": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
79 html_parts.append("<br />") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
80 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
81 case "blockquote": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
82 if current_paragraph_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
83 paragraph_content = "<br />".join( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
84 escape(line) for line in current_paragraph_lines |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
85 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
86 html_parts.append(f"<p>{paragraph_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
87 current_paragraph_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
88 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
89 quoted_lines, next_index = self._collect_quoted_lines(lines, i) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
90 html_parts.append(self._create_blockquote_html(quoted_lines)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
91 # Adjust for loop increment. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
92 i = next_index - 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
93 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
94 case "ordered_list": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
95 if current_paragraph_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
96 paragraph_content = "<br />".join( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
97 escape(line) for line in current_paragraph_lines |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
98 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
99 html_parts.append(f"<p>{paragraph_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
100 current_paragraph_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
101 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
102 match = self.ordered_list_pattern.match(line) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
103 if match: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
104 ordered_list_items.append(match.group(2)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
105 in_ordered_list = True |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
106 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
107 case "unordered_list": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
108 if current_paragraph_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
109 paragraph_content = "<br />".join( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
110 escape(line) for line in current_paragraph_lines |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
111 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
112 html_parts.append(f"<p>{paragraph_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
113 current_paragraph_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
114 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
115 match = self.unordered_list_pattern.match(line) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
116 if match: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
117 unordered_list_items.append(match.group(2)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
118 in_unordered_list = True |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
119 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
120 case "regular": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
121 if in_ordered_list: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
122 html_parts.append(self._create_ordered_list(ordered_list_items)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
123 ordered_list_items = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
124 in_ordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
125 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
126 if in_unordered_list: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
127 html_parts.append( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
128 self._create_unordered_list(unordered_list_items) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
129 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
130 unordered_list_items = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
131 in_unordered_list = False |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
132 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
133 current_paragraph_lines.append(line) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
134 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
135 i += 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
136 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
137 # We now handle remaining paragraphs and lists. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
138 if current_paragraph_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
139 paragraph_content = "<br />".join( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
140 escape(line) for line in current_paragraph_lines |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
141 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
142 html_parts.append(f"<p>{paragraph_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
143 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
144 if in_ordered_list: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
145 html_parts.append(self._create_ordered_list(ordered_list_items)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
146 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
147 if in_unordered_list: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
148 html_parts.append(self._create_unordered_list(unordered_list_items)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
149 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
150 # Remove trailing <br /> tags. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
151 while html_parts and html_parts[-1] == "<br />": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
152 html_parts.pop() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
153 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
154 return "\n".join(html_parts) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
155 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
156 def _classify_line(self, line: str) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
157 """Classify a line type for processing. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
158 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
159 @param line: Line to classify. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
160 @return: Line type classification. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
161 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
162 stripped = line.strip() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
163 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
164 if not stripped: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
165 return "empty" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
166 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
167 if line.lstrip().startswith(">"): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
168 return "blockquote" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
169 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
170 if self.ordered_list_pattern.match(line): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
171 return "ordered_list" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
172 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
173 if self.unordered_list_pattern.match(line): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
174 return "unordered_list" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
175 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
176 return "regular" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
177 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
178 def _collect_quoted_lines( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
179 self, lines: list[str], start_index: int |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
180 ) -> tuple[list[str], int]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
181 """Collect consecutive quoted lines. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
182 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
183 @param lines: All lines. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
184 @param start_index: Starting index for collection. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
185 @return: Tuple of (quoted_lines, next_index). |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
186 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
187 quoted_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
188 i = start_index |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
189 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
190 while i < len(lines) and lines[i].lstrip().startswith(">"): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
191 quoted_lines.append(lines[i]) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
192 i += 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
193 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
194 return quoted_lines, i |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
195 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
196 def _create_blockquote_html(self, quoted_lines: list[str]) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
197 """Create properly nested blockquote HTML. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
198 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
199 @param quoted_lines: Lines to convert to blockquotes. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
200 @return: HTML with nested blockquote elements. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
201 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
202 if not quoted_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
203 return "" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
204 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
205 # Parse lines to determine nesting structure. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
206 parsed_lines = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
207 for line in quoted_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
208 level = 0 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
209 content = line.lstrip() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
210 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
211 # Count and remove quote markers. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
212 while content.startswith(">"): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
213 level += 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
214 # Remove first '>'. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
215 content = content[1:] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
216 content = content.lstrip() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
217 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
218 parsed_lines.append((level, content)) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
219 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
220 return self._build_nested_blockquotes(parsed_lines) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
221 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
222 def _build_nested_blockquotes(self, parsed_lines: list[tuple[int, str]]) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
223 """Build properly nested blockquote elements. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
224 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
225 @param parsed_lines: List of (level, content) tuples. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
226 @return: Nested blockquote HTML. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
227 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
228 if not parsed_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
229 return "" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
230 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
231 html_parts = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
232 current_level = 0 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
233 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
234 for level, content in parsed_lines: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
235 # Close blockquotes if we're going to a lower level. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
236 while current_level > level: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
237 html_parts.append("</blockquote>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
238 current_level -= 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
239 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
240 # Open new blockquotes if we're going to a higher level. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
241 while current_level < level: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
242 html_parts.append("<blockquote>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
243 current_level += 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
244 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
245 # Add the content as a paragraph if it's not empty. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
246 if content.strip(): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
247 # Handle line breaks within quote content. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
248 escaped_content = escape(content) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
249 html_parts.append(f"<p>{escaped_content}</p>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
250 else: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
251 html_parts.append("<br />") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
252 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
253 # Close remaining blockquotes. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
254 while current_level > 0: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
255 html_parts.append("</blockquote>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
256 current_level -= 1 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
257 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
258 return "".join(html_parts) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
259 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
260 def _create_ordered_list(self, items: list[str]) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
261 """Create an ordered list from items.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
262 if not items: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
263 return "" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
264 list_items = "\n".join(f"<li>{escape(item)}</li>" for item in items) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
265 return f"<ol>\n{list_items}\n</ol>" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
266 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
267 def _create_unordered_list(self, items: list[str]) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
268 """Create an unordered list from items.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
269 if not items: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
270 return "" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
271 list_items = "\n".join(f"<li>{escape(item)}</li>" for item in items) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
272 return f"<ul>\n{list_items}\n</ul>" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
273 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
274 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
275 class MailingListHtmlCleaner: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
276 """Clean mailing list content by converting to HTML and adding semantic classes.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
277 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
278 def __init__(self) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
279 """Initialize the cleaner.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
280 self.converter = TextToHtmlConverter() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
281 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
282 def clean_message(self, text: str, is_html: bool = False) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
283 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
284 Convert text to HTML (if needed) and clean it with noise classification. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
285 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
286 @param text: The message content (text or HTML). |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
287 @param is_html: Whether the input is already HTML. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
288 @return: Cleaned HTML with semantic noise classification. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
289 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
290 if is_html: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
291 html_content = text |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
292 else: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
293 html_content = self.converter.text_to_html(text) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
294 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
295 return self.clean_html(html_content) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
296 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
297 def clean_html(self, html_content: str) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
298 """Clean HTML by adding noise classification classes. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
299 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
300 @param html: HTML to clean. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
301 @return: HTML with noise classification classes added. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
302 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
303 # Parse the HTML with lxml. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
304 try: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
305 # Try to parse as XHTML first. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
306 doc = etree.fromstring(f"<div>{html_content}</div>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
307 except etree.XMLSyntaxError: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
308 # If that fails, parse as HTML and convert to XHTML. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
309 doc = html.fromstring(html_content) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
310 # Convert to XHTML string and re-parse. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
311 xhtml = html.tostring(doc, encoding="unicode", method="xml") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
312 doc = etree.fromstring(f"<div>{xhtml}</div>") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
313 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
314 # Detect and mark noise elements. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
315 self._detect_and_mark_noise(doc) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
316 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
317 # Convert back to string. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
318 result = etree.tostring(doc, encoding="unicode", method="xml") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
319 # Remove the wrapper div. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
320 if result.startswith("<div>") and result.endswith("</div>"): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
321 result = result[5:-6] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
322 return result |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
323 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
324 def _detect_and_mark_noise(self, doc: etree.Element) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
325 """Detect noise elements in the document and add appropriate classes. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
326 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
327 @param doc: Parsed HTML document. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
328 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
329 # Detect long blockquotes at top or bottom. Those are often the result of user. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
330 # pressing the "reply" button of their client, and not removing the old message. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
331 self._detect_long_blockquotes(doc) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
332 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
333 # Detect "On XXX YYY wrote" patterns (reply context). |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
334 self._detect_reply_context(doc) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
335 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
336 # Detect signatures. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
337 self._detect_signatures(doc) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
338 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
339 def _detect_long_blockquotes(self, doc: etree.Element) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
340 """Detect long blockquotes at beginning or end of message.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
341 blockquotes = doc.xpath("//blockquote") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
342 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
343 if not blockquotes: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
344 return |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
345 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
346 # Check if first element is a blockquote. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
347 first_element = None |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
348 for child in doc: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
349 if child.tag in ["p", "blockquote", "div"]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
350 first_element = child |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
351 break |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
352 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
353 # Check if last element is a blockquote. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
354 last_element = None |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
355 for child in reversed(list(doc)): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
356 if child.tag in ["p", "blockquote", "div"]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
357 last_element = child |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
358 break |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
359 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
360 for blockquote in blockquotes: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
361 # Count the text content length. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
362 text_content = "".join(blockquote.itertext()) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
363 # Only mark as noise if it's a long blockquote at the beginning or end. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
364 if len(text_content) > 500 and ( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
365 blockquote is first_element or blockquote is last_element |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
366 ): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
367 self._add_class(blockquote, "noise-old-quote") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
368 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
369 def _detect_reply_context(self, doc: etree.Element) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
370 """Detect reply context patterns like "On XXX YYY wrote".""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
371 # Look for paragraphs that might contain reply context. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
372 paragraphs = doc.xpath("//p") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
373 reply_pattern = re.compile( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
374 r"On\s+.+?\s+(?:wrote|said|ecrit|a écrit).*?:\s*$", re.IGNORECASE |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
375 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
376 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
377 # Check if first element is a paragraph with reply context. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
378 first_element = None |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
379 for child in doc: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
380 if child.tag in ["p", "blockquote", "div"]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
381 first_element = child |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
382 break |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
383 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
384 # Check if last element is a paragraph with reply context. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
385 last_element = None |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
386 for child in reversed(list(doc)): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
387 if child.tag in ["p", "blockquote", "div"]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
388 last_element = child |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
389 break |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
390 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
391 for p in paragraphs: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
392 text = "".join(p.itertext()).strip() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
393 if reply_pattern.search(text): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
394 # Only mark as noise if it's at the beginning or end of the message. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
395 if p is first_element or p is last_element: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
396 self._add_class(p, "noise-reply-quote") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
397 # Also check for next sibling blockquote. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
398 next_sibling = p.getnext() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
399 if next_sibling is not None and next_sibling.tag == "blockquote": |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
400 self._add_class(next_sibling, "noise-reply-quote") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
401 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
402 def _detect_signatures(self, doc: etree.Element) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
403 """Detect mailing list signatures.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
404 # Only consider elements at the end of the document as potential signatures. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
405 # Find the last few elements that could be signatures. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
406 potential_signature_elements = [] |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
407 for child in reversed(list(doc)): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
408 if child.tag in ["p", "div"]: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
409 potential_signature_elements.append(child) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
410 # Check last 8 elements max. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
411 if len(potential_signature_elements) >= 8: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
412 break |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
413 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
414 # Put them back in order. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
415 potential_signature_elements.reverse() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
416 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
417 # More specific pattern for mailing list signatures (unsubscribe links, etc.). |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
418 mailing_list_signature_pattern = re.compile( |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
419 r"(?:^--\s*$)" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
420 r"|(?:^__+\s*$)" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
421 r"|(?:^==+\s*$)" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
422 r"|(?:.*(?:" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
423 r"unsubscribe" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
424 r"|subscribe" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
425 r"|list info" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
426 r"|mailing list" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
427 r"|archives?" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
428 r"|digest" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
429 # We must have an URL or an email after one of the previous keywords |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
430 r").*(?:https?:|@).+\..+" r")", |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
431 re.IGNORECASE, |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
432 ) |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
433 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
434 for element in potential_signature_elements: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
435 text = " ".join(element.itertext()).strip() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
436 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
437 # Check for mailing list signature patterns. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
438 if mailing_list_signature_pattern.search(text): |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
439 self._add_class(element, "noise-signature") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
440 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
441 def _add_class(self, element: etree.Element, class_name: str) -> None: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
442 """Add a CSS class to an element.""" |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
443 current_class = element.get("class", "") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
444 if current_class: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
445 if class_name not in current_class: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
446 element.set("class", f"{current_class} noise {class_name}") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
447 else: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
448 element.set("class", f"noise {class_name}") |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
449 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
450 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
451 def convert_to_html_and_detect_noise(text: str, is_html: bool = False) -> str: |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
452 """Convert content to HTML and mark "noise" elements. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
453 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
454 "Noise" elements are elements which make the message more difficult to read; elements |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
455 such as forgotten blockquoted old messages, mailing list generic signatures, etc. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
456 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
457 Elements detected as "noise" will have the "noise" class added, plus a generic |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
458 "noise-*" class according to the type of noise detected (old quote, signature). |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
459 |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
460 @param text: Text or HTML content to clean. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
461 @param is_html: True if the content is HTML, False for plain text. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
462 @return: Cleaned HTML with semantic noise classification. |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
463 """ |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
464 cleaner = MailingListHtmlCleaner() |
fe09446a09ce
component email message: Mailing list messages cleaning:
Goffi <goffi@goffi.org>
parents:
diff
changeset
|
465 return cleaner.clean_message(text, is_html) |