annotate libervia/backend/tools/common/regex.py @ 4318:27bb22eace65

tests (unit/email gateway): add test for XEP-0131 handling: rel 451
author Goffi <goffi@goffi.org>
date Sat, 28 Sep 2024 15:59:48 +0200
parents 0d7bb4df2343
children
Ignore whitespace changes - Everywhere: Within whitespace: At end of lines:
rev   line source
3028
ab2696e34d29 Python 3 port:
Goffi <goffi@goffi.org>
parents: 2771
diff changeset
1 #!/usr/bin/env python3
3137
559a625a236b fixed shebangs
Goffi <goffi@goffi.org>
parents: 3136
diff changeset
2
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
3
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
4 # Salut à Toi: an XMPP client
3479
be6d91572633 date update
Goffi <goffi@goffi.org>
parents: 3137
diff changeset
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
6
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
7 # This program is free software: you can redistribute it and/or modify
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
8 # it under the terms of the GNU Affero General Public License as published by
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
9 # the Free Software Foundation, either version 3 of the License, or
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
10 # (at your option) any later version.
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
11
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
12 # This program is distributed in the hope that it will be useful,
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
13 # but WITHOUT ANY WARRANTY; without even the implied warranty of
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
14 # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
15 # GNU Affero General Public License for more details.
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
16
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
17 # You should have received a copy of the GNU Affero General Public License
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
18 # along with this program. If not, see <http://www.gnu.org/licenses/>.
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
19
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
20 """ regex tools common to backend and frontends """
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
21
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
22 import re
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
23 import unicodedata
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
24
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
25 path_escape = {"%": "%25", "/": "%2F", "\\": "%5c"}
3028
ab2696e34d29 Python 3 port:
Goffi <goffi@goffi.org>
parents: 2771
diff changeset
26 path_escape_rev = {re.escape(v): k for k, v in path_escape.items()}
ab2696e34d29 Python 3 port:
Goffi <goffi@goffi.org>
parents: 2771
diff changeset
27 path_escape = {re.escape(k): v for k, v in path_escape.items()}
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
28 #  thanks to Martijn Pieters (https://stackoverflow.com/a/14693789)
4270
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
29 RE_ANSI_REMOVE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
30 RE_TEXT_URL = re.compile(r"[^a-zA-Z0-9,_]+")
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
31 TEXT_MAX_LEN = 60
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
32 # min lenght is currently deactivated
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
33 TEXT_WORD_MIN_LENGHT = 0
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
34
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
35
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
36 def re_join(exps):
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
37 """Join (OR) various regexes"""
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
38 return re.compile("|".join(exps))
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
39
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
40
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
41 def re_sub_dict(pattern, repl_dict, string):
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
42 """Replace key, value found in dict according to pattern
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
43
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
44 @param pattern(basestr): pattern using keys found in repl_dict
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
45 @repl_dict(dict): keys found in this dict will be replaced by
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
46 corresponding values
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
47 @param string(basestr): string to use for the replacement
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
48 """
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
49 return pattern.sub(lambda m: repl_dict[re.escape(m.group(0))], string)
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
50
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
51
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
52 path_escape_re = re_join(list(path_escape.keys()))
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
53 path_escape_rev_re = re_join(list(path_escape_rev.keys()))
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
54
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
55
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
56 def path_escape(string):
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
57 """Escape string so it can be use in a file path
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
58
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
59 @param string(basestr): string to escape
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
60 @return (str, unicode): escaped string, usable in a file path
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
61 """
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
62 return re_sub_dict(path_escape_re, path_escape, string)
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
63
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
64
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
65 def path_unescape(string):
1920
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
66 """Unescape string from value found in file path
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
67
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
68 @param string(basestr): string found in file path
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
69 @return (str, unicode): unescaped string
03526c8abeb0 tools (common): added regex module with path (un)escaping methods
Goffi <goffi@goffi.org>
parents:
diff changeset
70 """
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
71 return re_sub_dict(path_escape_rev_re, path_escape_rev, string)
2297
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
72
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
73
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
74 def ansi_remove(string):
2297
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
75 """Remove ANSI escape codes from string
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
76
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
77 @param string(basestr): string to filter
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
78 @return (str, unicode): string without ANSI escape codes
ad2a8e8b52da core (tools/common/regex): new ansiRemove method to remove ANSI escape codes from a string
Goffi <goffi@goffi.org>
parents: 1934
diff changeset
79 """
2624
56f94936df1e code style reformatting using black
Goffi <goffi@goffi.org>
parents: 2562
diff changeset
80 return RE_ANSI_REMOVE.sub("", string)
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
81
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
82
4037
524856bd7b19 massive refactoring to switch from camelCase to snake_case:
Goffi <goffi@goffi.org>
parents: 3501
diff changeset
83 def url_friendly_text(text):
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
84 """Convert text to url-friendly one"""
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
85 # we change special chars to ascii one,
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
86 # trick found at https://stackoverflow.com/a/3194567
4270
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
87 text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("utf-8")
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
88 text = RE_TEXT_URL.sub(" ", text).lower()
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
89 text = "-".join([t for t in text.split() if t and len(t) >= TEXT_WORD_MIN_LENGHT])
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
90 while len(text) > TEXT_MAX_LEN:
4270
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
91 if "-" in text:
0d7bb4df2343 Reformatted code base using black.
Goffi <goffi@goffi.org>
parents: 4071
diff changeset
92 text = text.rsplit("-", 1)[0]
3501
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
93 else:
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
94 text = text[:TEXT_MAX_LEN]
85b8a899f407 tools (common/regex): move code to make user friendly URLs from web frontend
Goffi <goffi@goffi.org>
parents: 3479
diff changeset
95 return text