view libervia/backend/tools/common/regex.py @ 4314:6a70fcd93a7a

plugin XEP-0131: Stanza Headers and Internet Metadata implementation: - SHIM is now supported and put in `msg_data["extra"]["headers"]`. - `Keywords` are converted from and to list of string in `msg_data["extra"]["keywords"]` field (if present in headers on message sending, values are merged). - Python minimal version upgraded to 3.11 due to use of `StrEnum`. rel 451
author Goffi <goffi@goffi.org>
date Sat, 28 Sep 2024 15:56:04 +0200
parents 0d7bb4df2343
children
line wrap: on
line source

#!/usr/bin/env python3


# Salut à Toi: an XMPP client
# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

""" regex tools common to backend and frontends """

import re
import unicodedata

path_escape = {"%": "%25", "/": "%2F", "\\": "%5c"}
path_escape_rev = {re.escape(v): k for k, v in path_escape.items()}
path_escape = {re.escape(k): v for k, v in path_escape.items()}
#  thanks to Martijn Pieters (https://stackoverflow.com/a/14693789)
RE_ANSI_REMOVE = re.compile(r"\x1B(?:[@-Z\\-_]|\[[0-?]*[ -/]*[@-~])")
RE_TEXT_URL = re.compile(r"[^a-zA-Z0-9,_]+")
TEXT_MAX_LEN = 60
# min lenght is currently deactivated
TEXT_WORD_MIN_LENGHT = 0


def re_join(exps):
    """Join (OR) various regexes"""
    return re.compile("|".join(exps))


def re_sub_dict(pattern, repl_dict, string):
    """Replace key, value found in dict according to pattern

    @param pattern(basestr): pattern using keys found in repl_dict
    @repl_dict(dict): keys found in this dict will be replaced by
        corresponding values
    @param string(basestr): string to use for the replacement
    """
    return pattern.sub(lambda m: repl_dict[re.escape(m.group(0))], string)


path_escape_re = re_join(list(path_escape.keys()))
path_escape_rev_re = re_join(list(path_escape_rev.keys()))


def path_escape(string):
    """Escape string so it can be use in a file path

    @param string(basestr): string to escape
    @return (str, unicode): escaped string, usable in a file path
    """
    return re_sub_dict(path_escape_re, path_escape, string)


def path_unescape(string):
    """Unescape string from value found in file path

    @param string(basestr): string found in file path
    @return (str, unicode): unescaped string
    """
    return re_sub_dict(path_escape_rev_re, path_escape_rev, string)


def ansi_remove(string):
    """Remove ANSI escape codes from string

    @param string(basestr): string to filter
    @return (str, unicode): string without ANSI escape codes
    """
    return RE_ANSI_REMOVE.sub("", string)


def url_friendly_text(text):
    """Convert text to url-friendly one"""
    # we change special chars to ascii one,
    # trick found at https://stackoverflow.com/a/3194567
    text = unicodedata.normalize("NFD", text).encode("ascii", "ignore").decode("utf-8")
    text = RE_TEXT_URL.sub(" ", text).lower()
    text = "-".join([t for t in text.split() if t and len(t) >= TEXT_WORD_MIN_LENGHT])
    while len(text) > TEXT_MAX_LEN:
        if "-" in text:
            text = text.rsplit("-", 1)[0]
        else:
            text = text[:TEXT_MAX_LEN]
    return text