Mercurial > libervia-backend
diff libervia/backend/memory/cache.py @ 4212:5f2d496c633f
core: get rid of `pickle`:
Use of `pickle` to serialise data was a technical legacy that was causing trouble to store
in database, to update (if a class was serialised, a change could break update), and to
security (pickle can lead to code execution).
This patch remove all use of Pickle in favour in JSON, notably:
- for caching data, a Pydantic model is now used instead
- for SQLAlchemy model, the LegacyPickle is replaced by JSON serialisation
- in XEP-0373 a class `PublicKeyMetadata` was serialised. New method `from_dict` and
`to_dict` method have been implemented to do serialisation.
- new methods to (de)serialise data can now be specified with Identity data types. It is
notably used to (de)serialise `path` of avatars.
A migration script has been created to convert data (for upgrade or downgrade), with
special care for XEP-0373 case. Depending of size of database, this migration script can
be long to run.
rel 443
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 23 Feb 2024 13:31:04 +0100 |
parents | 4b842c1fb686 |
children | e11b13418ba6 |
line wrap: on
line diff
--- a/libervia/backend/memory/cache.py Fri Feb 16 18:46:06 2024 +0100 +++ b/libervia/backend/memory/cache.py Fri Feb 23 13:31:04 2024 +0100 @@ -1,8 +1,7 @@ #!/usr/bin/env python3 - -# SAT: a jabber client -# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) +# Libervia: an XMPP client +# Copyright (C) 2009-2024 Jérôme Poisson (goffi@goffi.org) # This program is free software: you can redistribute it and/or modify # it under the terms of the GNU Affero General Public License as published by @@ -20,9 +19,10 @@ from io import BufferedIOBase import mimetypes from pathlib import Path -import pickle as pickle import time -from typing import Any, Dict, Optional +from typing import Any + +from pydantic import BaseModel, ValidationError from libervia.backend.core import exceptions from libervia.backend.core.constants import Const as C @@ -33,11 +33,24 @@ log = getLogger(__name__) +CACHE_METADATA_EXT = ".cache.json" DEFAULT_EXT = ".raw" -class Cache(object): - """generic file caching""" +class CacheMetadata(BaseModel): + source: str + uid: str + filename: str + creation: int + eol: int + max_age: int = C.DEFAULT_MAX_AGE + original_filename: str | None = None + mime_type: str | None = None + last_access: int | None = None + + +class Cache: + """Generic file caching.""" def __init__(self, host, profile): """ @@ -56,60 +69,52 @@ self.purge() def purge(self): - # remove expired files from cache + # Remove expired, unreadable, and unrelated files from cache # TODO: this should not be called only on startup, but at regular interval # (e.g. once a day) - purged = set() - # we sort files to have metadata files first - for cache_file in sorted(self.cache_dir.iterdir()): - if cache_file in purged: - continue + to_delete = set() + seen = set() + now = time.time() + for cache_data_file in self.cache_dir.glob(f"*{CACHE_METADATA_EXT}"): try: - with cache_file.open('rb') as f: - cache_data = pickle.load(f) - except IOError: + with cache_data_file.open("r") as f: + cache_data = CacheMetadata.model_validate_json(f.read()) + except (IOError, ValidationError): log.warning( - _("Can't read metadata file at {path}") - .format(path=cache_file)) - continue - except (pickle.UnpicklingError, EOFError): - log.debug(f"File at {cache_file} is not a metadata file") + _("Can't read metadata file at {path}, deleting it.").format( + path=cache_data_file + ) + ) + to_delete.add(cache_data_file) continue - try: - eol = cache_data['eol'] - filename = cache_data['filename'] - except KeyError: - log.warning( - _("Invalid cache metadata at {path}") - .format(path=cache_file)) - continue + else: + cached_file = self.get_path(cache_data.filename) + if not cached_file.exists(): + log.warning( + f"Cache file {cache_data_file!r} references a non-existent file " + f"and will be deleted: {cache_data_file!r}." + ) + to_delete.add(cache_data_file) + elif cache_data.eol < now: + log.debug( + "Purging expired cache file {cache_data_file!r} (expired for " + "{time}s)".format(time=int(time.time() - cache_data.eol)) + ) + to_delete.add(cache_data_file) + seen.add(cached_file) + seen.add(cache_data_file) - filepath = self.getPath(filename) + for file in to_delete: + log.debug(f"Deleting cache file: {file}") + file.unlink() - if not filepath.exists(): - log.warning(_( - "cache {cache_file!r} references an inexisting file: {filepath!r}" - ).format(cache_file=str(cache_file), filepath=str(filepath))) - log.debug("purging cache with missing file") - cache_file.unlink() - elif eol < time.time(): - log.debug( - "purging expired cache {filepath!r} (expired for {time}s)" - .format(filepath=str(filepath), time=int(time.time() - eol)) - ) - cache_file.unlink() - try: - filepath.unlink() - except FileNotFoundError: - log.warning( - _("following file is missing while purging cache: {path}") - .format(path=filepath) - ) - purged.add(cache_file) - purged.add(filepath) + for file in self.cache_dir.iterdir(): + if file not in seen: + log.debug(f"Deleting irrelevant file in cache dir: {file}") + file.unlink() - def getPath(self, filename: str) -> Path: - """return cached file URL + def get_path(self, filename: str) -> Path: + """Return cached file URL. @param filename: cached file name (cache data or actual file) @return: path to the cached file @@ -121,62 +126,58 @@ raise exceptions.DataError("Invalid char found") return self.cache_dir / filename - def get_metadata(self, uid: str, update_eol: bool = True) -> Optional[Dict[str, Any]]: - """Retrieve metadata for cached data + def get_metadata(self, uid: str, update_eol: bool = True) -> dict[str, Any] | None: + """Retrieve metadata for cached data. - @param uid(unicode): unique identifier of file - @param update_eol(bool): True if eol must extended + @param uid: unique identifier of cache metadata. + @param update_eol: True if eol must extended if True, max_age will be added to eol (only if it is not already expired) - @return (dict, None): metadata with following keys: - see [cache_data] for data details, an additional "path" key is the full path to - cached file. - None if file is not in cache (or cache is invalid) + @return: metadata, see [cache_data] for data details, an additional "path" key is + the full path to cached file. + None if file is not in cache (or cache is invalid). """ - uid = uid.strip() if not uid: raise exceptions.InternalError("uid must not be empty") - cache_url = self.getPath(uid) + cache_url = self.get_path(f"{uid}{CACHE_METADATA_EXT}") if not cache_url.exists(): return None try: - with cache_url.open("rb") as f: - cache_data = pickle.load(f) + with cache_url.open("r") as f: + cache_data = CacheMetadata.model_validate_json(f.read()) except (IOError, EOFError) as e: - log.warning(f"can't read cache at {cache_url}: {e}") + log.warning(f"Can't read cache at {cache_url}: {e}") return None - except pickle.UnpicklingError: - log.warning(f"invalid cache found at {cache_url}") + except ValidationError: + log.warning(f"Invalid cache found at {cache_url}") + return None + except UnicodeDecodeError as e: + log.warning(f"Invalid encoding, this is not a cache metadata file.") return None - try: - eol = cache_data["eol"] - except KeyError: - log.warning("no End Of Life found for cached file {}".format(uid)) - eol = 0 - if eol < time.time(): + if cache_data.eol < time.time(): log.debug( - "removing expired cache (expired for {}s)".format(time.time() - eol) + "removing expired cache (expired for {}s)".format( + time.time() - cache_data.eol + ) ) return None if update_eol: - try: - max_age = cache_data["max_age"] - except KeyError: - log.warning(f"no max_age found for cache at {cache_url}, using default") - max_age = cache_data["max_age"] = C.DEFAULT_MAX_AGE now = int(time.time()) - cache_data["last_access"] = now - cache_data["eol"] = now + max_age - with cache_url.open("wb") as f: - pickle.dump(cache_data, f, protocol=2) + cache_data.last_access = now + cache_data.eol = now + cache_data.max_age + with cache_url.open("w") as f: + f.write(cache_data.model_dump_json(exclude_none=True)) - cache_data["path"] = self.getPath(cache_data["filename"]) - return cache_data + # FIXME: we convert to dict to be compatible with former method (pre Pydantic). + # All call to get_metadata should use directly the Pydantic model in the future. + cache_data_dict = cache_data.model_dump() + cache_data_dict["path"] = self.get_path(cache_data.filename) + return cache_data_dict - def get_file_path(self, uid: str) -> Path: + def get_file_path(self, uid: str) -> Path | None: """Retrieve absolute path to file @param uid(unicode): unique identifier of file @@ -187,7 +188,7 @@ if metadata is not None: return metadata["path"] - def remove_from_cache(self, uid, metadata=None): + def remove_from_cache(self, uid: str, metadata=None) -> None: """Remove data from cache @param uid(unicode): unique identifier cache file @@ -198,32 +199,33 @@ return try: - filename = cache_data['filename'] + filename = cache_data["filename"] except KeyError: - log.warning(_("missing filename for cache {uid!r}") .format(uid=uid)) + log.warning(_("missing filename for cache {uid!r}").format(uid=uid)) else: - filepath = self.getPath(filename) + filepath = self.get_path(filename) try: filepath.unlink() except FileNotFoundError: log.warning( - _("missing file referenced in cache {uid!r}: {filename}") - .format(uid=uid, filename=filename) + _("missing file referenced in cache {uid!r}: {filename}").format( + uid=uid, filename=filename + ) ) - cache_file = self.getPath(uid) + cache_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}") cache_file.unlink() - log.debug(f"cache with uid {uid!r} has been removed") + log.debug(f"Cache with uid {uid!r} has been removed.") def cache_data( self, source: str, uid: str, - mime_type: Optional[str] = None, - max_age: Optional[int] = None, - original_filename: Optional[str] = None + mime_type: str | None = None, + max_age: int = C.DEFAULT_MAX_AGE, + original_filename: str | None = None, ) -> BufferedIOBase: - """create cache metadata and file object to use for actual data + """Create cache metadata and file object to use for actual data. @param source: source of the cache (should be plugin's import_name) @param uid: an identifier of the file which must be unique @@ -235,47 +237,42 @@ None to use default value 0 to ignore cache (file will be re-downloaded on each access) @param original_filename: if not None, will be used to retrieve file extension and - guess - mime type, and stored in "original_filename" + guess mime type, and stored in "original_filename" @return: file object opened in write mode you have to close it yourself (hint: use ``with`` statement) """ - if max_age is None: - max_age = C.DEFAULT_MAX_AGE - cache_data = { - "source": source, - # we also store max_age for updating eol - "max_age": max_age, - } - cache_url = self.getPath(uid) - if original_filename is not None: - cache_data["original_filename"] = original_filename - if mime_type is None: - # we have original_filename but not MIME type, we try to guess the later - mime_type = mimetypes.guess_type(original_filename, strict=False)[0] + if original_filename is not None and mime_type is None: + # we have original_filename but not MIME type, we try to guess the later + mime_type = mimetypes.guess_type(original_filename, strict=False)[0] + if mime_type: ext = mimetypes.guess_extension(mime_type, strict=False) if ext is None: - log.warning( - "can't find extension for MIME type {}".format(mime_type) - ) + log.warning("can't find extension for MIME type {}".format(mime_type)) ext = DEFAULT_EXT elif ext == ".jpe": ext = ".jpg" else: ext = DEFAULT_EXT mime_type = None + filename = uid + ext now = int(time.time()) - cache_data.update({ - "filename": filename, - "creation": now, - "eol": now + max_age, - "mime_type": mime_type, - }) - file_path = self.getPath(filename) + metadata = CacheMetadata( + source=source, + uid=uid, + mime_type=mime_type, + max_age=max_age, + original_filename=original_filename, + filename=filename, + creation=now, + eol=now + max_age, + ) - with open(cache_url, "wb") as f: - pickle.dump(cache_data, f, protocol=2) + cache_metadata_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}") + file_path = self.get_path(filename) - return file_path.open("wb") + with open(cache_metadata_file, "w") as f: + f.write(metadata.model_dump_json(exclude_none=True)) + + return open(file_path, "wb")