diff libervia/backend/memory/cache.py @ 4212:5f2d496c633f

core: get rid of `pickle`: Use of `pickle` to serialise data was a technical legacy that was causing trouble to store in database, to update (if a class was serialised, a change could break update), and to security (pickle can lead to code execution). This patch remove all use of Pickle in favour in JSON, notably: - for caching data, a Pydantic model is now used instead - for SQLAlchemy model, the LegacyPickle is replaced by JSON serialisation - in XEP-0373 a class `PublicKeyMetadata` was serialised. New method `from_dict` and `to_dict` method have been implemented to do serialisation. - new methods to (de)serialise data can now be specified with Identity data types. It is notably used to (de)serialise `path` of avatars. A migration script has been created to convert data (for upgrade or downgrade), with special care for XEP-0373 case. Depending of size of database, this migration script can be long to run. rel 443
author Goffi <goffi@goffi.org>
date Fri, 23 Feb 2024 13:31:04 +0100
parents 4b842c1fb686
children e11b13418ba6
line wrap: on
line diff
--- a/libervia/backend/memory/cache.py	Fri Feb 16 18:46:06 2024 +0100
+++ b/libervia/backend/memory/cache.py	Fri Feb 23 13:31:04 2024 +0100
@@ -1,8 +1,7 @@
 #!/usr/bin/env python3
 
-
-# SAT: a jabber client
-# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
+# Libervia: an XMPP client
+# Copyright (C) 2009-2024 Jérôme Poisson (goffi@goffi.org)
 
 # This program is free software: you can redistribute it and/or modify
 # it under the terms of the GNU Affero General Public License as published by
@@ -20,9 +19,10 @@
 from io import BufferedIOBase
 import mimetypes
 from pathlib import Path
-import pickle as pickle
 import time
-from typing import Any, Dict, Optional
+from typing import Any
+
+from pydantic import BaseModel, ValidationError
 
 from libervia.backend.core import exceptions
 from libervia.backend.core.constants import Const as C
@@ -33,11 +33,24 @@
 
 log = getLogger(__name__)
 
+CACHE_METADATA_EXT = ".cache.json"
 DEFAULT_EXT = ".raw"
 
 
-class Cache(object):
-    """generic file caching"""
+class CacheMetadata(BaseModel):
+    source: str
+    uid: str
+    filename: str
+    creation: int
+    eol: int
+    max_age: int = C.DEFAULT_MAX_AGE
+    original_filename: str | None = None
+    mime_type: str | None = None
+    last_access: int | None = None
+
+
+class Cache:
+    """Generic file caching."""
 
     def __init__(self, host, profile):
         """
@@ -56,60 +69,52 @@
         self.purge()
 
     def purge(self):
-        # remove expired files from cache
+        # Remove expired, unreadable, and unrelated files from cache
         # TODO: this should not be called only on startup, but at regular interval
         #   (e.g. once a day)
-        purged = set()
-        # we sort files to have metadata files first
-        for cache_file in sorted(self.cache_dir.iterdir()):
-            if cache_file in purged:
-                continue
+        to_delete = set()
+        seen = set()
+        now = time.time()
+        for cache_data_file in self.cache_dir.glob(f"*{CACHE_METADATA_EXT}"):
             try:
-                with cache_file.open('rb') as f:
-                    cache_data = pickle.load(f)
-            except IOError:
+                with cache_data_file.open("r") as f:
+                    cache_data = CacheMetadata.model_validate_json(f.read())
+            except (IOError, ValidationError):
                 log.warning(
-                    _("Can't read metadata file at {path}")
-                    .format(path=cache_file))
-                continue
-            except (pickle.UnpicklingError, EOFError):
-                log.debug(f"File at {cache_file} is not a metadata file")
+                    _("Can't read metadata file at {path}, deleting it.").format(
+                        path=cache_data_file
+                    )
+                )
+                to_delete.add(cache_data_file)
                 continue
-            try:
-                eol = cache_data['eol']
-                filename = cache_data['filename']
-            except KeyError:
-                log.warning(
-                    _("Invalid cache metadata at {path}")
-                    .format(path=cache_file))
-                continue
+            else:
+                cached_file = self.get_path(cache_data.filename)
+                if not cached_file.exists():
+                    log.warning(
+                        f"Cache file {cache_data_file!r} references a non-existent file "
+                        f"and will be deleted: {cache_data_file!r}."
+                    )
+                    to_delete.add(cache_data_file)
+                elif cache_data.eol < now:
+                    log.debug(
+                        "Purging expired cache file {cache_data_file!r} (expired for "
+                        "{time}s)".format(time=int(time.time() - cache_data.eol))
+                    )
+                    to_delete.add(cache_data_file)
+                seen.add(cached_file)
+            seen.add(cache_data_file)
 
-            filepath = self.getPath(filename)
+        for file in to_delete:
+            log.debug(f"Deleting cache file: {file}")
+            file.unlink()
 
-            if not filepath.exists():
-                log.warning(_(
-                    "cache {cache_file!r} references an inexisting file: {filepath!r}"
-                ).format(cache_file=str(cache_file), filepath=str(filepath)))
-                log.debug("purging cache with missing file")
-                cache_file.unlink()
-            elif eol < time.time():
-                log.debug(
-                    "purging expired cache {filepath!r} (expired for {time}s)"
-                    .format(filepath=str(filepath), time=int(time.time() - eol))
-                )
-                cache_file.unlink()
-                try:
-                    filepath.unlink()
-                except FileNotFoundError:
-                    log.warning(
-                        _("following file is missing while purging cache: {path}")
-                        .format(path=filepath)
-                    )
-                purged.add(cache_file)
-                purged.add(filepath)
+        for file in self.cache_dir.iterdir():
+            if file not in seen:
+                log.debug(f"Deleting irrelevant file in cache dir: {file}")
+                file.unlink()
 
-    def getPath(self, filename: str) -> Path:
-        """return cached file URL
+    def get_path(self, filename: str) -> Path:
+        """Return cached file URL.
 
         @param filename: cached file name (cache data or actual file)
         @return: path to the cached file
@@ -121,62 +126,58 @@
             raise exceptions.DataError("Invalid char found")
         return self.cache_dir / filename
 
-    def get_metadata(self, uid: str, update_eol: bool = True) -> Optional[Dict[str, Any]]:
-        """Retrieve metadata for cached data
+    def get_metadata(self, uid: str, update_eol: bool = True) -> dict[str, Any] | None:
+        """Retrieve metadata for cached data.
 
-        @param uid(unicode): unique identifier of file
-        @param update_eol(bool): True if eol must extended
+        @param uid: unique identifier of cache metadata.
+        @param update_eol: True if eol must extended
             if True, max_age will be added to eol (only if it is not already expired)
-        @return (dict, None): metadata with following keys:
-            see [cache_data] for data details, an additional "path" key is the full path to
-            cached file.
-            None if file is not in cache (or cache is invalid)
+        @return: metadata, see [cache_data] for data details, an additional "path" key is
+            the full path to cached file.
+            None if file is not in cache (or cache is invalid).
         """
-
         uid = uid.strip()
         if not uid:
             raise exceptions.InternalError("uid must not be empty")
-        cache_url = self.getPath(uid)
+        cache_url = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
         if not cache_url.exists():
             return None
 
         try:
-            with cache_url.open("rb") as f:
-                cache_data = pickle.load(f)
+            with cache_url.open("r") as f:
+                cache_data = CacheMetadata.model_validate_json(f.read())
         except (IOError, EOFError) as e:
-            log.warning(f"can't read cache at {cache_url}: {e}")
+            log.warning(f"Can't read cache at {cache_url}: {e}")
             return None
-        except pickle.UnpicklingError:
-            log.warning(f"invalid cache found at {cache_url}")
+        except ValidationError:
+            log.warning(f"Invalid cache found at {cache_url}")
+            return None
+        except UnicodeDecodeError as e:
+            log.warning(f"Invalid encoding, this is not a cache metadata file.")
             return None
 
-        try:
-            eol = cache_data["eol"]
-        except KeyError:
-            log.warning("no End Of Life found for cached file {}".format(uid))
-            eol = 0
-        if eol < time.time():
+        if cache_data.eol < time.time():
             log.debug(
-                "removing expired cache (expired for {}s)".format(time.time() - eol)
+                "removing expired cache (expired for {}s)".format(
+                    time.time() - cache_data.eol
+                )
             )
             return None
 
         if update_eol:
-            try:
-                max_age = cache_data["max_age"]
-            except KeyError:
-                log.warning(f"no max_age found for cache at {cache_url}, using default")
-                max_age = cache_data["max_age"] = C.DEFAULT_MAX_AGE
             now = int(time.time())
-            cache_data["last_access"] = now
-            cache_data["eol"] = now + max_age
-            with cache_url.open("wb") as f:
-                pickle.dump(cache_data, f, protocol=2)
+            cache_data.last_access = now
+            cache_data.eol = now + cache_data.max_age
+            with cache_url.open("w") as f:
+                f.write(cache_data.model_dump_json(exclude_none=True))
 
-        cache_data["path"] = self.getPath(cache_data["filename"])
-        return cache_data
+        # FIXME: we convert to dict to be compatible with former method (pre Pydantic).
+        #   All call to get_metadata should use directly the Pydantic model in the future.
+        cache_data_dict = cache_data.model_dump()
+        cache_data_dict["path"] = self.get_path(cache_data.filename)
+        return cache_data_dict
 
-    def get_file_path(self, uid: str) -> Path:
+    def get_file_path(self, uid: str) -> Path | None:
         """Retrieve absolute path to file
 
         @param uid(unicode): unique identifier of file
@@ -187,7 +188,7 @@
         if metadata is not None:
             return metadata["path"]
 
-    def remove_from_cache(self, uid, metadata=None):
+    def remove_from_cache(self, uid: str, metadata=None) -> None:
         """Remove data from cache
 
         @param uid(unicode): unique identifier cache file
@@ -198,32 +199,33 @@
             return
 
         try:
-            filename = cache_data['filename']
+            filename = cache_data["filename"]
         except KeyError:
-            log.warning(_("missing filename for cache {uid!r}") .format(uid=uid))
+            log.warning(_("missing filename for cache {uid!r}").format(uid=uid))
         else:
-            filepath = self.getPath(filename)
+            filepath = self.get_path(filename)
             try:
                 filepath.unlink()
             except FileNotFoundError:
                 log.warning(
-                    _("missing file referenced in cache {uid!r}: {filename}")
-                    .format(uid=uid, filename=filename)
+                    _("missing file referenced in cache {uid!r}: {filename}").format(
+                        uid=uid, filename=filename
+                    )
                 )
 
-        cache_file = self.getPath(uid)
+        cache_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
         cache_file.unlink()
-        log.debug(f"cache with uid {uid!r} has been removed")
+        log.debug(f"Cache with uid {uid!r} has been removed.")
 
     def cache_data(
         self,
         source: str,
         uid: str,
-        mime_type: Optional[str] = None,
-        max_age: Optional[int] = None,
-        original_filename: Optional[str] = None
+        mime_type: str | None = None,
+        max_age: int = C.DEFAULT_MAX_AGE,
+        original_filename: str | None = None,
     ) -> BufferedIOBase:
-        """create cache metadata and file object to use for actual data
+        """Create cache metadata and file object to use for actual data.
 
         @param source: source of the cache (should be plugin's import_name)
         @param uid: an identifier of the file which must be unique
@@ -235,47 +237,42 @@
             None to use default value
             0 to ignore cache (file will be re-downloaded on each access)
         @param original_filename: if not None, will be used to retrieve file extension and
-            guess
-            mime type, and stored in "original_filename"
+            guess mime type, and stored in "original_filename"
         @return: file object opened in write mode
             you have to close it yourself (hint: use ``with`` statement)
         """
-        if max_age is None:
-            max_age = C.DEFAULT_MAX_AGE
-        cache_data = {
-            "source": source,
-            # we also store max_age for updating eol
-            "max_age": max_age,
-        }
-        cache_url = self.getPath(uid)
-        if original_filename is not None:
-            cache_data["original_filename"] = original_filename
-            if mime_type is None:
-                # we have original_filename but not MIME type, we try to guess the later
-                mime_type = mimetypes.guess_type(original_filename, strict=False)[0]
+        if original_filename is not None and mime_type is None:
+            # we have original_filename but not MIME type, we try to guess the later
+            mime_type = mimetypes.guess_type(original_filename, strict=False)[0]
+
         if mime_type:
             ext = mimetypes.guess_extension(mime_type, strict=False)
             if ext is None:
-                log.warning(
-                    "can't find extension for MIME type {}".format(mime_type)
-                )
+                log.warning("can't find extension for MIME type {}".format(mime_type))
                 ext = DEFAULT_EXT
             elif ext == ".jpe":
                 ext = ".jpg"
         else:
             ext = DEFAULT_EXT
             mime_type = None
+
         filename = uid + ext
         now = int(time.time())
-        cache_data.update({
-            "filename": filename,
-            "creation": now,
-            "eol": now + max_age,
-            "mime_type": mime_type,
-        })
-        file_path = self.getPath(filename)
+        metadata = CacheMetadata(
+            source=source,
+            uid=uid,
+            mime_type=mime_type,
+            max_age=max_age,
+            original_filename=original_filename,
+            filename=filename,
+            creation=now,
+            eol=now + max_age,
+        )
 
-        with open(cache_url, "wb") as f:
-            pickle.dump(cache_data, f, protocol=2)
+        cache_metadata_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
+        file_path = self.get_path(filename)
 
-        return file_path.open("wb")
+        with open(cache_metadata_file, "w") as f:
+            f.write(metadata.model_dump_json(exclude_none=True))
+
+        return open(file_path, "wb")