comparison libervia/backend/memory/cache.py @ 4212:5f2d496c633f

core: get rid of `pickle`: Use of `pickle` to serialise data was a technical legacy that was causing trouble to store in database, to update (if a class was serialised, a change could break update), and to security (pickle can lead to code execution). This patch remove all use of Pickle in favour in JSON, notably: - for caching data, a Pydantic model is now used instead - for SQLAlchemy model, the LegacyPickle is replaced by JSON serialisation - in XEP-0373 a class `PublicKeyMetadata` was serialised. New method `from_dict` and `to_dict` method have been implemented to do serialisation. - new methods to (de)serialise data can now be specified with Identity data types. It is notably used to (de)serialise `path` of avatars. A migration script has been created to convert data (for upgrade or downgrade), with special care for XEP-0373 case. Depending of size of database, this migration script can be long to run. rel 443
author Goffi <goffi@goffi.org>
date Fri, 23 Feb 2024 13:31:04 +0100
parents 4b842c1fb686
children e11b13418ba6
comparison
equal deleted inserted replaced
4211:be89ab1cbca4 4212:5f2d496c633f
1 #!/usr/bin/env python3 1 #!/usr/bin/env python3
2 2
3 3 # Libervia: an XMPP client
4 # SAT: a jabber client 4 # Copyright (C) 2009-2024 Jérôme Poisson (goffi@goffi.org)
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)
6 5
7 # This program is free software: you can redistribute it and/or modify 6 # This program is free software: you can redistribute it and/or modify
8 # it under the terms of the GNU Affero General Public License as published by 7 # it under the terms of the GNU Affero General Public License as published by
9 # the Free Software Foundation, either version 3 of the License, or 8 # the Free Software Foundation, either version 3 of the License, or
10 # (at your option) any later version. 9 # (at your option) any later version.
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. 17 # along with this program. If not, see <http://www.gnu.org/licenses/>.
19 18
20 from io import BufferedIOBase 19 from io import BufferedIOBase
21 import mimetypes 20 import mimetypes
22 from pathlib import Path 21 from pathlib import Path
23 import pickle as pickle
24 import time 22 import time
25 from typing import Any, Dict, Optional 23 from typing import Any
24
25 from pydantic import BaseModel, ValidationError
26 26
27 from libervia.backend.core import exceptions 27 from libervia.backend.core import exceptions
28 from libervia.backend.core.constants import Const as C 28 from libervia.backend.core.constants import Const as C
29 from libervia.backend.core.i18n import _ 29 from libervia.backend.core.i18n import _
30 from libervia.backend.core.log import getLogger 30 from libervia.backend.core.log import getLogger
31 from libervia.backend.tools.common import regex 31 from libervia.backend.tools.common import regex
32 32
33 33
34 log = getLogger(__name__) 34 log = getLogger(__name__)
35 35
36 CACHE_METADATA_EXT = ".cache.json"
36 DEFAULT_EXT = ".raw" 37 DEFAULT_EXT = ".raw"
37 38
38 39
39 class Cache(object): 40 class CacheMetadata(BaseModel):
40 """generic file caching""" 41 source: str
42 uid: str
43 filename: str
44 creation: int
45 eol: int
46 max_age: int = C.DEFAULT_MAX_AGE
47 original_filename: str | None = None
48 mime_type: str | None = None
49 last_access: int | None = None
50
51
52 class Cache:
53 """Generic file caching."""
41 54
42 def __init__(self, host, profile): 55 def __init__(self, host, profile):
43 """ 56 """
44 @param profile(unicode, None): name of the profile to set the cache for 57 @param profile(unicode, None): name of the profile to set the cache for
45 if None, the cache will be common for all profiles 58 if None, the cache will be common for all profiles
54 67
55 self.cache_dir.mkdir(0o700, parents=True, exist_ok=True) 68 self.cache_dir.mkdir(0o700, parents=True, exist_ok=True)
56 self.purge() 69 self.purge()
57 70
58 def purge(self): 71 def purge(self):
59 # remove expired files from cache 72 # Remove expired, unreadable, and unrelated files from cache
60 # TODO: this should not be called only on startup, but at regular interval 73 # TODO: this should not be called only on startup, but at regular interval
61 # (e.g. once a day) 74 # (e.g. once a day)
62 purged = set() 75 to_delete = set()
63 # we sort files to have metadata files first 76 seen = set()
64 for cache_file in sorted(self.cache_dir.iterdir()): 77 now = time.time()
65 if cache_file in purged: 78 for cache_data_file in self.cache_dir.glob(f"*{CACHE_METADATA_EXT}"):
79 try:
80 with cache_data_file.open("r") as f:
81 cache_data = CacheMetadata.model_validate_json(f.read())
82 except (IOError, ValidationError):
83 log.warning(
84 _("Can't read metadata file at {path}, deleting it.").format(
85 path=cache_data_file
86 )
87 )
88 to_delete.add(cache_data_file)
66 continue 89 continue
67 try: 90 else:
68 with cache_file.open('rb') as f: 91 cached_file = self.get_path(cache_data.filename)
69 cache_data = pickle.load(f) 92 if not cached_file.exists():
70 except IOError:
71 log.warning(
72 _("Can't read metadata file at {path}")
73 .format(path=cache_file))
74 continue
75 except (pickle.UnpicklingError, EOFError):
76 log.debug(f"File at {cache_file} is not a metadata file")
77 continue
78 try:
79 eol = cache_data['eol']
80 filename = cache_data['filename']
81 except KeyError:
82 log.warning(
83 _("Invalid cache metadata at {path}")
84 .format(path=cache_file))
85 continue
86
87 filepath = self.getPath(filename)
88
89 if not filepath.exists():
90 log.warning(_(
91 "cache {cache_file!r} references an inexisting file: {filepath!r}"
92 ).format(cache_file=str(cache_file), filepath=str(filepath)))
93 log.debug("purging cache with missing file")
94 cache_file.unlink()
95 elif eol < time.time():
96 log.debug(
97 "purging expired cache {filepath!r} (expired for {time}s)"
98 .format(filepath=str(filepath), time=int(time.time() - eol))
99 )
100 cache_file.unlink()
101 try:
102 filepath.unlink()
103 except FileNotFoundError:
104 log.warning( 93 log.warning(
105 _("following file is missing while purging cache: {path}") 94 f"Cache file {cache_data_file!r} references a non-existent file "
106 .format(path=filepath) 95 f"and will be deleted: {cache_data_file!r}."
107 ) 96 )
108 purged.add(cache_file) 97 to_delete.add(cache_data_file)
109 purged.add(filepath) 98 elif cache_data.eol < now:
110 99 log.debug(
111 def getPath(self, filename: str) -> Path: 100 "Purging expired cache file {cache_data_file!r} (expired for "
112 """return cached file URL 101 "{time}s)".format(time=int(time.time() - cache_data.eol))
102 )
103 to_delete.add(cache_data_file)
104 seen.add(cached_file)
105 seen.add(cache_data_file)
106
107 for file in to_delete:
108 log.debug(f"Deleting cache file: {file}")
109 file.unlink()
110
111 for file in self.cache_dir.iterdir():
112 if file not in seen:
113 log.debug(f"Deleting irrelevant file in cache dir: {file}")
114 file.unlink()
115
116 def get_path(self, filename: str) -> Path:
117 """Return cached file URL.
113 118
114 @param filename: cached file name (cache data or actual file) 119 @param filename: cached file name (cache data or actual file)
115 @return: path to the cached file 120 @return: path to the cached file
116 """ 121 """
117 if not filename or "/" in filename: 122 if not filename or "/" in filename:
119 "invalid char found in file name, hack attempt? name:{}".format(filename) 124 "invalid char found in file name, hack attempt? name:{}".format(filename)
120 ) 125 )
121 raise exceptions.DataError("Invalid char found") 126 raise exceptions.DataError("Invalid char found")
122 return self.cache_dir / filename 127 return self.cache_dir / filename
123 128
124 def get_metadata(self, uid: str, update_eol: bool = True) -> Optional[Dict[str, Any]]: 129 def get_metadata(self, uid: str, update_eol: bool = True) -> dict[str, Any] | None:
125 """Retrieve metadata for cached data 130 """Retrieve metadata for cached data.
126 131
127 @param uid(unicode): unique identifier of file 132 @param uid: unique identifier of cache metadata.
128 @param update_eol(bool): True if eol must extended 133 @param update_eol: True if eol must extended
129 if True, max_age will be added to eol (only if it is not already expired) 134 if True, max_age will be added to eol (only if it is not already expired)
130 @return (dict, None): metadata with following keys: 135 @return: metadata, see [cache_data] for data details, an additional "path" key is
131 see [cache_data] for data details, an additional "path" key is the full path to 136 the full path to cached file.
132 cached file. 137 None if file is not in cache (or cache is invalid).
133 None if file is not in cache (or cache is invalid) 138 """
134 """
135
136 uid = uid.strip() 139 uid = uid.strip()
137 if not uid: 140 if not uid:
138 raise exceptions.InternalError("uid must not be empty") 141 raise exceptions.InternalError("uid must not be empty")
139 cache_url = self.getPath(uid) 142 cache_url = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
140 if not cache_url.exists(): 143 if not cache_url.exists():
141 return None 144 return None
142 145
143 try: 146 try:
144 with cache_url.open("rb") as f: 147 with cache_url.open("r") as f:
145 cache_data = pickle.load(f) 148 cache_data = CacheMetadata.model_validate_json(f.read())
146 except (IOError, EOFError) as e: 149 except (IOError, EOFError) as e:
147 log.warning(f"can't read cache at {cache_url}: {e}") 150 log.warning(f"Can't read cache at {cache_url}: {e}")
148 return None 151 return None
149 except pickle.UnpicklingError: 152 except ValidationError:
150 log.warning(f"invalid cache found at {cache_url}") 153 log.warning(f"Invalid cache found at {cache_url}")
151 return None 154 return None
152 155 except UnicodeDecodeError as e:
153 try: 156 log.warning(f"Invalid encoding, this is not a cache metadata file.")
154 eol = cache_data["eol"] 157 return None
155 except KeyError: 158
156 log.warning("no End Of Life found for cached file {}".format(uid)) 159 if cache_data.eol < time.time():
157 eol = 0
158 if eol < time.time():
159 log.debug( 160 log.debug(
160 "removing expired cache (expired for {}s)".format(time.time() - eol) 161 "removing expired cache (expired for {}s)".format(
162 time.time() - cache_data.eol
163 )
161 ) 164 )
162 return None 165 return None
163 166
164 if update_eol: 167 if update_eol:
165 try:
166 max_age = cache_data["max_age"]
167 except KeyError:
168 log.warning(f"no max_age found for cache at {cache_url}, using default")
169 max_age = cache_data["max_age"] = C.DEFAULT_MAX_AGE
170 now = int(time.time()) 168 now = int(time.time())
171 cache_data["last_access"] = now 169 cache_data.last_access = now
172 cache_data["eol"] = now + max_age 170 cache_data.eol = now + cache_data.max_age
173 with cache_url.open("wb") as f: 171 with cache_url.open("w") as f:
174 pickle.dump(cache_data, f, protocol=2) 172 f.write(cache_data.model_dump_json(exclude_none=True))
175 173
176 cache_data["path"] = self.getPath(cache_data["filename"]) 174 # FIXME: we convert to dict to be compatible with former method (pre Pydantic).
177 return cache_data 175 # All call to get_metadata should use directly the Pydantic model in the future.
178 176 cache_data_dict = cache_data.model_dump()
179 def get_file_path(self, uid: str) -> Path: 177 cache_data_dict["path"] = self.get_path(cache_data.filename)
178 return cache_data_dict
179
180 def get_file_path(self, uid: str) -> Path | None:
180 """Retrieve absolute path to file 181 """Retrieve absolute path to file
181 182
182 @param uid(unicode): unique identifier of file 183 @param uid(unicode): unique identifier of file
183 @return (unicode, None): absolute path to cached file 184 @return (unicode, None): absolute path to cached file
184 None if file is not in cache (or cache is invalid) 185 None if file is not in cache (or cache is invalid)
185 """ 186 """
186 metadata = self.get_metadata(uid) 187 metadata = self.get_metadata(uid)
187 if metadata is not None: 188 if metadata is not None:
188 return metadata["path"] 189 return metadata["path"]
189 190
190 def remove_from_cache(self, uid, metadata=None): 191 def remove_from_cache(self, uid: str, metadata=None) -> None:
191 """Remove data from cache 192 """Remove data from cache
192 193
193 @param uid(unicode): unique identifier cache file 194 @param uid(unicode): unique identifier cache file
194 """ 195 """
195 cache_data = self.get_metadata(uid, update_eol=False) 196 cache_data = self.get_metadata(uid, update_eol=False)
196 if cache_data is None: 197 if cache_data is None:
197 log.debug(f"cache with uid {uid!r} has already expired or been removed") 198 log.debug(f"cache with uid {uid!r} has already expired or been removed")
198 return 199 return
199 200
200 try: 201 try:
201 filename = cache_data['filename'] 202 filename = cache_data["filename"]
202 except KeyError: 203 except KeyError:
203 log.warning(_("missing filename for cache {uid!r}") .format(uid=uid)) 204 log.warning(_("missing filename for cache {uid!r}").format(uid=uid))
204 else: 205 else:
205 filepath = self.getPath(filename) 206 filepath = self.get_path(filename)
206 try: 207 try:
207 filepath.unlink() 208 filepath.unlink()
208 except FileNotFoundError: 209 except FileNotFoundError:
209 log.warning( 210 log.warning(
210 _("missing file referenced in cache {uid!r}: {filename}") 211 _("missing file referenced in cache {uid!r}: {filename}").format(
211 .format(uid=uid, filename=filename) 212 uid=uid, filename=filename
213 )
212 ) 214 )
213 215
214 cache_file = self.getPath(uid) 216 cache_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
215 cache_file.unlink() 217 cache_file.unlink()
216 log.debug(f"cache with uid {uid!r} has been removed") 218 log.debug(f"Cache with uid {uid!r} has been removed.")
217 219
218 def cache_data( 220 def cache_data(
219 self, 221 self,
220 source: str, 222 source: str,
221 uid: str, 223 uid: str,
222 mime_type: Optional[str] = None, 224 mime_type: str | None = None,
223 max_age: Optional[int] = None, 225 max_age: int = C.DEFAULT_MAX_AGE,
224 original_filename: Optional[str] = None 226 original_filename: str | None = None,
225 ) -> BufferedIOBase: 227 ) -> BufferedIOBase:
226 """create cache metadata and file object to use for actual data 228 """Create cache metadata and file object to use for actual data.
227 229
228 @param source: source of the cache (should be plugin's import_name) 230 @param source: source of the cache (should be plugin's import_name)
229 @param uid: an identifier of the file which must be unique 231 @param uid: an identifier of the file which must be unique
230 @param mime_type: MIME type of the file to cache 232 @param mime_type: MIME type of the file to cache
231 it will be used notably to guess file extension 233 it will be used notably to guess file extension
233 @param max_age: maximum age in seconds 235 @param max_age: maximum age in seconds
234 the cache metadata will have an "eol" (end of life) 236 the cache metadata will have an "eol" (end of life)
235 None to use default value 237 None to use default value
236 0 to ignore cache (file will be re-downloaded on each access) 238 0 to ignore cache (file will be re-downloaded on each access)
237 @param original_filename: if not None, will be used to retrieve file extension and 239 @param original_filename: if not None, will be used to retrieve file extension and
238 guess 240 guess mime type, and stored in "original_filename"
239 mime type, and stored in "original_filename"
240 @return: file object opened in write mode 241 @return: file object opened in write mode
241 you have to close it yourself (hint: use ``with`` statement) 242 you have to close it yourself (hint: use ``with`` statement)
242 """ 243 """
243 if max_age is None: 244 if original_filename is not None and mime_type is None:
244 max_age = C.DEFAULT_MAX_AGE 245 # we have original_filename but not MIME type, we try to guess the later
245 cache_data = { 246 mime_type = mimetypes.guess_type(original_filename, strict=False)[0]
246 "source": source, 247
247 # we also store max_age for updating eol
248 "max_age": max_age,
249 }
250 cache_url = self.getPath(uid)
251 if original_filename is not None:
252 cache_data["original_filename"] = original_filename
253 if mime_type is None:
254 # we have original_filename but not MIME type, we try to guess the later
255 mime_type = mimetypes.guess_type(original_filename, strict=False)[0]
256 if mime_type: 248 if mime_type:
257 ext = mimetypes.guess_extension(mime_type, strict=False) 249 ext = mimetypes.guess_extension(mime_type, strict=False)
258 if ext is None: 250 if ext is None:
259 log.warning( 251 log.warning("can't find extension for MIME type {}".format(mime_type))
260 "can't find extension for MIME type {}".format(mime_type)
261 )
262 ext = DEFAULT_EXT 252 ext = DEFAULT_EXT
263 elif ext == ".jpe": 253 elif ext == ".jpe":
264 ext = ".jpg" 254 ext = ".jpg"
265 else: 255 else:
266 ext = DEFAULT_EXT 256 ext = DEFAULT_EXT
267 mime_type = None 257 mime_type = None
258
268 filename = uid + ext 259 filename = uid + ext
269 now = int(time.time()) 260 now = int(time.time())
270 cache_data.update({ 261 metadata = CacheMetadata(
271 "filename": filename, 262 source=source,
272 "creation": now, 263 uid=uid,
273 "eol": now + max_age, 264 mime_type=mime_type,
274 "mime_type": mime_type, 265 max_age=max_age,
275 }) 266 original_filename=original_filename,
276 file_path = self.getPath(filename) 267 filename=filename,
277 268 creation=now,
278 with open(cache_url, "wb") as f: 269 eol=now + max_age,
279 pickle.dump(cache_data, f, protocol=2) 270 )
280 271
281 return file_path.open("wb") 272 cache_metadata_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}")
273 file_path = self.get_path(filename)
274
275 with open(cache_metadata_file, "w") as f:
276 f.write(metadata.model_dump_json(exclude_none=True))
277
278 return open(file_path, "wb")