Mercurial > libervia-backend
comparison libervia/backend/memory/cache.py @ 4212:5f2d496c633f
core: get rid of `pickle`:
Use of `pickle` to serialise data was a technical legacy that was causing trouble to store
in database, to update (if a class was serialised, a change could break update), and to
security (pickle can lead to code execution).
This patch remove all use of Pickle in favour in JSON, notably:
- for caching data, a Pydantic model is now used instead
- for SQLAlchemy model, the LegacyPickle is replaced by JSON serialisation
- in XEP-0373 a class `PublicKeyMetadata` was serialised. New method `from_dict` and
`to_dict` method have been implemented to do serialisation.
- new methods to (de)serialise data can now be specified with Identity data types. It is
notably used to (de)serialise `path` of avatars.
A migration script has been created to convert data (for upgrade or downgrade), with
special care for XEP-0373 case. Depending of size of database, this migration script can
be long to run.
rel 443
author | Goffi <goffi@goffi.org> |
---|---|
date | Fri, 23 Feb 2024 13:31:04 +0100 |
parents | 4b842c1fb686 |
children | e11b13418ba6 |
comparison
equal
deleted
inserted
replaced
4211:be89ab1cbca4 | 4212:5f2d496c633f |
---|---|
1 #!/usr/bin/env python3 | 1 #!/usr/bin/env python3 |
2 | 2 |
3 | 3 # Libervia: an XMPP client |
4 # SAT: a jabber client | 4 # Copyright (C) 2009-2024 Jérôme Poisson (goffi@goffi.org) |
5 # Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org) | |
6 | 5 |
7 # This program is free software: you can redistribute it and/or modify | 6 # This program is free software: you can redistribute it and/or modify |
8 # it under the terms of the GNU Affero General Public License as published by | 7 # it under the terms of the GNU Affero General Public License as published by |
9 # the Free Software Foundation, either version 3 of the License, or | 8 # the Free Software Foundation, either version 3 of the License, or |
10 # (at your option) any later version. | 9 # (at your option) any later version. |
18 # along with this program. If not, see <http://www.gnu.org/licenses/>. | 17 # along with this program. If not, see <http://www.gnu.org/licenses/>. |
19 | 18 |
20 from io import BufferedIOBase | 19 from io import BufferedIOBase |
21 import mimetypes | 20 import mimetypes |
22 from pathlib import Path | 21 from pathlib import Path |
23 import pickle as pickle | |
24 import time | 22 import time |
25 from typing import Any, Dict, Optional | 23 from typing import Any |
24 | |
25 from pydantic import BaseModel, ValidationError | |
26 | 26 |
27 from libervia.backend.core import exceptions | 27 from libervia.backend.core import exceptions |
28 from libervia.backend.core.constants import Const as C | 28 from libervia.backend.core.constants import Const as C |
29 from libervia.backend.core.i18n import _ | 29 from libervia.backend.core.i18n import _ |
30 from libervia.backend.core.log import getLogger | 30 from libervia.backend.core.log import getLogger |
31 from libervia.backend.tools.common import regex | 31 from libervia.backend.tools.common import regex |
32 | 32 |
33 | 33 |
34 log = getLogger(__name__) | 34 log = getLogger(__name__) |
35 | 35 |
36 CACHE_METADATA_EXT = ".cache.json" | |
36 DEFAULT_EXT = ".raw" | 37 DEFAULT_EXT = ".raw" |
37 | 38 |
38 | 39 |
39 class Cache(object): | 40 class CacheMetadata(BaseModel): |
40 """generic file caching""" | 41 source: str |
42 uid: str | |
43 filename: str | |
44 creation: int | |
45 eol: int | |
46 max_age: int = C.DEFAULT_MAX_AGE | |
47 original_filename: str | None = None | |
48 mime_type: str | None = None | |
49 last_access: int | None = None | |
50 | |
51 | |
52 class Cache: | |
53 """Generic file caching.""" | |
41 | 54 |
42 def __init__(self, host, profile): | 55 def __init__(self, host, profile): |
43 """ | 56 """ |
44 @param profile(unicode, None): name of the profile to set the cache for | 57 @param profile(unicode, None): name of the profile to set the cache for |
45 if None, the cache will be common for all profiles | 58 if None, the cache will be common for all profiles |
54 | 67 |
55 self.cache_dir.mkdir(0o700, parents=True, exist_ok=True) | 68 self.cache_dir.mkdir(0o700, parents=True, exist_ok=True) |
56 self.purge() | 69 self.purge() |
57 | 70 |
58 def purge(self): | 71 def purge(self): |
59 # remove expired files from cache | 72 # Remove expired, unreadable, and unrelated files from cache |
60 # TODO: this should not be called only on startup, but at regular interval | 73 # TODO: this should not be called only on startup, but at regular interval |
61 # (e.g. once a day) | 74 # (e.g. once a day) |
62 purged = set() | 75 to_delete = set() |
63 # we sort files to have metadata files first | 76 seen = set() |
64 for cache_file in sorted(self.cache_dir.iterdir()): | 77 now = time.time() |
65 if cache_file in purged: | 78 for cache_data_file in self.cache_dir.glob(f"*{CACHE_METADATA_EXT}"): |
79 try: | |
80 with cache_data_file.open("r") as f: | |
81 cache_data = CacheMetadata.model_validate_json(f.read()) | |
82 except (IOError, ValidationError): | |
83 log.warning( | |
84 _("Can't read metadata file at {path}, deleting it.").format( | |
85 path=cache_data_file | |
86 ) | |
87 ) | |
88 to_delete.add(cache_data_file) | |
66 continue | 89 continue |
67 try: | 90 else: |
68 with cache_file.open('rb') as f: | 91 cached_file = self.get_path(cache_data.filename) |
69 cache_data = pickle.load(f) | 92 if not cached_file.exists(): |
70 except IOError: | |
71 log.warning( | |
72 _("Can't read metadata file at {path}") | |
73 .format(path=cache_file)) | |
74 continue | |
75 except (pickle.UnpicklingError, EOFError): | |
76 log.debug(f"File at {cache_file} is not a metadata file") | |
77 continue | |
78 try: | |
79 eol = cache_data['eol'] | |
80 filename = cache_data['filename'] | |
81 except KeyError: | |
82 log.warning( | |
83 _("Invalid cache metadata at {path}") | |
84 .format(path=cache_file)) | |
85 continue | |
86 | |
87 filepath = self.getPath(filename) | |
88 | |
89 if not filepath.exists(): | |
90 log.warning(_( | |
91 "cache {cache_file!r} references an inexisting file: {filepath!r}" | |
92 ).format(cache_file=str(cache_file), filepath=str(filepath))) | |
93 log.debug("purging cache with missing file") | |
94 cache_file.unlink() | |
95 elif eol < time.time(): | |
96 log.debug( | |
97 "purging expired cache {filepath!r} (expired for {time}s)" | |
98 .format(filepath=str(filepath), time=int(time.time() - eol)) | |
99 ) | |
100 cache_file.unlink() | |
101 try: | |
102 filepath.unlink() | |
103 except FileNotFoundError: | |
104 log.warning( | 93 log.warning( |
105 _("following file is missing while purging cache: {path}") | 94 f"Cache file {cache_data_file!r} references a non-existent file " |
106 .format(path=filepath) | 95 f"and will be deleted: {cache_data_file!r}." |
107 ) | 96 ) |
108 purged.add(cache_file) | 97 to_delete.add(cache_data_file) |
109 purged.add(filepath) | 98 elif cache_data.eol < now: |
110 | 99 log.debug( |
111 def getPath(self, filename: str) -> Path: | 100 "Purging expired cache file {cache_data_file!r} (expired for " |
112 """return cached file URL | 101 "{time}s)".format(time=int(time.time() - cache_data.eol)) |
102 ) | |
103 to_delete.add(cache_data_file) | |
104 seen.add(cached_file) | |
105 seen.add(cache_data_file) | |
106 | |
107 for file in to_delete: | |
108 log.debug(f"Deleting cache file: {file}") | |
109 file.unlink() | |
110 | |
111 for file in self.cache_dir.iterdir(): | |
112 if file not in seen: | |
113 log.debug(f"Deleting irrelevant file in cache dir: {file}") | |
114 file.unlink() | |
115 | |
116 def get_path(self, filename: str) -> Path: | |
117 """Return cached file URL. | |
113 | 118 |
114 @param filename: cached file name (cache data or actual file) | 119 @param filename: cached file name (cache data or actual file) |
115 @return: path to the cached file | 120 @return: path to the cached file |
116 """ | 121 """ |
117 if not filename or "/" in filename: | 122 if not filename or "/" in filename: |
119 "invalid char found in file name, hack attempt? name:{}".format(filename) | 124 "invalid char found in file name, hack attempt? name:{}".format(filename) |
120 ) | 125 ) |
121 raise exceptions.DataError("Invalid char found") | 126 raise exceptions.DataError("Invalid char found") |
122 return self.cache_dir / filename | 127 return self.cache_dir / filename |
123 | 128 |
124 def get_metadata(self, uid: str, update_eol: bool = True) -> Optional[Dict[str, Any]]: | 129 def get_metadata(self, uid: str, update_eol: bool = True) -> dict[str, Any] | None: |
125 """Retrieve metadata for cached data | 130 """Retrieve metadata for cached data. |
126 | 131 |
127 @param uid(unicode): unique identifier of file | 132 @param uid: unique identifier of cache metadata. |
128 @param update_eol(bool): True if eol must extended | 133 @param update_eol: True if eol must extended |
129 if True, max_age will be added to eol (only if it is not already expired) | 134 if True, max_age will be added to eol (only if it is not already expired) |
130 @return (dict, None): metadata with following keys: | 135 @return: metadata, see [cache_data] for data details, an additional "path" key is |
131 see [cache_data] for data details, an additional "path" key is the full path to | 136 the full path to cached file. |
132 cached file. | 137 None if file is not in cache (or cache is invalid). |
133 None if file is not in cache (or cache is invalid) | 138 """ |
134 """ | |
135 | |
136 uid = uid.strip() | 139 uid = uid.strip() |
137 if not uid: | 140 if not uid: |
138 raise exceptions.InternalError("uid must not be empty") | 141 raise exceptions.InternalError("uid must not be empty") |
139 cache_url = self.getPath(uid) | 142 cache_url = self.get_path(f"{uid}{CACHE_METADATA_EXT}") |
140 if not cache_url.exists(): | 143 if not cache_url.exists(): |
141 return None | 144 return None |
142 | 145 |
143 try: | 146 try: |
144 with cache_url.open("rb") as f: | 147 with cache_url.open("r") as f: |
145 cache_data = pickle.load(f) | 148 cache_data = CacheMetadata.model_validate_json(f.read()) |
146 except (IOError, EOFError) as e: | 149 except (IOError, EOFError) as e: |
147 log.warning(f"can't read cache at {cache_url}: {e}") | 150 log.warning(f"Can't read cache at {cache_url}: {e}") |
148 return None | 151 return None |
149 except pickle.UnpicklingError: | 152 except ValidationError: |
150 log.warning(f"invalid cache found at {cache_url}") | 153 log.warning(f"Invalid cache found at {cache_url}") |
151 return None | 154 return None |
152 | 155 except UnicodeDecodeError as e: |
153 try: | 156 log.warning(f"Invalid encoding, this is not a cache metadata file.") |
154 eol = cache_data["eol"] | 157 return None |
155 except KeyError: | 158 |
156 log.warning("no End Of Life found for cached file {}".format(uid)) | 159 if cache_data.eol < time.time(): |
157 eol = 0 | |
158 if eol < time.time(): | |
159 log.debug( | 160 log.debug( |
160 "removing expired cache (expired for {}s)".format(time.time() - eol) | 161 "removing expired cache (expired for {}s)".format( |
162 time.time() - cache_data.eol | |
163 ) | |
161 ) | 164 ) |
162 return None | 165 return None |
163 | 166 |
164 if update_eol: | 167 if update_eol: |
165 try: | |
166 max_age = cache_data["max_age"] | |
167 except KeyError: | |
168 log.warning(f"no max_age found for cache at {cache_url}, using default") | |
169 max_age = cache_data["max_age"] = C.DEFAULT_MAX_AGE | |
170 now = int(time.time()) | 168 now = int(time.time()) |
171 cache_data["last_access"] = now | 169 cache_data.last_access = now |
172 cache_data["eol"] = now + max_age | 170 cache_data.eol = now + cache_data.max_age |
173 with cache_url.open("wb") as f: | 171 with cache_url.open("w") as f: |
174 pickle.dump(cache_data, f, protocol=2) | 172 f.write(cache_data.model_dump_json(exclude_none=True)) |
175 | 173 |
176 cache_data["path"] = self.getPath(cache_data["filename"]) | 174 # FIXME: we convert to dict to be compatible with former method (pre Pydantic). |
177 return cache_data | 175 # All call to get_metadata should use directly the Pydantic model in the future. |
178 | 176 cache_data_dict = cache_data.model_dump() |
179 def get_file_path(self, uid: str) -> Path: | 177 cache_data_dict["path"] = self.get_path(cache_data.filename) |
178 return cache_data_dict | |
179 | |
180 def get_file_path(self, uid: str) -> Path | None: | |
180 """Retrieve absolute path to file | 181 """Retrieve absolute path to file |
181 | 182 |
182 @param uid(unicode): unique identifier of file | 183 @param uid(unicode): unique identifier of file |
183 @return (unicode, None): absolute path to cached file | 184 @return (unicode, None): absolute path to cached file |
184 None if file is not in cache (or cache is invalid) | 185 None if file is not in cache (or cache is invalid) |
185 """ | 186 """ |
186 metadata = self.get_metadata(uid) | 187 metadata = self.get_metadata(uid) |
187 if metadata is not None: | 188 if metadata is not None: |
188 return metadata["path"] | 189 return metadata["path"] |
189 | 190 |
190 def remove_from_cache(self, uid, metadata=None): | 191 def remove_from_cache(self, uid: str, metadata=None) -> None: |
191 """Remove data from cache | 192 """Remove data from cache |
192 | 193 |
193 @param uid(unicode): unique identifier cache file | 194 @param uid(unicode): unique identifier cache file |
194 """ | 195 """ |
195 cache_data = self.get_metadata(uid, update_eol=False) | 196 cache_data = self.get_metadata(uid, update_eol=False) |
196 if cache_data is None: | 197 if cache_data is None: |
197 log.debug(f"cache with uid {uid!r} has already expired or been removed") | 198 log.debug(f"cache with uid {uid!r} has already expired or been removed") |
198 return | 199 return |
199 | 200 |
200 try: | 201 try: |
201 filename = cache_data['filename'] | 202 filename = cache_data["filename"] |
202 except KeyError: | 203 except KeyError: |
203 log.warning(_("missing filename for cache {uid!r}") .format(uid=uid)) | 204 log.warning(_("missing filename for cache {uid!r}").format(uid=uid)) |
204 else: | 205 else: |
205 filepath = self.getPath(filename) | 206 filepath = self.get_path(filename) |
206 try: | 207 try: |
207 filepath.unlink() | 208 filepath.unlink() |
208 except FileNotFoundError: | 209 except FileNotFoundError: |
209 log.warning( | 210 log.warning( |
210 _("missing file referenced in cache {uid!r}: {filename}") | 211 _("missing file referenced in cache {uid!r}: {filename}").format( |
211 .format(uid=uid, filename=filename) | 212 uid=uid, filename=filename |
213 ) | |
212 ) | 214 ) |
213 | 215 |
214 cache_file = self.getPath(uid) | 216 cache_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}") |
215 cache_file.unlink() | 217 cache_file.unlink() |
216 log.debug(f"cache with uid {uid!r} has been removed") | 218 log.debug(f"Cache with uid {uid!r} has been removed.") |
217 | 219 |
218 def cache_data( | 220 def cache_data( |
219 self, | 221 self, |
220 source: str, | 222 source: str, |
221 uid: str, | 223 uid: str, |
222 mime_type: Optional[str] = None, | 224 mime_type: str | None = None, |
223 max_age: Optional[int] = None, | 225 max_age: int = C.DEFAULT_MAX_AGE, |
224 original_filename: Optional[str] = None | 226 original_filename: str | None = None, |
225 ) -> BufferedIOBase: | 227 ) -> BufferedIOBase: |
226 """create cache metadata and file object to use for actual data | 228 """Create cache metadata and file object to use for actual data. |
227 | 229 |
228 @param source: source of the cache (should be plugin's import_name) | 230 @param source: source of the cache (should be plugin's import_name) |
229 @param uid: an identifier of the file which must be unique | 231 @param uid: an identifier of the file which must be unique |
230 @param mime_type: MIME type of the file to cache | 232 @param mime_type: MIME type of the file to cache |
231 it will be used notably to guess file extension | 233 it will be used notably to guess file extension |
233 @param max_age: maximum age in seconds | 235 @param max_age: maximum age in seconds |
234 the cache metadata will have an "eol" (end of life) | 236 the cache metadata will have an "eol" (end of life) |
235 None to use default value | 237 None to use default value |
236 0 to ignore cache (file will be re-downloaded on each access) | 238 0 to ignore cache (file will be re-downloaded on each access) |
237 @param original_filename: if not None, will be used to retrieve file extension and | 239 @param original_filename: if not None, will be used to retrieve file extension and |
238 guess | 240 guess mime type, and stored in "original_filename" |
239 mime type, and stored in "original_filename" | |
240 @return: file object opened in write mode | 241 @return: file object opened in write mode |
241 you have to close it yourself (hint: use ``with`` statement) | 242 you have to close it yourself (hint: use ``with`` statement) |
242 """ | 243 """ |
243 if max_age is None: | 244 if original_filename is not None and mime_type is None: |
244 max_age = C.DEFAULT_MAX_AGE | 245 # we have original_filename but not MIME type, we try to guess the later |
245 cache_data = { | 246 mime_type = mimetypes.guess_type(original_filename, strict=False)[0] |
246 "source": source, | 247 |
247 # we also store max_age for updating eol | |
248 "max_age": max_age, | |
249 } | |
250 cache_url = self.getPath(uid) | |
251 if original_filename is not None: | |
252 cache_data["original_filename"] = original_filename | |
253 if mime_type is None: | |
254 # we have original_filename but not MIME type, we try to guess the later | |
255 mime_type = mimetypes.guess_type(original_filename, strict=False)[0] | |
256 if mime_type: | 248 if mime_type: |
257 ext = mimetypes.guess_extension(mime_type, strict=False) | 249 ext = mimetypes.guess_extension(mime_type, strict=False) |
258 if ext is None: | 250 if ext is None: |
259 log.warning( | 251 log.warning("can't find extension for MIME type {}".format(mime_type)) |
260 "can't find extension for MIME type {}".format(mime_type) | |
261 ) | |
262 ext = DEFAULT_EXT | 252 ext = DEFAULT_EXT |
263 elif ext == ".jpe": | 253 elif ext == ".jpe": |
264 ext = ".jpg" | 254 ext = ".jpg" |
265 else: | 255 else: |
266 ext = DEFAULT_EXT | 256 ext = DEFAULT_EXT |
267 mime_type = None | 257 mime_type = None |
258 | |
268 filename = uid + ext | 259 filename = uid + ext |
269 now = int(time.time()) | 260 now = int(time.time()) |
270 cache_data.update({ | 261 metadata = CacheMetadata( |
271 "filename": filename, | 262 source=source, |
272 "creation": now, | 263 uid=uid, |
273 "eol": now + max_age, | 264 mime_type=mime_type, |
274 "mime_type": mime_type, | 265 max_age=max_age, |
275 }) | 266 original_filename=original_filename, |
276 file_path = self.getPath(filename) | 267 filename=filename, |
277 | 268 creation=now, |
278 with open(cache_url, "wb") as f: | 269 eol=now + max_age, |
279 pickle.dump(cache_data, f, protocol=2) | 270 ) |
280 | 271 |
281 return file_path.open("wb") | 272 cache_metadata_file = self.get_path(f"{uid}{CACHE_METADATA_EXT}") |
273 file_path = self.get_path(filename) | |
274 | |
275 with open(cache_metadata_file, "w") as f: | |
276 f.write(metadata.model_dump_json(exclude_none=True)) | |
277 | |
278 return open(file_path, "wb") |