diff libervia/backend/memory/migration/versions/fe3a02cb4bec_convert_legacypickle_columns_to_json.py @ 4212:5f2d496c633f

core: get rid of `pickle`: Use of `pickle` to serialise data was a technical legacy that was causing trouble to store in database, to update (if a class was serialised, a change could break update), and to security (pickle can lead to code execution). This patch remove all use of Pickle in favour in JSON, notably: - for caching data, a Pydantic model is now used instead - for SQLAlchemy model, the LegacyPickle is replaced by JSON serialisation - in XEP-0373 a class `PublicKeyMetadata` was serialised. New method `from_dict` and `to_dict` method have been implemented to do serialisation. - new methods to (de)serialise data can now be specified with Identity data types. It is notably used to (de)serialise `path` of avatars. A migration script has been created to convert data (for upgrade or downgrade), with special care for XEP-0373 case. Depending of size of database, this migration script can be long to run. rel 443
author Goffi <goffi@goffi.org>
date Fri, 23 Feb 2024 13:31:04 +0100
parents
children 1a7a3e4b52a4
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libervia/backend/memory/migration/versions/fe3a02cb4bec_convert_legacypickle_columns_to_json.py	Fri Feb 23 13:31:04 2024 +0100
@@ -0,0 +1,142 @@
+"""convert LegacyPickle columns to JSON
+
+Revision ID: fe3a02cb4bec
+Revises: 610345f77e75
+Create Date: 2024-02-22 14:55:59.993983
+
+"""
+from alembic import op
+import sqlalchemy as sa
+import pickle
+import json
+from libervia.backend.plugins.plugin_xep_0373 import PublicKeyMetadata
+
+# revision identifiers, used by Alembic.
+revision = "fe3a02cb4bec"
+down_revision = "610345f77e75"
+branch_labels = None
+depends_on = None
+
+
+def convert_pickle_to_json(value, table, primary_keys):
+    """Convert pickled data to JSON, handling potential errors."""
+    if value is None:
+        return None
+    try:
+        # some values are converted to bytes with LegacyPickle
+        if isinstance(value, str):
+            value = value.encode()
+        try:
+            deserialized = pickle.loads(value, encoding="utf-8")
+        except ModuleNotFoundError:
+            deserialized = pickle.loads(
+                value.replace(b"sat.plugins", b"libervia.backend.plugins"),
+                encoding="utf-8",
+            )
+        if (
+            table == "private_ind_bin"
+            and primary_keys[0] == "XEP-0373"
+            and not primary_keys[1].startswith("/trust")
+            and isinstance(deserialized, set)
+            and deserialized
+            and isinstance(next(iter(deserialized)), PublicKeyMetadata)
+        ):
+            # XEP-0373 plugin was pickling an internal class, this can't be converted
+            # directly to JSON, so we do a special treatment with the add `to_dict` and
+            # `from_dict` methods.
+            deserialized = [pkm.to_dict() for pkm in deserialized]
+
+        ret = json.dumps(deserialized, ensure_ascii=False, default=str)
+        if table == 'history' and ret == "{}":
+            # For history, we can remove empty data, but for other tables it may be
+            # significant.
+            ret = None
+        return ret
+    except Exception as e:
+        print(
+            f"Warning: Failed to convert pickle to JSON, using NULL instead. Error: {e}"
+        )
+        return None
+
+
+def upgrade():
+    print(
+        "This migration may take very long, please be patient and don't stop the process."
+    )
+    connection = op.get_bind()
+
+    tables_and_columns = [
+        ("history", "extra", "uid"),
+        ("private_gen_bin", "value", "namespace", "key"),
+        ("private_ind_bin", "value", "namespace", "key", "profile_id"),
+    ]
+
+    for table, column, *primary_keys in tables_and_columns:
+        primary_key_clause = " AND ".join(f"{pk} = :{pk}" for pk in primary_keys)
+        select_stmt = sa.text(f"SELECT {', '.join(primary_keys)}, {column} FROM {table}")
+        update_stmt = sa.text(
+            f"UPDATE {table} SET {column} = :{column} WHERE {primary_key_clause}"
+        )
+
+        result = connection.execute(select_stmt)
+        for row in result:
+            value = row[-1]
+            if value is None:
+                continue
+            data = {pk: row[idx] for idx, pk in enumerate(primary_keys)}
+            data[column] = convert_pickle_to_json(value, table, row[:-1])
+            connection.execute(update_stmt.bindparams(**data))
+
+
+def convert_json_to_pickle(value, table, primary_keys):
+    """Convert JSON data back to pickled data, handling potential errors."""
+    if value is None:
+        return None
+    try:
+        deserialized = json.loads(value)
+        # Check for the specific table and primary key conditions that require special
+        # handling
+        if (
+            table == "private_ind_bin"
+            and primary_keys[0] == "XEP-0373"
+            and not primary_keys[1].startswith("/trust")
+        ):
+            # Convert list of dicts back to set of PublicKeyMetadata objects
+            if isinstance(deserialized, list):
+                deserialized = {PublicKeyMetadata.from_dict(d) for d in deserialized}
+        return pickle.dumps(deserialized, 0)
+    except Exception as e:
+        print(
+            f"Warning: Failed to convert JSON to pickle, using NULL instead. Error: {e}"
+        )
+        return None
+
+
+def downgrade():
+    print(
+        "Reverting JSON columns to LegacyPickle format. This may take a while, please be "
+        "patient."
+    )
+    connection = op.get_bind()
+
+    tables_and_columns = [
+        ("history", "extra", "uid"),
+        ("private_gen_bin", "value", "namespace", "key"),
+        ("private_ind_bin", "value", "namespace", "key", "profile_id"),
+    ]
+
+    for table, column, *primary_keys in tables_and_columns:
+        primary_key_clause = " AND ".join(f"{pk} = :{pk}" for pk in primary_keys)
+        select_stmt = sa.text(f"SELECT {', '.join(primary_keys)}, {column} FROM {table}")
+        update_stmt = sa.text(
+            f"UPDATE {table} SET {column} = :{column} WHERE {primary_key_clause}"
+        )
+
+        result = connection.execute(select_stmt)
+        for row in result:
+            value = row[-1]
+            if value is None:
+                continue
+            data = {pk: row[idx] for idx, pk in enumerate(primary_keys)}
+            data[column] = convert_json_to_pickle(value, table, row[:-1])
+            connection.execute(update_stmt.bindparams(**data))