changeset 3288:780fb8dd07ef

core (memory/sqlite): new database schema (v9): this new schema brings the new `public_id` column, which will be used to serve files via HTTP, splits `mime_type` in 2 (`media_type` and `media_subtype`) to make indexing/media filtering easier, and fixes indexes for `files` table.
author Goffi <goffi@goffi.org>
date Fri, 29 May 2020 21:50:49 +0200
parents a4b8c9bcfb57
children 9057713ab124
files sat/memory/memory.py sat/memory/sqlite.py sat/plugins/plugin_comp_file_sharing_management.py
diffstat 3 files changed, 127 insertions(+), 35 deletions(-) [+]
line wrap: on
line diff
--- a/sat/memory/memory.py	Fri May 29 21:07:10 2020 +0200
+++ b/sat/memory/memory.py	Fri May 29 21:50:49 2020 +0200
@@ -22,6 +22,7 @@
 import shortuuid
 import mimetypes
 import time
+from pathlib import Path
 from uuid import uuid4
 from collections import namedtuple
 from twisted.python import failure
@@ -1293,8 +1294,8 @@
     def getFiles(
         self, client, peer_jid, file_id=None, version=None, parent=None, path=None,
         type_=None, file_hash=None, hash_algo=None, name=None, namespace=None,
-        mime_type=None, owner=None, access=None, projection=None, unique=False,
-        perms_to_check=(C.ACCESS_PERM_READ,)):
+        mime_type=None, public_id=None, owner=None, access=None, projection=None,
+        unique=False, perms_to_check=(C.ACCESS_PERM_READ,)):
         """Retrieve files with with given filters
 
         @param peer_jid(jid.JID, None): jid trying to access the file
@@ -1315,6 +1316,7 @@
         @param name(unicode, None): name of the file to retrieve
         @param namespace(unicode, None): namespace of the files to retrieve
         @param mime_type(unicode, None): filter on this mime type
+        @param public_id(unicode, None): filter on this public id
         @param owner(jid.JID, None): if not None, only get files from this owner
         @param access(dict, None): get file with given access (see [setFile])
         @param projection(list[unicode], None): name of columns to retrieve
@@ -1369,6 +1371,7 @@
             name=name,
             namespace=namespace,
             mime_type=mime_type,
+            public_id=public_id,
             owner=owner,
             access=access,
             projection=projection,
@@ -1389,11 +1392,12 @@
 
     @defer.inlineCallbacks
     def setFile(
-            self, client, name, file_id=None, version="", parent=None, path=None,
-            type_=C.FILE_TYPE_FILE, file_hash=None, hash_algo=None, size=None,
-            namespace=None, mime_type=None, created=None, modified=None, owner=None,
-            access=None, extra=None, peer_jid=None, perms_to_check=(C.ACCESS_PERM_WRITE,)
-            ):
+        self, client, name, file_id=None, version="", parent=None, path=None,
+        type_=C.FILE_TYPE_FILE, file_hash=None, hash_algo=None, size=None,
+        namespace=None, mime_type=None, public_id=None, created=None, modified=None,
+        owner=None, access=None, extra=None, peer_jid=None,
+        perms_to_check=(C.ACCESS_PERM_WRITE,)
+    ):
         """Set a file metadata
 
         @param name(unicode): basename of the file
@@ -1411,6 +1415,7 @@
                                          files
             For instance, namespace could be used to group files in a specific photo album
         @param mime_type(unicode): MIME type of the file, or None if not known/guessed
+        @param public_id(unicode): id used to share publicly the file via HTTP
         @param created(int): UNIX time of creation
         @param modified(int,None): UNIX time of last modification, or None to use
                                    created date
@@ -1448,7 +1453,11 @@
         ):
             raise ValueError("file_hash and hash_algo must be set at the same time")
         if mime_type is None:
-            mime_type, file_encoding = mimetypes.guess_type(name)
+            mime_type, __ = mimetypes.guess_type(name)
+        else:
+            mime_type = mime_type.lower()
+        if public_id is not None:
+            assert len(public_id)>0
         if created is None:
             created = time.time()
         if namespace is not None:
@@ -1477,6 +1486,7 @@
                     parent=parent,
                     type_=C.FILE_TYPE_DIRECTORY,
                     namespace=namespace,
+                    public_id=public_id,
                     created=time.time(),
                     owner=owner,
                     access=access,
@@ -1498,6 +1508,7 @@
             size=size,
             namespace=namespace,
             mime_type=mime_type,
+            public_id=public_id,
             created=created,
             modified=modified,
             owner=owner,
@@ -1518,7 +1529,14 @@
         return self.storage.fileUpdate(file_id, column, update_cb)
 
     @defer.inlineCallbacks
-    def _deleteFile(self, client, peer_jid, recursive, files_path, file_data):
+    def _deleteFile(
+        self,
+        client,
+        peer_jid: jid.JID,
+        recursive: bool,
+        files_path: Path,
+        file_data: dict
+    ):
         """Internal method to delete files/directories recursively
 
         @param peer_jid(jid.JID): entity requesting the deletion (must be owner of files
--- a/sat/memory/sqlite.py	Fri May 29 21:07:10 2020 +0200
+++ b/sat/memory/sqlite.py	Fri May 29 21:50:49 2020 +0200
@@ -38,7 +38,7 @@
 
 log = getLogger(__name__)
 
-CURRENT_DB_VERSION = 8
+CURRENT_DB_VERSION = 9
 
 # XXX: DATABASE schemas are used in the following way:
 #      - 'current' key is for the actual database schema, for a new base
@@ -82,14 +82,16 @@
                                                    ("PRIMARY KEY (namespace, key)",))),
                               ('private_ind_bin', (("namespace TEXT", "key TEXT", "profile_id INTEGER", "value BLOB"),
                                                    ("PRIMARY KEY (profile_id, namespace, key)", "FOREIGN KEY(profile_id) REFERENCES profiles(id) ON DELETE CASCADE"))),
-                              ('files',           (("id TEXT NOT NULL", "version TEXT NOT NULL", "parent TEXT NOT NULL",
+                              ('files',           (("id TEXT NOT NULL", "public_id TEXT", "version TEXT NOT NULL",
+                                                    "parent TEXT NOT NULL",
                                                     "type TEXT CHECK(type in ('{file}', '{directory}')) NOT NULL DEFAULT '{file}'".format(
                                                         file=C.FILE_TYPE_FILE, directory=C.FILE_TYPE_DIRECTORY),
                                                     "file_hash TEXT", "hash_algo TEXT", "name TEXT NOT NULL", "size INTEGER",
-                                                    "namespace TEXT", "mime_type TEXT",
+                                                    "namespace TEXT", "media_type TEXT", "media_subtype TEXT",
                                                     "created DATETIME NOT NULL", "modified DATETIME",
                                                     "owner TEXT", "access TEXT", "extra TEXT", "profile_id INTEGER"),
-                                                   ("PRIMARY KEY (id, version)", "FOREIGN KEY(profile_id) REFERENCES profiles(id) ON DELETE CASCADE"))),
+                                                   ("PRIMARY KEY (id, version)", "FOREIGN KEY(profile_id) REFERENCES profiles(id) ON DELETE CASCADE",
+                                                    "UNIQUE (public_id)"))),
                               )),
                     'INSERT': OrderedDict((
                               ('message_types', (("'chat'",),
@@ -105,8 +107,12 @@
                               ('message', ('history_uid',)),
                               ('subject', ('history_uid',)),
                               ('thread', ('history_uid',)),
-                              ('files', ('profile_id', 'mime_type', 'owner', 'parent'))),
+                              ('files', (('profile_id', 'owner', 'media_type', 'media_subtype'),
+                                         ('profile_id', 'owner', 'parent'))),
+                             )
                     },
+        9:         {'specific': 'update_v9'
+                   },
         8:         {'specific': 'update_v8'
                    },
         7:         {'specific': 'update_v7'
@@ -834,7 +840,7 @@
     @defer.inlineCallbacks
     def getFiles(self, client, file_id=None, version='', parent=None, type_=None,
                  file_hash=None, hash_algo=None, name=None, namespace=None, mime_type=None,
-                 owner=None, access=None, projection=None, unique=False):
+                 public_id=None, owner=None, access=None, projection=None, unique=False):
         """retrieve files with with given filters
 
         @param file_id(unicode, None): id of the file
@@ -856,12 +862,19 @@
             query_parts.append('DISTINCT')
         if projection is None:
             projection = ['id', 'version', 'parent', 'type', 'file_hash', 'hash_algo', 'name',
-                          'size', 'namespace', 'mime_type', 'created', 'modified', 'owner',
+                          'size', 'namespace', 'media_type', 'media_subtype', 'public_id', 'created', 'modified', 'owner',
                           'access', 'extra']
         query_parts.append(','.join(projection))
         query_parts.append("FROM files WHERE")
-        filters = ['profile_id=?']
-        args = [self.profiles[client.profile]]
+        if client is not None:
+            filters = ['profile_id=?']
+            args = [self.profiles[client.profile]]
+        else:
+            if public_id is None:
+                raise exceptions.InternalError(
+                    "client can only be omitted when public_id is set")
+            filters = []
+            args = []
 
         if file_id is not None:
             filters.append('id=?')
@@ -888,8 +901,15 @@
             filters.append('namespace=?')
             args.append(namespace)
         if mime_type is not None:
-            filters.append('mime_type=?')
-            args.append(mime_type)
+            if '/' in mime_type:
+                filters.extend('media_type=?', 'media_subtype=?')
+                args.extend(mime_type.split('/', 1))
+            else:
+                filters.append('media_type=?')
+                args.append(mime_type)
+        if public_id is not None:
+            filters.append('public_id=?')
+            args.append(public_id)
         if owner is not None:
             filters.append('owner=?')
             args.append(owner.full())
@@ -917,24 +937,25 @@
 
     def setFile(self, client, name, file_id, version='', parent=None, type_=C.FILE_TYPE_FILE,
                 file_hash=None, hash_algo=None, size=None, namespace=None, mime_type=None,
-                created=None, modified=None, owner=None, access=None, extra=None):
+                public_id=None, created=None, modified=None, owner=None, access=None, extra=None):
         """set a file metadata
 
         @param client(SatXMPPClient): client owning the file
-        @param name(unicode): name of the file (must not contain "/")
-        @param file_id(unicode): unique id of the file
-        @param version(unicode): version of this file
-        @param parent(unicode): id of the directory containing this file
+        @param name(str): name of the file (must not contain "/")
+        @param file_id(str): unique id of the file
+        @param version(str): version of this file
+        @param parent(str): id of the directory containing this file
             None if it is a root file/directory
-        @param type_(unicode): one of:
+        @param type_(str): one of:
             - file
             - directory
-        @param file_hash(unicode): unique hash of the payload
-        @param hash_algo(unicode): algorithm used for hashing the file (usually sha-256)
+        @param file_hash(str): unique hash of the payload
+        @param hash_algo(str): algorithm used for hashing the file (usually sha-256)
         @param size(int): size in bytes
-        @param namespace(unicode, None): identifier (human readable is better) to group files
+        @param namespace(str, None): identifier (human readable is better) to group files
             for instance, namespace could be used to group files in a specific photo album
-        @param mime_type(unicode): MIME type of the file, or None if not known/guessed
+        @param mime_type(str): media type of the file, or None if not known/guessed
+        @param public_id(str): ID used to server the file publicly via HTTP
         @param created(int): UNIX time of creation
         @param modified(int,None): UNIX time of last modification, or None to use created date
         @param owner(jid.JID, None): jid of the owner of the file (mainly useful for component)
@@ -943,13 +964,21 @@
             will be encoded to json in database
         """
         if extra is not None:
-         assert isinstance(extra, dict)
+            assert isinstance(extra, dict)
+
+        if mime_type is None:
+            media_type = media_subtype = None
+        elif '/' in mime_type:
+            media_type, media_subtype = mime_type.split('/', 1)
+        else:
+            media_type, media_subtype = mime_type, None
+
         query = ('INSERT INTO files(id, version, parent, type, file_hash, hash_algo, name, size, namespace, '
-                 'mime_type, created, modified, owner, access, extra, profile_id) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)')
+                 'media_type, media_subtype, public_id, created, modified, owner, access, extra, profile_id) VALUES (?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?,?)')
         d = self.dbpool.runQuery(query, (file_id, version.strip(), parent, type_,
                                          file_hash, hash_algo,
                                          name, size, namespace,
-                                         mime_type, created, modified,
+                                         media_type, media_subtype, public_id, created, modified,
                                          owner.full() if owner is not None else None,
                                          json.dumps(access) if access else None,
                                          json.dumps(extra) if extra else None,
@@ -1357,6 +1386,51 @@
             ret.extend(cmds or [])
         defer.returnValue(ret)
 
+    def update_v9(self):
+        """Update database from v8 to v9
+
+        (public_id on file with UNIQUE constraint, files indexes fix, media_type split)
+        """
+        # we have to do a specific update because we can't set UNIQUE constraint when adding a column
+        # (see https://sqlite.org/lang_altertable.html#alter_table_add_column)
+        log.info("Database update to v9")
+
+        create = {
+            'files': (("id TEXT NOT NULL", "public_id TEXT", "version TEXT NOT NULL",
+                       "parent TEXT NOT NULL",
+                       "type TEXT CHECK(type in ('{file}', '{directory}')) NOT NULL DEFAULT '{file}'".format(
+                           file=C.FILE_TYPE_FILE, directory=C.FILE_TYPE_DIRECTORY),
+                       "file_hash TEXT", "hash_algo TEXT", "name TEXT NOT NULL", "size INTEGER",
+                       "namespace TEXT", "media_type TEXT", "media_subtype TEXT",
+                       "created DATETIME NOT NULL", "modified DATETIME",
+                       "owner TEXT", "access TEXT", "extra TEXT", "profile_id INTEGER"),
+                      ("PRIMARY KEY (id, version)", "FOREIGN KEY(profile_id) REFERENCES profiles(id) ON DELETE CASCADE",
+                       "UNIQUE (public_id)")),
+
+        }
+        index = tuple({'files': (('profile_id', 'owner', 'media_type', 'media_subtype'),
+                                 ('profile_id', 'owner', 'parent'))}.items())
+        # XXX: Sqlite doc recommends to do the other way around (i.e. create new table,
+        #   copy, drop old table then rename), but the RENAME would then add
+        #   "IF NOT EXISTS" which breaks the (admittely fragile) schema comparison.
+        # TODO: rework sqlite update management, don't try to automatically detect
+        #   update, the database version is now enough.
+        statements = ["ALTER TABLE files RENAME TO files_old"]
+        statements.extend(Updater.createData2Raw(create))
+        cols = ','.join([col_stmt.split()[0] for col_stmt in create['files'][0] if "public_id" not in col_stmt])
+        old_cols = cols[:]
+        # we need to split mime_type to the new media_type and media_subtype
+        old_cols = old_cols.replace(
+            'media_type,media_subtype',
+            "substr(mime_type, 0, instr(mime_type,'/')),substr(mime_type, instr(mime_type,'/')+1)"
+        )
+        statements.extend([
+            f"INSERT INTO files({cols}) SELECT {old_cols} FROM files_old",
+            f"DROP TABLE files_old",
+        ])
+        statements.extend(Updater.indexData2Raw(index))
+        return statements
+
     def update_v8(self):
         """Update database from v7 to v8 (primary keys order changes + indexes)"""
         log.info("Database update to v8")
--- a/sat/plugins/plugin_comp_file_sharing_management.py	Fri May 29 21:07:10 2020 +0200
+++ b/sat/plugins/plugin_comp_file_sharing_management.py	Fri May 29 21:50:49 2020 +0200
@@ -381,9 +381,9 @@
                 yield self._genThumbs(client, requestor, namespace, sub_file_data)
 
         elif file_data['type'] == C.FILE_TYPE_FILE:
-            mime_type = file_data['mime_type']
+            media_type = file_data['media_type']
             file_path = os.path.join(self.files_path, file_data['file_hash'])
-            if mime_type is not None and mime_type.startswith("image"):
+            if media_type == 'image':
                 thumbnails = []
 
                 for max_thumb_size in (self._t.SIZE_SMALL, self._t.SIZE_MEDIUM):