Mercurial > libervia-backend

#!/usr/bin/env python3

# Libervia: an XMPP client
# Copyright (C) 2009-2021 Jérôme Poisson (goffi@goffi.org)

# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU Affero General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.

# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
# GNU Affero General Public License for more details.

# You should have received a copy of the GNU Affero General Public License
# along with this program.  If not, see <http://www.gnu.org/licenses/>.

import sys
import time
import asyncio
import copy
from datetime import datetime
from asyncio.subprocess import PIPE
from pathlib import Path
from typing import Union, Dict, List, Tuple, Iterable, Any, Callable, Optional
from sqlalchemy.ext.asyncio import AsyncSession, AsyncEngine, create_async_engine
from sqlalchemy.exc import IntegrityError, NoResultFound
from sqlalchemy.orm import (
    sessionmaker, subqueryload, joinedload, selectinload, contains_eager
)
from sqlalchemy.orm.decl_api import DeclarativeMeta
from sqlalchemy.future import select
from sqlalchemy.engine import Engine, Connection
from sqlalchemy import update, delete, and_, or_, event, func
from sqlalchemy.sql.functions import coalesce, sum as sum_, now, count
from sqlalchemy.dialects.sqlite import insert
from sqlalchemy import text, literal_column, Integer
from alembic import script as al_script, config as al_config
from alembic.runtime import migration as al_migration
from twisted.internet import defer
from twisted.words.protocols.jabber import jid
from twisted.words.xish import domish
from sat.core.i18n import _
from sat.core import exceptions
from sat.core.log import getLogger
from sat.core.constants import Const as C
from sat.core.core_types import SatXMPPEntity
from sat.tools.utils import aio
from sat.tools.common import uri
from sat.memory import migration
from sat.memory import sqla_config
from sat.memory.sqla_mapping import (
    NOT_IN_EXTRA,
    SyncState,
    Base,
    Profile,
    Component,
    History,
    Message,
    Subject,
    Thread,
    ParamGen,
    ParamInd,
    PrivateGen,
    PrivateInd,
    PrivateGenBin,
    PrivateIndBin,
    File,
    PubsubNode,
    PubsubItem,
)


log = getLogger(__name__)
migration_path = Path(migration.__file__).parent
#: mapping of Libervia search query operators to SQLAlchemy method name
OP_MAP = {
    "==": "__eq__",
    "eq": "__eq__",
    "!=": "__ne__",
    "ne": "__ne__",
    ">": "__gt__",
    "gt": "__gt__",
    "<": "__le__",
    "le": "__le__",
    "between": "between",
    "in": "in_",
    "not_in": "not_in",
    "overlap": "in_",
    "ioverlap": "in_",
    "disjoint": "in_",
    "idisjoint": "in_",
    "like": "like",
    "ilike": "ilike",
    "not_like": "notlike",
    "not_ilike": "notilike",
}


@event.listens_for(Engine, "connect")
def set_sqlite_pragma(dbapi_connection, connection_record):
    cursor = dbapi_connection.cursor()
    cursor.execute("PRAGMA foreign_keys=ON")
    cursor.close()


class Storage:

    def __init__(self):
        self.initialized = defer.Deferred()
        # we keep cache for the profiles (key: profile name, value: profile id)
        # profile id to name
        self.profiles: Dict[str, int] = {}
        # profile id to component entry point
        self.components: Dict[int, str] = {}

    def getProfileById(self, profile_id):
        return self.profiles.get(profile_id)

    async def migrateApply(self, *args: str, log_output: bool = False) -> None:
        """Do a migration command

        Commands are applied by running Alembic in a subprocess.
        Arguments are alembic executables commands

        @param log_output: manage stdout and stderr:
            - if False, stdout and stderr are buffered, and logged only in case of error
            - if True, stdout and stderr will be logged during the command execution
        @raise exceptions.DatabaseError: something went wrong while running the
            process
        """
        stdout, stderr = 2 * (None,) if log_output else 2 * (PIPE,)
        proc = await asyncio.create_subprocess_exec(
            sys.executable, "-m", "alembic", *args,
            stdout=stdout, stderr=stderr, cwd=migration_path
        )
        log_out, log_err = await proc.communicate()
        if proc.returncode != 0:
            msg = _(
                "Can't {operation} database (exit code {exit_code})"
            ).format(
                operation=args[0],
                exit_code=proc.returncode
            )
            if log_out or log_err:
                msg += f":\nstdout: {log_out.decode()}\nstderr: {log_err.decode()}"
            log.error(msg)

            raise exceptions.DatabaseError(msg)

    async def createDB(self, engine: AsyncEngine, db_config: dict) -> None:
        """Create a new database

        The database is generated from SQLAlchemy model, then stamped by Alembic
        """
        # the dir may not exist if it's not the XDG recommended one
        db_config["path"].parent.mkdir(0o700, True, True)
        async with engine.begin() as conn:
            await conn.run_sync(Base.metadata.create_all)

        log.debug("stamping the database")
        await self.migrateApply("stamp", "head")
        log.debug("stamping done")

    def _checkDBIsUpToDate(self, conn: Connection) -> bool:
        al_ini_path = migration_path / "alembic.ini"
        al_cfg = al_config.Config(al_ini_path)
        directory = al_script.ScriptDirectory.from_config(al_cfg)
        context = al_migration.MigrationContext.configure(conn)
        return set(context.get_current_heads()) == set(directory.get_heads())

    async def checkAndUpdateDB(self, engine: AsyncEngine, db_config: dict) -> None:
        """Check that database is up-to-date, and update if necessary"""
        async with engine.connect() as conn:
            up_to_date = await conn.run_sync(self._checkDBIsUpToDate)
        if up_to_date:
            log.debug("Database is up-to-date")
        else:
            log.info("Database needs to be updated")
            log.info("updating…")
            await self.migrateApply("upgrade", "head", log_output=True)
            log.info("Database is now up-to-date")

    @aio
    async def initialise(self) -> None:
        log.info(_("Connecting database"))

        db_config = sqla_config.getDbConfig()
        engine = create_async_engine(
            db_config["url"],
            future=True,
        )

        new_base = not db_config["path"].exists()
        if new_base:
            log.info(_("The database is new, creating the tables"))
            await self.createDB(engine, db_config)
        else:
            await self.checkAndUpdateDB(engine, db_config)

        self.session = sessionmaker(
            engine, expire_on_commit=False, class_=AsyncSession
        )

        async with self.session() as session:
            result = await session.execute(select(Profile))
            for p in result.scalars():
                self.profiles[p.name] = p.id
            result = await session.execute(select(Component))
            for c in result.scalars():
                self.components[c.profile_id] = c.entry_point

        self.initialized.callback(None)

    ## Generic

    @aio
    async def add(self, db_obj: DeclarativeMeta) -> None:
        """Add an object to database"""
        async with self.session() as session:
            async with session.begin():
                session.add(db_obj)

    @aio
    async def delete(self, db_obj: DeclarativeMeta) -> None:
        """Delete an object from database"""
        async with self.session() as session:
            async with session.begin():
                await session.delete(db_obj)
                await session.commit()

    ## Profiles

    def getProfilesList(self) -> List[str]:
        """"Return list of all registered profiles"""
        return list(self.profiles.keys())

    def hasProfile(self, profile_name: str) -> bool:
        """return True if profile_name exists

        @param profile_name: name of the profile to check
        """
        return profile_name in self.profiles

    def profileIsComponent(self, profile_name: str) -> bool:
        try:
            return self.profiles[profile_name] in self.components
        except KeyError:
            raise exceptions.NotFound("the requested profile doesn't exists")

    def getEntryPoint(self, profile_name: str) -> str:
        try:
            return self.components[self.profiles[profile_name]]
        except KeyError:
            raise exceptions.NotFound("the requested profile doesn't exists or is not a component")

    @aio
    async def createProfile(self, name: str, component_ep: Optional[str] = None) -> None:
        """Create a new profile

        @param name: name of the profile
        @param component: if not None, must point to a component entry point
        """
        async with self.session() as session:
            profile = Profile(name=name)
            async with session.begin():
                session.add(profile)
            self.profiles[profile.name] = profile.id
            if component_ep is not None:
                async with session.begin():
                    component = Component(profile=profile, entry_point=component_ep)
                    session.add(component)
                self.components[profile.id] = component_ep
        return profile

    @aio
    async def deleteProfile(self, name: str) -> None:
        """Delete profile

        @param name: name of the profile
        """
        async with self.session() as session:
            result = await session.execute(select(Profile).where(Profile.name == name))
            profile = result.scalar()
            await session.delete(profile)
            await session.commit()
        del self.profiles[profile.name]
        if profile.id in self.components:
            del self.components[profile.id]
        log.info(_("Profile {name!r} deleted").format(name = name))

    ## Params

    @aio
    async def loadGenParams(self, params_gen: dict) -> None:
        """Load general parameters

        @param params_gen: dictionary to fill
        """
        log.debug(_("loading general parameters from database"))
        async with self.session() as session:
            result = await session.execute(select(ParamGen))
        for p in result.scalars():
            params_gen[(p.category, p.name)] = p.value

    @aio
    async def loadIndParams(self, params_ind: dict, profile: str) -> None:
        """Load individual parameters

        @param params_ind: dictionary to fill
        @param profile: a profile which *must* exist
        """
        log.debug(_("loading individual parameters from database"))
        async with self.session() as session:
            result = await session.execute(
                select(ParamInd).where(ParamInd.profile_id == self.profiles[profile])
            )
        for p in result.scalars():
            params_ind[(p.category, p.name)] = p.value

    @aio
    async def getIndParam(self, category: str, name: str, profile: str) -> Optional[str]:
        """Ask database for the value of one specific individual parameter

        @param category: category of the parameter
        @param name: name of the parameter
        @param profile: %(doc_profile)s
        """
        async with self.session() as session:
            result = await session.execute(
                select(ParamInd.value)
                .filter_by(
                    category=category,
                    name=name,
                    profile_id=self.profiles[profile]
                )
            )
        return result.scalar_one_or_none()

    @aio
    async def getIndParamValues(self, category: str, name: str) -> Dict[str, str]:
        """Ask database for the individual values of a parameter for all profiles

        @param category: category of the parameter
        @param name: name of the parameter
        @return dict: profile => value map
        """
        async with self.session() as session:
            result = await session.execute(
                select(ParamInd)
                .filter_by(
                    category=category,
                    name=name
                )
                .options(subqueryload(ParamInd.profile))
            )
        return {param.profile.name: param.value for param in result.scalars()}

    @aio
    async def setGenParam(self, category: str, name: str, value: Optional[str]) -> None:
        """Save the general parameters in database

        @param category: category of the parameter
        @param name: name of the parameter
        @param value: value to set
        """
        async with self.session() as session:
            stmt = insert(ParamGen).values(
                category=category,
                name=name,
                value=value
            ).on_conflict_do_update(
                index_elements=(ParamGen.category, ParamGen.name),
                set_={
                    ParamGen.value: value
                }
            )
            await session.execute(stmt)
            await session.commit()

    @aio
    async def setIndParam(
        self,
        category:str,
        name: str,
        value: Optional[str],
        profile: str
    ) -> None:
        """Save the individual parameters in database

        @param category: category of the parameter
        @param name: name of the parameter
        @param value: value to set
        @param profile: a profile which *must* exist
        """
        async with self.session() as session:
            stmt = insert(ParamInd).values(
                category=category,
                name=name,
                profile_id=self.profiles[profile],
                value=value
            ).on_conflict_do_update(
                index_elements=(ParamInd.category, ParamInd.name, ParamInd.profile_id),
                set_={
                    ParamInd.value: value
                }
            )
            await session.execute(stmt)
            await session.commit()

    def _jid_filter(self, jid_: jid.JID, dest: bool = False):
        """Generate condition to filter on a JID, using relevant columns

        @param dest: True if it's the destinee JID, otherwise it's the source one
        @param jid_: JID to filter by
        """
        if jid_.resource:
            if dest:
                return and_(
                    History.dest == jid_.userhost(),
                    History.dest_res == jid_.resource
                )
            else:
                return and_(
                    History.source == jid_.userhost(),
                    History.source_res == jid_.resource
                )
        else:
            if dest:
                return History.dest == jid_.userhost()
            else:
                return History.source == jid_.userhost()

    @aio
    async def historyGet(
        self,
        from_jid: Optional[jid.JID],
        to_jid: Optional[jid.JID],
        limit: Optional[int] = None,
        between: bool = True,
        filters: Optional[Dict[str, str]] = None,
        profile: Optional[str] = None,
    ) -> List[Tuple[
        str, int, str, str, Dict[str, str], Dict[str, str], str, str, str]
    ]:
        """Retrieve messages in history

        @param from_jid: source JID (full, or bare for catchall)
        @param to_jid: dest JID (full, or bare for catchall)
        @param limit: maximum number of messages to get:
            - 0 for no message (returns the empty list)
            - None for unlimited
        @param between: confound source and dest (ignore the direction)
        @param filters: pattern to filter the history results
        @return: list of messages as in [messageNew], minus the profile which is already
            known.
        """
        # we have to set a default value to profile because it's last argument
        # and thus follow other keyword arguments with default values
        # but None should not be used for it
        assert profile is not None
        if limit == 0:
            return []
        if filters is None:
            filters = {}

        stmt = (
            select(History)
            .filter_by(
                profile_id=self.profiles[profile]
            )
            .outerjoin(History.messages)
            .outerjoin(History.subjects)
            .outerjoin(History.thread)
            .options(
                contains_eager(History.messages),
                contains_eager(History.subjects),
                contains_eager(History.thread),
            )
            .order_by(
                # timestamp may be identical for 2 close messages (specially when delay is
                # used) that's why we order ties by received_timestamp. We'll reverse the
                # order when returning the result. We use DESC here so LIMIT keep the last
                # messages
                History.timestamp.desc(),
                History.received_timestamp.desc()
            )
        )


        if not from_jid and not to_jid:
            # no jid specified, we want all one2one communications
            pass
        elif between:
            if not from_jid or not to_jid:
                # we only have one jid specified, we check all messages
                # from or to this jid
                jid_ = from_jid or to_jid
                stmt = stmt.where(
                    or_(
                        self._jid_filter(jid_),
                        self._jid_filter(jid_, dest=True)
                    )
                )
            else:
                # we have 2 jids specified, we check all communications between
                # those 2 jids
                stmt = stmt.where(
                    or_(
                        and_(
                            self._jid_filter(from_jid),
                            self._jid_filter(to_jid, dest=True),
                        ),
                        and_(
                            self._jid_filter(to_jid),
                            self._jid_filter(from_jid, dest=True),
                        )
                    )
                )
        else:
            # we want one communication in specific direction (from somebody or
            # to somebody).
            if from_jid is not None:
                stmt = stmt.where(self._jid_filter(from_jid))
            if to_jid is not None:
                stmt = stmt.where(self._jid_filter(to_jid, dest=True))

        if filters:
            if 'timestamp_start' in filters:
                stmt = stmt.where(History.timestamp >= float(filters['timestamp_start']))
            if 'before_uid' in filters:
                # orignially this query was using SQLITE's rowid. This has been changed
                # to use coalesce(received_timestamp, timestamp) to be SQL engine independant
                stmt = stmt.where(
                    coalesce(
                        History.received_timestamp,
                        History.timestamp
                    ) < (
                        select(coalesce(History.received_timestamp, History.timestamp))
                        .filter_by(uid=filters["before_uid"])
                    ).scalar_subquery()
                )
            if 'body' in filters:
                # TODO: use REGEXP (function to be defined) instead of GLOB: https://www.sqlite.org/lang_expr.html
                stmt = stmt.where(Message.message.like(f"%{filters['body']}%"))
            if 'search' in filters:
                search_term = f"%{filters['search']}%"
                stmt = stmt.where(or_(
                    Message.message.like(search_term),
                    History.source_res.like(search_term)
                ))
            if 'types' in filters:
                types = filters['types'].split()
                stmt = stmt.where(History.type.in_(types))
            if 'not_types' in filters:
                types = filters['not_types'].split()
                stmt = stmt.where(History.type.not_in(types))
            if 'last_stanza_id' in filters:
                # this request get the last message with a "stanza_id" that we
                # have in history. This is mainly used to retrieve messages sent
                # while we were offline, using MAM (XEP-0313).
                if (filters['last_stanza_id'] is not True
                    or limit != 1):
                    raise ValueError("Unexpected values for last_stanza_id filter")
                stmt = stmt.where(History.stanza_id.is_not(None))

        if limit is not None:
            stmt = stmt.limit(limit)

        async with self.session() as session:
            result = await session.execute(stmt)

        result = result.scalars().unique().all()
        result.reverse()
        return [h.as_tuple() for h in result]

    @aio
    async def addToHistory(self, data: dict, profile: str) -> None:
        """Store a new message in history

        @param data: message data as build by SatMessageProtocol.onMessage
        """
        extra = {k: v for k, v in data["extra"].items() if k not in NOT_IN_EXTRA}
        messages = [Message(message=mess, language=lang)
                    for lang, mess in data["message"].items()]
        subjects = [Subject(subject=mess, language=lang)
                    for lang, mess in data["subject"].items()]
        if "thread" in data["extra"]:
            thread = Thread(thread_id=data["extra"]["thread"],
                            parent_id=data["extra"].get["thread_parent"])
        else:
            thread = None
        try:
            async with self.session() as session:
                async with session.begin():
                    session.add(History(
                        uid=data["uid"],
                        stanza_id=data["extra"].get("stanza_id"),
                        update_uid=data["extra"].get("update_uid"),
                        profile_id=self.profiles[profile],
                        source_jid=data["from"],
                        dest_jid=data["to"],
                        timestamp=data["timestamp"],
                        received_timestamp=data.get("received_timestamp"),
                        type=data["type"],
                        extra=extra,
                        messages=messages,
                        subjects=subjects,
                        thread=thread,
                    ))
        except IntegrityError as e:
            if "unique" in str(e.orig).lower():
                log.debug(
                    f"message {data['uid']!r} is already in history, not storing it again"
                )
            else:
                log.error(f"Can't store message {data['uid']!r} in history: {e}")
        except Exception as e:
            log.critical(
                f"Can't store message, unexpected exception (uid: {data['uid']}): {e}"
            )

    ## Private values

    def _getPrivateClass(self, binary, profile):
        """Get ORM class to use for private values"""
        if profile is None:
            return PrivateGenBin if binary else PrivateGen
        else:
            return PrivateIndBin if binary else PrivateInd


    @aio
    async def getPrivates(
        self,
        namespace:str,
        keys: Optional[Iterable[str]] = None,
        binary: bool = False,
        profile: Optional[str] = None
    ) -> Dict[str, Any]:
        """Get private value(s) from databases

        @param namespace: namespace of the values
        @param keys: keys of the values to get None to get all keys/values
        @param binary: True to deserialise binary values
        @param profile: profile to use for individual values
            None to use general values
        @return: gotten keys/values
        """
        if keys is not None:
            keys = list(keys)
        log.debug(
            f"getting {'general' if profile is None else 'individual'}"
            f"{' binary' if binary else ''} private values from database for namespace "
            f"{namespace}{f' with keys {keys!r}' if keys is not None else ''}"
        )
        cls = self._getPrivateClass(binary, profile)
        stmt = select(cls).filter_by(namespace=namespace)
        if keys:
            stmt = stmt.where(cls.key.in_(list(keys)))
        if profile is not None:
            stmt = stmt.filter_by(profile_id=self.profiles[profile])
        async with self.session() as session:
            result = await session.execute(stmt)
        return {p.key: p.value for p in result.scalars()}

    @aio
    async def setPrivateValue(
        self,
        namespace: str,
        key:str,
        value: Any,
        binary: bool = False,
        profile: Optional[str] = None
    ) -> None:
        """Set a private value in database

        @param namespace: namespace of the values
        @param key: key of the value to set
        @param value: value to set
        @param binary: True if it's a binary values
            binary values need to be serialised, used for everything but strings
        @param profile: profile to use for individual value
            if None, it's a general value
        """
        cls = self._getPrivateClass(binary, profile)

        values = {
            "namespace": namespace,
            "key": key,
            "value": value
        }
        index_elements = [cls.namespace, cls.key]

        if profile is not None:
            values["profile_id"] = self.profiles[profile]
            index_elements.append(cls.profile_id)

        async with self.session() as session:
            await session.execute(
                insert(cls).values(**values).on_conflict_do_update(
                    index_elements=index_elements,
                    set_={
                        cls.value: value
                    }
                )
            )
            await session.commit()

    @aio
    async def delPrivateValue(
        self,
        namespace: str,
        key: str,
        binary: bool = False,
        profile: Optional[str] = None
    ) -> None:
        """Delete private value from database

        @param category: category of the privateeter
        @param key: key of the private value
        @param binary: True if it's a binary values
        @param profile: profile to use for individual value
            if None, it's a general value
        """
        cls = self._getPrivateClass(binary, profile)

        stmt = delete(cls).filter_by(namespace=namespace, key=key)

        if profile is not None:
            stmt = stmt.filter_by(profile_id=self.profiles[profile])

        async with self.session() as session:
            await session.execute(stmt)
            await session.commit()

    @aio
    async def delPrivateNamespace(
        self,
        namespace: str,
        binary: bool = False,
        profile: Optional[str] = None
    ) -> None:
        """Delete all data from a private namespace

        Be really cautious when you use this method, as all data with given namespace are
        removed.
        Params are the same as for delPrivateValue
        """
        cls = self._getPrivateClass(binary, profile)

        stmt = delete(cls).filter_by(namespace=namespace)

        if profile is not None:
            stmt = stmt.filter_by(profile_id=self.profiles[profile])

        async with self.session() as session:
            await session.execute(stmt)
            await session.commit()

    ## Files

    @aio
    async def getFiles(
        self,
        client: Optional[SatXMPPEntity],
        file_id: Optional[str] = None,
        version: Optional[str] = '',
        parent: Optional[str] = None,
        type_: Optional[str] = None,
        file_hash: Optional[str] = None,
        hash_algo: Optional[str] = None,
        name: Optional[str] = None,
        namespace: Optional[str] = None,
        mime_type: Optional[str] = None,
        public_id: Optional[str] = None,
        owner: Optional[jid.JID] = None,
        access: Optional[dict] = None,
        projection: Optional[List[str]] = None,
        unique: bool = False
    ) -> List[dict]:
        """Retrieve files with with given filters

        @param file_id: id of the file
            None to ignore
        @param version: version of the file
            None to ignore
            empty string to look for current version
        @param parent: id of the directory containing the files
            None to ignore
            empty string to look for root files/directories
        @param projection: name of columns to retrieve
            None to retrieve all
        @param unique: if True will remove duplicates
        other params are the same as for [setFile]
        @return: files corresponding to filters
        """
        if projection is None:
            projection = [
                'id', 'version', 'parent', 'type', 'file_hash', 'hash_algo', 'name',
                'size', 'namespace', 'media_type', 'media_subtype', 'public_id',
                'created', 'modified', 'owner', 'access', 'extra'
            ]

        stmt = select(*[getattr(File, f) for f in projection])

        if unique:
            stmt = stmt.distinct()

        if client is not None:
            stmt = stmt.filter_by(profile_id=self.profiles[client.profile])
        else:
            if public_id is None:
                raise exceptions.InternalError(
                    "client can only be omitted when public_id is set"
                )
        if file_id is not None:
            stmt = stmt.filter_by(id=file_id)
        if version is not None:
            stmt = stmt.filter_by(version=version)
        if parent is not None:
            stmt = stmt.filter_by(parent=parent)
        if type_ is not None:
            stmt = stmt.filter_by(type=type_)
        if file_hash is not None:
            stmt = stmt.filter_by(file_hash=file_hash)
        if hash_algo is not None:
            stmt = stmt.filter_by(hash_algo=hash_algo)
        if name is not None:
            stmt = stmt.filter_by(name=name)
        if namespace is not None:
            stmt = stmt.filter_by(namespace=namespace)
        if mime_type is not None:
            if '/' in mime_type:
                media_type, media_subtype = mime_type.split("/", 1)
                stmt = stmt.filter_by(media_type=media_type, media_subtype=media_subtype)
            else:
                stmt = stmt.filter_by(media_type=mime_type)
        if public_id is not None:
            stmt = stmt.filter_by(public_id=public_id)
        if owner is not None:
            stmt = stmt.filter_by(owner=owner)
        if access is not None:
            raise NotImplementedError('Access check is not implemented yet')
            # a JSON comparison is needed here

        async with self.session() as session:
            result = await session.execute(stmt)

        return [dict(r) for r in result]

    @aio
    async def setFile(
        self,
        client: SatXMPPEntity,
        name: str,
        file_id: str,
        version: str = "",
        parent: str = "",
        type_: str = C.FILE_TYPE_FILE,
        file_hash: Optional[str] = None,
        hash_algo: Optional[str] = None,
        size: int = None,
        namespace: Optional[str] = None,
        mime_type: Optional[str] = None,
        public_id: Optional[str] = None,
        created: Optional[float] = None,
        modified: Optional[float] = None,
        owner: Optional[jid.JID] = None,
        access: Optional[dict] = None,
        extra: Optional[dict] = None
    ) -> None:
        """Set a file metadata

        @param client: client owning the file
        @param name: name of the file (must not contain "/")
        @param file_id: unique id of the file
        @param version: version of this file
        @param parent: id of the directory containing this file
            Empty string if it is a root file/directory
        @param type_: one of:
            - file
            - directory
        @param file_hash: unique hash of the payload
        @param hash_algo: algorithm used for hashing the file (usually sha-256)
        @param size: size in bytes
        @param namespace: identifier (human readable is better) to group files
            for instance, namespace could be used to group files in a specific photo album
        @param mime_type: media type of the file, or None if not known/guessed
        @param public_id: ID used to server the file publicly via HTTP
        @param created: UNIX time of creation
        @param modified: UNIX time of last modification, or None to use created date
        @param owner: jid of the owner of the file (mainly useful for component)
        @param access: serialisable dictionary with access rules. See [memory.memory] for
            details
        @param extra: serialisable dictionary of any extra data
            will be encoded to json in database
        """
        if mime_type is None:
            media_type = media_subtype = None
        elif '/' in mime_type:
            media_type, media_subtype = mime_type.split('/', 1)
        else:
            media_type, media_subtype = mime_type, None

        async with self.session() as session:
            async with session.begin():
                session.add(File(
                    id=file_id,
                    version=version.strip(),
                    parent=parent,
                    type=type_,
                    file_hash=file_hash,
                    hash_algo=hash_algo,
                    name=name,
                    size=size,
                    namespace=namespace,
                    media_type=media_type,
                    media_subtype=media_subtype,
                    public_id=public_id,
                    created=time.time() if created is None else created,
                    modified=modified,
                    owner=owner,
                    access=access,
                    extra=extra,
                    profile_id=self.profiles[client.profile]
                ))

    @aio
    async def fileGetUsedSpace(self, client: SatXMPPEntity, owner: jid.JID) -> int:
        async with self.session() as session:
            result = await session.execute(
                select(sum_(File.size)).filter_by(
                    owner=owner,
                    type=C.FILE_TYPE_FILE,
                    profile_id=self.profiles[client.profile]
                ))
        return result.scalar_one_or_none() or 0

    @aio
    async def fileDelete(self, file_id: str) -> None:
        """Delete file metadata from the database

        @param file_id: id of the file to delete
        NOTE: file itself must still be removed, this method only handle metadata in
            database
        """
        async with self.session() as session:
            await session.execute(delete(File).filter_by(id=file_id))
            await session.commit()

    @aio
    async def fileUpdate(
        self,
        file_id: str,
        column: str,
        update_cb: Callable[[dict], None]
    ) -> None:
        """Update a column value using a method to avoid race conditions

        the older value will be retrieved from database, then update_cb will be applied to
        update it, and file will be updated checking that older value has not been changed
        meanwhile by an other user. If it has changed, it tries again a couple of times
        before failing
        @param column: column name (only "access" or "extra" are allowed)
        @param update_cb: method to update the value of the colum
            the method will take older value as argument, and must update it in place
            update_cb must not care about serialization,
            it get the deserialized data (i.e. a Python object) directly
        @raise exceptions.NotFound: there is not file with this id
        """
        if column not in ('access', 'extra'):
            raise exceptions.InternalError('bad column name')
        orm_col = getattr(File, column)

        for i in range(5):
            async with self.session() as session:
                try:
                    value = (await session.execute(
                        select(orm_col).filter_by(id=file_id)
                    )).scalar_one()
                except NoResultFound:
                    raise exceptions.NotFound
                old_value = copy.deepcopy(value)
                update_cb(value)
                stmt = update(File).filter_by(id=file_id).values({column: value})
                if not old_value:
                    # because JsonDefaultDict convert NULL to an empty dict, we have to
                    # test both for empty dict and None when we have an empty dict
                    stmt = stmt.where((orm_col == None) | (orm_col == old_value))
                else:
                    stmt = stmt.where(orm_col == old_value)
                result = await session.execute(stmt)
                await session.commit()

            if result.rowcount == 1:
                break

            log.warning(
                _("table not updated, probably due to race condition, trying again "
                  "({tries})").format(tries=i+1)
            )

        else:
            raise exceptions.DatabaseError(
                _("Can't update file {file_id} due to race condition")
                .format(file_id=file_id)
            )

    @aio
    async def getPubsubNode(
        self,
        client: SatXMPPEntity,
        service: jid.JID,
        name: str,
        with_items: bool = False,
        with_subscriptions: bool = False,
    ) -> Optional[PubsubNode]:
        """
        """
        async with self.session() as session:
            stmt = (
                select(PubsubNode)
                .filter_by(
                    service=service,
                    name=name,
                    profile_id=self.profiles[client.profile],
                )
            )
            if with_items:
                stmt = stmt.options(
                    joinedload(PubsubNode.items)
                )
            if with_subscriptions:
                stmt = stmt.options(
                    joinedload(PubsubNode.subscriptions)
                )
            result = await session.execute(stmt)
        return result.unique().scalar_one_or_none()

    @aio
    async def setPubsubNode(
        self,
        client: SatXMPPEntity,
        service: jid.JID,
        name: str,
        analyser: Optional[str] = None,
        type_: Optional[str] = None,
        subtype: Optional[str] = None,
        subscribed: bool = False,
    ) -> PubsubNode:
        node = PubsubNode(
            profile_id=self.profiles[client.profile],
            service=service,
            name=name,
            subscribed=subscribed,
            analyser=analyser,
            type_=type_,
            subtype=subtype,
            subscriptions=[],
        )
        async with self.session() as session:
            async with session.begin():
                session.add(node)
        return node

    @aio
    async def updatePubsubNodeSyncState(
        self,
        node: PubsubNode,
        state: SyncState
    ) -> None:
        async with self.session() as session:
            async with session.begin():
                await session.execute(
                    update(PubsubNode)
                    .filter_by(id=node.id)
                    .values(
                        sync_state=state,
                        sync_state_updated=time.time(),
                    )
                )

    @aio
    async def deletePubsubNode(
        self,
        profiles: Optional[List[str]],
        services: Optional[List[jid.JID]],
        names: Optional[List[str]]
    ) -> None:
        """Delete items cached for a node

        @param profiles: profile names from which nodes must be deleted.
            None to remove nodes from ALL profiles
        @param services: JIDs of pubsub services from which nodes must be deleted.
            None to remove nodes from ALL services
        @param names: names of nodes which must be deleted.
            None to remove ALL nodes whatever is their names
        """
        stmt = delete(PubsubNode)
        if profiles is not None:
            stmt = stmt.where(
                PubsubNode.profile.in_(
                    [self.profiles[p] for p in profiles]
                )
            )
        if services is not None:
            stmt = stmt.where(PubsubNode.service.in_(services))
        if names is not None:
            stmt = stmt.where(PubsubNode.name.in_(names))
        async with self.session() as session:
            await session.execute(stmt)
            await session.commit()

    @aio
    async def cachePubsubItems(
        self,
        client: SatXMPPEntity,
        node: PubsubNode,
        items: List[domish.Element],
        parsed_items: Optional[List[dict]] = None,
    ) -> None:
        """Add items to database, using an upsert taking care of "updated" field"""
        if parsed_items is not None and len(items) != len(parsed_items):
            raise exceptions.InternalError(
                "parsed_items must have the same lenght as items"
            )
        async with self.session() as session:
            async with session.begin():
                for idx, item in enumerate(items):
                    parsed = parsed_items[idx] if parsed_items else None
                    stmt = insert(PubsubItem).values(
                        node_id = node.id,
                        name = item["id"],
                        data = item,
                        parsed = parsed,
                    ).on_conflict_do_update(
                        index_elements=(PubsubItem.node_id, PubsubItem.name),
                        set_={
                            PubsubItem.data: item,
                            PubsubItem.parsed: parsed,
                            PubsubItem.updated: now()
                        }
                    )
                    await session.execute(stmt)
                await session.commit()

    @aio
    async def deletePubsubItems(
        self,
        node: PubsubNode,
        items_names: Optional[List[str]] = None
    ) -> None:
        """Delete items cached for a node

        @param node: node from which items must be deleted
        @param items_names: names of items to delete
            if None, ALL items will be deleted
        """
        stmt = delete(PubsubItem)
        if node is not None:
            if isinstance(node, list):
                stmt = stmt.where(PubsubItem.node_id.in_([n.id for n in node]))
            else:
                stmt = stmt.filter_by(node_id=node.id)
        if items_names is not None:
            stmt = stmt.where(PubsubItem.name.in_(items_names))
        async with self.session() as session:
            await session.execute(stmt)
            await session.commit()

    @aio
    async def purgePubsubItems(
        self,
        services: Optional[List[jid.JID]] = None,
        names: Optional[List[str]] = None,
        types: Optional[List[str]] = None,
        subtypes: Optional[List[str]] = None,
        profiles: Optional[List[str]] = None,
        created_before: Optional[datetime] = None,
        updated_before: Optional[datetime] = None,
    ) -> None:
        """Delete items cached for a node

        @param node: node from which items must be deleted
        @param items_names: names of items to delete
            if None, ALL items will be deleted
        """
        stmt = delete(PubsubItem)
        node_fields = {
            "service": services,
            "name": names,
            "type_": types,
            "subtype": subtypes,
        }
        if profiles is not None:
            node_fields["profile_id"] = [self.profiles[p] for p in profiles]

        if any(x is not None for x in node_fields.values()):
            sub_q = select(PubsubNode.id)
            for col, values in node_fields.items():
                if values is None:
                    continue
                sub_q = sub_q.where(getattr(PubsubNode, col).in_(values))
            stmt = (
                stmt
                .where(PubsubItem.node_id.in_(sub_q))
                .execution_options(synchronize_session=False)
            )

        if created_before is not None:
            stmt = stmt.where(PubsubItem.created < created_before)

        if updated_before is not None:
            stmt = stmt.where(PubsubItem.updated < updated_before)

        async with self.session() as session:
            await session.execute(stmt)
            await session.commit()

    @aio
    async def getItems(
        self,
        node: PubsubNode,
        max_items: Optional[int] = None,
        item_ids: Optional[list[str]] = None,
        before: Optional[str] = None,
        after: Optional[str] = None,
        from_index: Optional[int] = None,
        order_by: Optional[List[str]] = None,
        desc: bool = True,
        force_rsm: bool = False,
    ) -> Tuple[List[PubsubItem], dict]:
        """Get Pubsub Items from cache

        @param node: retrieve items from this node (must be synchronised)
        @param max_items: maximum number of items to retrieve
        @param before: get items which are before the item with this name in given order
            empty string is not managed here, use desc order to reproduce RSM
            behaviour.
        @param after: get items which are after the item with this name in given order
        @param from_index: get items with item index (as defined in RSM spec)
            starting from this number
        @param order_by: sorting order of items (one of C.ORDER_BY_*)
        @param desc: direction or ordering
        @param force_rsm: if True, force the use of RSM worklow.
            RSM workflow is automatically used if any of before, after or
            from_index is used, but if only RSM max_items is used, it won't be
            used by default. This parameter let's use RSM workflow in this
            case. Note that in addition to RSM metadata, the result will not be
            the same (max_items without RSM will returns most recent items,
            i.e. last items in modification order, while max_items with RSM
            will return the oldest ones (i.e. first items in modification
            order).
            to be used when max_items is used from RSM
        """

        metadata = {
            "service": node.service,
            "node": node.name,
            "uri": uri.buildXMPPUri(
                "pubsub",
                path=node.service.full(),
                node=node.name,
            ),
        }
        if max_items is None:
            max_items = 20

        use_rsm = any((before, after, from_index is not None))
        if force_rsm and not use_rsm:
            #
            use_rsm = True
            from_index = 0

        stmt = (
            select(PubsubItem)
            .filter_by(node_id=node.id)
            .limit(max_items)
        )

        if item_ids is not None:
            stmt = stmt.where(PubsubItem.name.in_(item_ids))

        if not order_by:
            order_by = [C.ORDER_BY_MODIFICATION]

        order = []
        for order_type in order_by:
            if order_type == C.ORDER_BY_MODIFICATION:
                if desc:
                    order.extend((PubsubItem.updated.desc(), PubsubItem.id.desc()))
                else:
                    order.extend((PubsubItem.updated.asc(), PubsubItem.id.asc()))
            elif order_type == C.ORDER_BY_CREATION:
                if desc:
                    order.append(PubsubItem.id.desc())
                else:
                    order.append(PubsubItem.id.asc())
            else:
                raise exceptions.InternalError(f"Unknown order type {order_type!r}")

        stmt = stmt.order_by(*order)

        if use_rsm:
            # CTE to have result row numbers
            row_num_q = select(
                PubsubItem.id,
                PubsubItem.name,
                # row_number starts from 1, but RSM index must start from 0
                (func.row_number().over(order_by=order)-1).label("item_index")
            ).filter_by(node_id=node.id)

            row_num_cte = row_num_q.cte()

            if max_items > 0:
                # as we can't simply use PubsubItem.id when we order by modification,
                # we need to use row number
                item_name = before or after
                row_num_limit_q = (
                    select(row_num_cte.c.item_index)
                    .where(row_num_cte.c.name==item_name)
                ).scalar_subquery()

                stmt = (
                    select(row_num_cte.c.item_index, PubsubItem)
                    .join(row_num_cte, PubsubItem.id == row_num_cte.c.id)
                    .limit(max_items)
                )
                if before:
                    stmt = (
                        stmt
                        .where(row_num_cte.c.item_index<row_num_limit_q)
                        .order_by(row_num_cte.c.item_index.desc())
                    )
                elif after:
                    stmt = (
                        stmt
                        .where(row_num_cte.c.item_index>row_num_limit_q)
                        .order_by(row_num_cte.c.item_index.asc())
                    )
                else:
                    stmt = (
                        stmt
                        .where(row_num_cte.c.item_index>=from_index)
                        .order_by(row_num_cte.c.item_index.asc())
                    )
                    # from_index is used

            async with self.session() as session:
                if max_items == 0:
                    items = result = []
                else:
                    result = await session.execute(stmt)
                    result = result.all()
                    if before:
                        result.reverse()
                    items = [row[-1] for row in result]
                rows_count = (
                    await session.execute(row_num_q.with_only_columns(count()))
                ).scalar_one()

            try:
                index = result[0][0]
            except IndexError:
                index = None

            try:
                first = result[0][1].name
            except IndexError:
                first = None
                last = None
            else:
                last = result[-1][1].name

            metadata["rsm"] = {
                k: v for k, v in {
                    "index": index,
                    "count": rows_count,
                    "first": first,
                    "last": last,
                }.items() if v is not None
            }
            metadata["complete"] = (index or 0) + len(result) == rows_count

            return items, metadata

        async with self.session() as session:
            result = await session.execute(stmt)

        result = result.scalars().all()
        if desc:
            result.reverse()
        return result, metadata

    def _getSqlitePath(
        self,
        path: List[Union[str, int]]
    ) -> str:
        """generate path suitable to query JSON element with SQLite"""
        return f"${''.join(f'[{p}]' if isinstance(p, int) else f'.{p}' for p in path)}"

    @aio
    async def searchPubsubItems(
        self,
        query: dict,
    ) -> Tuple[List[PubsubItem]]:
        """Search for pubsub items in cache

        @param query: search terms. Keys can be:
            :fts (str):
                Full-Text Search query. Currently SQLite FT5 engine is used, its query
                syntax can be used, see `FTS5 Query documentation
                <https://sqlite.org/fts5.html#full_text_query_syntax>`_
            :profiles (list[str]):
                filter on nodes linked to those profiles
            :nodes (list[str]):
                filter on nodes with those names
            :services (list[jid.JID]):
                filter on nodes from those services
            :types (list[str|None]):
                filter on nodes with those types. None can be used to filter on nodes with
                no type set
            :subtypes (list[str|None]):
                filter on nodes with those subtypes. None can be used to filter on nodes with
                no subtype set
            :names (list[str]):
                filter on items with those names
            :parsed (list[dict]):
                Filter on a parsed data field. The dict must contain 3 keys: ``path``
                which is a list of str or int giving the path to the field of interest
                (str for a dict key, int for a list index), ``operator`` with indicate the
                operator to use to check the condition, and ``value`` which depends of
                field type and operator.

                See documentation for details on operators (it's currently explained at
                ``doc/libervia-cli/pubsub_cache.rst`` in ``search`` command
                documentation).

            :order-by (list[dict]):
                Indicates how to order results. The dict can contain either a ``order``
                for a well-know order or a ``path`` for a parsed data field path
                (``order`` and ``path`` can't be used at the same time), an an optional
                ``direction`` which can be ``asc`` or ``desc``. See documentation for
                details on well-known orders (it's currently explained at
                ``doc/libervia-cli/pubsub_cache.rst`` in ``search`` command
                documentation).

            :index (int):
                starting index of items to return from the query result. It's translated
                to SQL's OFFSET

            :limit (int):
                maximum number of items to return. It's translated to SQL's LIMIT.

        @result: found items (the ``node`` attribute will be filled with suitable
            PubsubNode)
        """
        # TODO: FTS and parsed data filters use SQLite specific syntax
        #   when other DB engines will be used, this will have to be adapted
        stmt = select(PubsubItem)

        # Full-Text Search
        fts = query.get("fts")
        if fts:
            fts_select = text(
                "SELECT rowid, rank FROM pubsub_items_fts(:fts_query)"
            ).bindparams(fts_query=fts).columns(rowid=Integer).subquery()
            stmt = (
                stmt
                .select_from(fts_select)
                .outerjoin(PubsubItem, fts_select.c.rowid == PubsubItem.id)
            )

        # node related filters
        profiles = query.get("profiles")
        if (profiles
            or any(query.get(k) for k in ("nodes", "services", "types", "subtypes"))
        ):
            stmt = stmt.join(PubsubNode).options(contains_eager(PubsubItem.node))
            if profiles:
                try:
                    stmt = stmt.where(
                        PubsubNode.profile_id.in_(self.profiles[p] for p in profiles)
                    )
                except KeyError as e:
                    raise exceptions.ProfileUnknownError(
                        f"This profile doesn't exist: {e.args[0]!r}"
                    )
            for key, attr in (
                ("nodes", "name"),
                ("services", "service"),
                ("types", "type_"),
                ("subtypes", "subtype")
            ):
                value = query.get(key)
                if not value:
                    continue
                if key in ("types", "subtypes") and None in value:
                    # NULL can't be used with SQL's IN, so we have to add a condition with
                    # IS NULL, and use a OR if there are other values to check
                    value.remove(None)
                    condition = getattr(PubsubNode, attr).is_(None)
                    if value:
                        condition = or_(
                            getattr(PubsubNode, attr).in_(value),
                            condition
                        )
                else:
                    condition = getattr(PubsubNode, attr).in_(value)
                stmt = stmt.where(condition)
        else:
            stmt = stmt.options(selectinload(PubsubItem.node))

        # names
        names = query.get("names")
        if names:
            stmt = stmt.where(PubsubItem.name.in_(names))

        # parsed data filters
        parsed = query.get("parsed", [])
        for filter_ in parsed:
            try:
                path = filter_["path"]
                operator = filter_["op"]
                value = filter_["value"]
            except KeyError as e:
                raise ValueError(
                    f'missing mandatory key {e.args[0]!r} in "parsed" filter'
                )
            try:
                op_attr = OP_MAP[operator]
            except KeyError:
                raise ValueError(f"invalid operator: {operator!r}")
            sqlite_path = self._getSqlitePath(path)
            if operator in ("overlap", "ioverlap", "disjoint", "idisjoint"):
                col = literal_column("json_each.value")
                if operator[0] == "i":
                    col = func.lower(col)
                    value = [str(v).lower() for v in value]
                condition = (
                    select(1)
                    .select_from(func.json_each(PubsubItem.parsed, sqlite_path))
                    .where(col.in_(value))
                ).scalar_subquery()
                if operator in ("disjoint", "idisjoint"):
                    condition = condition.is_(None)
                stmt = stmt.where(condition)
            elif operator == "between":
                try:
                    left, right = value
                except (ValueError, TypeError):
                    raise ValueError(_(
                        "invalid value for \"between\" filter, you must use a 2 items "
                        "array: {value!r}"
                    ).format(value=value))
                col = func.json_extract(PubsubItem.parsed, sqlite_path)
                stmt = stmt.where(col.between(left, right))
            else:
                # we use func.json_extract instead of generic JSON way because SQLAlchemy
                # add a JSON_QUOTE to the value, and we want SQL value
                col = func.json_extract(PubsubItem.parsed, sqlite_path)
                stmt = stmt.where(getattr(col, op_attr)(value))

        # order
        order_by = query.get("order-by") or [{"order": "creation"}]

        for order_data in order_by:
            order, path = order_data.get("order"), order_data.get("path")
            if order and path:
                raise ValueError(_(
                    '"order" and "path" can\'t be used at the same time in '
                    '"order-by" data'
                ))
            if order:
                if order == "creation":
                    col = PubsubItem.id
                elif order == "modification":
                    col = PubsubItem.updated
                elif order == "item_id":
                    col = PubsubItem.name
                elif order == "rank":
                    if not fts:
                        raise ValueError(
                            "'rank' order can only be used with Full-Text Search (fts)"
                        )
                    col = literal_column("rank")
                else:
                    raise NotImplementedError(f"Unknown {order!r} order")
            else:
                # we have a JSON path
                # sqlite_path = self._getSqlitePath(path)
                col = PubsubItem.parsed[path]
            direction = order_data.get("direction", "ASC").lower()
            if not direction in ("asc", "desc"):
                raise ValueError(f"Invalid order-by direction: {direction!r}")
            stmt = stmt.order_by(getattr(col, direction)())

        # offset, limit
        index = query.get("index")
        if index:
            stmt = stmt.offset(index)
        limit = query.get("limit")
        if limit:
            stmt = stmt.limit(limit)

        async with self.session() as session:
            result = await session.execute(stmt)

        return result.scalars().all()
author	Goffi <goffi@goffi.org>
date	Sun, 15 May 2022 14:15:24 +0200
parents	af485e8afe03
children	cc653b2685f0