# HG changeset patch # User Goffi # Date 1687873695 -7200 # Node ID eaa0daa7f83400a1e0e0787bb71066eef13b777f # Parent c0bb4b3fdccfd62d0983083241981e6bd6b5b26c plugin URL preview: URL preview first draft diff -r c0bb4b3fdccf -r eaa0daa7f834 libervia/backend/plugins/plugin_misc_url_preview.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/libervia/backend/plugins/plugin_misc_url_preview.py Tue Jun 27 15:48:15 2023 +0200 @@ -0,0 +1,503 @@ +#!/usr/bin/env python3 + + +# Libervia plugin to handle events +# Copyright (C) 2009-2022 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see . + +from dataclasses import dataclass +import json +from textwrap import dedent +from typing import Callable, Dict, List, Optional, Union +from urllib import parse +import fnmatch + +from lxml import etree +import treq +from twisted.internet import defer + +from libervia.backend.core.constants import Const as C +from libervia.backend.core.core_types import SatXMPPEntity +from libervia.backend.core.exceptions import ConflictError +from libervia.backend.core.i18n import _ +from libervia.backend.core.log import getLogger +from libervia.backend.tools.common import data_format +from libervia.backend.tools.common.async_utils import async_lru + +log = getLogger(__name__) + +PLUGIN_INFO = { + C.PI_NAME: "Preview", + C.PI_IMPORT_NAME: "Preview", + C.PI_TYPE: C.PLUG_TYPE_MISC, + C.PI_PROTOCOLS: ["Open Graph", "oEmbed"], + C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"], + C.PI_MAIN: "Preview", + C.PI_HANDLER: "no", + C.PI_DESCRIPTION: dedent( + _( + """\ + Retrieves and provides a preview of URLs using various protocols. Initially, it + uses the Open Graph protocol for most web pages. Specialized handlers are + implemented for YouTube using the oEmbed protocol. + """ + ) + ), +} + +OG_TAGS = [ + "title", + "type", + "image", + "url", + "audio", + "description", + "determiner", + "locale", + "locale:alternate", + "site_name", + "video", +] + + +class PreviewFetchError(Exception): + pass + + +@dataclass +class Protocol: + name: str + callback: Callable + priority: int + + +class Preview: + protocols: Dict[str, Protocol] = {} + domain_protocols: Dict[str, str] = {} + + def __init__(self, host): + log.info(_("Preview plugin initialization")) + self.host = host + + # generic protocols + + self.register("open_graph", self.fetch_open_graph_data, priority=100) + self.register("oembed", self.fetch_generic_oembed_data, priority=50) + self.register("generic", self.fetch_generic_data, priority=0) + + # domain specific protocols + + self.register("oembed-youtube", self.fetch_youtube_oembed_data, priority=-100) + self.register_domain_protocol( + ["www.youtube.com", "youtu.be", "m.youtube.com"], "oembed-youtube" + ) + + self.register("wikipedia", self.fetch_wikipedia_data, priority=-80) + self.register_domain_protocol(["*.wikipedia.org"], "wikipedia") + + self.register("invidious", self.fetch_invidious_data, priority=-90) + self.register_domain_protocol( + ["yewtu.be", "www.yewtu.be", "invidious.fdn.fr"], + "invidious" + ) + + # bridge methods + + host.bridge.add_method( + "url_preview_get", + ".plugin", + in_sign="sss", + out_sign="s", + method=self._url_preview_get, + async_=True, + ) + + # API + + def _url_preview_get(self, url: str, options: str, profile_key: str) -> defer.Deferred: + client = self.host.get_client(profile_key) + d = defer.ensureDeferred( + self.get_preview_data(client, url, data_format.deserialise(options)) + ) + d.addCallback(data_format.serialise) + return d + + @async_lru() + async def get_preview_data( + self, client: SatXMPPEntity, url: str, options: dict + ) -> Optional[dict]: + """Fetch preview data from a url using registered protocols + + @param url: The url to fetch the preview data from + @param options: Additional options that may be used while fetching preview data + @return: A dictionary containing the preview data or None if no data could be + fetched + """ + parsed_url = parse.urlparse(url) + domain = parsed_url.netloc + + preview_data: Optional[dict] = None + matched_protocol = None + for registered_domain, registered_protocol in self.domain_protocols.items(): + if fnmatch.fnmatch(domain, registered_domain): + matched_protocol = registered_protocol + break + + if matched_protocol is not None: + callback = self.protocols[matched_protocol].callback + preview_data = await callback(client, url, options) + else: + for name, protocol in sorted( + self.protocols.items(), key=lambda item: item[1].priority, reverse=True + ): + try: + preview_data = await protocol.callback(client, url, options) + except Exception as e: + log.warning(f"Can't run protocol {name} for {url}: {e}") + else: + if preview_data is not None: + matched_protocol = protocol.name + break + + if preview_data is not None: + preview_data["protocol"] = matched_protocol + # we don't clean html for youtube as we need Javascript to make it work, and + # for invidious as we generate it ourself + if "html" in preview_data: + if matched_protocol in ("oembed-youtube", "invidious"): + # this flag indicate that we know the source of HTML and we should be + # able to trust it. This will add `allow-scripts` and + # `allow-same-origin` in the preview ' + ) + # structure the data to be returned + data = { + "title": video_data.get("title"), + "description": video_data.get("description"), + "url": url, + "image": video_data.get("videoThumbnails", [{}])[0].get("url"), + "provider_name": "Invidious", + "provider_url": f"https://{parsed_url.netloc}", + "html": html, + "author_name": video_data.get("author"), + "author_url": f"https://{parsed_url.netloc}/channel/{video_data.get('authorId')}", + } + return data + else: + log.warning(f"Unable to fetch video data from Invidious API for {video_id}") + return None