diff libervia/backend/plugins/plugin_misc_url_preview.py @ 4103:eaa0daa7f834

plugin URL preview: URL preview first draft
author Goffi <goffi@goffi.org>
date Tue, 27 Jun 2023 15:48:15 +0200
parents
children 0d7bb4df2343
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/libervia/backend/plugins/plugin_misc_url_preview.py	Tue Jun 27 15:48:15 2023 +0200
@@ -0,0 +1,503 @@
+#!/usr/bin/env python3
+
+
+# Libervia plugin to handle events
+# Copyright (C) 2009-2022 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from dataclasses import dataclass
+import json
+from textwrap import dedent
+from typing import Callable, Dict, List, Optional, Union
+from urllib import parse
+import fnmatch
+
+from lxml import etree
+import treq
+from twisted.internet import defer
+
+from libervia.backend.core.constants import Const as C
+from libervia.backend.core.core_types import SatXMPPEntity
+from libervia.backend.core.exceptions import ConflictError
+from libervia.backend.core.i18n import _
+from libervia.backend.core.log import getLogger
+from libervia.backend.tools.common import data_format
+from libervia.backend.tools.common.async_utils import async_lru
+
+log = getLogger(__name__)
+
+PLUGIN_INFO = {
+    C.PI_NAME: "Preview",
+    C.PI_IMPORT_NAME: "Preview",
+    C.PI_TYPE: C.PLUG_TYPE_MISC,
+    C.PI_PROTOCOLS: ["Open Graph", "oEmbed"],
+    C.PI_DEPENDENCIES: ["TEXT_SYNTAXES"],
+    C.PI_MAIN: "Preview",
+    C.PI_HANDLER: "no",
+    C.PI_DESCRIPTION: dedent(
+        _(
+            """\
+    Retrieves and provides a preview of URLs using various protocols. Initially, it
+    uses the Open Graph protocol for most web pages. Specialized handlers are
+    implemented for YouTube using the oEmbed protocol.
+    """
+        )
+    ),
+}
+
+OG_TAGS = [
+    "title",
+    "type",
+    "image",
+    "url",
+    "audio",
+    "description",
+    "determiner",
+    "locale",
+    "locale:alternate",
+    "site_name",
+    "video",
+]
+
+
+class PreviewFetchError(Exception):
+    pass
+
+
+@dataclass
+class Protocol:
+    name: str
+    callback: Callable
+    priority: int
+
+
+class Preview:
+    protocols: Dict[str, Protocol] = {}
+    domain_protocols: Dict[str, str] = {}
+
+    def __init__(self, host):
+        log.info(_("Preview plugin initialization"))
+        self.host = host
+
+        # generic protocols
+
+        self.register("open_graph", self.fetch_open_graph_data, priority=100)
+        self.register("oembed", self.fetch_generic_oembed_data, priority=50)
+        self.register("generic", self.fetch_generic_data, priority=0)
+
+        # domain specific protocols
+
+        self.register("oembed-youtube", self.fetch_youtube_oembed_data, priority=-100)
+        self.register_domain_protocol(
+            ["www.youtube.com", "youtu.be", "m.youtube.com"], "oembed-youtube"
+        )
+
+        self.register("wikipedia", self.fetch_wikipedia_data, priority=-80)
+        self.register_domain_protocol(["*.wikipedia.org"], "wikipedia")
+
+        self.register("invidious", self.fetch_invidious_data, priority=-90)
+        self.register_domain_protocol(
+            ["yewtu.be", "www.yewtu.be", "invidious.fdn.fr"],
+            "invidious"
+        )
+
+        # bridge methods
+
+        host.bridge.add_method(
+            "url_preview_get",
+            ".plugin",
+            in_sign="sss",
+            out_sign="s",
+            method=self._url_preview_get,
+            async_=True,
+        )
+
+    # API
+
+    def _url_preview_get(self, url: str, options: str, profile_key: str) -> defer.Deferred:
+        client = self.host.get_client(profile_key)
+        d = defer.ensureDeferred(
+            self.get_preview_data(client, url, data_format.deserialise(options))
+        )
+        d.addCallback(data_format.serialise)
+        return d
+
+    @async_lru()
+    async def get_preview_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch preview data from a url using registered protocols
+
+        @param url: The url to fetch the preview data from
+        @param options: Additional options that may be used while fetching preview data
+        @return: A dictionary containing the preview data or None if no data could be
+            fetched
+        """
+        parsed_url = parse.urlparse(url)
+        domain = parsed_url.netloc
+
+        preview_data: Optional[dict] = None
+        matched_protocol = None
+        for registered_domain, registered_protocol in self.domain_protocols.items():
+            if fnmatch.fnmatch(domain, registered_domain):
+                matched_protocol = registered_protocol
+                break
+
+        if matched_protocol is not None:
+            callback = self.protocols[matched_protocol].callback
+            preview_data = await callback(client, url, options)
+        else:
+            for name, protocol in sorted(
+                self.protocols.items(), key=lambda item: item[1].priority, reverse=True
+            ):
+                try:
+                    preview_data = await protocol.callback(client, url, options)
+                except Exception as e:
+                    log.warning(f"Can't run protocol {name} for {url}: {e}")
+                else:
+                    if preview_data is not None:
+                        matched_protocol = protocol.name
+                        break
+
+        if preview_data is not None:
+            preview_data["protocol"] = matched_protocol
+            # we don't clean html for youtube as we need Javascript to make it work, and
+            # for invidious as we generate it ourself
+            if "html" in preview_data:
+                if matched_protocol in ("oembed-youtube", "invidious"):
+                    # this flag indicate that we know the source of HTML and we should be
+                    # able to trust it. This will add `allow-scripts` and
+                    # `allow-same-origin` in the preview <iframe> "sandbox" attribute
+                    preview_data["html_known"] = True
+                else:
+                    preview_data["html_known"] = False
+                    clean_xhtml = self.host.plugins["TEXT_SYNTAXES"].clean_xhtml
+                    try:
+                        preview_data["html"] = clean_xhtml(preview_data["html"])
+                    except Exception as e:
+                        log.warning(f"Can't clean html data: {e}\n{preview_data}")
+                        del preview_data["html"]
+
+
+        return preview_data
+
+    @classmethod
+    def register(cls, name: str, callback: Callable, priority: int = 0):
+        """Register a protocol to retrieve preview data
+
+        The registered callback should return a dictionary of preview data if available,
+        or None otherwise.
+
+        @param name: Unique name of the protocol
+        @param callback: Async callback function to fetch preview data
+        @param priority: Priority of the protocol, with higher numbers indicating higher
+            priority
+        @return: None
+        """
+        if name in cls.protocols:
+            raise ConflictError(f"Protocol with the name {name} is already registered.")
+
+        cls.protocols[name] = Protocol(name=name, callback=callback, priority=priority)
+
+    @classmethod
+    def register_domain_protocol(cls, domains: Union[str, List[str]], protocol_name: str):
+        """Register a protocol for a specific domain or list of domains
+
+        @param domains: The domain name or list of domain names
+        @param protocol_name: The name of the protocol to be associated with the domain(s)
+        @return: None
+        """
+        protocol_name = protocol_name.replace(" ", "").lower()
+        if protocol_name not in cls.protocols:
+            raise ConflictError(
+                f"Protocol with the name {protocol_name} is not registered."
+            )
+
+        if isinstance(domains, str):
+            domains = [domains]
+
+        for domain in domains:
+            domain = domain.strip()
+            if not domain:
+                log.warning("empty string used as domain, ignoring")
+                continue
+            cls.domain_protocols[domain] = protocol_name
+
+    # Open Graph
+
+    async def fetch_open_graph_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch Open Graph data from a url
+
+        This method implements the Open Graph protocol, details of which can be found at:
+        http://ogp.me/
+
+        @param url: The url to fetch the Open Graph data from
+        @param options: Additional options that may be used while fetching data
+        @return: A dictionary containing the Open Graph data or None if no data could be
+            fetched
+        """
+        resp = await treq.get(url)
+
+        if resp.code == 200:
+            html = await resp.text()
+            parser = etree.HTMLParser()
+            tree = etree.fromstring(html, parser)
+
+            # Extract Open Graph data
+            metadata = {}
+            for tag in OG_TAGS:
+                og_el = tree.find('.//meta[@property="og:{tag}"]'.format(tag=tag))
+                if og_el is not None:
+                    metadata[tag] = og_el.get("content")
+
+            if metadata:
+                if "site_name" in metadata and not "provider_name" in metadata:
+                    metadata["provider_name"] = metadata["site_name"]
+                return metadata
+
+            return None
+        else:
+            raise PreviewFetchError(
+                f"Failed to fetch preview for {url}, status code: {resp.code}"
+            )
+
+    # oEmbed
+
+    async def _fetch_oembed_data(self, oembed_url: str) -> Optional[dict]:
+        """Fetch oEmbed data from a given oEmbed URL
+
+        @param oembed_url: The url to fetch the oEmbed data from
+        @return: A dictionary containing the oEmbed data or None if no data could be
+            fetched
+        """
+        resp = await treq.get(oembed_url)
+        if resp.code == 200:
+            return json.loads(await resp.text())
+        else:
+            raise PreviewFetchError(
+                f"Failed to fetch oEmbed preview for {oembed_url}, status code: "
+                f"{resp.code}"
+            )
+
+    async def fetch_youtube_oembed_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch YouTube oEmbed data from a url
+
+        @param url: The url to fetch the YouTube oEmbed data from
+        @param options: Additional options that may be used while fetching data
+        @return: A dictionary containing the YouTube oEmbed data or None if no data could
+            be fetched
+        """
+        oembed_url = f"https://www.youtube.com/oembed?url={parse.quote(url)}&format=json"
+        data = await self._fetch_oembed_data(oembed_url)
+        if data is not None and 'html' in data:
+            html = data['html']
+            root = etree.HTML(html)
+            iframe_elt = root.xpath('//iframe')
+            if iframe_elt:
+                iframe_elt[0].attrib['style'] = (
+                    'position: absolute; top: 0; left: 0; width: 100%; height: 100%;'
+                )
+                data['html'] = etree.tostring(root, method='html', encoding='unicode')
+            else:
+                log.warning("No <iframe> found in the YouTube oEmbed response")
+
+        return data
+
+    async def fetch_generic_oembed_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch generic oEmbed data from a url
+
+        @param url: The url to fetch the oEmbed data from
+        @param options: Additional options that may be used while fetching data
+        @return: A dictionary containing the oEmbed data or None if no data could be
+            fetched
+        """
+        resp = await treq.get(url)
+        if resp.code == 200:
+            html = await resp.text()
+            parser = etree.HTMLParser()
+            tree = etree.fromstring(html, parser)
+
+            # Find oEmbed URL
+            oembed_link = tree.find('.//link[@type="application/json+oembed"]')
+            if oembed_link is not None:
+                oembed_url = oembed_link.get("href")
+                return await self._fetch_oembed_data(oembed_url)
+            else:
+                return None
+        else:
+            raise PreviewFetchError(
+                f"Failed to fetch preview for {url}, status code: {resp.code}"
+            )
+
+
+    async def fetch_generic_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch generic data from a url
+
+        This method attempts to extract the title, description, and author metadata from
+        the HTML of the page. If these data cannot be found, the method will return None.
+
+        @param url: The url to fetch the generic data from
+        @param options: Additional options that may be used while fetching data
+        @return: A dictionary containing the generic data or None if no data could be
+            fetched
+        """
+        resp = await treq.get(url)
+        if resp.code == 200:
+            html = await resp.text()
+            parser = etree.HTMLParser()
+            tree = etree.fromstring(html, parser)
+
+            # Find title, description, and author metadata
+            title_el = tree.find(".//title")
+            desc_el = tree.find('.//meta[@name="description"]')
+            author_el = tree.find('.//meta[@name="author"]')
+
+            metadata = {
+                "title": title_el.text if title_el is not None else "",
+                "description": desc_el.get("content") if desc_el is not None else "",
+                "author_name": author_el.get("content") if author_el is not None else "",
+                "url": url,
+                "provider_name": parse.urlparse(url).netloc,
+                "provider_url": f"{parse.urlparse(url).scheme}://{parse.urlparse(url).netloc}",
+            }
+
+            return metadata if any(metadata.values()) else None
+        else:
+            raise PreviewFetchError(
+                f"Failed to fetch generic preview for {url}, status code: {resp.code}"
+            )
+
+    # Wikipedia
+
+    async def fetch_wikipedia_data(
+        self, client: SatXMPPEntity, url: str, options: dict
+    ) -> Optional[dict]:
+        """Fetch Wikipedia data from a url
+
+        This method implements the Wikipedia API, details of which can be found at:
+        https://www.mediawiki.org/wiki/API:Main_page
+
+        @param url: The url to fetch the Wikipedia data from
+        @param options: Additional options that may be used while fetching data
+        @return: A dictionary containing the Wikipedia data or None if no data could be
+            fetched
+        """
+        parsed_url = parse.urlparse(url)
+        page_name = parsed_url.path.split("/")[-1]
+
+        # Use the Wikipedia API to get a summary of the page and a preview image
+        api_url = (
+            f"https://{parsed_url.netloc}/w/api.php?format=json&action=query&"
+            f"prop=extracts|pageimages&exintro&explaintext&redirects=1&piprop=thumbnail"
+            f"&pithumbsize=300&titles={page_name}"
+        )
+
+        resp = await treq.get(api_url)
+        if resp.code == 200:
+            data = json.loads(await resp.text())
+            # The page ID is the first key in the "pages" dictionary
+            page_id = next(iter(data["query"]["pages"].keys()))
+            page = data["query"]["pages"][page_id]
+
+            # The API may return a page with a missing title or extract if the page does
+            # not exist
+            if "missing" in page:
+                return None
+
+            return {
+                "provider_name": "Wikipedia",
+                "provider_url": "https://www.wikipedia.org",
+                "title": page.get("title"),
+                "description": page.get("extract"),
+                "url": url,
+                "image": page.get("thumbnail", {}).get("source")
+                if "thumbnail" in page
+                else None,
+            }
+        else:
+            raise PreviewFetchError(
+                f"Failed to fetch Wikipedia preview for {url}, status code: {resp.code}"
+            )
+
+    # Invidious
+
+    async def fetch_invidious_data(self, client: SatXMPPEntity, url: str, options: dict) -> Optional[dict]:
+        """
+        Fetch Invidious data from a url and generate HTML iframe.
+
+        @param url: The url to fetch the Invidious data from.
+        @param options: Additional options that may be used while fetching data.
+        @return: A dictionary containing the Invidious data or None if no data could be fetched.
+        """
+        parsed_url = parse.urlparse(url)
+        if 'watch' in parsed_url.path:
+            video_id = parse.parse_qs(parsed_url.query).get('v', [None])[0]
+        else:
+            video_id = parsed_url.path.strip('/')
+        if not video_id:
+            log.warning(f"Can't extract video ID from {url}")
+            return None
+
+        invidious_api_url = f"https://{parsed_url.netloc}/api/v1/videos/{video_id}"
+
+        resp = await treq.get(invidious_api_url)
+        if resp.code == 200:
+            video_data = await resp.json()
+            # construct the iframe html code
+            html = (
+                f'<iframe'
+                f'    width="100%"'
+                f'    height="auto"'
+                f'    src="https://{parsed_url.netloc}/embed/{video_id}"'
+                f'    frameborder="0" '
+                f'    allow="'
+                f'        accelerometer;'
+                f'        autoplay;'
+                f'        clipboard-write;'
+                f'        encrypted-media;'
+                f'        gyroscope;'
+                f'        picture-in-picture"'
+                f'    style="'
+                f'        position: absolute;'
+                f'        top: 0;'
+                f'        left: 0;'
+                f'        width: 100%;'
+                f'        height: 100%;"'
+                f'    allowfullscreen></iframe>'
+            )
+            # structure the data to be returned
+            data = {
+                "title": video_data.get("title"),
+                "description": video_data.get("description"),
+                "url": url,
+                "image": video_data.get("videoThumbnails", [{}])[0].get("url"),
+                "provider_name": "Invidious",
+                "provider_url": f"https://{parsed_url.netloc}",
+                "html": html,
+                "author_name": video_data.get("author"),
+                "author_url": f"https://{parsed_url.netloc}/channel/{video_data.get('authorId')}",
+            }
+            return data
+        else:
+            log.warning(f"Unable to fetch video data from Invidious API for {video_id}")
+            return None