# HG changeset patch # User Kim Alvefur # Date 1612397561 -3600 # Node ID 3fe2c264aac47871da973b8d2298a2208944af67 # Parent b3e0295e14a3d26c3387953bdcc8e16db6e21786 mod_pubsub_summary: Mangle HTML payloads in Atom/RSS feeds This module is meant for use with mod_pubsub_feeds and tries to improve on mod_pubsub's built-in Atom summary generator. diff -r b3e0295e14a3 -r 3fe2c264aac4 mod_pubsub_summary/mod_pubsub_summary.lua --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/mod_pubsub_summary/mod_pubsub_summary.lua Thu Feb 04 01:12:41 2021 +0100 @@ -0,0 +1,42 @@ +-- No, not trying to parse HTML here. It's an illusion. Just trying to read RSS feeds. +-- +-- Compose a textual representation of Atom payloads +module:hook("pubsub-summary/http://www.w3.org/2005/Atom", function (event) + local payload = event.payload; + local title = payload:get_child_text("title"); + local content_tag = payload:get_child("content") or payload:get_child("summary"); + local content = content_tag:get_text(); + if content_tag.attr.type == "html" then + content = content:gsub("\n*]*>\n*(.-)\n*

\n*", "%1\n\n"); + content = content:gsub("
  • (.-)
  • \n", "* %1\n"); + content = content:gsub("]*href=[\"'](.-)[\"'][^>]*>(.-)", "%2 <%1>"); + content = content:gsub("(.-)", "*%1*"); + content = content:gsub("(.-)", "*%1*"); + content = content:gsub("(.-)", "*%1*"); + content = content:gsub("(.-)", "*%1*"); + content = content:gsub("]*src=[\"'](.-)[\"'][^>]*>", " %1 "); -- TODO alt= would have been nice to grab + content = content:gsub("]*>", "\n"); + content = content:gsub("<[^>]+>", ""); + content = content:gsub("^%s*", ""):gsub("%s*$", ""); + content = content:gsub("\n\n\n+", "\n\n"); + content = content:gsub("&(%w+);", { + apos = "'"; + quot = '"'; + lt = "<"; + gt = ">"; + amp = "&"; + nbsp = utf8 and utf8.char(0xa0) or " "; + }); + end + local link = payload:get_child("link"); + local summary; + if title and content then + summary = title .. "\n\n" .. content; + elseif title or content then + summary = content or title; + end + if link and link.attr.href and link.attr.href ~= content then + summary = (summary and summary .. "\n" or "") .. link.attr.href; + end + return summary; +end, 1);