changeset 5571:ca3c2d11823c

mod_pubsub_feeds: Track latest timestamp seen in feeds instead of last poll This should ensure that an entry that has a publish timestmap after the previously oldest post, but before the time of the last poll check, is published to the node. Previously if an entry would be skipped if it was published at 13:00 with a timestamp of 12:30, where the last poll was at 12:45. For feeds that lack a timestamp, it now looks for the first post that is not published, assuming that the feed is in reverse chronological order, then iterates back up from there.
author Kim Alvefur <zash@zash.se>
date Sun, 25 Jun 2023 16:27:55 +0200
parents f93b1fc1aa31
children fd1c535dcb92
files mod_pubsub_feeds/mod_pubsub_feeds.lua
diffstat 1 files changed, 39 insertions(+), 27 deletions(-) [+]
line wrap: on
line diff
--- a/mod_pubsub_feeds/mod_pubsub_feeds.lua	Sun Jun 25 16:24:12 2023 +0200
+++ b/mod_pubsub_feeds/mod_pubsub_feeds.lua	Sun Jun 25 16:27:55 2023 +0200
@@ -98,36 +98,50 @@
 		end
 		items = {};
 	end
-	for i = #entries, 1, -1 do -- Feeds are usually in reverse order
+
+	local start_from = #entries;
+	for i, entry in ipairs(entries) do
+		local id = entry:get_child_text("id");
+		if not id then
+			local link = entry:get_child("link");
+			if link then
+				module:log("debug", "Feed %q item %s is missing an id, using <link> instead", feed.url, entry:top_tag());
+				id = link and link.attr.href;
+			else
+				module:log("debug", "Feed %q item %s is missing an id, using a HMAC of the item instead", feed.url, entry:top_tag());
+				id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp);
+			end
+			entry:text_tag("id", id);
+		end
+
+		if items[id] then
+			-- This should be the first item that we already have.
+			start_from = i-1;
+			break
+		end
+	end
+
+	for i = start_from, 1, -1 do -- Feeds are usually in reverse order
 		local entry = entries[i];
 		entry.attr.xmlns = xmlns_atom;
 
-		local e_published = entry:get_child_text("published");
-		e_published = e_published and dt_parse(e_published);
-		local e_updated = entry:get_child_text("updated");
-		e_updated = e_updated and dt_parse(e_updated);
+		local id = entry:get_child_text("id");
 
-		local timestamp = e_updated or e_published or nil;
-		--module:log("debug", "timestamp is %s, item.last_update is %s", tostring(timestamp), tostring(item.last_update));
+		local timestamp = dt_parse(entry:get_child_text("published"));
+		if not timestamp then
+			timestamp = time();
+			entry:text_tag("published", dt_datetime(timestamp));
+		end
+
 		if not timestamp or not item.last_update or timestamp > item.last_update then
-			local id = entry:get_child_text("id");
-			if not id then
-				local link = entry:get_child("link");
-				id = link and link.attr.href;
-			end
-			if not id then
-				-- Sigh, no link?
-				id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp);
-			end
-			if not items[id] then
-				local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry);
-				-- TODO Put data from /feed into item/source
+			local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry);
+			-- TODO Put data from /feed into item/source
 
-				--module:log("debug", "publishing to %s, id %s", node, id);
-				local ok, err = pubsub.service:publish(node, true, id, xitem);
-				if not ok then
-					module:log("error", "Publishing to node %s failed: %s", node, err);
-				end
+			local ok, err = pubsub.service:publish(node, true, id, xitem);
+			if not ok then
+				module:log("error", "Publishing to node %s failed: %s", node, err);
+			elseif timestamp then
+				item.last_update = timestamp;
 			end
 		end
 	end
@@ -157,12 +171,11 @@
 		if code == 200 then
 			item.data = data;
 			if callback then callback(item) end
-			item.last_update = time();
 			if resp.headers then
 				item.etag = resp.headers.etag
 			end
 		elseif code == 304 then
-			item.last_update = time();
+			module:log("debug", "No updates to %q", item.url);
 		elseif code == 301 and resp.headers.location then
 			module:log("info", "Feed %q has moved to %q", item.url, resp.headers.location);
 		elseif code <= 100 then
@@ -271,7 +284,6 @@
 			end
 			feed.data = body;
 			update_entry(feed);
-			feed.last_update = time();
 			return 202;
 		end
 		return 400;