# HG changeset patch # User Kim Alvefur # Date 1687703275 -7200 # Node ID ca3c2d11823c5ba6159cf8c876cf6de93f12d08c # Parent f93b1fc1aa31cf066aa122a34f828a536c7988ce mod_pubsub_feeds: Track latest timestamp seen in feeds instead of last poll This should ensure that an entry that has a publish timestmap after the previously oldest post, but before the time of the last poll check, is published to the node. Previously if an entry would be skipped if it was published at 13:00 with a timestamp of 12:30, where the last poll was at 12:45. For feeds that lack a timestamp, it now looks for the first post that is not published, assuming that the feed is in reverse chronological order, then iterates back up from there. diff -r f93b1fc1aa31 -r ca3c2d11823c mod_pubsub_feeds/mod_pubsub_feeds.lua --- a/mod_pubsub_feeds/mod_pubsub_feeds.lua Sun Jun 25 16:24:12 2023 +0200 +++ b/mod_pubsub_feeds/mod_pubsub_feeds.lua Sun Jun 25 16:27:55 2023 +0200 @@ -98,36 +98,50 @@ end items = {}; end - for i = #entries, 1, -1 do -- Feeds are usually in reverse order + + local start_from = #entries; + for i, entry in ipairs(entries) do + local id = entry:get_child_text("id"); + if not id then + local link = entry:get_child("link"); + if link then + module:log("debug", "Feed %q item %s is missing an id, using instead", feed.url, entry:top_tag()); + id = link and link.attr.href; + else + module:log("debug", "Feed %q item %s is missing an id, using a HMAC of the item instead", feed.url, entry:top_tag()); + id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp); + end + entry:text_tag("id", id); + end + + if items[id] then + -- This should be the first item that we already have. + start_from = i-1; + break + end + end + + for i = start_from, 1, -1 do -- Feeds are usually in reverse order local entry = entries[i]; entry.attr.xmlns = xmlns_atom; - local e_published = entry:get_child_text("published"); - e_published = e_published and dt_parse(e_published); - local e_updated = entry:get_child_text("updated"); - e_updated = e_updated and dt_parse(e_updated); + local id = entry:get_child_text("id"); - local timestamp = e_updated or e_published or nil; - --module:log("debug", "timestamp is %s, item.last_update is %s", tostring(timestamp), tostring(item.last_update)); + local timestamp = dt_parse(entry:get_child_text("published")); + if not timestamp then + timestamp = time(); + entry:text_tag("published", dt_datetime(timestamp)); + end + if not timestamp or not item.last_update or timestamp > item.last_update then - local id = entry:get_child_text("id"); - if not id then - local link = entry:get_child("link"); - id = link and link.attr.href; - end - if not id then - -- Sigh, no link? - id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp); - end - if not items[id] then - local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry); - -- TODO Put data from /feed into item/source + local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry); + -- TODO Put data from /feed into item/source - --module:log("debug", "publishing to %s, id %s", node, id); - local ok, err = pubsub.service:publish(node, true, id, xitem); - if not ok then - module:log("error", "Publishing to node %s failed: %s", node, err); - end + local ok, err = pubsub.service:publish(node, true, id, xitem); + if not ok then + module:log("error", "Publishing to node %s failed: %s", node, err); + elseif timestamp then + item.last_update = timestamp; end end end @@ -157,12 +171,11 @@ if code == 200 then item.data = data; if callback then callback(item) end - item.last_update = time(); if resp.headers then item.etag = resp.headers.etag end elseif code == 304 then - item.last_update = time(); + module:log("debug", "No updates to %q", item.url); elseif code == 301 and resp.headers.location then module:log("info", "Feed %q has moved to %q", item.url, resp.headers.location); elseif code <= 100 then @@ -271,7 +284,6 @@ end feed.data = body; update_entry(feed); - feed.last_update = time(); return 202; end return 400;