comparison mod_pubsub_feeds/mod_pubsub_feeds.lua @ 5571:ca3c2d11823c

mod_pubsub_feeds: Track latest timestamp seen in feeds instead of last poll This should ensure that an entry that has a publish timestmap after the previously oldest post, but before the time of the last poll check, is published to the node. Previously if an entry would be skipped if it was published at 13:00 with a timestamp of 12:30, where the last poll was at 12:45. For feeds that lack a timestamp, it now looks for the first post that is not published, assuming that the feed is in reverse chronological order, then iterates back up from there.
author Kim Alvefur <zash@zash.se>
date Sun, 25 Jun 2023 16:27:55 +0200
parents f93b1fc1aa31
children fd1c535dcb92
comparison
equal deleted inserted replaced
5570:f93b1fc1aa31 5571:ca3c2d11823c
96 module:log("error", "Could not create node %s: %s", node, err); 96 module:log("error", "Could not create node %s: %s", node, err);
97 return; 97 return;
98 end 98 end
99 items = {}; 99 items = {};
100 end 100 end
101 for i = #entries, 1, -1 do -- Feeds are usually in reverse order 101
102 local start_from = #entries;
103 for i, entry in ipairs(entries) do
104 local id = entry:get_child_text("id");
105 if not id then
106 local link = entry:get_child("link");
107 if link then
108 module:log("debug", "Feed %q item %s is missing an id, using <link> instead", feed.url, entry:top_tag());
109 id = link and link.attr.href;
110 else
111 module:log("debug", "Feed %q item %s is missing an id, using a HMAC of the item instead", feed.url, entry:top_tag());
112 id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp);
113 end
114 entry:text_tag("id", id);
115 end
116
117 if items[id] then
118 -- This should be the first item that we already have.
119 start_from = i-1;
120 break
121 end
122 end
123
124 for i = start_from, 1, -1 do -- Feeds are usually in reverse order
102 local entry = entries[i]; 125 local entry = entries[i];
103 entry.attr.xmlns = xmlns_atom; 126 entry.attr.xmlns = xmlns_atom;
104 127
105 local e_published = entry:get_child_text("published"); 128 local id = entry:get_child_text("id");
106 e_published = e_published and dt_parse(e_published); 129
107 local e_updated = entry:get_child_text("updated"); 130 local timestamp = dt_parse(entry:get_child_text("published"));
108 e_updated = e_updated and dt_parse(e_updated); 131 if not timestamp then
109 132 timestamp = time();
110 local timestamp = e_updated or e_published or nil; 133 entry:text_tag("published", dt_datetime(timestamp));
111 --module:log("debug", "timestamp is %s, item.last_update is %s", tostring(timestamp), tostring(item.last_update)); 134 end
135
112 if not timestamp or not item.last_update or timestamp > item.last_update then 136 if not timestamp or not item.last_update or timestamp > item.last_update then
113 local id = entry:get_child_text("id"); 137 local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry);
114 if not id then 138 -- TODO Put data from /feed into item/source
115 local link = entry:get_child("link"); 139
116 id = link and link.attr.href; 140 local ok, err = pubsub.service:publish(node, true, id, xitem);
117 end 141 if not ok then
118 if not id then 142 module:log("error", "Publishing to node %s failed: %s", node, err);
119 -- Sigh, no link? 143 elseif timestamp then
120 id = feed.url .. "#" .. hmac_sha1(feed.url, tostring(entry), true) .. "@" .. dt_datetime(timestamp); 144 item.last_update = timestamp;
121 end
122 if not items[id] then
123 local xitem = st.stanza("item", { id = id, xmlns = "http://jabber.org/protocol/pubsub" }):add_child(entry);
124 -- TODO Put data from /feed into item/source
125
126 --module:log("debug", "publishing to %s, id %s", node, id);
127 local ok, err = pubsub.service:publish(node, true, id, xitem);
128 if not ok then
129 module:log("error", "Publishing to node %s failed: %s", node, err);
130 end
131 end 145 end
132 end 146 end
133 end 147 end
134 148
135 if item.lease_expires and item.lease_expires > time() then 149 if item.lease_expires and item.lease_expires > time() then
155 end 169 end
156 http.request(item.url, { headers = headers }, function(data, code, resp) 170 http.request(item.url, { headers = headers }, function(data, code, resp)
157 if code == 200 then 171 if code == 200 then
158 item.data = data; 172 item.data = data;
159 if callback then callback(item) end 173 if callback then callback(item) end
160 item.last_update = time();
161 if resp.headers then 174 if resp.headers then
162 item.etag = resp.headers.etag 175 item.etag = resp.headers.etag
163 end 176 end
164 elseif code == 304 then 177 elseif code == 304 then
165 item.last_update = time(); 178 module:log("debug", "No updates to %q", item.url);
166 elseif code == 301 and resp.headers.location then 179 elseif code == 301 and resp.headers.location then
167 module:log("info", "Feed %q has moved to %q", item.url, resp.headers.location); 180 module:log("info", "Feed %q has moved to %q", item.url, resp.headers.location);
168 elseif code <= 100 then 181 elseif code <= 100 then
169 module:log("error", "Error fetching %q: %q[%d]", item.url, data, code); 182 module:log("error", "Error fetching %q: %q[%d]", item.url, data, code);
170 else 183 else
269 end 282 end
270 module:log("debug", "Valid signature"); 283 module:log("debug", "Valid signature");
271 end 284 end
272 feed.data = body; 285 feed.data = body;
273 update_entry(feed); 286 update_entry(feed);
274 feed.last_update = time();
275 return 202; 287 return 202;
276 end 288 end
277 return 400; 289 return 400;
278 end 290 end
279 return 501; 291 return 501;