changeset 5859:259ffdbf8906

mod_anti_spam: New module for spam filtering (pre-alpha)
author Matthew Wild <mwild1@gmail.com>
date Tue, 05 Mar 2024 18:26:29 +0000
parents 761142ee0ff2
children fdff8cb54302
files mod_anti_spam/mod_anti_spam.lua mod_anti_spam/rtbl.lib.lua mod_anti_spam/trie.lib.lua
diffstat 3 files changed, 455 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_anti_spam/mod_anti_spam.lua	Tue Mar 05 18:26:29 2024 +0000
@@ -0,0 +1,165 @@
+local ip = require "util.ip";
+local jid_bare = require "util.jid".bare;
+local jid_split = require "util.jid".split;
+local set = require "util.set";
+local sha256 = require "util.hashes".sha256;
+local st = require"util.stanza";
+local is_contact_subscribed = require "core.rostermanager".is_contact_subscribed;
+local full_sessions = prosody.full_sessions;
+
+local user_exists = require "core.usermanager".user_exists;
+
+local new_rtbl_subscription = module:require("rtbl").new_rtbl_subscription;
+local trie = module:require("trie");
+
+local spam_source_domains = set.new();
+local spam_source_ips = trie.new();
+local spam_source_jids = set.new();
+
+local count_spam_blocked = module:metric("counter", "anti_spam_blocked", "stanzas", "Stanzas blocked as spam", {"reason"});
+
+function block_spam(event, reason, action)
+	event.spam_reason = reason;
+	event.spam_action = action;
+	if module:fire_event("spam-blocked", event) == false then
+		module:log("debug", "Spam allowed by another module");
+		return;
+	end
+
+	count_spam_blocked:with_labels(reason):add(1);
+
+	if action == "bounce" then
+		module:log("debug", "Bouncing likely spam %s from %s (%s)", event.stanza.name, event.stanza.attr.from, reason);
+		event.origin.send(st.error_reply("cancel", "policy-violation", "Rejected as spam"));
+	else
+		module:log("debug", "Discarding likely spam %s from %s (%s)", event.stanza.name, event.stanza.attr.from, reason);
+	end
+
+	return true;
+end
+
+function is_from_stranger(from_jid, event)
+	local stanza = event.stanza;
+	local to_user, to_host, to_resource = jid_split(stanza.attr.to);
+
+	if not to_user then return false; end
+
+	local to_session = full_sessions[stanza.attr.to];
+	if to_session then return false; end
+
+	if not is_contact_subscribed(to_user, to_host, from_jid) then
+		-- Allow all messages from your own jid
+		if from_jid == to_user.."@"..to_host then
+			return false; -- Pass through
+		end
+		if to_resource and stanza.attr.type == "groupchat" then
+			return false; -- Pass through
+		end
+		return true; -- Stranger danger
+	end
+end
+
+function is_spammy_server(session)
+	if spam_source_domains:contains(session.from_host) then
+		return true;
+	end
+	local origin_ip = ip.new(session.ip);
+	if spam_source_ips:contains_ip(origin_ip) then
+		return true;
+	end
+end
+
+function is_spammy_sender(sender_jid)
+	return spam_source_jids:contains(sha256(sender_jid, true));
+end
+
+local spammy_strings = module:get_option_array("anti_spam_block_strings");
+local spammy_patterns = module:get_option_array("anti_spam_block_patterns");
+
+function is_spammy_content(stanza)
+	-- Only support message content
+	if stanza.name ~= "message" then return; end
+	if not (spammy_strings or spammy_patterns) then return; end
+
+	local body = stanza:get_child_text("body");
+	if spammy_strings then
+		for _, s in ipairs(spammy_strings) do
+			if body:find(s, 1, true) then
+				return true;
+			end
+		end
+	end
+	if spammy_patterns then
+		for _, s in ipairs(spammy_patterns) do
+			if body:find(s) then
+				return true;
+			end
+		end
+	end
+end
+
+-- Set up RTBLs
+
+local anti_spam_services = module:get_option_array("anti_spam_services");
+
+for _, rtbl_service_jid in ipairs(anti_spam_services) do
+	new_rtbl_subscription(rtbl_service_jid, "spam_source_domains", {
+		added = function (item)
+			spam_source_domains:add(item);
+		end;
+		removed = function (item)
+			spam_source_domains:remove(item);
+		end;
+	});
+	new_rtbl_subscription(rtbl_service_jid, "spam_source_ips", {
+		added = function (item)
+			spam_source_ips:add_subnet(ip.parse_cidr(item));
+		end;
+		removed = function (item)
+			spam_source_ips:remove_subnet(ip.parse_cidr(item));
+		end;
+	});
+	new_rtbl_subscription(rtbl_service_jid, "spam_source_jids_sha256", {
+		added = function (item)
+			spam_source_jids:add(item);
+		end;
+		removed = function (item)
+			spam_source_jids:remove(item);
+		end;
+	});
+end
+
+module:hook("message/bare", function (event)
+	local to_bare = jid_bare(event.stanza.attr.to);
+
+	if not user_exists(to_bare) then return; end
+
+	local from_bare = jid_bare(event.stanza.attr.from);
+	if not is_from_stranger(from_bare, event) then return; end
+
+	if is_spammy_server(event.origin) then
+		return block_spam(event, "known-spam-source", "drop");
+	end
+
+	if is_spammy_sender(from_bare) then
+		return block_spam(event, "known-spam-jid", "drop");
+	end
+
+	if is_spammy_content(event.stanza) then
+		return block_spam(event, "spam-content", "drop");
+	end
+end, 500);
+
+module:hook("presence/bare", function (event)
+	if event.stanza.type ~= "subscribe" then
+		return;
+	end
+
+	if is_spammy_server(event.origin) then
+		return block_spam(event, "known-spam-source", "drop");
+	end
+
+	if is_spammy_sender(event.stanza) then
+		return block_spam(event, "known-spam-jid", "drop");
+	end
+end, 500);
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_anti_spam/rtbl.lib.lua	Tue Mar 05 18:26:29 2024 +0000
@@ -0,0 +1,122 @@
+local array = require "util.array";
+local id = require "util.id";
+local it = require "util.iterators";
+local set = require "util.set";
+local st = require "util.stanza";
+
+module:depends("pubsub_subscription");
+
+local function new_rtbl_subscription(rtbl_service_jid, rtbl_node, handlers)
+	local items = {};
+
+	local function notify(event_type, hash)
+		local handler = handlers[event_type];
+		if not handler then return; end
+		handler(hash);
+	end
+
+	module:add_item("pubsub-subscription", {
+		service = rtbl_service_jid;
+		node = rtbl_node;
+
+		-- Callbacks:
+		on_subscribed = function()
+			module:log("info", "RTBL active: %s:%s", rtbl_service_jid, rtbl_node);
+		end;
+
+		on_error = function(err)
+			module:log(
+				"error",
+				"Failed to subscribe to RTBL: %s:%s %s::%s:  %s",
+				rtbl_service_jid,
+				rtbl_node,
+				err.type,
+				err.condition,
+				err.text
+			);
+		end;
+
+		on_item = function(event)
+			local hash = event.item.attr.id;
+			if not hash then return; end
+			module:log("debug", "Received new hash from %s:%s: %s", rtbl_service_jid, rtbl_node, hash);
+			items[hash] = true;
+			notify("added", hash);
+		end;
+
+		on_retract = function (event)
+			local hash = event.item.attr.id;
+			if not hash then return; end
+			module:log("debug", "Retracted hash from %s:%s: %s", rtbl_service_jid, rtbl_node, hash);
+			items[hash] = nil;
+			notify("removed", hash);
+		end;
+
+		purge = function()
+			module:log("debug", "Purge all hashes from %s:%s", rtbl_service_jid, rtbl_node);
+			for hash in pairs(items) do
+				items[hash] = nil;
+				notify("removed", hash);
+			end
+		end;
+	});
+
+	local request_id = "rtbl-request-"..id.short();
+
+	local function request_list()
+		local items_request = st.iq({ to = rtbl_service_jid, from = module.host, type = "get", id = request_id })
+			:tag("pubsub", { xmlns = "http://jabber.org/protocol/pubsub" })
+				:tag("items", { node = rtbl_node }):up()
+			:up();
+		module:send(items_request);
+	end
+
+	local function update_list(event)
+		local from_jid = event.stanza.attr.from;
+		if from_jid ~= rtbl_service_jid then
+			module:log("debug", "Ignoring RTBL response from unknown sender: %s", from_jid);
+			return;
+		end
+		local items_el = event.stanza:find("{http://jabber.org/protocol/pubsub}pubsub/items");
+		if not items_el then
+			module:log("warn", "Invalid items response from RTBL service %s:%s", rtbl_service_jid, rtbl_node);
+			return;
+		end
+
+		local old_entries = set.new(array.collect(it.keys(items)));
+
+		local n_added, n_removed, n_total = 0, 0, 0;
+		for item in items_el:childtags("item") do
+			local hash = item.attr.id;
+			if hash then
+				n_total = n_total + 1;
+				if not old_entries:contains(hash) then
+					-- New entry
+					n_added = n_added + 1;
+					items[hash] = true;
+					notify("added", hash);
+				else
+					-- Entry already existed
+					old_entries:remove(hash);
+				end
+			end
+		end
+
+		-- Remove old entries that weren't in the received list
+		for hash in old_entries do
+			n_removed = n_removed + 1;
+			items[hash] = nil;
+			notify("removed", hash);
+		end
+
+		module:log("info", "%d RTBL entries received from %s:%s (%d added, %d removed)", n_total, from_jid, rtbl_node, n_added, n_removed);
+		return true;
+	end
+
+	module:hook("iq-result/host/"..request_id, update_list);
+	module:add_timer(0, request_list);
+end
+
+return {
+	new_rtbl_subscription = new_rtbl_subscription;
+}
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_anti_spam/trie.lib.lua	Tue Mar 05 18:26:29 2024 +0000
@@ -0,0 +1,168 @@
+local bit = require "prosody.util.bitcompat";
+
+local trie_methods = {};
+local trie_mt = { __index = trie_methods };
+
+local function new_node()
+	return {};
+end
+
+function trie_methods:set(item, value)
+	local node = self.root;
+	for i = 1, #item do
+		local c = item:byte(i);
+		if not node[c] then
+			node[c] = new_node();
+		end
+		node = node[c];
+	end
+	node.terminal = true;
+	node.value = value;
+end
+
+local function _remove(node, item, i)
+	if i > #item then
+		if node.terminal then
+			node.terminal = nil;
+			node.value = nil;
+		end
+		if next(node) ~= nil then
+			return node;
+		end
+		return nil;
+	end
+	local c = item:byte(i);
+	local child = node[c];
+	local ret;
+	if child then
+		ret = _remove(child, item, i+1);
+		node[c] = ret;
+	end
+	if ret == nil and next(node) == nil then
+		return nil;
+	end
+	return node;
+end
+
+function trie_methods:remove(item)
+	return _remove(self.root, item, 1);
+end
+
+function trie_methods:get(item, partial)
+	local value;
+	local node = self.root;
+	local len = #item;
+	for i = 1, len do
+		if partial and node.terminal then
+			value = node.value;
+		end
+		local c = item:byte(i);
+		node = node[c];
+		if not node then
+			return value, i - 1;
+		end
+	end
+	return node.value, len;
+end
+
+function trie_methods:add(item)
+	return self:set(item, true);
+end
+
+function trie_methods:contains(item, partial)
+	return self:get(item, partial) ~= nil;
+end
+
+function trie_methods:longest_prefix(item)
+	return select(2, self:get(item));
+end
+
+function trie_methods:add_subnet(item, bits)
+	item = item.packed:sub(1, math.ceil(bits/8));
+	local existing = self:get(item);
+	if not existing then
+		existing = { bits };
+		return self:set(item, existing);
+	end
+
+	-- Simple insertion sort
+	for i = 1, #existing do
+		local v = existing[i];
+		if v == bits then
+			return; -- Already in there
+		elseif v > bits then
+			table.insert(existing, v, i);
+			return;
+		end
+	end
+end
+
+function trie_methods:remove_subnet(item, bits)
+	item = item.packed:sub(1, math.ceil(bits/8));
+	local existing = self:get(item);
+	if not existing then
+		return;
+	end
+
+	-- Simple insertion sort
+	for i = 1, #existing do
+		local v = existing[i];
+		if v == bits then
+			table.remove(existing, i);
+			break;
+		elseif v > bits then
+			return; -- Stop search
+		end
+	end
+
+	if #existing == 0 then
+		self:remove(item);
+	end
+end
+
+function trie_methods:has_ip(item)
+	item = item.packed;
+	local node = self.root;
+	local len = #item;
+	for i = 1, len do
+		if node.terminal then
+			return true;
+		end
+
+		local c = item:byte(i);
+		local child = node[c];
+		if not child then
+			for child_byte, child_node in pairs(node) do
+				if type(child_byte) == "number" and child_node.terminal then
+					local bits = child_node.value;
+					for j = #bits, 1, -1 do
+						local b = bits[j]-((i-1)*8);
+						if b ~= 8 then
+							local mask = bit.bnot(2^b-1);
+							if bit.band(bit.bxor(c, child_byte), mask) == 0 then
+								return true;
+							end
+						end
+					end
+				end
+			end
+			return false;
+		end
+		node = child;
+	end
+end
+
+local function new()
+	return setmetatable({
+		root = new_node();
+	}, trie_mt);
+end
+
+local function is_trie(o)
+	return getmetatable(o) == trie_mt;
+end
+
+return {
+	new = new;
+	is_trie = is_trie;
+};