changeset 4815:9c2af2146ee2

mod_export_skeletons: Command to aid in analysis of archive contents
author Kim Alvefur <zash@zash.se>
date Thu, 09 Dec 2021 23:48:25 +0100
parents 5f12c75fd210
children e7d1d68f0279
files mod_export_skeletons/README.md mod_export_skeletons/mod_export_skeletons.lua
diffstat 2 files changed, 112 insertions(+), 0 deletions(-) [+]
line wrap: on
line diff
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_export_skeletons/README.md	Thu Dec 09 23:48:25 2021 +0100
@@ -0,0 +1,41 @@
+---
+summary: Export message archives in sanitized minimal form for analysis
+---
+
+Exports message archives in a format stripped from private information
+and message content.
+
+# Usage
+
+    prosodyctl mod_export_skeletons [options] user@host*
+
+Multiple user JIDs can be given.
+
+Some storage drivers such as [SQL][doc:modules:mod_storage_sql] allows
+exporting all users at once by giving the special username `*`, i.e.
+`prosodyctl mod_export_skeletons \*@example.com`.
+
+`--start=timestamp`
+:	Start of time span to export in [XEP-0082] format
+
+`--end=timestamp`
+:	End of time span to export in [XEP-0082] format
+
+# Output
+
+All content is stripped, leaving only the basic XML structure, with
+child tags sorted.
+
+Top level attributes are given special treatment since they carry
+protocol semantics. Notably the `@to` and `@from` JIDs are replaced by
+symbolic labels to convey what form (bare, full or host) they had. The
+`@id` attribute is replaced with a string of the same length.
+
+## Example
+
+```xml
+<message from='full' id='xxxxxxxxxxxxxxxx' type='chat' to='bare'><body/><x xmlns='jabber:x:oob'><url/></x></message>
+<message from='bare' id='xxxxxxxxxxxxxxxx' type='error' to='full'><error><remote-server-not-found xmlns='urn:ietf:params:xml:ns:xmpp-stanzas'/><text xmlns='urn:ietf:params:xml:ns:xmpp-stanzas'/></error></message>
+<message from='full' id='xxxxxxxxxxxxxxxx' type='chat' to='bare'><body/><x xmlns='jabber:x:oob'><url/></x></message>
+<message from='full' id='xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx' type='normal' to='bare'><x xmlns='jabber:x:conference'/></message>
+```
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/mod_export_skeletons/mod_export_skeletons.lua	Thu Dec 09 23:48:25 2021 +0100
@@ -0,0 +1,71 @@
+
+local t_insert = table.insert;
+local t_sort = table.sort;
+
+local sm = require "core.storagemanager";
+local um = require "core.usermanager";
+
+local argparse = require "util.argparse";
+local dt = require "util.datetime";
+local jid = require "util.jid";
+local st = require "util.stanza";
+
+local function skeleton(s)
+	local o = st.stanza(s.name, { xmlns = s.attr.xmlns });
+
+	local children = {};
+	for _, child in ipairs(s.tags) do t_insert(children, skeleton(child)) end
+	t_sort(children, function(a, b)
+		if a.attr.xmlns == b.attr.xmlns then return a.name < b.name; end
+		return (a.attr.xmlns or "") < (b.attr.xmlns or "");
+	end);
+	for _, child in ipairs(children) do o:add_direct_child(child); end
+	return o;
+end
+
+local function classify_jid(s)
+	if not s then return "" end
+	local u, h, r = jid.split(s);
+	if r then
+		return "full"
+	elseif u then
+		return "bare"
+	elseif h then
+		return "host"
+	else
+		return "invalid"
+	end
+end
+
+function module.command(arg)
+	local opts = argparse.parse(arg, { value_params = { store = true; with = true; start = true; ["end"] = true } });
+	local store = opts.store or "archive"; -- so you can pass 'archive2'
+	opts.store = nil;
+	local query = { with = jid.prep(opts.with); start = dt.parse(opts.start); ["end"] = dt.parse(opts["end"]) };
+	local host_initialized = {};
+	for _, export_jid in ipairs(arg) do
+
+		local username, host = jid.split(export_jid);
+		if not host_initialized[host] then
+			sm.initialize_host(host);
+			um.initialize_host(host);
+			host_initialized[host] = true;
+		end
+
+		local archive = module:context(host):open_store(store, "archive");
+		local iter, total = assert(archive:find(username ~= "*" and username, query))
+		if total then io.stderr:write(string.format("Processing %d entries\n", total)); end
+		for _, item in iter do
+			local clean = skeleton(item);
+
+			-- Normalize top level attributes
+			clean.attr.type = item.attr.type;
+			if clean.attr.type == nil and clean.name == "message" then clean.attr.type = "normal"; end
+			clean.attr.id = string.rep("x", #(item.attr.id or "")); -- worth rounding to nearest power of two or so?
+			clean.attr.from = classify_jid(item.attr.from);
+			clean.attr.to = classify_jid(item.attr.to);
+			print(clean);
+		end
+
+	end
+end