Mercurial > libervia-backend

--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/tests/unit/components/test_cleaning.py	Thu Sep 11 21:17:51 2025 +0200
@@ -0,0 +1,287 @@
+#!/usr/bin/env python3
+
+# Libervia: an XMPP client
+# Copyright (C) 2009-2025 Jérôme Poisson (goffi@goffi.org)
+
+# This program is free software: you can redistribute it and/or modify
+# it under the terms of the GNU Affero General Public License as published by
+# the Free Software Foundation, either version 3 of the License, or
+# (at your option) any later version.
+
+# This program is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+# GNU Affero General Public License for more details.
+
+# You should have received a copy of the GNU Affero General Public License
+# along with this program.  If not, see <http://www.gnu.org/licenses/>.
+
+from lxml import etree
+
+from libervia.backend.plugins.plugin_comp_email_gateway.cleaning import (
+    TextToHtmlConverter,
+    MailingListHtmlCleaner,
+    convert_to_html_and_detect_noise,
+)
+
+
+class TestTextToHtmlConverter:
+    """Test the TextToHtmlConverter class."""
+
+    def test_simple_text_conversion(self):
+        """Simple text to HTML conversion."""
+        converter = TextToHtmlConverter()
+        text = "Hello world\nThis is a test"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Should create a single paragraph with line break
+        paragraphs = root.xpath("//p")
+        assert len(paragraphs) == 1
+
+        # Check that both parts of the text are in the paragraph
+        paragraph_html = etree.tostring(paragraphs[0], encoding="unicode")
+        assert "Hello world" in paragraph_html
+        assert "This is a test" in paragraph_html
+        assert "<br" in paragraph_html
+
+    def test_paragraphs_with_empty_lines(self):
+        """Several paragraphs are converted to several <p> elements."""
+        converter = TextToHtmlConverter()
+        text = "First paragraph\n\nSecond paragraph"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        paragraphs = root.xpath("//p")
+        assert len(paragraphs) == 2
+        first_p_html = etree.tostring(paragraphs[0], encoding="unicode")
+        second_p_html = etree.tostring(paragraphs[1], encoding="unicode")
+        assert "First paragraph" in first_p_html
+        assert "Second paragraph" in second_p_html
+
+    def test_simple_blockquote(self):
+        """Simple blockquote is converted."""
+        converter = TextToHtmlConverter()
+        text = "> This is a quote\n> Another quote line"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Should contain blockquote
+        blockquotes = root.xpath("//blockquote")
+        assert len(blockquotes) >= 1
+
+        # Should contain the quoted content
+        assert len(blockquotes[0].xpath(".//p")) >= 1
+
+    def test_nested_blockquotes(self):
+        """ "Nested blockquotes are converted."""
+        converter = TextToHtmlConverter()
+        text = "> First level\n>> Second level\n> Back to first level"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Check that we have blockquotes
+        blockquotes = root.xpath("//blockquote")
+        assert len(blockquotes) > 0
+
+    def test_ordered_list_conversion(self):
+        """Ordered lists a converted."""
+        converter = TextToHtmlConverter()
+        text = "Here are steps:\n1. First step\n2. Second step\n3. Third step"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        lists = root.xpath("//ol")
+        assert len(lists) == 1
+
+        items = root.xpath("//li")
+        assert len(items) == 3
+        assert "First step" in items[0].text
+        assert "Second step" in items[1].text
+        assert "Third step" in items[2].text
+
+    def test_unordered_list_conversion(self):
+        """Unordered lists are converted."""
+        converter = TextToHtmlConverter()
+        text = "Here are points:\n- First point\n- Second point\n- Third point"
+        result = converter.text_to_html(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        lists = root.xpath("//ul")
+        assert len(lists) == 1
+
+        items = root.xpath("//li")
+        assert len(items) == 3
+        assert "First point" in items[0].text
+        assert "Second point" in items[1].text
+        assert "Third point" in items[2].text
+
+
+class TestMailingListHtmlCleaner:
+    """Test the MailingListHtmlCleaner class."""
+
+    def test_clean_simple_text(self):
+        """Cleaning of simple text."""
+        cleaner = MailingListHtmlCleaner()
+        text = "Hello world\nThis is a test"
+        result = cleaner.clean_message(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Should contain a paragraph
+        paragraphs = root.xpath("//p")
+        assert len(paragraphs) > 0
+
+    def test_clean_with_quotes(self):
+        """Cleaning of text with quotes."""
+        cleaner = MailingListHtmlCleaner()
+        text = "Hello\n> This is quoted\ntext"
+        result = cleaner.clean_message(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Should contain blockquote
+        blockquotes = root.xpath("//blockquote")
+        assert len(blockquotes) > 0
+
+    def test_clean_mailing_list_message(self):
+        """Cleaning of a realistic XMPP mailing list message."""
+        cleaner = MailingListHtmlCleaner()
+        text = """On Wed, Mar 15, 2024 at 2:34 PM Louise <louise@example.org> wrote:
+
+> Hi everyone,
+>
+> I wanted to share some thoughts on XMPP server optimization we've been discussing internally.
+>
+> Key points:
+> - Use message carbons for better message synchronization
+> - Implement proper roster versioning to reduce bandwidth
+> - Consider using XEP-0313 (Message Archive Management) for history
+> - Don't forget to enable compression on S2S connections
+>
+> What are your experiences with these approaches? I'd love to hear what strategies have worked well for your deployments.
+>
+> Thanks,
+> Louise
+
+---
+
+Best practices for XMPP server optimization
+
+I've been working on some performance improvements lately and wanted to share a few thoughts on XMPP server optimization.
+
+I think we should be focusing on:
+1. Proper XEP implementation based on use cases
+2. Monitoring connection statistics
+3. Regular maintenance of database indexes
+
+Let me know if anyone has faced similar challenges or have different approaches they'd like to discuss.
+
+Thanks,
+Louise
+
+--
+Louise
+Senior XMPP Engineer
+example.org
+Email: louise@example.org
+Phone: (555) 123-4567
+
+This message was sent to dev@example.org
+To unsubscribe, visit: https://example.org/unsubscribe/dev
+For archives and more information: https://example.org/archives/dev"""
+        result = cleaner.clean_message(text)
+
+        root = etree.fromstring(f"<div>{result}</div>")
+
+        # Should contain proper HTML structure
+        paragraphs = root.xpath("//p")
+        assert len(paragraphs) > 0
+
+        # Should contain blockquotes for the quoted message
+        blockquotes = root.xpath("//blockquote")
+        assert len(blockquotes) > 0
+
+        # Should contain noise elements (reply context and mailing list signature)
+        noise_elements = root.xpath("//*[@class]")
+        noise_classes = [
+            elem.get("class")
+            for elem in noise_elements
+            if "noise" in elem.get("class", "")
+        ]
+        assert len(noise_classes) > 0
+
+        # Check that common sign-offs like "Thanks," are not marked as noise
+        # They should not have noise class.
+        thanks_elements = [
+            elem for elem in noise_elements if "Thanks" in "".join(elem.itertext())
+        ]
+        signature_noise_elements = [
+            elem for elem in noise_elements if "noise-signature" in elem.get("class", "")
+        ]
+        assert (
+            len(thanks_elements) == 0
+        ), 'Common sign-offs like "Thanks,"" should not be marked as noise'
+        assert (
+            len(signature_noise_elements) == 1
+        ), "Mailing-list signature should be marked as noise."
+
+
+def test_clean_mailing_list_content_function():
+    """Test the main clean_mailing_list_content function."""
+    text = """Hello
+> Quoted text
+
+--
+This message was sent to dev@example.org
+To unsubscribe, visit: https://example.org/unsubscribe/dev
+For archives and more information: https://example.org/archives/dev"""
+    result = convert_to_html_and_detect_noise(text)
+
+    root = etree.fromstring(f"<div>{result}</div>")
+
+    # Should contain proper HTML structure
+    paragraphs = root.xpath("//p")
+    assert len(paragraphs) > 0
+
+    blockquotes = root.xpath("//blockquote")
+    assert len(blockquotes) > 0
+
+    # Should contain noise elements
+    noise_elements = root.xpath("//*[@class]")
+    noise_classes = [
+        elem.get("class") for elem in noise_elements if "noise" in elem.get("class", "")
+    ]
+    # The mailing list signature should be detected as noise
+    assert any("noise-signature" in cls for cls in noise_classes)
+
+
+def test_common_signoffs_not_marked_as_noise():
+    """Test that common sign-offs like 'regards' are not marked as noise."""
+    text = """Hello everyone,
+
+I wanted to share some thoughts with you.
+
+Best regards,
+John Doe"""
+    result = convert_to_html_and_detect_noise(text)
+
+    root = etree.fromstring(f"<div>{result}</div>")
+
+    # Should contain proper HTML structure
+    paragraphs = root.xpath("//p")
+    assert len(paragraphs) > 0
+
+    # Should NOT contain noise elements for common sign-offs
+    noise_elements = root.xpath("//*[@class]")
+    noise_signature_elements = [
+        elem for elem in noise_elements if "noise-signature" in elem.get("class", "")
+    ]
+    # Common sign-offs like "Best regards," should not be marked as noise
+    assert len(noise_signature_elements) == 0