Mercurial > libervia-backend
changeset 4400:b591c7dff8ab
tests (unit/components): Add tests for email gateway's cleaning module:
rel 464
author | Goffi <goffi@goffi.org> |
---|---|
date | Thu, 11 Sep 2025 21:17:51 +0200 |
parents | fe09446a09ce |
children | ae26233b655f |
files | tests/unit/components/test_cleaning.py |
diffstat | 1 files changed, 287 insertions(+), 0 deletions(-) [+] |
line wrap: on
line diff
--- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/tests/unit/components/test_cleaning.py Thu Sep 11 21:17:51 2025 +0200 @@ -0,0 +1,287 @@ +#!/usr/bin/env python3 + +# Libervia: an XMPP client +# Copyright (C) 2009-2025 Jérôme Poisson (goffi@goffi.org) + +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU Affero General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. + +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU Affero General Public License for more details. + +# You should have received a copy of the GNU Affero General Public License +# along with this program. If not, see <http://www.gnu.org/licenses/>. + +from lxml import etree + +from libervia.backend.plugins.plugin_comp_email_gateway.cleaning import ( + TextToHtmlConverter, + MailingListHtmlCleaner, + convert_to_html_and_detect_noise, +) + + +class TestTextToHtmlConverter: + """Test the TextToHtmlConverter class.""" + + def test_simple_text_conversion(self): + """Simple text to HTML conversion.""" + converter = TextToHtmlConverter() + text = "Hello world\nThis is a test" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should create a single paragraph with line break + paragraphs = root.xpath("//p") + assert len(paragraphs) == 1 + + # Check that both parts of the text are in the paragraph + paragraph_html = etree.tostring(paragraphs[0], encoding="unicode") + assert "Hello world" in paragraph_html + assert "This is a test" in paragraph_html + assert "<br" in paragraph_html + + def test_paragraphs_with_empty_lines(self): + """Several paragraphs are converted to several <p> elements.""" + converter = TextToHtmlConverter() + text = "First paragraph\n\nSecond paragraph" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + paragraphs = root.xpath("//p") + assert len(paragraphs) == 2 + first_p_html = etree.tostring(paragraphs[0], encoding="unicode") + second_p_html = etree.tostring(paragraphs[1], encoding="unicode") + assert "First paragraph" in first_p_html + assert "Second paragraph" in second_p_html + + def test_simple_blockquote(self): + """Simple blockquote is converted.""" + converter = TextToHtmlConverter() + text = "> This is a quote\n> Another quote line" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain blockquote + blockquotes = root.xpath("//blockquote") + assert len(blockquotes) >= 1 + + # Should contain the quoted content + assert len(blockquotes[0].xpath(".//p")) >= 1 + + def test_nested_blockquotes(self): + """ "Nested blockquotes are converted.""" + converter = TextToHtmlConverter() + text = "> First level\n>> Second level\n> Back to first level" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Check that we have blockquotes + blockquotes = root.xpath("//blockquote") + assert len(blockquotes) > 0 + + def test_ordered_list_conversion(self): + """Ordered lists a converted.""" + converter = TextToHtmlConverter() + text = "Here are steps:\n1. First step\n2. Second step\n3. Third step" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + lists = root.xpath("//ol") + assert len(lists) == 1 + + items = root.xpath("//li") + assert len(items) == 3 + assert "First step" in items[0].text + assert "Second step" in items[1].text + assert "Third step" in items[2].text + + def test_unordered_list_conversion(self): + """Unordered lists are converted.""" + converter = TextToHtmlConverter() + text = "Here are points:\n- First point\n- Second point\n- Third point" + result = converter.text_to_html(text) + + root = etree.fromstring(f"<div>{result}</div>") + + lists = root.xpath("//ul") + assert len(lists) == 1 + + items = root.xpath("//li") + assert len(items) == 3 + assert "First point" in items[0].text + assert "Second point" in items[1].text + assert "Third point" in items[2].text + + +class TestMailingListHtmlCleaner: + """Test the MailingListHtmlCleaner class.""" + + def test_clean_simple_text(self): + """Cleaning of simple text.""" + cleaner = MailingListHtmlCleaner() + text = "Hello world\nThis is a test" + result = cleaner.clean_message(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain a paragraph + paragraphs = root.xpath("//p") + assert len(paragraphs) > 0 + + def test_clean_with_quotes(self): + """Cleaning of text with quotes.""" + cleaner = MailingListHtmlCleaner() + text = "Hello\n> This is quoted\ntext" + result = cleaner.clean_message(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain blockquote + blockquotes = root.xpath("//blockquote") + assert len(blockquotes) > 0 + + def test_clean_mailing_list_message(self): + """Cleaning of a realistic XMPP mailing list message.""" + cleaner = MailingListHtmlCleaner() + text = """On Wed, Mar 15, 2024 at 2:34 PM Louise <louise@example.org> wrote: + +> Hi everyone, +> +> I wanted to share some thoughts on XMPP server optimization we've been discussing internally. +> +> Key points: +> - Use message carbons for better message synchronization +> - Implement proper roster versioning to reduce bandwidth +> - Consider using XEP-0313 (Message Archive Management) for history +> - Don't forget to enable compression on S2S connections +> +> What are your experiences with these approaches? I'd love to hear what strategies have worked well for your deployments. +> +> Thanks, +> Louise + +--- + +Best practices for XMPP server optimization + +I've been working on some performance improvements lately and wanted to share a few thoughts on XMPP server optimization. + +I think we should be focusing on: +1. Proper XEP implementation based on use cases +2. Monitoring connection statistics +3. Regular maintenance of database indexes + +Let me know if anyone has faced similar challenges or have different approaches they'd like to discuss. + +Thanks, +Louise + +-- +Louise +Senior XMPP Engineer +example.org +Email: louise@example.org +Phone: (555) 123-4567 + +This message was sent to dev@example.org +To unsubscribe, visit: https://example.org/unsubscribe/dev +For archives and more information: https://example.org/archives/dev""" + result = cleaner.clean_message(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain proper HTML structure + paragraphs = root.xpath("//p") + assert len(paragraphs) > 0 + + # Should contain blockquotes for the quoted message + blockquotes = root.xpath("//blockquote") + assert len(blockquotes) > 0 + + # Should contain noise elements (reply context and mailing list signature) + noise_elements = root.xpath("//*[@class]") + noise_classes = [ + elem.get("class") + for elem in noise_elements + if "noise" in elem.get("class", "") + ] + assert len(noise_classes) > 0 + + # Check that common sign-offs like "Thanks," are not marked as noise + # They should not have noise class. + thanks_elements = [ + elem for elem in noise_elements if "Thanks" in "".join(elem.itertext()) + ] + signature_noise_elements = [ + elem for elem in noise_elements if "noise-signature" in elem.get("class", "") + ] + assert ( + len(thanks_elements) == 0 + ), 'Common sign-offs like "Thanks,"" should not be marked as noise' + assert ( + len(signature_noise_elements) == 1 + ), "Mailing-list signature should be marked as noise." + + +def test_clean_mailing_list_content_function(): + """Test the main clean_mailing_list_content function.""" + text = """Hello +> Quoted text + +-- +This message was sent to dev@example.org +To unsubscribe, visit: https://example.org/unsubscribe/dev +For archives and more information: https://example.org/archives/dev""" + result = convert_to_html_and_detect_noise(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain proper HTML structure + paragraphs = root.xpath("//p") + assert len(paragraphs) > 0 + + blockquotes = root.xpath("//blockquote") + assert len(blockquotes) > 0 + + # Should contain noise elements + noise_elements = root.xpath("//*[@class]") + noise_classes = [ + elem.get("class") for elem in noise_elements if "noise" in elem.get("class", "") + ] + # The mailing list signature should be detected as noise + assert any("noise-signature" in cls for cls in noise_classes) + + +def test_common_signoffs_not_marked_as_noise(): + """Test that common sign-offs like 'regards' are not marked as noise.""" + text = """Hello everyone, + +I wanted to share some thoughts with you. + +Best regards, +John Doe""" + result = convert_to_html_and_detect_noise(text) + + root = etree.fromstring(f"<div>{result}</div>") + + # Should contain proper HTML structure + paragraphs = root.xpath("//p") + assert len(paragraphs) > 0 + + # Should NOT contain noise elements for common sign-offs + noise_elements = root.xpath("//*[@class]") + noise_signature_elements = [ + elem for elem in noise_elements if "noise-signature" in elem.get("class", "") + ] + # Common sign-offs like "Best regards," should not be marked as noise + assert len(noise_signature_elements) == 0