""" Tests for the content_source parameter in markdown generation. """ import unittest import asyncio from unittest.mock import patch, MagicMock from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy from crawl4ai.async_webcrawler import AsyncWebCrawler from crawl4ai.async_configs import CrawlerRunConfig from crawl4ai.models import MarkdownGenerationResult HTML_SAMPLE = """ Test Page

Test Content

This is a test paragraph.

This is content within a container.

""" class TestContentSourceParameter(unittest.TestCase): """Test cases for the content_source parameter in markdown generation.""" def setUp(self): """Set up test fixtures.""" self.loop = asyncio.new_event_loop() asyncio.set_event_loop(self.loop) def tearDown(self): """Tear down test fixtures.""" self.loop.close() def test_default_content_source(self): """Test that the default content_source is 'cleaned_html'.""" # Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator generator = DefaultMarkdownGenerator() self.assertEqual(generator.content_source, "cleaned_html") def test_custom_content_source(self): """Test that content_source can be customized.""" generator = DefaultMarkdownGenerator(content_source="fit_html") self.assertEqual(generator.content_source, "fit_html") @patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text') def test_html_processing_using_input_html(self, mock_html2text): """Test that generate_markdown uses input_html parameter.""" # Setup mock mock_instance = MagicMock() mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph." mock_html2text.return_value = mock_instance # Create generator and call generate_markdown generator = DefaultMarkdownGenerator() result = generator.generate_markdown(input_html="

Test Content

This is a test paragraph.

") # Verify input_html was passed to HTML2Text handler mock_instance.handle.assert_called_once() # Get the first positional argument args, _ = mock_instance.handle.call_args self.assertEqual(args[0], "

Test Content

This is a test paragraph.

") # Check result self.assertIsInstance(result, MarkdownGenerationResult) self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.") def test_html_source_selection_logic(self): """Test that the HTML source selection logic works correctly.""" # We'll test the dispatch pattern directly to avoid async complexities # Create test data raw_html = "

Raw HTML

" cleaned_html = "

Cleaned HTML

" fit_html = "

Preprocessed HTML

" # Test the dispatch pattern html_source_selector = { "raw_html": lambda: raw_html, "cleaned_html": lambda: cleaned_html, "fit_html": lambda: fit_html, } # Test Case 1: content_source="cleaned_html" source_lambda = html_source_selector.get("cleaned_html") self.assertEqual(source_lambda(), cleaned_html) # Test Case 2: content_source="raw_html" source_lambda = html_source_selector.get("raw_html") self.assertEqual(source_lambda(), raw_html) # Test Case 3: content_source="fit_html" source_lambda = html_source_selector.get("fit_html") self.assertEqual(source_lambda(), fit_html) # Test Case 4: Invalid content_source falls back to cleaned_html source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html) self.assertEqual(source_lambda(), cleaned_html) if __name__ == '__main__': unittest.main()