feat(markdown): add content source selection for markdown generation

Adds a new content_source parameter to MarkdownGenerationStrategy that allows
selecting which HTML content to use for markdown generation:
- cleaned_html (default): uses post-processed HTML
- raw_html: uses original webpage HTML
- fit_html: uses preprocessed HTML for schema extraction

Changes include:
- Added content_source parameter to MarkdownGenerationStrategy
- Updated AsyncWebCrawler to handle HTML source selection
- Added examples and tests for the new feature
- Updated documentation with new parameter details

BREAKING CHANGE: Renamed cleaned_html parameter to input_html in generate_markdown()
method signature to better reflect its generalized purpose
This commit is contained in:
UncleCode
2025-04-17 20:13:53 +08:00
parent 94d486579c
commit 7db6b468d9
9 changed files with 383 additions and 24 deletions

View File

@@ -0,0 +1,106 @@
"""
Tests for the content_source parameter in markdown generation.
"""
import unittest
import asyncio
from unittest.mock import patch, MagicMock
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
from crawl4ai.async_webcrawler import AsyncWebCrawler
from crawl4ai.async_configs import CrawlerRunConfig
from crawl4ai.models import MarkdownGenerationResult
HTML_SAMPLE = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Test Content</h1>
<p>This is a test paragraph.</p>
<div class="container">
<p>This is content within a container.</p>
</div>
</body>
</html>
"""
class TestContentSourceParameter(unittest.TestCase):
"""Test cases for the content_source parameter in markdown generation."""
def setUp(self):
"""Set up test fixtures."""
self.loop = asyncio.new_event_loop()
asyncio.set_event_loop(self.loop)
def tearDown(self):
"""Tear down test fixtures."""
self.loop.close()
def test_default_content_source(self):
"""Test that the default content_source is 'cleaned_html'."""
# Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
generator = DefaultMarkdownGenerator()
self.assertEqual(generator.content_source, "cleaned_html")
def test_custom_content_source(self):
"""Test that content_source can be customized."""
generator = DefaultMarkdownGenerator(content_source="fit_html")
self.assertEqual(generator.content_source, "fit_html")
@patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
def test_html_processing_using_input_html(self, mock_html2text):
"""Test that generate_markdown uses input_html parameter."""
# Setup mock
mock_instance = MagicMock()
mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
mock_html2text.return_value = mock_instance
# Create generator and call generate_markdown
generator = DefaultMarkdownGenerator()
result = generator.generate_markdown(input_html="<h1>Test Content</h1><p>This is a test paragraph.</p>")
# Verify input_html was passed to HTML2Text handler
mock_instance.handle.assert_called_once()
# Get the first positional argument
args, _ = mock_instance.handle.call_args
self.assertEqual(args[0], "<h1>Test Content</h1><p>This is a test paragraph.</p>")
# Check result
self.assertIsInstance(result, MarkdownGenerationResult)
self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
def test_html_source_selection_logic(self):
"""Test that the HTML source selection logic works correctly."""
# We'll test the dispatch pattern directly to avoid async complexities
# Create test data
raw_html = "<html><body><h1>Raw HTML</h1></body></html>"
cleaned_html = "<html><body><h1>Cleaned HTML</h1></body></html>"
fit_html = "<html><body><h1>Preprocessed HTML</h1></body></html>"
# Test the dispatch pattern
html_source_selector = {
"raw_html": lambda: raw_html,
"cleaned_html": lambda: cleaned_html,
"fit_html": lambda: fit_html,
}
# Test Case 1: content_source="cleaned_html"
source_lambda = html_source_selector.get("cleaned_html")
self.assertEqual(source_lambda(), cleaned_html)
# Test Case 2: content_source="raw_html"
source_lambda = html_source_selector.get("raw_html")
self.assertEqual(source_lambda(), raw_html)
# Test Case 3: content_source="fit_html"
source_lambda = html_source_selector.get("fit_html")
self.assertEqual(source_lambda(), fit_html)
# Test Case 4: Invalid content_source falls back to cleaned_html
source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
self.assertEqual(source_lambda(), cleaned_html)
if __name__ == '__main__':
unittest.main()