From edd0b576b103df4acb251e41cfd4d55b53e25ae7 Mon Sep 17 00:00:00 2001 From: rbushria Date: Thu, 28 Aug 2025 10:46:44 +0300 Subject: [PATCH] Fix: Use correct URL variable for raw HTML extraction (#1116) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html --- crawl4ai/async_webcrawler.py | 2 +- tests/general/test_async_webcrawler.py | 81 +++++++++++++++++++++++++- 2 files changed, 81 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ebd2859d..359aa73c 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -615,7 +615,7 @@ class AsyncWebCrawler: else config.chunking_strategy ) sections = chunking.chunk(content) - extracted_content = config.extraction_strategy.run(url, sections) + extracted_content = config.extraction_strategy.run(_url, sections) extracted_content = json.dumps( extracted_content, indent=4, default=str, ensure_ascii=False ) diff --git a/tests/general/test_async_webcrawler.py b/tests/general/test_async_webcrawler.py index 4d7aa815..80d4acbe 100644 --- a/tests/general/test_async_webcrawler.py +++ b/tests/general/test_async_webcrawler.py @@ -9,6 +9,21 @@ from crawl4ai import ( RateLimiter, CacheMode ) +from crawl4ai.extraction_strategy import ExtractionStrategy + +class MockExtractionStrategy(ExtractionStrategy): + """Mock extraction strategy for testing URL parameter handling""" + + def __init__(self): + super().__init__() + self.run_calls = [] + + def extract(self, url: str, html: str, *args, **kwargs): + return [{"test": "data"}] + + def run(self, url: str, sections: List[str], *args, **kwargs): + self.run_calls.append(url) + return super().run(url, sections, *args, **kwargs) @pytest.mark.asyncio @pytest.mark.parametrize("viewport", [ @@ -142,8 +157,72 @@ async def test_error_handling(error_url): assert not result.success assert result.error_message is not None +@pytest.mark.asyncio +async def test_extraction_strategy_run_with_regular_url(): + """ + Regression test for extraction_strategy.run URL parameter handling with regular URLs. + + This test verifies that when is_raw_html=False (regular URL), + extraction_strategy.run is called with the actual URL. + """ + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + mock_strategy = MockExtractionStrategy() + + # Test regular URL (is_raw_html=False) + regular_url = "https://example.com" + result = await crawler.arun( + url=regular_url, + config=CrawlerRunConfig( + page_timeout=30000, + extraction_strategy=mock_strategy, + cache_mode=CacheMode.BYPASS + ) + ) + + assert result.success + assert len(mock_strategy.run_calls) == 1 + assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'" + +@pytest.mark.asyncio +async def test_extraction_strategy_run_with_raw_html(): + """ + Regression test for extraction_strategy.run URL parameter handling with raw HTML. + + This test verifies that when is_raw_html=True (URL starts with "raw:"), + extraction_strategy.run is called with "Raw HTML" instead of the actual URL. + """ + browser_config = BrowserConfig( + browser_type="chromium", + headless=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + mock_strategy = MockExtractionStrategy() + + # Test raw HTML URL (is_raw_html=True automatically set) + raw_html_url = "raw:

Test HTML

This is a test.

" + result = await crawler.arun( + url=raw_html_url, + config=CrawlerRunConfig( + page_timeout=30000, + extraction_strategy=mock_strategy, + cache_mode=CacheMode.BYPASS + ) + ) + + assert result.success + assert len(mock_strategy.run_calls) == 1 + assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'" + if __name__ == "__main__": asyncio.run(test_viewport_config((1024, 768))) asyncio.run(test_memory_management()) asyncio.run(test_rate_limiting()) - asyncio.run(test_javascript_execution()) \ No newline at end of file + asyncio.run(test_javascript_execution()) + asyncio.run(test_extraction_strategy_run_with_regular_url()) + asyncio.run(test_extraction_strategy_run_with_raw_html())