Merge pull request #1447 from rbushri/fix/wrong_url_raw
Fix: Wrong URL variable used for extraction of raw html
This commit is contained in:
@@ -617,11 +617,11 @@ class AsyncWebCrawler:
|
|||||||
else config.chunking_strategy
|
else config.chunking_strategy
|
||||||
)
|
)
|
||||||
sections = chunking.chunk(content)
|
sections = chunking.chunk(content)
|
||||||
# extracted_content = config.extraction_strategy.run(url, sections)
|
# extracted_content = config.extraction_strategy.run(_url, sections)
|
||||||
|
|
||||||
# Use async version if available for better parallelism
|
# Use async version if available for better parallelism
|
||||||
if hasattr(config.extraction_strategy, 'arun'):
|
if hasattr(config.extraction_strategy, 'arun'):
|
||||||
extracted_content = await config.extraction_strategy.arun(url, sections)
|
extracted_content = await config.extraction_strategy.arun(_url, sections)
|
||||||
else:
|
else:
|
||||||
# Fallback to sync version run in thread pool to avoid blocking
|
# Fallback to sync version run in thread pool to avoid blocking
|
||||||
extracted_content = await asyncio.to_thread(
|
extracted_content = await asyncio.to_thread(
|
||||||
|
|||||||
@@ -9,6 +9,21 @@ from crawl4ai import (
|
|||||||
RateLimiter,
|
RateLimiter,
|
||||||
CacheMode
|
CacheMode
|
||||||
)
|
)
|
||||||
|
from crawl4ai.extraction_strategy import ExtractionStrategy
|
||||||
|
|
||||||
|
class MockExtractionStrategy(ExtractionStrategy):
|
||||||
|
"""Mock extraction strategy for testing URL parameter handling"""
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
super().__init__()
|
||||||
|
self.run_calls = []
|
||||||
|
|
||||||
|
def extract(self, url: str, html: str, *args, **kwargs):
|
||||||
|
return [{"test": "data"}]
|
||||||
|
|
||||||
|
def run(self, url: str, sections: List[str], *args, **kwargs):
|
||||||
|
self.run_calls.append(url)
|
||||||
|
return super().run(url, sections, *args, **kwargs)
|
||||||
|
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
@pytest.mark.parametrize("viewport", [
|
@pytest.mark.parametrize("viewport", [
|
||||||
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
|
|||||||
assert not result.success
|
assert not result.success
|
||||||
assert result.error_message is not None
|
assert result.error_message is not None
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extraction_strategy_run_with_regular_url():
|
||||||
|
"""
|
||||||
|
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
||||||
|
|
||||||
|
This test verifies that when is_raw_html=False (regular URL),
|
||||||
|
extraction_strategy.run is called with the actual URL.
|
||||||
|
"""
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
mock_strategy = MockExtractionStrategy()
|
||||||
|
|
||||||
|
# Test regular URL (is_raw_html=False)
|
||||||
|
regular_url = "https://example.com"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=regular_url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
page_timeout=30000,
|
||||||
|
extraction_strategy=mock_strategy,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert len(mock_strategy.run_calls) == 1
|
||||||
|
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_extraction_strategy_run_with_raw_html():
|
||||||
|
"""
|
||||||
|
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
||||||
|
|
||||||
|
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
||||||
|
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
||||||
|
"""
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
mock_strategy = MockExtractionStrategy()
|
||||||
|
|
||||||
|
# Test raw HTML URL (is_raw_html=True automatically set)
|
||||||
|
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
||||||
|
result = await crawler.arun(
|
||||||
|
url=raw_html_url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
page_timeout=30000,
|
||||||
|
extraction_strategy=mock_strategy,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert len(mock_strategy.run_calls) == 1
|
||||||
|
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(test_viewport_config((1024, 768)))
|
asyncio.run(test_viewport_config((1024, 768)))
|
||||||
asyncio.run(test_memory_management())
|
asyncio.run(test_memory_management())
|
||||||
asyncio.run(test_rate_limiting())
|
asyncio.run(test_rate_limiting())
|
||||||
asyncio.run(test_javascript_execution())
|
asyncio.run(test_javascript_execution())
|
||||||
|
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
||||||
|
asyncio.run(test_extraction_strategy_run_with_raw_html())
|
||||||
|
|||||||
Reference in New Issue
Block a user