* Fix: Use correct URL variable for raw HTML extraction (#1116) - Prevents full HTML content from being passed as URL to extraction strategies - Added unit tests to verify raw HTML and regular URL processing Fix: Wrong URL variable used for extraction of raw html * Fix #1181: Preserve whitespace in code blocks during HTML scraping The remove_empty_elements_fast() method was removing whitespace-only span elements inside <pre> and <code> tags, causing import statements like "import torch" to become "importtorch". Now skips elements inside code blocks where whitespace is significant. * Refactor Pydantic model configuration to use ConfigDict for arbitrary types * Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621 * Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638 * fix: ensure BrowserConfig.to_dict serializes proxy_config * feat: make LLM backoff configurable end-to-end - extend LLMConfig with backoff delay/attempt/factor fields and thread them through LLMExtractionStrategy, LLMContentFilter, table extraction, and Docker API handlers - expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff and document them in the md_v2 guides * reproduced AttributeError from #1642 * pass timeout parameter to docker client request * added missing deep crawling objects to init * generalized query in ContentRelevanceFilter to be a str or list * import modules from enhanceable deserialization * parameterized tests * Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268 * refactor: replace PyPDF2 with pypdf across the codebase. ref #1412 * announcement: add application form for cloud API closed beta * Release v0.7.8: Stability & Bug Fix Release - Updated version to 0.7.8 - Introduced focused stability release addressing 11 community-reported bugs. - Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates. - Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended. - Updated documentation to reflect recent changes and improvements. * docs: add section for Crawl4AI Cloud API closed beta with application link * fix: add disk cleanup step to Docker workflow --------- Co-authored-by: rbushria <rbushri@gmail.com> Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com> Co-authored-by: Soham Kukreti <kukretisoham@gmail.com> Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com> Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
229 lines
6.9 KiB
Python
229 lines
6.9 KiB
Python
import asyncio
|
|
import pytest
|
|
from typing import List
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
MemoryAdaptiveDispatcher,
|
|
RateLimiter,
|
|
CacheMode
|
|
)
|
|
from crawl4ai.extraction_strategy import ExtractionStrategy
|
|
|
|
class MockExtractionStrategy(ExtractionStrategy):
|
|
"""Mock extraction strategy for testing URL parameter handling"""
|
|
|
|
def __init__(self):
|
|
super().__init__()
|
|
self.run_calls = []
|
|
|
|
def extract(self, url: str, html: str, *args, **kwargs):
|
|
return [{"test": "data"}]
|
|
|
|
def run(self, url: str, sections: List[str], *args, **kwargs):
|
|
self.run_calls.append(url)
|
|
return super().run(url, sections, *args, **kwargs)
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("viewport", [
|
|
(800, 600),
|
|
(1024, 768),
|
|
(1920, 1080)
|
|
])
|
|
async def test_viewport_config(viewport):
|
|
"""Test different viewport configurations"""
|
|
width, height = viewport
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True,
|
|
viewport_width=width,
|
|
viewport_height=height
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(
|
|
# cache_mode=CacheMode.BYPASS,
|
|
page_timeout=30000 # 30 seconds
|
|
)
|
|
)
|
|
assert result.success
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_memory_management():
|
|
"""Test memory-adaptive dispatching"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True,
|
|
viewport_width=1024,
|
|
viewport_height=768
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=70.0,
|
|
check_interval=1.0,
|
|
max_session_permit=5
|
|
)
|
|
|
|
urls = ["https://example.com"] * 3 # Test with multiple identical URLs
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
config=CrawlerRunConfig(page_timeout=30000),
|
|
dispatcher=dispatcher
|
|
)
|
|
assert len(results) == len(urls)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_rate_limiting():
|
|
"""Test rate limiting functionality"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
rate_limiter=RateLimiter(
|
|
base_delay=(1.0, 2.0),
|
|
max_delay=5.0,
|
|
max_retries=2
|
|
),
|
|
memory_threshold_percent=70.0
|
|
)
|
|
|
|
urls = [
|
|
"https://example.com",
|
|
"https://example.org",
|
|
"https://example.net"
|
|
]
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
config=CrawlerRunConfig(page_timeout=30000),
|
|
dispatcher=dispatcher
|
|
)
|
|
assert len(results) == len(urls)
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_javascript_execution():
|
|
"""Test JavaScript execution capabilities"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True,
|
|
java_script_enabled=True
|
|
)
|
|
|
|
js_code = """
|
|
document.body.style.backgroundColor = 'red';
|
|
return document.body.style.backgroundColor;
|
|
"""
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url="https://example.com",
|
|
config=CrawlerRunConfig(
|
|
js_code=js_code,
|
|
page_timeout=30000
|
|
)
|
|
)
|
|
assert result.success
|
|
|
|
@pytest.mark.asyncio
|
|
@pytest.mark.parametrize("error_url", [
|
|
"https://invalid.domain.test",
|
|
"https://httpbin.org/status/404",
|
|
"https://httpbin.org/status/503",
|
|
"https://httpbin.org/status/403"
|
|
])
|
|
async def test_error_handling(error_url):
|
|
"""Test error handling for various failure scenarios"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
result = await crawler.arun(
|
|
url=error_url,
|
|
config=CrawlerRunConfig(
|
|
page_timeout=10000, # Short timeout for error cases
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
)
|
|
assert not result.success
|
|
assert result.error_message is not None
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extraction_strategy_run_with_regular_url():
|
|
"""
|
|
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
|
|
|
|
This test verifies that when is_raw_html=False (regular URL),
|
|
extraction_strategy.run is called with the actual URL.
|
|
"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
mock_strategy = MockExtractionStrategy()
|
|
|
|
# Test regular URL (is_raw_html=False)
|
|
regular_url = "https://example.com"
|
|
result = await crawler.arun(
|
|
url=regular_url,
|
|
config=CrawlerRunConfig(
|
|
page_timeout=30000,
|
|
extraction_strategy=mock_strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
)
|
|
|
|
assert result.success
|
|
assert len(mock_strategy.run_calls) == 1
|
|
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
|
|
|
|
@pytest.mark.asyncio
|
|
async def test_extraction_strategy_run_with_raw_html():
|
|
"""
|
|
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
|
|
|
|
This test verifies that when is_raw_html=True (URL starts with "raw:"),
|
|
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
|
|
"""
|
|
browser_config = BrowserConfig(
|
|
browser_type="chromium",
|
|
headless=True
|
|
)
|
|
|
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
mock_strategy = MockExtractionStrategy()
|
|
|
|
# Test raw HTML URL (is_raw_html=True automatically set)
|
|
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
|
|
result = await crawler.arun(
|
|
url=raw_html_url,
|
|
config=CrawlerRunConfig(
|
|
page_timeout=30000,
|
|
extraction_strategy=mock_strategy,
|
|
cache_mode=CacheMode.BYPASS
|
|
)
|
|
)
|
|
|
|
assert result.success
|
|
assert len(mock_strategy.run_calls) == 1
|
|
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(test_viewport_config((1024, 768)))
|
|
asyncio.run(test_memory_management())
|
|
asyncio.run(test_rate_limiting())
|
|
asyncio.run(test_javascript_execution())
|
|
asyncio.run(test_extraction_strategy_run_with_regular_url())
|
|
asyncio.run(test_extraction_strategy_run_with_raw_html())
|