Files
crawl4ai/tests/general/test_async_webcrawler.py
Nasrin a87e8c1c9e Release/v0.7.8 (#1662)
* Fix: Use correct URL variable for raw HTML extraction (#1116)

- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html

* Fix #1181: Preserve whitespace in code blocks during HTML scraping

  The remove_empty_elements_fast() method was removing whitespace-only
  span elements inside <pre> and <code> tags, causing import statements
  like "import torch" to become "importtorch". Now skips elements inside
  code blocks where whitespace is significant.

* Refactor Pydantic model configuration to use ConfigDict for arbitrary types

* Fix EmbeddingStrategy: Uncomment response handling for the variations and clean up mock data. ref #1621

* Fix: permission issues with .cache/url_seeder and other runtime cache dirs. ref #1638

* fix: ensure BrowserConfig.to_dict serializes proxy_config

* feat: make LLM backoff configurable end-to-end

- extend LLMConfig with backoff delay/attempt/factor fields and thread them
  through LLMExtractionStrategy, LLMContentFilter, table extraction, and
  Docker API handlers
- expose the backoff parameter knobs on perform_completion_with_backoff/aperform_completion_with_backoff
  and document them in the md_v2 guides

* reproduced AttributeError from #1642

* pass timeout parameter to docker client request

* added missing deep crawling objects to init

* generalized query in ContentRelevanceFilter to be a str or list

* import modules from enhanceable deserialization

* parameterized tests

* Fix: capture current page URL to reflect JavaScript navigation and add test for delayed redirects. ref #1268

* refactor: replace PyPDF2 with pypdf across the codebase. ref #1412

* announcement: add application form for cloud API closed beta

* Release v0.7.8: Stability & Bug Fix Release

- Updated version to 0.7.8
- Introduced focused stability release addressing 11 community-reported bugs.
- Key fixes include Docker API improvements, LLM extraction enhancements, URL handling corrections, and dependency updates.
- Added detailed release notes for v0.7.8 in the blog and created a dedicated verification script to ensure all fixes are functioning as intended.
- Updated documentation to reflect recent changes and improvements.

* docs: add section for Crawl4AI Cloud API closed beta with application link

* fix: add disk cleanup step to Docker workflow

---------

Co-authored-by: rbushria <rbushri@gmail.com>
Co-authored-by: AHMET YILMAZ <tawfik@kidocode.com>
Co-authored-by: Soham Kukreti <kukretisoham@gmail.com>
Co-authored-by: Chris Murphy <chris.murphy@klaviyo.com>
Co-authored-by: Aravind Karnam <aravind.karanam@gmail.com>
2025-12-11 11:04:52 +01:00

229 lines
6.9 KiB
Python

import asyncio
import pytest
from typing import List
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
MemoryAdaptiveDispatcher,
RateLimiter,
CacheMode
)
from crawl4ai.extraction_strategy import ExtractionStrategy
class MockExtractionStrategy(ExtractionStrategy):
"""Mock extraction strategy for testing URL parameter handling"""
def __init__(self):
super().__init__()
self.run_calls = []
def extract(self, url: str, html: str, *args, **kwargs):
return [{"test": "data"}]
def run(self, url: str, sections: List[str], *args, **kwargs):
self.run_calls.append(url)
return super().run(url, sections, *args, **kwargs)
@pytest.mark.asyncio
@pytest.mark.parametrize("viewport", [
(800, 600),
(1024, 768),
(1920, 1080)
])
async def test_viewport_config(viewport):
"""Test different viewport configurations"""
width, height = viewport
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=width,
viewport_height=height
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(
# cache_mode=CacheMode.BYPASS,
page_timeout=30000 # 30 seconds
)
)
assert result.success
@pytest.mark.asyncio
async def test_memory_management():
"""Test memory-adaptive dispatching"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
viewport_width=1024,
viewport_height=768
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0,
check_interval=1.0,
max_session_permit=5
)
urls = ["https://example.com"] * 3 # Test with multiple identical URLs
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(page_timeout=30000),
dispatcher=dispatcher
)
assert len(results) == len(urls)
@pytest.mark.asyncio
async def test_rate_limiting():
"""Test rate limiting functionality"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
dispatcher = MemoryAdaptiveDispatcher(
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=5.0,
max_retries=2
),
memory_threshold_percent=70.0
)
urls = [
"https://example.com",
"https://example.org",
"https://example.net"
]
async with AsyncWebCrawler(config=browser_config) as crawler:
results = await crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(page_timeout=30000),
dispatcher=dispatcher
)
assert len(results) == len(urls)
@pytest.mark.asyncio
async def test_javascript_execution():
"""Test JavaScript execution capabilities"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True,
java_script_enabled=True
)
js_code = """
document.body.style.backgroundColor = 'red';
return document.body.style.backgroundColor;
"""
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(
js_code=js_code,
page_timeout=30000
)
)
assert result.success
@pytest.mark.asyncio
@pytest.mark.parametrize("error_url", [
"https://invalid.domain.test",
"https://httpbin.org/status/404",
"https://httpbin.org/status/503",
"https://httpbin.org/status/403"
])
async def test_error_handling(error_url):
"""Test error handling for various failure scenarios"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=error_url,
config=CrawlerRunConfig(
page_timeout=10000, # Short timeout for error cases
cache_mode=CacheMode.BYPASS
)
)
assert not result.success
assert result.error_message is not None
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_regular_url():
"""
Regression test for extraction_strategy.run URL parameter handling with regular URLs.
This test verifies that when is_raw_html=False (regular URL),
extraction_strategy.run is called with the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test regular URL (is_raw_html=False)
regular_url = "https://example.com"
result = await crawler.arun(
url=regular_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
@pytest.mark.asyncio
async def test_extraction_strategy_run_with_raw_html():
"""
Regression test for extraction_strategy.run URL parameter handling with raw HTML.
This test verifies that when is_raw_html=True (URL starts with "raw:"),
extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
"""
browser_config = BrowserConfig(
browser_type="chromium",
headless=True
)
async with AsyncWebCrawler(config=browser_config) as crawler:
mock_strategy = MockExtractionStrategy()
# Test raw HTML URL (is_raw_html=True automatically set)
raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
result = await crawler.arun(
url=raw_html_url,
config=CrawlerRunConfig(
page_timeout=30000,
extraction_strategy=mock_strategy,
cache_mode=CacheMode.BYPASS
)
)
assert result.success
assert len(mock_strategy.run_calls) == 1
assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
if __name__ == "__main__":
asyncio.run(test_viewport_config((1024, 768)))
asyncio.run(test_memory_management())
asyncio.run(test_rate_limiting())
asyncio.run(test_javascript_execution())
asyncio.run(test_extraction_strategy_run_with_regular_url())
asyncio.run(test_extraction_strategy_run_with_raw_html())