#!/usr/bin/env python3 """ Crawl4AI v0.7.8 Release Demo - Verification Tests ================================================== This demo ACTUALLY RUNS and VERIFIES the bug fixes in v0.7.8. Each test executes real code and validates the fix is working. Bug Fixes Verified: 1. ProxyConfig JSON serialization (#1629) 2. Configurable backoff parameters (#1269) 3. LLM Strategy input_format support (#1178) 4. Raw HTML URL variable (#1116) 5. Relative URLs after redirects (#1268) 6. pypdf migration (#1412) 7. Pydantic v2 ConfigDict (#678) 8. Docker ContentRelevanceFilter (#1642) - requires Docker 9. Docker .cache permissions (#1638) - requires Docker 10. AdaptiveCrawler query expansion (#1621) - requires LLM API key 11. Import statement formatting (#1181) Usage: python docs/releases_review/demo_v0.7.8.py For Docker tests: docker run -d -p 11235:11235 --shm-size=1g unclecode/crawl4ai:0.7.8 python docs/releases_review/demo_v0.7.8.py """ import asyncio import json import sys import warnings import os import tempfile from typing import Tuple, Optional from dataclasses import dataclass # Test results tracking @dataclass class TestResult: name: str issue: str passed: bool message: str skipped: bool = False results: list[TestResult] = [] def print_header(title: str): print(f"\n{'=' * 70}") print(f"{title}") print(f"{'=' * 70}") def print_test(name: str, issue: str): print(f"\n[TEST] {name} ({issue})") print("-" * 50) def record_result(name: str, issue: str, passed: bool, message: str, skipped: bool = False): results.append(TestResult(name, issue, passed, message, skipped)) if skipped: print(f" SKIPPED: {message}") elif passed: print(f" PASSED: {message}") else: print(f" FAILED: {message}") # ============================================================================= # TEST 1: ProxyConfig JSON Serialization (#1629) # ============================================================================= async def test_proxy_config_serialization(): """ Verify BrowserConfig.to_dict() properly serializes ProxyConfig to JSON. BEFORE: ProxyConfig was included as object, causing JSON serialization to fail AFTER: ProxyConfig.to_dict() is called, producing valid JSON """ print_test("ProxyConfig JSON Serialization", "#1629") try: from crawl4ai import BrowserConfig from crawl4ai.async_configs import ProxyConfig # Create config with ProxyConfig proxy = ProxyConfig( server="http://proxy.example.com:8080", username="testuser", password="testpass" ) browser_config = BrowserConfig(headless=True, proxy_config=proxy) # Test 1: to_dict() should return dict for proxy_config config_dict = browser_config.to_dict() proxy_dict = config_dict.get('proxy_config') if not isinstance(proxy_dict, dict): record_result("ProxyConfig Serialization", "#1629", False, f"proxy_config is {type(proxy_dict)}, expected dict") return # Test 2: Should be JSON serializable try: json_str = json.dumps(config_dict) json.loads(json_str) # Verify valid JSON except (TypeError, json.JSONDecodeError) as e: record_result("ProxyConfig Serialization", "#1629", False, f"JSON serialization failed: {e}") return # Test 3: Verify proxy data is preserved if proxy_dict.get('server') != "http://proxy.example.com:8080": record_result("ProxyConfig Serialization", "#1629", False, "Proxy server not preserved in serialization") return record_result("ProxyConfig Serialization", "#1629", True, "BrowserConfig with ProxyConfig serializes to valid JSON") except Exception as e: record_result("ProxyConfig Serialization", "#1629", False, f"Exception: {e}") # ============================================================================= # TEST 2: Configurable Backoff Parameters (#1269) # ============================================================================= async def test_configurable_backoff(): """ Verify LLMConfig accepts and stores backoff configuration parameters. BEFORE: Backoff was hardcoded (delay=2, attempts=3, factor=2) AFTER: LLMConfig accepts backoff_base_delay, backoff_max_attempts, backoff_exponential_factor """ print_test("Configurable Backoff Parameters", "#1269") try: from crawl4ai import LLMConfig # Test 1: Default values default_config = LLMConfig(provider="openai/gpt-4o-mini") if default_config.backoff_base_delay != 2: record_result("Configurable Backoff", "#1269", False, f"Default base_delay is {default_config.backoff_base_delay}, expected 2") return if default_config.backoff_max_attempts != 3: record_result("Configurable Backoff", "#1269", False, f"Default max_attempts is {default_config.backoff_max_attempts}, expected 3") return if default_config.backoff_exponential_factor != 2: record_result("Configurable Backoff", "#1269", False, f"Default exponential_factor is {default_config.backoff_exponential_factor}, expected 2") return # Test 2: Custom values custom_config = LLMConfig( provider="openai/gpt-4o-mini", backoff_base_delay=5, backoff_max_attempts=10, backoff_exponential_factor=3 ) if custom_config.backoff_base_delay != 5: record_result("Configurable Backoff", "#1269", False, f"Custom base_delay is {custom_config.backoff_base_delay}, expected 5") return if custom_config.backoff_max_attempts != 10: record_result("Configurable Backoff", "#1269", False, f"Custom max_attempts is {custom_config.backoff_max_attempts}, expected 10") return if custom_config.backoff_exponential_factor != 3: record_result("Configurable Backoff", "#1269", False, f"Custom exponential_factor is {custom_config.backoff_exponential_factor}, expected 3") return # Test 3: to_dict() includes backoff params config_dict = custom_config.to_dict() if 'backoff_base_delay' not in config_dict: record_result("Configurable Backoff", "#1269", False, "backoff_base_delay missing from to_dict()") return record_result("Configurable Backoff", "#1269", True, "LLMConfig accepts and stores custom backoff parameters") except Exception as e: record_result("Configurable Backoff", "#1269", False, f"Exception: {e}") # ============================================================================= # TEST 3: LLM Strategy Input Format (#1178) # ============================================================================= async def test_llm_input_format(): """ Verify LLMExtractionStrategy accepts input_format parameter. BEFORE: Always used markdown input AFTER: Supports "markdown", "html", "fit_markdown", "cleaned_html", "fit_html" """ print_test("LLM Strategy Input Format", "#1178") try: from crawl4ai import LLMExtractionStrategy, LLMConfig llm_config = LLMConfig(provider="openai/gpt-4o-mini") # Test 1: Default is markdown default_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data" ) if default_strategy.input_format != "markdown": record_result("LLM Input Format", "#1178", False, f"Default input_format is '{default_strategy.input_format}', expected 'markdown'") return # Test 2: Can set to html html_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data", input_format="html" ) if html_strategy.input_format != "html": record_result("LLM Input Format", "#1178", False, f"HTML input_format is '{html_strategy.input_format}', expected 'html'") return # Test 3: Can set to fit_markdown fit_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data", input_format="fit_markdown" ) if fit_strategy.input_format != "fit_markdown": record_result("LLM Input Format", "#1178", False, f"fit_markdown input_format is '{fit_strategy.input_format}'") return record_result("LLM Input Format", "#1178", True, "LLMExtractionStrategy accepts all input_format options") except Exception as e: record_result("LLM Input Format", "#1178", False, f"Exception: {e}") # ============================================================================= # TEST 4: Raw HTML URL Variable (#1116) # ============================================================================= async def test_raw_html_url_variable(): """ Verify that raw: prefix URLs pass "Raw HTML" to extraction strategy. BEFORE: Entire HTML blob was passed as URL parameter AFTER: "Raw HTML" string is passed as URL parameter """ print_test("Raw HTML URL Variable", "#1116") try: from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai.extraction_strategy import ExtractionStrategy # Custom strategy to capture what URL is passed class URLCapturingStrategy(ExtractionStrategy): captured_url = None def extract(self, url: str, html: str, *args, **kwargs): URLCapturingStrategy.captured_url = url return [{"content": "test"}] html_content = "
import os
import sys
from pathlib import Path
from typing import List, Dict
def main():
pass
"""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url=f"raw:{html_with_code}",
config=CrawlerRunConfig()
)
markdown = result.markdown.raw_markdown if result.markdown else ""
# Check that imports are not concatenated on the same line
# Bad: "import osimport sys" (no newline between statements)
# This is the actual bug - statements getting merged on same line
bad_patterns = [
"import os import sys", # Space but no newline
"import osimport sys", # No space or newline
"import os from pathlib", # Space but no newline
"import osfrom pathlib", # No space or newline
]
markdown_single_line = markdown.replace('\n', ' ') # Convert newlines to spaces
for pattern in bad_patterns:
# Check if pattern exists without proper line separation
if pattern.replace(' ', '') in markdown_single_line.replace(' ', ''):
# Verify it's actually on same line (not just adjacent after newline removal)
lines = markdown.split('\n')
for line in lines:
if 'import' in line.lower():
# Count import statements on this line
import_count = line.lower().count('import ')
if import_count > 1:
record_result("Import Formatting", "#1181", False,
f"Multiple imports on same line: {line[:60]}...")
return
# Verify imports are present
if "import" in markdown.lower():
record_result("Import Formatting", "#1181", True,
"Import statements are properly line-separated")
else:
record_result("Import Formatting", "#1181", True,
"No import statements found to verify (test HTML may have changed)")
except Exception as e:
record_result("Import Formatting", "#1181", False, f"Exception: {e}")
# =============================================================================
# COMPREHENSIVE CRAWL TEST
# =============================================================================
async def test_comprehensive_crawl():
"""
Run a comprehensive crawl to verify overall stability.
"""
print_test("Comprehensive Crawl Test", "Overall")
try:
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig
async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler:
result = await crawler.arun(
url="https://httpbin.org/html",
config=CrawlerRunConfig()
)
# Verify result
checks = []
if result.success:
checks.append("success=True")
else:
record_result("Comprehensive Crawl", "Overall", False,
f"Crawl failed: {result.error_message}")
return
if result.html and len(result.html) > 100:
checks.append(f"html={len(result.html)} chars")
if result.markdown and result.markdown.raw_markdown:
checks.append(f"markdown={len(result.markdown.raw_markdown)} chars")
if result.redirected_url:
checks.append("redirected_url present")
record_result("Comprehensive Crawl", "Overall", True,
f"All checks passed: {', '.join(checks)}")
except Exception as e:
record_result("Comprehensive Crawl", "Overall", False, f"Exception: {e}")
# =============================================================================
# MAIN
# =============================================================================
def print_summary():
"""Print test results summary"""
print_header("TEST RESULTS SUMMARY")
passed = sum(1 for r in results if r.passed and not r.skipped)
failed = sum(1 for r in results if not r.passed and not r.skipped)
skipped = sum(1 for r in results if r.skipped)
print(f"\nTotal: {len(results)} tests")
print(f" Passed: {passed}")
print(f" Failed: {failed}")
print(f" Skipped: {skipped}")
if failed > 0:
print("\nFailed Tests:")
for r in results:
if not r.passed and not r.skipped:
print(f" - {r.name} ({r.issue}): {r.message}")
if skipped > 0:
print("\nSkipped Tests:")
for r in results:
if r.skipped:
print(f" - {r.name} ({r.issue}): {r.message}")
print("\n" + "=" * 70)
if failed == 0:
print("All tests passed! v0.7.8 bug fixes verified.")
else:
print(f"WARNING: {failed} test(s) failed!")
print("=" * 70)
return failed == 0
async def main():
"""Run all verification tests"""
print_header("Crawl4AI v0.7.8 - Bug Fix Verification Tests")
print("Running actual tests to verify bug fixes...")
# Run all tests
tests = [
test_proxy_config_serialization, # #1629
test_configurable_backoff, # #1269
test_llm_input_format, # #1178
test_raw_html_url_variable, # #1116
test_redirect_url_handling, # #1268
test_pypdf_migration, # #1412
test_pydantic_configdict, # #678
test_docker_content_filter, # #1642
test_docker_cache_permissions, # #1638
test_adaptive_crawler_embedding, # #1621
test_import_formatting, # #1181
test_comprehensive_crawl, # Overall
]
for test_func in tests:
try:
await test_func()
except Exception as e:
print(f"\nTest {test_func.__name__} crashed: {e}")
results.append(TestResult(
test_func.__name__,
"Unknown",
False,
f"Crashed: {e}"
))
# Print summary
all_passed = print_summary()
return 0 if all_passed else 1
if __name__ == "__main__":
try:
exit_code = asyncio.run(main())
sys.exit(exit_code)
except KeyboardInterrupt:
print("\n\nTests interrupted by user.")
sys.exit(1)
except Exception as e:
print(f"\n\nTest suite failed: {e}")
import traceback
traceback.print_exc()
sys.exit(1)