#!/usr/bin/env python3 """ Crawl4AI v0.7.8 Release Demo - Verification Tests ================================================== This demo ACTUALLY RUNS and VERIFIES the bug fixes in v0.7.8. Each test executes real code and validates the fix is working. Bug Fixes Verified: 1. ProxyConfig JSON serialization (#1629) 2. Configurable backoff parameters (#1269) 3. LLM Strategy input_format support (#1178) 4. Raw HTML URL variable (#1116) 5. Relative URLs after redirects (#1268) 6. pypdf migration (#1412) 7. Pydantic v2 ConfigDict (#678) 8. Docker ContentRelevanceFilter (#1642) - requires Docker 9. Docker .cache permissions (#1638) - requires Docker 10. AdaptiveCrawler query expansion (#1621) - requires LLM API key 11. Import statement formatting (#1181) Usage: python docs/releases_review/demo_v0.7.8.py For Docker tests: docker run -d -p 11235:11235 --shm-size=1g unclecode/crawl4ai:0.7.8 python docs/releases_review/demo_v0.7.8.py """ import asyncio import json import sys import warnings import os import tempfile from typing import Tuple, Optional from dataclasses import dataclass # Test results tracking @dataclass class TestResult: name: str issue: str passed: bool message: str skipped: bool = False results: list[TestResult] = [] def print_header(title: str): print(f"\n{'=' * 70}") print(f"{title}") print(f"{'=' * 70}") def print_test(name: str, issue: str): print(f"\n[TEST] {name} ({issue})") print("-" * 50) def record_result(name: str, issue: str, passed: bool, message: str, skipped: bool = False): results.append(TestResult(name, issue, passed, message, skipped)) if skipped: print(f" SKIPPED: {message}") elif passed: print(f" PASSED: {message}") else: print(f" FAILED: {message}") # ============================================================================= # TEST 1: ProxyConfig JSON Serialization (#1629) # ============================================================================= async def test_proxy_config_serialization(): """ Verify BrowserConfig.to_dict() properly serializes ProxyConfig to JSON. BEFORE: ProxyConfig was included as object, causing JSON serialization to fail AFTER: ProxyConfig.to_dict() is called, producing valid JSON """ print_test("ProxyConfig JSON Serialization", "#1629") try: from crawl4ai import BrowserConfig from crawl4ai.async_configs import ProxyConfig # Create config with ProxyConfig proxy = ProxyConfig( server="http://proxy.example.com:8080", username="testuser", password="testpass" ) browser_config = BrowserConfig(headless=True, proxy_config=proxy) # Test 1: to_dict() should return dict for proxy_config config_dict = browser_config.to_dict() proxy_dict = config_dict.get('proxy_config') if not isinstance(proxy_dict, dict): record_result("ProxyConfig Serialization", "#1629", False, f"proxy_config is {type(proxy_dict)}, expected dict") return # Test 2: Should be JSON serializable try: json_str = json.dumps(config_dict) json.loads(json_str) # Verify valid JSON except (TypeError, json.JSONDecodeError) as e: record_result("ProxyConfig Serialization", "#1629", False, f"JSON serialization failed: {e}") return # Test 3: Verify proxy data is preserved if proxy_dict.get('server') != "http://proxy.example.com:8080": record_result("ProxyConfig Serialization", "#1629", False, "Proxy server not preserved in serialization") return record_result("ProxyConfig Serialization", "#1629", True, "BrowserConfig with ProxyConfig serializes to valid JSON") except Exception as e: record_result("ProxyConfig Serialization", "#1629", False, f"Exception: {e}") # ============================================================================= # TEST 2: Configurable Backoff Parameters (#1269) # ============================================================================= async def test_configurable_backoff(): """ Verify LLMConfig accepts and stores backoff configuration parameters. BEFORE: Backoff was hardcoded (delay=2, attempts=3, factor=2) AFTER: LLMConfig accepts backoff_base_delay, backoff_max_attempts, backoff_exponential_factor """ print_test("Configurable Backoff Parameters", "#1269") try: from crawl4ai import LLMConfig # Test 1: Default values default_config = LLMConfig(provider="openai/gpt-4o-mini") if default_config.backoff_base_delay != 2: record_result("Configurable Backoff", "#1269", False, f"Default base_delay is {default_config.backoff_base_delay}, expected 2") return if default_config.backoff_max_attempts != 3: record_result("Configurable Backoff", "#1269", False, f"Default max_attempts is {default_config.backoff_max_attempts}, expected 3") return if default_config.backoff_exponential_factor != 2: record_result("Configurable Backoff", "#1269", False, f"Default exponential_factor is {default_config.backoff_exponential_factor}, expected 2") return # Test 2: Custom values custom_config = LLMConfig( provider="openai/gpt-4o-mini", backoff_base_delay=5, backoff_max_attempts=10, backoff_exponential_factor=3 ) if custom_config.backoff_base_delay != 5: record_result("Configurable Backoff", "#1269", False, f"Custom base_delay is {custom_config.backoff_base_delay}, expected 5") return if custom_config.backoff_max_attempts != 10: record_result("Configurable Backoff", "#1269", False, f"Custom max_attempts is {custom_config.backoff_max_attempts}, expected 10") return if custom_config.backoff_exponential_factor != 3: record_result("Configurable Backoff", "#1269", False, f"Custom exponential_factor is {custom_config.backoff_exponential_factor}, expected 3") return # Test 3: to_dict() includes backoff params config_dict = custom_config.to_dict() if 'backoff_base_delay' not in config_dict: record_result("Configurable Backoff", "#1269", False, "backoff_base_delay missing from to_dict()") return record_result("Configurable Backoff", "#1269", True, "LLMConfig accepts and stores custom backoff parameters") except Exception as e: record_result("Configurable Backoff", "#1269", False, f"Exception: {e}") # ============================================================================= # TEST 3: LLM Strategy Input Format (#1178) # ============================================================================= async def test_llm_input_format(): """ Verify LLMExtractionStrategy accepts input_format parameter. BEFORE: Always used markdown input AFTER: Supports "markdown", "html", "fit_markdown", "cleaned_html", "fit_html" """ print_test("LLM Strategy Input Format", "#1178") try: from crawl4ai import LLMExtractionStrategy, LLMConfig llm_config = LLMConfig(provider="openai/gpt-4o-mini") # Test 1: Default is markdown default_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data" ) if default_strategy.input_format != "markdown": record_result("LLM Input Format", "#1178", False, f"Default input_format is '{default_strategy.input_format}', expected 'markdown'") return # Test 2: Can set to html html_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data", input_format="html" ) if html_strategy.input_format != "html": record_result("LLM Input Format", "#1178", False, f"HTML input_format is '{html_strategy.input_format}', expected 'html'") return # Test 3: Can set to fit_markdown fit_strategy = LLMExtractionStrategy( llm_config=llm_config, instruction="Extract data", input_format="fit_markdown" ) if fit_strategy.input_format != "fit_markdown": record_result("LLM Input Format", "#1178", False, f"fit_markdown input_format is '{fit_strategy.input_format}'") return record_result("LLM Input Format", "#1178", True, "LLMExtractionStrategy accepts all input_format options") except Exception as e: record_result("LLM Input Format", "#1178", False, f"Exception: {e}") # ============================================================================= # TEST 4: Raw HTML URL Variable (#1116) # ============================================================================= async def test_raw_html_url_variable(): """ Verify that raw: prefix URLs pass "Raw HTML" to extraction strategy. BEFORE: Entire HTML blob was passed as URL parameter AFTER: "Raw HTML" string is passed as URL parameter """ print_test("Raw HTML URL Variable", "#1116") try: from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai.extraction_strategy import ExtractionStrategy # Custom strategy to capture what URL is passed class URLCapturingStrategy(ExtractionStrategy): captured_url = None def extract(self, url: str, html: str, *args, **kwargs): URLCapturingStrategy.captured_url = url return [{"content": "test"}] html_content = "

Test

" strategy = URLCapturingStrategy() async with AsyncWebCrawler() as crawler: result = await crawler.arun( url=f"raw:{html_content}", config=CrawlerRunConfig( extraction_strategy=strategy ) ) captured = URLCapturingStrategy.captured_url if captured is None: record_result("Raw HTML URL Variable", "#1116", False, "Extraction strategy was not called") return if captured == html_content or captured.startswith("

import os
import sys
from pathlib import Path
from typing import List, Dict

def main():
    pass
        
""" async with AsyncWebCrawler() as crawler: result = await crawler.arun( url=f"raw:{html_with_code}", config=CrawlerRunConfig() ) markdown = result.markdown.raw_markdown if result.markdown else "" # Check that imports are not concatenated on the same line # Bad: "import osimport sys" (no newline between statements) # This is the actual bug - statements getting merged on same line bad_patterns = [ "import os import sys", # Space but no newline "import osimport sys", # No space or newline "import os from pathlib", # Space but no newline "import osfrom pathlib", # No space or newline ] markdown_single_line = markdown.replace('\n', ' ') # Convert newlines to spaces for pattern in bad_patterns: # Check if pattern exists without proper line separation if pattern.replace(' ', '') in markdown_single_line.replace(' ', ''): # Verify it's actually on same line (not just adjacent after newline removal) lines = markdown.split('\n') for line in lines: if 'import' in line.lower(): # Count import statements on this line import_count = line.lower().count('import ') if import_count > 1: record_result("Import Formatting", "#1181", False, f"Multiple imports on same line: {line[:60]}...") return # Verify imports are present if "import" in markdown.lower(): record_result("Import Formatting", "#1181", True, "Import statements are properly line-separated") else: record_result("Import Formatting", "#1181", True, "No import statements found to verify (test HTML may have changed)") except Exception as e: record_result("Import Formatting", "#1181", False, f"Exception: {e}") # ============================================================================= # COMPREHENSIVE CRAWL TEST # ============================================================================= async def test_comprehensive_crawl(): """ Run a comprehensive crawl to verify overall stability. """ print_test("Comprehensive Crawl Test", "Overall") try: from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, BrowserConfig async with AsyncWebCrawler(config=BrowserConfig(headless=True)) as crawler: result = await crawler.arun( url="https://httpbin.org/html", config=CrawlerRunConfig() ) # Verify result checks = [] if result.success: checks.append("success=True") else: record_result("Comprehensive Crawl", "Overall", False, f"Crawl failed: {result.error_message}") return if result.html and len(result.html) > 100: checks.append(f"html={len(result.html)} chars") if result.markdown and result.markdown.raw_markdown: checks.append(f"markdown={len(result.markdown.raw_markdown)} chars") if result.redirected_url: checks.append("redirected_url present") record_result("Comprehensive Crawl", "Overall", True, f"All checks passed: {', '.join(checks)}") except Exception as e: record_result("Comprehensive Crawl", "Overall", False, f"Exception: {e}") # ============================================================================= # MAIN # ============================================================================= def print_summary(): """Print test results summary""" print_header("TEST RESULTS SUMMARY") passed = sum(1 for r in results if r.passed and not r.skipped) failed = sum(1 for r in results if not r.passed and not r.skipped) skipped = sum(1 for r in results if r.skipped) print(f"\nTotal: {len(results)} tests") print(f" Passed: {passed}") print(f" Failed: {failed}") print(f" Skipped: {skipped}") if failed > 0: print("\nFailed Tests:") for r in results: if not r.passed and not r.skipped: print(f" - {r.name} ({r.issue}): {r.message}") if skipped > 0: print("\nSkipped Tests:") for r in results: if r.skipped: print(f" - {r.name} ({r.issue}): {r.message}") print("\n" + "=" * 70) if failed == 0: print("All tests passed! v0.7.8 bug fixes verified.") else: print(f"WARNING: {failed} test(s) failed!") print("=" * 70) return failed == 0 async def main(): """Run all verification tests""" print_header("Crawl4AI v0.7.8 - Bug Fix Verification Tests") print("Running actual tests to verify bug fixes...") # Run all tests tests = [ test_proxy_config_serialization, # #1629 test_configurable_backoff, # #1269 test_llm_input_format, # #1178 test_raw_html_url_variable, # #1116 test_redirect_url_handling, # #1268 test_pypdf_migration, # #1412 test_pydantic_configdict, # #678 test_docker_content_filter, # #1642 test_docker_cache_permissions, # #1638 test_adaptive_crawler_embedding, # #1621 test_import_formatting, # #1181 test_comprehensive_crawl, # Overall ] for test_func in tests: try: await test_func() except Exception as e: print(f"\nTest {test_func.__name__} crashed: {e}") results.append(TestResult( test_func.__name__, "Unknown", False, f"Crashed: {e}" )) # Print summary all_passed = print_summary() return 0 if all_passed else 1 if __name__ == "__main__": try: exit_code = asyncio.run(main()) sys.exit(exit_code) except KeyboardInterrupt: print("\n\nTests interrupted by user.") sys.exit(1) except Exception as e: print(f"\n\nTest suite failed: {e}") import traceback traceback.print_exc() sys.exit(1)