crawl4ai/tests/docker/test_hooks_client.py

#!/usr/bin/env python3
"""
Test client for demonstrating user-provided hooks in Crawl4AI Docker API
"""

import requests
import json
from typing import Dict, Any


API_BASE_URL = "http://localhost:11234"  # Adjust if needed


def test_hooks_info():
    """Get information about available hooks"""
    print("=" * 70)
    print("Testing: GET /hooks/info")
    print("=" * 70)

    response = requests.get(f"{API_BASE_URL}/hooks/info")
    if response.status_code == 200:
        data = response.json()
        print("Available Hook Points:")
        for hook, info in data['available_hooks'].items():
            print(f"\n{hook}:")
            print(f"  Parameters: {', '.join(info['parameters'])}")
            print(f"  Description: {info['description']}")
    else:
        print(f"Error: {response.status_code}")
        print(response.text)


def test_basic_crawl_with_hooks():
    """Test basic crawling with user-provided hooks"""
    print("\n" + "=" * 70)
    print("Testing: POST /crawl with hooks")
    print("=" * 70)

    # Define hooks as Python code strings
    hooks_code = {
        "on_page_context_created": """
async def hook(page, context, **kwargs):
    print("Hook: Setting up page context")
    # Block images to speed up crawling
    await context.route("**/*.{png,jpg,jpeg,gif,webp}", lambda route: route.abort())
    print("Hook: Images blocked")
    return page
""",

        "before_retrieve_html": """
async def hook(page, context, **kwargs):
    print("Hook: Before retrieving HTML")
    # Scroll to bottom to load lazy content
    await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")
    await page.wait_for_timeout(1000)
    print("Hook: Scrolled to bottom")
    return page
""",

        "before_goto": """
async def hook(page, context, url, **kwargs):
    print(f"Hook: About to navigate to {url}")
    # Add custom headers
    await page.set_extra_http_headers({
        'X-Test-Header': 'crawl4ai-hooks-test'
    })
    return page
"""
    }

    # Create request payload
    payload = {
        "urls": ["https://httpbin.org/html"],
        "hooks": {
            "code": hooks_code,
            "timeout": 30
        }
    }

    print("Sending request with hooks...")
    response = requests.post(f"{API_BASE_URL}/crawl", json=payload)

    if response.status_code == 200:
        data = response.json()
        print("\n✅ Crawl successful!")

        # Check hooks status
        if 'hooks' in data:
            hooks_info = data['hooks']
            print("\nHooks Execution Summary:")
            print(f"  Status: {hooks_info['status']['status']}")
            print(f"  Attached hooks: {', '.join(hooks_info['status']['attached_hooks'])}")

            if hooks_info['status']['validation_errors']:
                print("\n⚠️ Validation Errors:")
                for error in hooks_info['status']['validation_errors']:
                    print(f"  - {error['hook_point']}: {error['error']}")

            if 'summary' in hooks_info:
                summary = hooks_info['summary']
                print(f"\nExecution Statistics:")
                print(f"  Total executions: {summary['total_executions']}")
                print(f"  Successful: {summary['successful']}")
                print(f"  Failed: {summary['failed']}")
                print(f"  Timed out: {summary['timed_out']}")
                print(f"  Success rate: {summary['success_rate']:.1f}%")

            if hooks_info['execution_log']:
                print("\nExecution Log:")
                for log_entry in hooks_info['execution_log']:
                    status_icon = "✅" if log_entry['status'] == 'success' else "❌"
                    print(f"  {status_icon} {log_entry['hook_point']}: {log_entry['status']} ({log_entry.get('execution_time', 0):.2f}s)")

            if hooks_info['errors']:
                print("\n❌ Hook Errors:")
                for error in hooks_info['errors']:
                    print(f"  - {error['hook_point']}: {error['error']}")

        # Show crawl results
        if 'results' in data:
            print(f"\nCrawled {len(data['results'])} URL(s)")
            for result in data['results']:
                print(f"  - {result['url']}: {'✅' if result['success'] else '❌'}")

    else:
        print(f"❌ Error: {response.status_code}")
        print(response.text)


def test_invalid_hook():
    """Test with an invalid hook to see error handling"""
    print("\n" + "=" * 70)
    print("Testing: Invalid hook handling")
    print("=" * 70)

    # Intentionally broken hook
    hooks_code = {
        "on_page_context_created": """
def hook(page, context):  # Missing async!
    return page
""",

        "before_retrieve_html": """
async def hook(page, context, **kwargs):
    # This will cause an error
    await page.non_existent_method()
    return page
"""
    }

    payload = {
        "urls": ["https://httpbin.org/html"],
        "hooks": {
            "code": hooks_code,
            "timeout": 5
        }
    }

    print("Sending request with invalid hooks...")
    response = requests.post(f"{API_BASE_URL}/crawl", json=payload)

    if response.status_code == 200:
        data = response.json()

        if 'hooks' in data:
            hooks_info = data['hooks']
            print(f"\nHooks Status: {hooks_info['status']['status']}")

            if hooks_info['status']['validation_errors']:
                print("\n✅ Validation caught errors (as expected):")
                for error in hooks_info['status']['validation_errors']:
                    print(f"  - {error['hook_point']}: {error['error']}")

            if hooks_info['errors']:
                print("\n✅ Runtime errors handled gracefully:")
                for error in hooks_info['errors']:
                    print(f"  - {error['hook_point']}: {error['error']}")

            # The crawl should still succeed despite hook errors
            if data.get('success'):
                print("\n✅ Crawl succeeded despite hook errors (error isolation working!)")

    else:
        print(f"Error: {response.status_code}")
        print(response.text)


def test_authentication_hook():
    """Test authentication using hooks"""
    print("\n" + "=" * 70)
    print("Testing: Authentication with hooks")
    print("=" * 70)

    hooks_code = {
        "before_goto": """
async def hook(page, context, url, **kwargs):
    # For httpbin.org basic auth test, set Authorization header
    import base64

    # httpbin.org/basic-auth/user/passwd expects username="user" and password="passwd"
    credentials = base64.b64encode(b"user:passwd").decode('ascii')

    await page.set_extra_http_headers({
        'Authorization': f'Basic {credentials}'
    })

    print(f"Hook: Set Authorization header for {url}")
    return page
""",
        "on_page_context_created": """
async def hook(page, context, **kwargs):
    # Example: Add cookies for session tracking
    await context.add_cookies([
        {
            'name': 'session_id',
            'value': 'test_session_123',
            'domain': '.httpbin.org',
            'path': '/',
            'httpOnly': True,
            'secure': True
        }
    ])

    print("Hook: Added session cookie")
    return page
"""
    }

    payload = {
        "urls": ["https://httpbin.org/basic-auth/user/passwd"],
        "hooks": {
            "code": hooks_code,
            "timeout": 30
        }
    }

    print("Sending request with authentication hook...")
    response = requests.post(f"{API_BASE_URL}/crawl", json=payload)

    if response.status_code == 200:
        data = response.json()
        if data.get('success'):
            print("✅ Crawl with authentication hook successful")

            # Check if hooks executed
            if 'hooks' in data:
                hooks_info = data['hooks']
                if hooks_info.get('summary', {}).get('successful', 0) > 0:
                    print(f"✅ Authentication hooks executed: {hooks_info['summary']['successful']} successful")

                # Check for any hook errors
                if hooks_info.get('errors'):
                    print("⚠️ Hook errors:")
                    for error in hooks_info['errors']:
                        print(f"  - {error}")

            # Check if authentication worked by looking at the result
            if 'results' in data and len(data['results']) > 0:
                result = data['results'][0]
                if result.get('success'):
                    print("✅ Page crawled successfully (authentication worked!)")
                    # httpbin.org/basic-auth returns JSON with authenticated=true when successful
                    if 'authenticated' in str(result.get('html', '')):
                        print("✅ Authentication confirmed in response content")
                else:
                    print(f"❌ Crawl failed: {result.get('error_message', 'Unknown error')}")
        else:
            print("❌ Request failed")
            print(f"Response: {json.dumps(data, indent=2)}")
    else:
        print(f"❌ Error: {response.status_code}")
        try:
            error_data = response.json()
            print(f"Error details: {json.dumps(error_data, indent=2)}")
        except:
            print(f"Error text: {response.text[:500]}")


def test_streaming_with_hooks():
    """Test streaming endpoint with hooks"""
    print("\n" + "=" * 70)
    print("Testing: POST /crawl/stream with hooks")
    print("=" * 70)

    hooks_code = {
        "before_retrieve_html": """
async def hook(page, context, **kwargs):
    await page.evaluate("document.querySelectorAll('img').forEach(img => img.remove())")
    return page
"""
    }

    payload = {
        "urls": ["https://httpbin.org/html", "https://httpbin.org/json"],
        "hooks": {
            "code": hooks_code,
            "timeout": 10
        }
    }

    print("Sending streaming request with hooks...")

    with requests.post(f"{API_BASE_URL}/crawl/stream", json=payload, stream=True) as response:
        if response.status_code == 200:
            # Check headers for hooks status
            hooks_status = response.headers.get('X-Hooks-Status')
            if hooks_status:
                print(f"Hooks Status (from header): {hooks_status}")

            print("\nStreaming results:")
            for line in response.iter_lines():
                if line:
                    try:
                        result = json.loads(line)
                        if 'url' in result:
                            print(f"  Received: {result['url']}")
                        elif 'status' in result:
                            print(f"  Stream status: {result['status']}")
                    except json.JSONDecodeError:
                        print(f"  Raw: {line.decode()}")
        else:
            print(f"Error: {response.status_code}")


def test_basic_without_hooks():
    """Test basic crawl without hooks"""
    print("\n" + "=" * 70)
    print("Testing: POST /crawl with no hooks")
    print("=" * 70)

    payload = {
        "urls": ["https://httpbin.org/html", "https://httpbin.org/json"]
    }

    response = requests.post(f"{API_BASE_URL}/crawl", json=payload)
    if response.status_code == 200:
        data = response.json()
        print(f"Response: {json.dumps(data, indent=2)}")
    else:
        print(f"Error: {response.status_code}")


def main():
    """Run all tests"""
    print("🔧 Crawl4AI Docker API - Hooks Testing")
    print("=" * 70)

    # Test 1: Get hooks information
    # test_hooks_info()

    # Test 2: Basic crawl with hooks
    # test_basic_crawl_with_hooks()

    # Test 3: Invalid hooks (error handling)
    test_invalid_hook()

    # # Test 4: Authentication hook
    # test_authentication_hook()

    # # Test 5: Streaming with hooks
    # test_streaming_with_hooks()

    # # Test 6: Basic crawl without hooks
    # test_basic_without_hooks()

    print("\n" + "=" * 70)
    print("✅ All tests completed!")
    print("=" * 70)


if __name__ == "__main__":
    main()