crawl4ai/tests/test_virtual_scroll_api_simple.py

#!/usr/bin/env python3
"""
Test VirtualScrollConfig with the /crawl API using existing test assets
"""

import requests
import json
import os
import http.server
import socketserver
import threading
import time
from pathlib import Path

def test_virtual_scroll_api():
    """Test the /crawl endpoint with VirtualScrollConfig using test assets"""

    # Use the existing test assets
    assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
    if not assets_dir.exists():
        print(f"❌ Assets directory not found: {assets_dir}")
        return

    # Start local server for assets
    os.chdir(assets_dir)
    port = 8081

    class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
        def log_message(self, format, *args):
            pass  # Suppress log messages

    try:
        with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
            server_thread = threading.Thread(target=httpd.serve_forever)
            server_thread.daemon = True
            server_thread.start()

            time.sleep(0.5)  # Give server time to start

            # Test with Twitter-like virtual scroll
            url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"

            payload = {
                "urls": [url],
                "browser_config": {
                    "type": "BrowserConfig",
                    "params": {
                        "headless": True,
                        "viewport_width": 1280,
                        "viewport_height": 800
                    }
                },
                "crawler_config": {
                    "type": "CrawlerRunConfig",
                    "params": {
                        "virtual_scroll_config": {
                            "type": "VirtualScrollConfig",
                            "params": {
                                "container_selector": "#timeline",
                                "scroll_count": 10,
                                "scroll_by": "container_height",
                                "wait_after_scroll": 0.3
                            }
                        },
                        "cache_mode": "bypass",
                        "extraction_strategy": {
                            "type": "NoExtractionStrategy",
                            "params": {}
                        }
                    }
                }
            }

            print("Testing VirtualScrollConfig with /crawl endpoint...")
            print(f"Test URL: {url}")
            print("Payload:")
            print(json.dumps(payload, indent=2))

            response = requests.post(
                "http://localhost:11234/crawl",
                json=payload,
                headers={"Content-Type": "application/json"},
                timeout=60  # Longer timeout for virtual scroll
            )

            print(f"\nResponse Status: {response.status_code}")

            if response.status_code == 200:
                result = response.json()
                print("✅ Success! VirtualScrollConfig is working with the API.")
                print(f"Content length: {len(result[0]['content']['raw_content'])} characters")

                # Check if we captured multiple posts (indicating virtual scroll worked)
                content = result[0]['content']['raw_content']
                post_count = content.count("Post #")
                print(f"Found {post_count} posts in the content")

                if post_count > 5:  # Should capture more than just the initial posts
                    print("✅ Virtual scroll successfully captured additional content!")
                else:
                    print("⚠️  Virtual scroll may not have captured much additional content")

                # Print a snippet of the content
                content_preview = content[:1000] + "..." if len(content) > 1000 else content
                print(f"\nContent preview:\n{content_preview}")

            else:
                print(f"❌ Error: {response.status_code}")
                print(f"Response: {response.text}")

    except requests.exceptions.Timeout:
        print("❌ Request timed out - virtual scroll may be taking too long")
    except Exception as e:
        print(f"❌ Test failed with error: {e}")

if __name__ == "__main__":
    test_virtual_scroll_api()