Files
crawl4ai/tests/test_virtual_scroll_api_simple.py

117 lines
4.3 KiB
Python

#!/usr/bin/env python3
"""
Test VirtualScrollConfig with the /crawl API using existing test assets
"""
import requests
import json
import os
import http.server
import socketserver
import threading
import time
from pathlib import Path
def test_virtual_scroll_api():
"""Test the /crawl endpoint with VirtualScrollConfig using test assets"""
# Use the existing test assets
assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
if not assets_dir.exists():
print(f"❌ Assets directory not found: {assets_dir}")
return
# Start local server for assets
os.chdir(assets_dir)
port = 8081
class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
def log_message(self, format, *args):
pass # Suppress log messages
try:
with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
server_thread = threading.Thread(target=httpd.serve_forever)
server_thread.daemon = True
server_thread.start()
time.sleep(0.5) # Give server time to start
# Test with Twitter-like virtual scroll
url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
payload = {
"urls": [url],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
"viewport_width": 1280,
"viewport_height": 800
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"virtual_scroll_config": {
"type": "VirtualScrollConfig",
"params": {
"container_selector": "#timeline",
"scroll_count": 10,
"scroll_by": "container_height",
"wait_after_scroll": 0.3
}
},
"cache_mode": "bypass",
"extraction_strategy": {
"type": "NoExtractionStrategy",
"params": {}
}
}
}
}
print("Testing VirtualScrollConfig with /crawl endpoint...")
print(f"Test URL: {url}")
print("Payload:")
print(json.dumps(payload, indent=2))
response = requests.post(
"http://localhost:11234/crawl",
json=payload,
headers={"Content-Type": "application/json"},
timeout=60 # Longer timeout for virtual scroll
)
print(f"\nResponse Status: {response.status_code}")
if response.status_code == 200:
result = response.json()
print("✅ Success! VirtualScrollConfig is working with the API.")
print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
# Check if we captured multiple posts (indicating virtual scroll worked)
content = result[0]['content']['raw_content']
post_count = content.count("Post #")
print(f"Found {post_count} posts in the content")
if post_count > 5: # Should capture more than just the initial posts
print("✅ Virtual scroll successfully captured additional content!")
else:
print("⚠️ Virtual scroll may not have captured much additional content")
# Print a snippet of the content
content_preview = content[:1000] + "..." if len(content) > 1000 else content
print(f"\nContent preview:\n{content_preview}")
else:
print(f"❌ Error: {response.status_code}")
print(f"Response: {response.text}")
except requests.exceptions.Timeout:
print("❌ Request timed out - virtual scroll may be taking too long")
except Exception as e:
print(f"❌ Test failed with error: {e}")
if __name__ == "__main__":
test_virtual_scroll_api()