feat: Add comprehensive tests for URL discovery and virtual scroll functionality

2025-10-16 10:35:48 +08:00
parent 674d0741da
commit 74eeff4c51
6 changed files with 287 additions and 0 deletions
--- a/tests/test_virtual_scroll_api_simple.py
+++ b/tests/test_virtual_scroll_api_simple.py
@@ -0,0 +1,117 @@
+#!/usr/bin/env python3
+"""
+Test VirtualScrollConfig with the /crawl API using existing test assets
+"""
+
+import requests
+import json
+import os
+import http.server
+import socketserver
+import threading
+import time
+from pathlib import Path
+
+def test_virtual_scroll_api():
+    """Test the /crawl endpoint with VirtualScrollConfig using test assets"""
+
+    # Use the existing test assets
+    assets_dir = Path(__file__).parent / "docs" / "examples" / "assets"
+    if not assets_dir.exists():
+        print(f"❌ Assets directory not found: {assets_dir}")
+        return
+
+    # Start local server for assets
+    os.chdir(assets_dir)
+    port = 8081
+
+    class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler):
+        def log_message(self, format, *args):
+            pass  # Suppress log messages
+
+    try:
+        with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd:
+            server_thread = threading.Thread(target=httpd.serve_forever)
+            server_thread.daemon = True
+            server_thread.start()
+
+            time.sleep(0.5)  # Give server time to start
+
+            # Test with Twitter-like virtual scroll
+            url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html"
+
+            payload = {
+                "urls": [url],
+                "browser_config": {
+                    "type": "BrowserConfig",
+                    "params": {
+                        "headless": True,
+                        "viewport_width": 1280,
+                        "viewport_height": 800
+                    }
+                },
+                "crawler_config": {
+                    "type": "CrawlerRunConfig",
+                    "params": {
+                        "virtual_scroll_config": {
+                            "type": "VirtualScrollConfig",
+                            "params": {
+                                "container_selector": "#timeline",
+                                "scroll_count": 10,
+                                "scroll_by": "container_height",
+                                "wait_after_scroll": 0.3
+                            }
+                        },
+                        "cache_mode": "bypass",
+                        "extraction_strategy": {
+                            "type": "NoExtractionStrategy",
+                            "params": {}
+                        }
+                    }
+                }
+            }
+
+            print("Testing VirtualScrollConfig with /crawl endpoint...")
+            print(f"Test URL: {url}")
+            print("Payload:")
+            print(json.dumps(payload, indent=2))
+
+            response = requests.post(
+                "http://localhost:11234/crawl",
+                json=payload,
+                headers={"Content-Type": "application/json"},
+                timeout=60  # Longer timeout for virtual scroll
+            )
+
+            print(f"\nResponse Status: {response.status_code}")
+
+            if response.status_code == 200:
+                result = response.json()
+                print("✅ Success! VirtualScrollConfig is working with the API.")
+                print(f"Content length: {len(result[0]['content']['raw_content'])} characters")
+
+                # Check if we captured multiple posts (indicating virtual scroll worked)
+                content = result[0]['content']['raw_content']
+                post_count = content.count("Post #")
+                print(f"Found {post_count} posts in the content")
+
+                if post_count > 5:  # Should capture more than just the initial posts
+                    print("✅ Virtual scroll successfully captured additional content!")
+                else:
+                    print("⚠️  Virtual scroll may not have captured much additional content")
+
+                # Print a snippet of the content
+                content_preview = content[:1000] + "..." if len(content) > 1000 else content
+                print(f"\nContent preview:\n{content_preview}")
+
+            else:
+                print(f"❌ Error: {response.status_code}")
+                print(f"Response: {response.text}")
+
+    except requests.exceptions.Timeout:
+        print("❌ Request timed out - virtual scroll may be taking too long")
+    except Exception as e:
+        print(f"❌ Test failed with error: {e}")
+
+if __name__ == "__main__":
+    test_virtual_scroll_api()