diff --git a/example_url_discovery.py b/tests/example_url_discovery.py similarity index 100% rename from example_url_discovery.py rename to tests/example_url_discovery.py diff --git a/test_implementation.py b/tests/test_implementation.py similarity index 100% rename from test_implementation.py rename to tests/test_implementation.py diff --git a/test_url_discovery.py b/tests/test_url_discovery.py similarity index 100% rename from test_url_discovery.py rename to tests/test_url_discovery.py diff --git a/test_url_discovery_e2e.py b/tests/test_url_discovery_e2e.py similarity index 100% rename from test_url_discovery_e2e.py rename to tests/test_url_discovery_e2e.py diff --git a/tests/test_virtual_scroll_api.py b/tests/test_virtual_scroll_api.py new file mode 100644 index 00000000..7eedd054 --- /dev/null +++ b/tests/test_virtual_scroll_api.py @@ -0,0 +1,170 @@ +#!/usr/bin/env python3 +""" +Test script for VirtualScrollConfig with the /crawl API endpoint +""" + +import requests +import json + +def test_virtual_scroll_api(): + """Test the /crawl endpoint with VirtualScrollConfig""" + + # Create a simple HTML page with virtual scroll for testing + test_html = ''' + + + + + +

Virtual Scroll Test

+
+
Item 1
+
Item 2
+
Item 3
+
Item 4
+
Item 5
+
+ + + + ''' + + # Save the HTML to a temporary file and serve it + import tempfile + import os + import http.server + import socketserver + import threading + import time + + # Create temporary HTML file + with tempfile.NamedTemporaryFile(mode='w', suffix='.html', delete=False) as f: + f.write(test_html) + temp_file = f.name + + # Start local server + os.chdir(os.path.dirname(temp_file)) + port = 8080 + + class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): + def log_message(self, format, *args): + pass # Suppress log messages + + try: + with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd: + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + time.sleep(0.5) # Give server time to start + + # Now test the API + url = f"http://crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html" + + payload = { + "urls": [url], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport_width": 1920, + "viewport_height": 1080 + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "virtual_scroll_config": { + "type": "VirtualScrollConfig", + "params": { + "container_selector": "#container", + "scroll_count": 3, + "scroll_by": "container_height", + "wait_after_scroll": 0.5 + } + }, + "cache_mode": "bypass", + "extraction_strategy": { + "type": "NoExtractionStrategy", + "params": {} + } + } + } + } + + print("Testing VirtualScrollConfig with /crawl endpoint...") + print(f"Test URL: {url}") + print("Payload:") + print(json.dumps(payload, indent=2)) + + response = requests.post( + "http://localhost:11234/crawl", + json=payload, + headers={"Content-Type": "application/json"} + ) + + print(f"\nResponse Status: {response.status_code}") + + if response.status_code == 200: + result = response.json() + print("✅ Success! VirtualScrollConfig is working.") + print(f"Content length: {len(result[0]['content']['raw_content'])} characters") + + # Check if virtual scroll captured more content + if "Item 10" in result[0]['content']['raw_content']: + print("✅ Virtual scroll successfully captured additional content!") + else: + print("⚠️ Virtual scroll may not have worked as expected") + + # Print a snippet of the content + content_preview = result[0]['content']['raw_content'][:500] + "..." + print(f"\nContent preview:\n{content_preview}") + + else: + print(f"❌ Error: {response.status_code}") + print(f"Response: {response.text}") + + except Exception as e: + print(f"❌ Test failed with error: {e}") + finally: + # Cleanup + try: + os.unlink(temp_file) + except: + pass + +if __name__ == "__main__": + test_virtual_scroll_api() \ No newline at end of file diff --git a/tests/test_virtual_scroll_api_simple.py b/tests/test_virtual_scroll_api_simple.py new file mode 100644 index 00000000..24be8772 --- /dev/null +++ b/tests/test_virtual_scroll_api_simple.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +""" +Test VirtualScrollConfig with the /crawl API using existing test assets +""" + +import requests +import json +import os +import http.server +import socketserver +import threading +import time +from pathlib import Path + +def test_virtual_scroll_api(): + """Test the /crawl endpoint with VirtualScrollConfig using test assets""" + + # Use the existing test assets + assets_dir = Path(__file__).parent / "docs" / "examples" / "assets" + if not assets_dir.exists(): + print(f"❌ Assets directory not found: {assets_dir}") + return + + # Start local server for assets + os.chdir(assets_dir) + port = 8081 + + class QuietHTTPRequestHandler(http.server.SimpleHTTPRequestHandler): + def log_message(self, format, *args): + pass # Suppress log messages + + try: + with socketserver.TCPServer(("", port), QuietHTTPRequestHandler) as httpd: + server_thread = threading.Thread(target=httpd.serve_forever) + server_thread.daemon = True + server_thread.start() + + time.sleep(0.5) # Give server time to start + + # Test with Twitter-like virtual scroll + url = f"http://docs.crawl4ai.com/examples/assets/virtual_scroll_twitter_like.html" + + payload = { + "urls": [url], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport_width": 1280, + "viewport_height": 800 + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "virtual_scroll_config": { + "type": "VirtualScrollConfig", + "params": { + "container_selector": "#timeline", + "scroll_count": 10, + "scroll_by": "container_height", + "wait_after_scroll": 0.3 + } + }, + "cache_mode": "bypass", + "extraction_strategy": { + "type": "NoExtractionStrategy", + "params": {} + } + } + } + } + + print("Testing VirtualScrollConfig with /crawl endpoint...") + print(f"Test URL: {url}") + print("Payload:") + print(json.dumps(payload, indent=2)) + + response = requests.post( + "http://localhost:11234/crawl", + json=payload, + headers={"Content-Type": "application/json"}, + timeout=60 # Longer timeout for virtual scroll + ) + + print(f"\nResponse Status: {response.status_code}") + + if response.status_code == 200: + result = response.json() + print("✅ Success! VirtualScrollConfig is working with the API.") + print(f"Content length: {len(result[0]['content']['raw_content'])} characters") + + # Check if we captured multiple posts (indicating virtual scroll worked) + content = result[0]['content']['raw_content'] + post_count = content.count("Post #") + print(f"Found {post_count} posts in the content") + + if post_count > 5: # Should capture more than just the initial posts + print("✅ Virtual scroll successfully captured additional content!") + else: + print("⚠️ Virtual scroll may not have captured much additional content") + + # Print a snippet of the content + content_preview = content[:1000] + "..." if len(content) > 1000 else content + print(f"\nContent preview:\n{content_preview}") + + else: + print(f"❌ Error: {response.status_code}") + print(f"Response: {response.text}") + + except requests.exceptions.Timeout: + print("❌ Request timed out - virtual scroll may be taking too long") + except Exception as e: + print(f"❌ Test failed with error: {e}") + +if __name__ == "__main__": + test_virtual_scroll_api() \ No newline at end of file