Enhance AsyncWebCrawler with smart waiting and screenshot capabilities

- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
2024-10-02 17:34:56 +08:00
parent e0e0db4247
commit 4750810a67
10 changed files with 281 additions and 21 deletions
--- a/tests/async/test_basic_crawling.py
+++ b/tests/async/test_basic_crawling.py
@@ -5,7 +5,7 @@ import asyncio
 import time

 # Add the parent directory to the Python path
-parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+parent_dir = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
 sys.path.append(parent_dir)

 from crawl4ai.async_webcrawler import AsyncWebCrawler
--- a/tests/async/test_screenshot.py
+++ b/tests/async/test_screenshot.py
@@ -0,0 +1,124 @@
+import os
+import sys
+import pytest
+import asyncio
+import base64
+from PIL import Image
+import io
+
+# Add the parent directory to the Python path
+parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
+sys.path.append(parent_dir)
+
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+
+@pytest.mark.asyncio
+async def test_basic_screenshot():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://example.com"  # A static website
+        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
+        
+        assert result.success
+        assert result.screenshot is not None
+        
+        # Verify the screenshot is a valid image
+        image_data = base64.b64decode(result.screenshot)
+        image = Image.open(io.BytesIO(image_data))
+        assert image.format == "PNG"
+
+@pytest.mark.asyncio
+async def test_screenshot_with_wait_for():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Using a website with dynamic content
+        url = "https://www.youtube.com"
+        wait_for = "css:#content"  # Wait for the main content to load
+        
+        result = await crawler.arun(
+            url=url, 
+            bypass_cache=True, 
+            screenshot=True, 
+            wait_for=wait_for
+        )
+        
+        assert result.success
+        assert result.screenshot is not None
+        
+        # Verify the screenshot is a valid image
+        image_data = base64.b64decode(result.screenshot)
+        image = Image.open(io.BytesIO(image_data))
+        assert image.format == "PNG"
+        
+        # You might want to add more specific checks here, like image dimensions
+        # or even use image recognition to verify certain elements are present
+
+@pytest.mark.asyncio
+async def test_screenshot_with_js_wait_for():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://www.amazon.com"
+        wait_for = "js:() => document.querySelector('#nav-logo-sprites') !== null"
+        
+        result = await crawler.arun(
+            url=url, 
+            bypass_cache=True, 
+            screenshot=True, 
+            wait_for=wait_for
+        )
+        
+        assert result.success
+        assert result.screenshot is not None
+        
+        image_data = base64.b64decode(result.screenshot)
+        image = Image.open(io.BytesIO(image_data))
+        assert image.format == "PNG"
+
+@pytest.mark.asyncio
+async def test_screenshot_without_wait_for():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://www.nytimes.com"  # A website with lots of dynamic content
+        
+        result = await crawler.arun(url=url, bypass_cache=True, screenshot=True)
+        
+        assert result.success
+        assert result.screenshot is not None
+        
+        image_data = base64.b64decode(result.screenshot)
+        image = Image.open(io.BytesIO(image_data))
+        assert image.format == "PNG"
+
+@pytest.mark.asyncio
+async def test_screenshot_comparison():
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        url = "https://www.reddit.com"
+        wait_for = "css:#SHORTCUT_FOCUSABLE_DIV"
+        
+        # Take screenshot without wait_for
+        result_without_wait = await crawler.arun(
+            url=url, 
+            bypass_cache=True, 
+            screenshot=True
+        )
+        
+        # Take screenshot with wait_for
+        result_with_wait = await crawler.arun(
+            url=url, 
+            bypass_cache=True, 
+            screenshot=True, 
+            wait_for=wait_for
+        )
+        
+        assert result_without_wait.success and result_with_wait.success
+        assert result_without_wait.screenshot is not None
+        assert result_with_wait.screenshot is not None
+        
+        # Compare the two screenshots
+        image_without_wait = Image.open(io.BytesIO(base64.b64decode(result_without_wait.screenshot)))
+        image_with_wait = Image.open(io.BytesIO(base64.b64decode(result_with_wait.screenshot)))
+        
+        # This is a simple size comparison. In a real-world scenario, you might want to use
+        # more sophisticated image comparison techniques.
+        assert image_with_wait.size[0] >= image_without_wait.size[0]
+        assert image_with_wait.size[1] >= image_without_wait.size[1]
+
+# Entry point for debugging
+if __name__ == "__main__":
+    pytest.main([__file__, "-v"])