Enhance AsyncWebCrawler with smart waiting and screenshot capabilities

- Implement smart_wait function in AsyncPlaywrightCrawlerStrategy - Add screenshot support to AsyncCrawlResponse and AsyncWebCrawler - Improve error handling and timeout management in crawling process - Fix typo in CrawlResult model (responser_headers -> response_headers) - Update .gitignore to exclude additional files - Adjust import path in test_basic_crawling.py
2024-10-02 17:34:56 +08:00
parent e0e0db4247
commit 4750810a67
10 changed files with 281 additions and 21 deletions
--- a/crawl4ai/init.py
+++ b/crawl4ai/init.py
@@ -3,7 +3,7 @@
 from .async_webcrawler import AsyncWebCrawler
 from .models import CrawlResult

-__version__ = "0.3.4"
+__version__ = "0.3.5"

 __all__ = [
    "AsyncWebCrawler",
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -12,10 +12,12 @@ import hashlib
 from pathlib import Path
 from playwright.async_api import ProxySettings
 from pydantic import BaseModel
+
 class AsyncCrawlResponse(BaseModel):
    html: str
    response_headers: Dict[str, str]
    status_code: int
+    screenshot: Optional[str] = None

 class AsyncCrawlerStrategy(ABC):
    @abstractmethod
@@ -139,6 +141,45 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            asyncio.create_task(self.kill_session(sid))
            
            
+    async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000):
+        wait_for = wait_for.strip()
+        
+        if wait_for.startswith('js:'):
+            # Explicitly specified JavaScript
+            js_code = wait_for[3:].strip()
+            return await self.csp_compliant_wait(page, js_code, timeout)
+        elif wait_for.startswith('css:'):
+            # Explicitly specified CSS selector
+            css_selector = wait_for[4:].strip()
+            try:
+                await page.wait_for_selector(css_selector, timeout=timeout)
+            except Error as e:
+                if 'Timeout' in str(e):
+                    raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{css_selector}'")
+                else:
+                    raise ValueError(f"Invalid CSS selector: '{css_selector}'")
+        else:
+            # Auto-detect based on content
+            if wait_for.startswith('()') or wait_for.startswith('function'):
+                # It's likely a JavaScript function
+                return await self.csp_compliant_wait(page, wait_for, timeout)
+            else:
+                # Assume it's a CSS selector first
+                try:
+                    await page.wait_for_selector(wait_for, timeout=timeout)
+                except Error as e:
+                    if 'Timeout' in str(e):
+                        raise TimeoutError(f"Timeout after {timeout}ms waiting for selector '{wait_for}'")
+                    else:
+                        # If it's not a timeout error, it might be an invalid selector
+                        # Let's try to evaluate it as a JavaScript function as a fallback
+                        try:
+                            return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
+                        except Error:
+                            raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
+                                            "It should be either a valid CSS selector, a JavaScript function, "
+                                            "or explicitly prefixed with 'js:' or 'css:'.")
+    
    async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
        wrapper_js = f"""
        async () => {{
@@ -250,19 +291,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            wait_for = kwargs.get("wait_for")
            if wait_for:
                try:
-                    await self.csp_compliant_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
+                    await self.smart_wait(page, wait_for, timeout=kwargs.get("timeout", 30000))
                except Exception as e:
-                    raise RuntimeError(f"Custom wait condition failed: {str(e)}")                
-                # try:
-                #     await page.wait_for_function(wait_for)
-                #     # if callable(wait_for):
-                #     #     await page.wait_for_function(wait_for)
-                #     # elif isinstance(wait_for, str):
-                #     #     await page.wait_for_selector(wait_for)
-                #     # else:
-                #     #     raise ValueError("wait_for must be either a callable or a CSS selector string")
-                # except Error as e:
-                #     raise Error(f"Custom wait condition failed: {str(e)}")
+                    raise RuntimeError(f"Wait condition failed: {str(e)}")

            html = await page.content()
            page = await self.execute_hook('before_return_html', page, html)
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -80,7 +80,7 @@ class AsyncWebCrawler:
            
            word_count_threshold = max(word_count_threshold, MIN_WORD_THRESHOLD)

-            async_response : AsyncCrawlResponse = None
+            async_response: AsyncCrawlResponse = None
            cached = None
            screenshot_data = None
            extracted_content = None
@@ -102,15 +102,14 @@ class AsyncWebCrawler:
                t1 = time.time()
                if user_agent:
                    self.crawler_strategy.update_user_agent(user_agent)
-                async_response : AsyncCrawlResponse = await self.crawler_strategy.crawl(url, **kwargs)
+                async_response: AsyncCrawlResponse = await self.crawler_strategy.crawl(url, screenshot=screenshot, **kwargs)
                html = sanitize_input_encode(async_response.html)
+                screenshot_data = async_response.screenshot
                t2 = time.time()
                if verbose:
                    print(
                        f"[LOG] 🚀 Crawling done for {url}, success: {bool(html)}, time taken: {t2 - t1:.2f} seconds"
                    )
-                if screenshot:
-                    screenshot_data = await self.crawler_strategy.take_screenshot(url)

            crawl_result = await self.aprocess_html(
                url,
@@ -127,7 +126,7 @@ class AsyncWebCrawler:
                **kwargs,
            )
            crawl_result.status_code = async_response.status_code if async_response else 200
-            crawl_result.responser_headers = async_response.response_headers if async_response else {}
+            crawl_result.response_headers = async_response.response_headers if async_response else {}
            crawl_result.success = bool(html)
            crawl_result.session_id = kwargs.get("session_id", None)
            return crawl_result
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -18,5 +18,5 @@ class CrawlResult(BaseModel):
    metadata: Optional[dict] = None
    error_message: Optional[str] = None
    session_id: Optional[str] = None
-    responser_headers: Optional[dict] = None
+    response_headers: Optional[dict] = None
    status_code: Optional[int] = None