feat(crawler): Enhance stealth and flexibility, improve error handling

- Implement playwright_stealth for better bot detection avoidance - Add user simulation and navigator override options - Improve iframe processing and browser selection - Enhance error reporting and debugging capabilities - Optimize image processing and parallel crawling - Add new example for user simulation feature - Added support for including links in Markdown content, by definin g a new flag `include_links_on_markdown` in `crawl` method.
2024-10-17 21:37:48 +08:00
parent 9ffa34b697
commit 768aa06ceb
8 changed files with 777 additions and 102 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1,17 +1,35 @@
 import asyncio
-import base64, time
+import base64
+import time
 from abc import ABC, abstractmethod
 from typing import Callable, Dict, Any, List, Optional, Awaitable
 import os
 from playwright.async_api import async_playwright, Page, Browser, Error
 from io import BytesIO
 from PIL import Image, ImageDraw, ImageFont
-from .utils import sanitize_input_encode, calculate_semaphore_count
-import json, uuid
-import hashlib
 from pathlib import Path
 from playwright.async_api import ProxySettings
 from pydantic import BaseModel
+import hashlib
+import json
+import uuid
+from playwright_stealth import StealthConfig, stealth_async
+
+stealth_config = StealthConfig(
+    webdriver=True,
+    chrome_app=True,
+    chrome_csi=True,
+    chrome_load_times=True,
+    chrome_runtime=True,
+    navigator_languages=True,
+    navigator_plugins=True,
+    navigator_permissions=True,
+    webgl_vendor=True,
+    outerdimensions=True,
+    navigator_hardware_concurrency=True,
+    media_codecs=True,
+)
+

 class AsyncCrawlResponse(BaseModel):
    html: str
@@ -47,10 +65,14 @@ class AsyncCrawlerStrategy(ABC):
 class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
    def __init__(self, use_cached_html=False, js_code=None, **kwargs):
        self.use_cached_html = use_cached_html
-        self.user_agent = kwargs.get("user_agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36")
+        self.user_agent = kwargs.get(
+            "user_agent",
+            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 "
+            "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
+        )
        self.proxy = kwargs.get("proxy")
        self.headless = kwargs.get("headless", True)
-        self.browser_type = kwargs.get("browser_type", "chromium")  # New parameter
+        self.browser_type = kwargs.get("browser_type", "chromium")
        self.headers = kwargs.get("headers", {})
        self.sessions = {}
        self.session_ttl = 1800 
@@ -83,9 +105,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                "headless": self.headless,
                "args": [
                    "--disable-gpu",
-                    "--disable-dev-shm-usage",
-                    "--disable-setuid-sandbox",
                    "--no-sandbox",
+                    "--disable-dev-shm-usage",
+                    "--disable-blink-features=AutomationControlled",
+                    "--disable-infobars",
+                    "--window-position=0,0",
+                    "--ignore-certificate-errors",
+                    "--ignore-certificate-errors-spki-list",
+                    # "--headless=new",  # Use the new headless mode
                ]
            }
            
@@ -94,7 +121,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                proxy_settings = ProxySettings(server=self.proxy)
                browser_args["proxy"] = proxy_settings
                
-                
            # Select the appropriate browser based on the browser_type
            if self.browser_type == "firefox":
                self.browser = await self.playwright.firefox.launch(**browser_args)
@@ -147,8 +173,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

    def _cleanup_expired_sessions(self):
        current_time = time.time()
-        expired_sessions = [sid for sid, (_, _, last_used) in self.sessions.items() 
-                            if current_time - last_used > self.session_ttl]
+        expired_sessions = [
+            sid for sid, (_, _, last_used) in self.sessions.items() 
+            if current_time - last_used > self.session_ttl
+        ]
        for sid in expired_sessions:
            asyncio.create_task(self.kill_session(sid))
            
@@ -188,8 +216,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                            return await self.csp_compliant_wait(page, f"() => {{{wait_for}}}", timeout)
                        except Error:
                            raise ValueError(f"Invalid wait_for parameter: '{wait_for}'. "
-                                            "It should be either a valid CSS selector, a JavaScript function, "
-                                            "or explicitly prefixed with 'js:' or 'css:'.")
+                                             "It should be either a valid CSS selector, a JavaScript function, "
+                                             "or explicitly prefixed with 'js:' or 'css:'.")
    
    async def csp_compliant_wait(self, page: Page, user_wait_function: str, timeout: float = 30000):
        wrapper_js = f"""
@@ -254,8 +282,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                print(f"Error processing iframe {i}: {str(e)}")

        # Return the page object
-        return page
-    
+        return page  
    
    async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse:
        response_headers = {}
@@ -268,6 +295,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            if not context:
                context = await self.browser.new_context(
                    user_agent=self.user_agent,
+                    viewport={"width": 1920, "height": 1080},
                    proxy={"server": self.proxy} if self.proxy else None
                )
                await context.set_extra_http_headers(self.headers)
@@ -275,18 +303,58 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                self.sessions[session_id] = (context, page, time.time())
        else:
            context = await self.browser.new_context(
-                    user_agent=self.user_agent,
-                    proxy={"server": self.proxy} if self.proxy else None
+                user_agent=self.user_agent,
+                viewport={"width": 1920, "height": 1080},
+                proxy={"server": self.proxy} if self.proxy else None
            )
            await context.set_extra_http_headers(self.headers)
+            
+            if kwargs.get("override_navigator", False):
+                # Inject scripts to override navigator properties
+                await context.add_init_script("""
+                    // Pass the Permissions Test.
+                    const originalQuery = window.navigator.permissions.query;
+                    window.navigator.permissions.query = (parameters) => (
+                        parameters.name === 'notifications' ?
+                            Promise.resolve({ state: Notification.permission }) :
+                            originalQuery(parameters)
+                    );
+                    Object.defineProperty(navigator, 'webdriver', {
+                        get: () => undefined
+                    });
+                    window.navigator.chrome = {
+                        runtime: {},
+                        // Add other properties if necessary
+                    };
+                    Object.defineProperty(navigator, 'plugins', {
+                        get: () => [1, 2, 3, 4, 5],
+                    });
+                    Object.defineProperty(navigator, 'languages', {
+                        get: () => ['en-US', 'en'],
+                    });
+                    Object.defineProperty(document, 'hidden', {
+                        get: () => false
+                    });
+                    Object.defineProperty(document, 'visibilityState', {
+                        get: () => 'visible'
+                    });
+                """)
+            
            page = await context.new_page()
+            # await stealth_async(page) #, stealth_config)

+        # Add console message and error logging
+        page.on("console", lambda msg: print(f"Console: {msg.text}"))
+        page.on("pageerror", lambda exc: print(f"Page Error: {exc}"))
+        
        try:
            if self.verbose:
                print(f"[LOG] 🕸️ Crawling {url} using AsyncPlaywrightCrawlerStrategy...")

            if self.use_cached_html:
-                cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
+                cache_file_path = os.path.join(
+                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                )
                if os.path.exists(cache_file_path):
                    html = ""
                    with open(cache_file_path, "r") as f:
@@ -296,12 +364,21 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        meta = json.load(f)
                        response_headers = meta.get("response_headers", {})
                        status_code = meta.get("status_code")
-                    response = AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
+                    response = AsyncCrawlResponse(
+                        html=html, response_headers=response_headers, status_code=status_code
+                    )
                    return response

            if not kwargs.get("js_only", False):
                await self.execute_hook('before_goto', page)
-                response = await page.goto(url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000))
+                
+                response = await page.goto(
+                    url, wait_until="domcontentloaded", timeout=kwargs.get("page_timeout", 60000)
+                )
+                
+                # response = await page.goto("about:blank")
+                # await page.evaluate(f"window.location.href = '{url}'")
+                
                await self.execute_hook('after_goto', page)
                
                # Get status code and headers
@@ -311,37 +388,29 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                status_code = 200
                response_headers = {}

-
            await page.wait_for_selector('body')
            await page.evaluate("window.scrollTo(0, document.body.scrollHeight)")

            js_code = kwargs.get("js_code", kwargs.get("js", self.js_code))
            if js_code:
                if isinstance(js_code, str):
-                    r = await page.evaluate(js_code)
+                    await page.evaluate(js_code)
                elif isinstance(js_code, list):
                    for js in js_code:
                        await page.evaluate(js)
                
-                # await page.wait_for_timeout(100)
                await page.wait_for_load_state('networkidle')
-                # Check for on execution even
+                # Check for on execution event
                await self.execute_hook('on_execution_started', page)
                
-            # New code to handle the wait_for parameter
-            # Example usage:
-            # await crawler.crawl(
-            #     url,
-            #     js_code="// some JavaScript code",
-            #     wait_for="""() => {
-            #         return document.querySelector('#my-element') !== null;
-            #     }"""
-            # )
-            # Example of using a CSS selector:
-            # await crawler.crawl(
-            #     url,
-            #     wait_for="#my-element"
-            # )
+            if kwargs.get("simulate_user", False):
+                # Simulate user interactions
+                await page.mouse.move(100, 100)
+                await page.mouse.down()
+                await page.mouse.up()
+                await page.keyboard.press('ArrowDown')
+
+            # Handle the wait_for parameter
            wait_for = kwargs.get("wait_for")
            if wait_for:
                try:
@@ -349,13 +418,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                except Exception as e:
                    raise RuntimeError(f"Wait condition failed: {str(e)}")

-            # Check if kwargs has screenshot=True then take screenshot
-            screenshot_data = None
-            if kwargs.get("screenshot"):
-                screenshot_data = await self.take_screenshot(url)
+
            
-            
-            # New code to update image dimensions
+            # Update image dimensions
            update_image_dimensions_js = """
            () => {
                return new Promise((resolve) => {
@@ -428,12 +493,19 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                
            html = await page.content()
            await self.execute_hook('before_return_html', page, html)
+            
+            # Check if kwargs has screenshot=True then take screenshot
+            screenshot_data = None
+            if kwargs.get("screenshot"):
+                screenshot_data = await self.take_screenshot(url)            

            if self.verbose:
                print(f"[LOG] ✅ Crawled {url} successfully!")

            if self.use_cached_html:
-                cache_file_path = os.path.join(Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest())
+                cache_file_path = os.path.join(
+                    Path.home(), ".crawl4ai", "cache", hashlib.md5(url.encode()).hexdigest()
+                )
                with open(cache_file_path, "w", encoding="utf-8") as f:
                    f.write(html)
                # store response headers and status code in cache
@@ -443,7 +515,6 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "status_code": status_code
                    }, f)

-            
            async def get_delayed_content(delay: float = 5.0) -> str:
                if self.verbose:
                    print(f"[LOG] Waiting for {delay} seconds before retrieving content for {url}")
@@ -463,59 +534,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        finally:
            if not session_id:
                await page.close()
+                await context.close()

-        # try:
-        #     html = await _crawl()
-        #     return sanitize_input_encode(html)
-        # except Error as e:
-        #     raise Error(f"Failed to crawl {url}: {str(e)}")
-        # except Exception as e:
-        #     raise Exception(f"Failed to crawl {url}: {str(e)}")
-
-    async def execute_js(self, session_id: str, js_code: str, wait_for_js: str = None, wait_for_css: str = None) -> AsyncCrawlResponse:
-        """
-        Execute JavaScript code in a specific session and optionally wait for a condition.
-        
-        :param session_id: The ID of the session to execute the JS code in.
-        :param js_code: The JavaScript code to execute.
-        :param wait_for_js: JavaScript condition to wait for after execution.
-        :param wait_for_css: CSS selector to wait for after execution.
-        :return: AsyncCrawlResponse containing the page's HTML and other information.
-        :raises ValueError: If the session does not exist.
-        """
-        if not session_id:
-            raise ValueError("Session ID must be provided")
-        
-        if session_id not in self.sessions:
-            raise ValueError(f"No active session found for session ID: {session_id}")
-        
-        context, page, last_used = self.sessions[session_id]
-        
-        try:
-            await page.evaluate(js_code)
-            
-            if wait_for_js:
-                await page.wait_for_function(wait_for_js)
-            
-            if wait_for_css:
-                await page.wait_for_selector(wait_for_css)
-            
-            # Get the updated HTML content
-            html = await page.content()
-            
-            # Get response headers and status code (assuming these are available)
-            response_headers = await page.evaluate("() => JSON.stringify(performance.getEntriesByType('resource')[0].responseHeaders)")
-            status_code = await page.evaluate("() => performance.getEntriesByType('resource')[0].responseStatus")
-            
-            # Update the last used time for this session
-            self.sessions[session_id] = (context, page, time.time())
-            
-            return AsyncCrawlResponse(html=html, response_headers=response_headers, status_code=status_code)
-        except Error as e:
-            raise Error(f"Failed to execute JavaScript or wait for condition in session {session_id}: {str(e)}")
-    
    async def crawl_many(self, urls: List[str], **kwargs) -> List[AsyncCrawlResponse]:
-        semaphore_count = kwargs.get('semaphore_count', calculate_semaphore_count())
+        semaphore_count = kwargs.get('semaphore_count', 5)  # Adjust as needed
        semaphore = asyncio.Semaphore(semaphore_count)

        async def crawl_with_semaphore(url):
@@ -526,7 +548,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        results = await asyncio.gather(*tasks, return_exceptions=True)
        return [result if not isinstance(result, Exception) else str(result) for result in results]

-    async def take_screenshot(self, url: str, wait_time = 1000) -> str:
+    async def take_screenshot(self, url: str, wait_time=1000) -> str:
        async with await self.browser.new_context(user_agent=self.user_agent) as context:
            page = await context.new_page()
            try:
@@ -549,4 +571,5 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                img.save(buffered, format="JPEG")
                return base64.b64encode(buffered.getvalue()).decode('utf-8')
            finally:
-                await page.close()
+                await page.close()
+