diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 7a4f373d..4bd06783 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -88,6 +88,13 @@ from .script import ( ErrorDetail ) +# Browser Adapters +from .browser_adapter import ( + BrowserAdapter, + PlaywrightAdapter, + UndetectedAdapter +) + from .utils import ( start_colab_display_server, setup_colab_environment @@ -174,6 +181,10 @@ __all__ = [ "CompilationResult", "ValidationResult", "ErrorDetail", + # Browser Adapters + "BrowserAdapter", + "PlaywrightAdapter", + "UndetectedAdapter", "LinkPreviewConfig" ] diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 174ec3e4..3c70d634 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -390,6 +390,8 @@ class BrowserConfig: light_mode (bool): Disables certain background features for performance gains. Default: False. extra_args (list): Additional command-line arguments passed to the browser. Default: []. + enable_stealth (bool): If True, applies playwright-stealth to bypass basic bot detection. + Cannot be used with use_undetected browser mode. Default: False. """ def __init__( @@ -430,6 +432,7 @@ class BrowserConfig: extra_args: list = None, debugging_port: int = 9222, host: str = "localhost", + enable_stealth: bool = False, ): self.browser_type = browser_type self.headless = headless @@ -470,6 +473,7 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port self.host = host + self.enable_stealth = enable_stealth fa_user_agenr_generator = ValidUAGenerator() if self.user_agent_mode == "random": @@ -501,6 +505,13 @@ class BrowserConfig: # If persistent context is requested, ensure managed browser is enabled if self.use_persistent_context: self.use_managed_browser = True + + # Validate stealth configuration + if self.enable_stealth and self.use_managed_browser and self.browser_mode == "builtin": + raise ValueError( + "enable_stealth cannot be used with browser_mode='builtin'. " + "Stealth mode requires a dedicated browser instance." + ) @staticmethod def from_kwargs(kwargs: dict) -> "BrowserConfig": @@ -537,6 +548,7 @@ class BrowserConfig: extra_args=kwargs.get("extra_args", []), debugging_port=kwargs.get("debugging_port", 9222), host=kwargs.get("host", "localhost"), + enable_stealth=kwargs.get("enable_stealth", False), ) def to_dict(self): @@ -571,6 +583,7 @@ class BrowserConfig: "verbose": self.verbose, "debugging_port": self.debugging_port, "host": self.host, + "enable_stealth": self.enable_stealth, } diff --git a/crawl4ai/async_crawler_strategy.back.py b/crawl4ai/async_crawler_strategy.back.py new file mode 100644 index 00000000..9fdb0fe2 --- /dev/null +++ b/crawl4ai/async_crawler_strategy.back.py @@ -0,0 +1,2450 @@ +from __future__ import annotations + +import asyncio +import base64 +import time +from abc import ABC, abstractmethod +from typing import Callable, Dict, Any, List, Union +from typing import Optional, AsyncGenerator, Final +import os +from playwright.async_api import Page, Error +from playwright.async_api import TimeoutError as PlaywrightTimeoutError +from io import BytesIO +from PIL import Image, ImageDraw, ImageFont +import hashlib +import uuid +from .js_snippet import load_js_script +from .models import AsyncCrawlResponse +from .config import SCREENSHOT_HEIGHT_TRESHOLD +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig +from .async_logger import AsyncLogger +from .ssl_certificate import SSLCertificate +from .user_agent_generator import ValidUAGenerator +from .browser_manager import BrowserManager + +import aiofiles +import aiohttp +import chardet +from aiohttp.client import ClientTimeout +from urllib.parse import urlparse +from types import MappingProxyType +import contextlib +from functools import partial + +class AsyncCrawlerStrategy(ABC): + """ + Abstract base class for crawler strategies. + Subclasses must implement the crawl method. + """ + + @abstractmethod + async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse: + pass # 4 + 3 + +class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): + """ + Crawler strategy using Playwright. + + Attributes: + browser_config (BrowserConfig): Configuration object containing browser settings. + logger (AsyncLogger): Logger instance for recording events and errors. + _downloaded_files (List[str]): List of downloaded file paths. + hooks (Dict[str, Callable]): Dictionary of hooks for custom behavior. + browser_manager (BrowserManager): Manager for browser creation and management. + + Methods: + __init__(self, browser_config=None, logger=None, **kwargs): + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + __aenter__(self): + Start the browser and initialize the browser manager. + __aexit__(self, exc_type, exc_val, exc_tb): + Close the browser and clean up resources. + start(self): + Start the browser and initialize the browser manager. + close(self): + Close the browser and clean up resources. + kill_session(self, session_id): + Kill a browser session and clean up resources. + crawl(self, url, **kwargs): + Run the crawler for a single URL. + + """ + + def __init__( + self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs + ): + """ + Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. + + Args: + browser_config (BrowserConfig): Configuration object containing browser settings. + If None, will be created from kwargs for backwards compatibility. + logger: Logger instance for recording events and errors. + **kwargs: Additional arguments for backwards compatibility and extending functionality. + """ + # Initialize browser config, either from provided object or kwargs + self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) + self.logger = logger + + # Initialize session management + self._downloaded_files = [] + + # Initialize hooks system + self.hooks = { + "on_browser_created": None, + "on_page_context_created": None, + "on_user_agent_updated": None, + "on_execution_started": None, + "on_execution_ended": None, + "before_goto": None, + "after_goto": None, + "before_return_html": None, + "before_retrieve_html": None, + } + + # Initialize browser manager with config + self.browser_manager = BrowserManager( + browser_config=self.browser_config, logger=self.logger + ) + + async def __aenter__(self): + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb): + await self.close() + + async def start(self): + """ + Start the browser and initialize the browser manager. + """ + await self.browser_manager.start() + await self.execute_hook( + "on_browser_created", + self.browser_manager.browser, + context=self.browser_manager.default_context, + ) + + async def close(self): + """ + Close the browser and clean up resources. + """ + await self.browser_manager.close() + # Explicitly reset the static Playwright instance + BrowserManager._playwright_instance = None + + async def kill_session(self, session_id: str): + """ + Kill a browser session and clean up resources. + + Args: + session_id (str): The ID of the session to kill. + + Returns: + None + """ + # Log a warning message and no need kill session, in new version auto kill session + self.logger.warning( + message="Session auto-kill is enabled in the new version. No need to manually kill sessions.", + tag="WARNING", + ) + await self.browser_manager.kill_session(session_id) + + def set_hook(self, hook_type: str, hook: Callable): + """ + Set a hook function for a specific hook type. Following are list of hook types: + - on_browser_created: Called when a new browser instance is created. + - on_page_context_created: Called when a new page context is created. + - on_user_agent_updated: Called when the user agent is updated. + - on_execution_started: Called when the execution starts. + - before_goto: Called before a goto operation. + - after_goto: Called after a goto operation. + - before_return_html: Called before returning HTML content. + - before_retrieve_html: Called before retrieving HTML content. + + All hooks except on_browser_created accepts a context and a page as arguments and **kwargs. However, on_browser_created accepts a browser and a context as arguments and **kwargs. + + Args: + hook_type (str): The type of the hook. + hook (Callable): The hook function to set. + + Returns: + None + """ + if hook_type in self.hooks: + self.hooks[hook_type] = hook + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def execute_hook(self, hook_type: str, *args, **kwargs): + """ + Execute a hook function for a specific hook type. + + Args: + hook_type (str): The type of the hook. + *args: Variable length positional arguments. + **kwargs: Keyword arguments. + + Returns: + The return value of the hook function, if any. + """ + hook = self.hooks.get(hook_type) + if hook: + if asyncio.iscoroutinefunction(hook): + return await hook(*args, **kwargs) + else: + return hook(*args, **kwargs) + return args[0] if args else None + + def update_user_agent(self, user_agent: str): + """ + Update the user agent for the browser. + + Args: + user_agent (str): The new user agent string. + + Returns: + None + """ + self.user_agent = user_agent + + def set_custom_headers(self, headers: Dict[str, str]): + """ + Set custom headers for the browser. + + Args: + headers (Dict[str, str]): A dictionary of headers to set. + + Returns: + None + """ + self.headers = headers + + async def smart_wait(self, page: Page, wait_for: str, timeout: float = 30000): + """ + Wait for a condition in a smart way. This functions works as below: + + 1. If wait_for starts with 'js:', it assumes it's a JavaScript function and waits for it to return true. + 2. If wait_for starts with 'css:', it assumes it's a CSS selector and waits for it to be present. + 3. Otherwise, it tries to evaluate wait_for as a JavaScript function and waits for it to return true. + 4. If it's not a JavaScript function, it assumes it's a CSS selector and waits for it to be present. + + This is a more advanced version of the wait_for parameter in CrawlerStrategy.crawl(). + Args: + page: Playwright page object + wait_for (str): The condition to wait for. Can be a CSS selector, a JavaScript function, or explicitly prefixed with 'js:' or 'css:'. + timeout (float): Maximum time to wait in milliseconds + + Returns: + None + """ + wait_for = wait_for.strip() + + if wait_for.startswith("js:"): + # Explicitly specified JavaScript + js_code = wait_for[3:].strip() + return await self.csp_compliant_wait(page, js_code, timeout) + elif wait_for.startswith("css:"): + # Explicitly specified CSS selector + css_selector = wait_for[4:].strip() + try: + await page.wait_for_selector(css_selector, timeout=timeout) + except Error as e: + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{css_selector}'" + ) + else: + raise ValueError(f"Invalid CSS selector: '{css_selector}'") + else: + # Auto-detect based on content + if wait_for.startswith("()") or wait_for.startswith("function"): + # It's likely a JavaScript function + return await self.csp_compliant_wait(page, wait_for, timeout) + else: + # Assume it's a CSS selector first + try: + await page.wait_for_selector(wait_for, timeout=timeout) + except Error as e: + if "Timeout" in str(e): + raise TimeoutError( + f"Timeout after {timeout}ms waiting for selector '{wait_for}'" + ) + else: + # If it's not a timeout error, it might be an invalid selector + # Let's try to evaluate it as a JavaScript function as a fallback + try: + return await self.csp_compliant_wait( + page, f"() => {{{wait_for}}}", timeout + ) + except Error: + raise ValueError( + f"Invalid wait_for parameter: '{wait_for}'. " + "It should be either a valid CSS selector, a JavaScript function, " + "or explicitly prefixed with 'js:' or 'css:'." + ) + + async def csp_compliant_wait( + self, page: Page, user_wait_function: str, timeout: float = 30000 + ): + """ + Wait for a condition in a CSP-compliant way. + + Args: + page: Playwright page object + user_wait_function: JavaScript function as string that returns boolean + timeout: Maximum time to wait in milliseconds + + Returns: + bool: True if condition was met, False if timed out + + Raises: + RuntimeError: If there's an error evaluating the condition + """ + wrapper_js = f""" + async () => {{ + const userFunction = {user_wait_function}; + const startTime = Date.now(); + try {{ + while (true) {{ + if (await userFunction()) {{ + return true; + }} + if (Date.now() - startTime > {timeout}) {{ + return false; // Return false instead of throwing + }} + await new Promise(resolve => setTimeout(resolve, 100)); + }} + }} catch (error) {{ + throw new Error(`Error evaluating condition: ${{error.message}}`); + }} + }} + """ + + try: + result = await page.evaluate(wrapper_js) + return result + except Exception as e: + if "Error evaluating condition" in str(e): + raise RuntimeError(f"Failed to evaluate wait condition: {str(e)}") + # For timeout or other cases, just return False + return False + + async def process_iframes(self, page): + """ + Process iframes on a page. This function will extract the content of each iframe and replace it with a div containing the extracted content. + + Args: + page: Playwright page object + + Returns: + Playwright page object + """ + # Find all iframes + iframes = await page.query_selector_all("iframe") + + for i, iframe in enumerate(iframes): + try: + # Add a unique identifier to the iframe + await iframe.evaluate(f'(element) => element.id = "iframe-{i}"') + + # Get the frame associated with this iframe + frame = await iframe.content_frame() + + if frame: + # Wait for the frame to load + await frame.wait_for_load_state( + "load", timeout=30000 + ) # 30 seconds timeout + + # Extract the content of the iframe's body + iframe_content = await frame.evaluate( + "() => document.body.innerHTML" + ) + + # Generate a unique class name for this iframe + class_name = f"extracted-iframe-content-{i}" + + # Replace the iframe with a div containing the extracted content + _iframe = iframe_content.replace("`", "\\`") + await page.evaluate( + f""" + () => {{ + const iframe = document.getElementById('iframe-{i}'); + const div = document.createElement('div'); + div.innerHTML = `{_iframe}`; + div.className = '{class_name}'; + iframe.replaceWith(div); + }} + """ + ) + else: + self.logger.warning( + message="Could not access content frame for iframe {index}", + tag="SCRAPE", + params={"index": i}, + ) + except Exception as e: + self.logger.error( + message="Error processing iframe {index}: {error}", + tag="ERROR", + params={"index": i, "error": str(e)}, + ) + + # Return the page object + return page + + async def create_session(self, **kwargs) -> str: + """ + Creates a new browser session and returns its ID. A browse session is a unique openned page can be reused for multiple crawls. + This function is asynchronous and returns a string representing the session ID. + + Args: + **kwargs: Optional keyword arguments to configure the session. + + Returns: + str: The session ID. + """ + await self.start() + + session_id = kwargs.get("session_id") or str(uuid.uuid4()) + + user_agent = kwargs.get("user_agent", self.user_agent) + # Use browser_manager to get a fresh page & context assigned to this session_id + page, context = await self.browser_manager.get_page(CrawlerRunConfig( + session_id=session_id, + user_agent=user_agent, + **kwargs, + )) + return session_id + + async def crawl( + self, url: str, config: CrawlerRunConfig, **kwargs + ) -> AsyncCrawlResponse: + """ + Crawls a given URL or processes raw HTML/local file content based on the URL prefix. + + Args: + url (str): The URL to crawl. Supported prefixes: + - 'http://' or 'https://': Web URL to crawl. + - 'file://': Local file path to process. + - 'raw://': Raw HTML content to process. + **kwargs: Additional parameters: + - 'screenshot' (bool): Whether to take a screenshot. + - ... [other existing parameters] + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional screenshot. + """ + config = config or CrawlerRunConfig.from_kwargs(kwargs) + response_headers = {} + status_code = 200 # Default for local/raw HTML + screenshot_data = None + + if url.startswith(("http://", "https://", "view-source:")): + return await self._crawl_web(url, config) + + elif url.startswith("file://"): + # initialize empty lists for console messages + captured_console = [] + + # Process local file + local_file_path = url[7:] # Remove 'file://' prefix + if not os.path.exists(local_file_path): + raise FileNotFoundError(f"Local file not found: {local_file_path}") + with open(local_file_path, "r", encoding="utf-8") as f: + html = f.read() + if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + if config.capture_console_messages: + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + captured_console = await self._capture_console_messages(page, url) + + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None, + console_messages=captured_console, + ) + + ##### + # Since both "raw:" and "raw://" start with "raw:", the first condition is always true for both, so "raw://" will be sliced as "//...", which is incorrect. + # Fix: Check for "raw://" first, then "raw:" + # Also, the prefix "raw://" is actually 6 characters long, not 7, so it should be sliced accordingly: url[6:] + ##### + elif url.startswith("raw://") or url.startswith("raw:"): + # Process raw HTML content + # raw_html = url[4:] if url[:4] == "raw:" else url[7:] + raw_html = url[6:] if url.startswith("raw://") else url[4:] + html = raw_html + if config.screenshot: + screenshot_data = await self._generate_screenshot_from_html(html) + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + status_code=status_code, + screenshot=screenshot_data, + get_delayed_content=None, + ) + else: + raise ValueError( + "URL must start with 'http://', 'https://', 'file://', or 'raw:'" + ) + + async def _crawl_web( + self, url: str, config: CrawlerRunConfig + ) -> AsyncCrawlResponse: + """ + Internal method to crawl web URLs with the specified configuration. + Includes optional network and console capturing. + + Args: + url (str): The web URL to crawl + config (CrawlerRunConfig): Configuration object controlling the crawl behavior + + Returns: + AsyncCrawlResponse: The response containing HTML, headers, status code, and optional data + """ + config.url = url + response_headers = {} + execution_result = None + status_code = None + redirected_url = url + + # Reset downloaded files list for new crawl + self._downloaded_files = [] + + # Initialize capture lists + captured_requests = [] + captured_console = [] + + # Handle user agent with magic mode + user_agent_to_override = config.user_agent + if user_agent_to_override: + self.browser_config.user_agent = user_agent_to_override + elif config.magic or config.user_agent_mode == "random": + self.browser_config.user_agent = ValidUAGenerator().generate( + **(config.user_agent_generator_config or {}) + ) + + # Get page for session + page, context = await self.browser_manager.get_page(crawlerRunConfig=config) + + # await page.goto(URL) + + # Add default cookie + # await context.add_cookies( + # [{"name": "cookiesEnabled", "value": "true", "url": url}] + # ) + + # Handle navigator overrides + if config.override_navigator or config.simulate_user or config.magic: + await context.add_init_script(load_js_script("navigator_overrider")) + + # Call hook after page creation + await self.execute_hook("on_page_context_created", page, context=context, config=config) + + # Network Request Capturing + if config.capture_network_requests: + async def handle_request_capture(request): + try: + post_data_str = None + try: + # Be cautious with large post data + post_data = request.post_data_buffer + if post_data: + # Attempt to decode, fallback to base64 or size indication + try: + post_data_str = post_data.decode('utf-8', errors='replace') + except UnicodeDecodeError: + post_data_str = f"[Binary data: {len(post_data)} bytes]" + except Exception: + post_data_str = "[Error retrieving post data]" + + captured_requests.append({ + "event_type": "request", + "url": request.url, + "method": request.method, + "headers": dict(request.headers), # Convert Header dict + "post_data": post_data_str, + "resource_type": request.resource_type, + "is_navigation_request": request.is_navigation_request(), + "timestamp": time.time() + }) + except Exception as e: + if self.logger: + self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE") + captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()}) + + async def handle_response_capture(response): + try: + try: + # body = await response.body() + # json_body = await response.json() + text_body = await response.text() + except Exception as e: + body = None + # json_body = None + # text_body = None + captured_requests.append({ + "event_type": "response", + "url": response.url, + "status": response.status, + "status_text": response.status_text, + "headers": dict(response.headers), # Convert Header dict + "from_service_worker": response.from_service_worker, + "request_timing": response.request.timing, # Detailed timing info + "timestamp": time.time(), + "body" : { + # "raw": body, + # "json": json_body, + "text": text_body + } + }) + except Exception as e: + if self.logger: + self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE") + captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()}) + + async def handle_request_failed_capture(request): + try: + captured_requests.append({ + "event_type": "request_failed", + "url": request.url, + "method": request.method, + "resource_type": request.resource_type, + "failure_text": str(request.failure) if request.failure else "Unknown failure", + "timestamp": time.time() + }) + except Exception as e: + if self.logger: + self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE") + captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()}) + + page.on("request", handle_request_capture) + page.on("response", handle_response_capture) + page.on("requestfailed", handle_request_failed_capture) + + # Console Message Capturing + if config.capture_console_messages: + def handle_console_capture(msg): + try: + message_type = "unknown" + try: + message_type = msg.type + except: + pass + + message_text = "unknown" + try: + message_text = msg.text + except: + pass + + # Basic console message with minimal content + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time() + } + + captured_console.append(entry) + + except Exception as e: + if self.logger: + self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE") + # Still add something to the list even on error + captured_console.append({ + "type": "console_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + def handle_pageerror_capture(err): + try: + error_message = "Unknown error" + try: + error_message = err.message + except: + pass + + error_stack = "" + try: + error_stack = err.stack + except: + pass + + captured_console.append({ + "type": "error", + "text": error_message, + "stack": error_stack, + "timestamp": time.time() + }) + except Exception as e: + if self.logger: + self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE") + captured_console.append({ + "type": "pageerror_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + # Add event listeners directly + page.on("console", handle_console_capture) + page.on("pageerror", handle_pageerror_capture) + + # Set up console logging if requested + if config.log_console: + def log_consol( + msg, console_log_type="debug" + ): # Corrected the parameter syntax + if console_log_type == "error": + self.logger.error( + message=f"Console error: {msg}", # Use f-string for variable interpolation + tag="CONSOLE" + ) + elif console_log_type == "debug": + self.logger.debug( + message=f"Console: {msg}", # Use f-string for variable interpolation + tag="CONSOLE" + ) + + page.on("console", log_consol) + page.on("pageerror", lambda e: log_consol(e, "error")) + + try: + # Get SSL certificate information if requested and URL is HTTPS + ssl_cert = None + if config.fetch_ssl_certificate: + ssl_cert = SSLCertificate.from_url(url) + + # Set up download handling + if self.browser_config.accept_downloads: + page.on( + "download", + lambda download: asyncio.create_task( + self._handle_download(download) + ), + ) + + # Handle page navigation and content loading + if not config.js_only: + await self.execute_hook("before_goto", page, context=context, url=url, config=config) + + try: + # Generate a unique nonce for this request + if config.experimental.get("use_csp_nonce", False): + nonce = hashlib.sha256(os.urandom(32)).hexdigest() + + # Add CSP headers to the request + await page.set_extra_http_headers( + { + "Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'" + } + ) + + response = await page.goto( + url, wait_until=config.wait_until, timeout=config.page_timeout + ) + redirected_url = page.url + except Error as e: + # Allow navigation to be aborted when downloading files + # This is expected behavior for downloads in some browser engines + if 'net::ERR_ABORTED' in str(e) and self.browser_config.accept_downloads: + self.logger.info( + message=f"Navigation aborted, likely due to file download: {url}", + tag="GOTO", + params={"url": url}, + ) + response = None + else: + raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") + + await self.execute_hook( + "after_goto", page, context=context, url=url, response=response, config=config + ) + + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── + if response is None: + status_code = 200 + response_headers = {} + else: + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers + # if response is None: + # status_code = 200 + # response_headers = {} + # else: + # status_code = response.status + # response_headers = response.headers + + else: + status_code = 200 + response_headers = {} + + # Wait for body element and visibility + try: + await page.wait_for_selector("body", state="attached", timeout=30000) + + # Use the new check_visibility function with csp_compliant_wait + is_visible = await self.csp_compliant_wait( + page, + """() => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + }""", + timeout=30000, + ) + + if not is_visible and not config.ignore_body_visibility: + visibility_info = await self.check_visibility(page) + raise Error(f"Body element is hidden: {visibility_info}") + + except Error: + visibility_info = await self.check_visibility(page) + + if self.browser_config.config.verbose: + self.logger.debug( + message="Body visibility info: {info}", + tag="DEBUG", + params={"info": visibility_info}, + ) + + if not config.ignore_body_visibility: + raise Error(f"Body element is hidden: {visibility_info}") + + # try: + # await page.wait_for_selector("body", state="attached", timeout=30000) + + # await page.wait_for_function( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return style.display !== 'none' && + # style.visibility !== 'hidden' && + # style.opacity !== '0'; + # } + # """, + # timeout=30000, + # ) + # except Error as e: + # visibility_info = await page.evaluate( + # """ + # () => { + # const body = document.body; + # const style = window.getComputedStyle(body); + # return { + # display: style.display, + # visibility: style.visibility, + # opacity: style.opacity, + # hasContent: body.innerHTML.length, + # classList: Array.from(body.classList) + # } + # } + # """ + # ) + + # if self.config.verbose: + # self.logger.debug( + # message="Body visibility info: {info}", + # tag="DEBUG", + # params={"info": visibility_info}, + # ) + + # if not config.ignore_body_visibility: + # raise Error(f"Body element is hidden: {visibility_info}") + + # Handle content loading and viewport adjustment + if not self.browser_config.text_mode and ( + config.wait_for_images or config.adjust_viewport_to_content + ): + await page.wait_for_load_state("domcontentloaded") + await asyncio.sleep(0.1) + + # Check for image loading with improved error handling + images_loaded = await self.csp_compliant_wait( + page, + "() => Array.from(document.getElementsByTagName('img')).every(img => img.complete)", + timeout=1000, + ) + + if not images_loaded and self.logger: + self.logger.warning( + message="Some images failed to load within timeout", + tag="SCRAPE", + ) + + # Adjust viewport if needed + if not self.browser_config.text_mode and config.adjust_viewport_to_content: + try: + dimensions = await self.get_page_dimensions(page) + page_height = dimensions["height"] + page_width = dimensions["width"] + # page_width = await page.evaluate( + # "document.documentElement.scrollWidth" + # ) + # page_height = await page.evaluate( + # "document.documentElement.scrollHeight" + # ) + + target_width = self.browser_config.viewport_width + target_height = int(target_width * page_width / page_height * 0.95) + await page.set_viewport_size( + {"width": target_width, "height": target_height} + ) + + scale = min(target_width / page_width, target_height / page_height) + cdp = await page.context.new_cdp_session(page) + await cdp.send( + "Emulation.setDeviceMetricsOverride", + { + "width": page_width, + "height": page_height, + "deviceScaleFactor": 1, + "mobile": False, + "scale": scale, + }, + ) + except Exception as e: + self.logger.warning( + message="Failed to adjust viewport to content: {error}", + tag="VIEWPORT", + params={"error": str(e)}, + ) + + # Handle full page scanning + if config.scan_full_page: + # await self._handle_full_page_scan(page, config.scroll_delay) + await self._handle_full_page_scan(page, config.scroll_delay, config.max_scroll_steps) + + # Handle virtual scroll if configured + if config.virtual_scroll_config: + await self._handle_virtual_scroll(page, config.virtual_scroll_config) + + # Execute JavaScript if provided + # if config.js_code: + # if isinstance(config.js_code, str): + # await page.evaluate(config.js_code) + # elif isinstance(config.js_code, list): + # for js in config.js_code: + # await page.evaluate(js) + + if config.js_code: + # execution_result = await self.execute_user_script(page, config.js_code) + execution_result = await self.robust_execute_user_script( + page, config.js_code + ) + + if not execution_result["success"]: + self.logger.warning( + message="User script execution had issues: {error}", + tag="JS_EXEC", + params={"error": execution_result.get("error")}, + ) + + await self.execute_hook("on_execution_started", page, context=context, config=config) + await self.execute_hook("on_execution_ended", page, context=context, config=config, result=execution_result) + + # Handle user simulation + if config.simulate_user or config.magic: + await page.mouse.move(100, 100) + await page.mouse.down() + await page.mouse.up() + await page.keyboard.press("ArrowDown") + + # Handle wait_for condition + # Todo: Decide how to handle this + if not config.wait_for and config.css_selector and False: + # if not config.wait_for and config.css_selector: + config.wait_for = f"css:{config.css_selector}" + + if config.wait_for: + try: + # Use wait_for_timeout if specified, otherwise fall back to page_timeout + timeout = config.wait_for_timeout if config.wait_for_timeout is not None else config.page_timeout + await self.smart_wait( + page, config.wait_for, timeout=timeout + ) + except Exception as e: + raise RuntimeError(f"Wait condition failed: {str(e)}") + + # Update image dimensions if needed + if not self.browser_config.text_mode: + update_image_dimensions_js = load_js_script("update_image_dimensions") + try: + try: + await page.wait_for_load_state("domcontentloaded", timeout=5) + except PlaywrightTimeoutError: + pass + await page.evaluate(update_image_dimensions_js) + except Exception as e: + self.logger.error( + message="Error updating image dimensions: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + # Process iframes if needed + if config.process_iframes: + page = await self.process_iframes(page) + + # Pre-content retrieval hooks and delay + await self.execute_hook("before_retrieve_html", page, context=context, config=config) + if config.delay_before_return_html: + await asyncio.sleep(config.delay_before_return_html) + + # Handle overlay removal + if config.remove_overlay_elements: + await self.remove_overlay_elements(page) + + if config.css_selector: + try: + # Handle comma-separated selectors by splitting them + selectors = [s.strip() for s in config.css_selector.split(',')] + html_parts = [] + + for selector in selectors: + try: + content = await page.evaluate( + f"""Array.from(document.querySelectorAll("{selector}")) + .map(el => el.outerHTML) + .join('')""" + ) + html_parts.append(content) + except Error as e: + print(f"Warning: Could not get content for selector '{selector}': {str(e)}") + + # Wrap in a div to create a valid HTML structure + html = f"
\n" + "\n".join(html_parts) + "\n
" + except Error as e: + raise RuntimeError(f"Failed to extract HTML content: {str(e)}") + else: + html = await page.content() + + # # Get final HTML content + # html = await page.content() + await self.execute_hook( + "before_return_html", page=page, html=html, context=context, config=config + ) + + # Handle PDF, MHTML and screenshot generation + start_export_time = time.perf_counter() + pdf_data = None + screenshot_data = None + mhtml_data = None + + if config.pdf: + pdf_data = await self.export_pdf(page) + + if config.capture_mhtml: + mhtml_data = await self.capture_mhtml(page) + + if config.screenshot: + if config.screenshot_wait_for: + await asyncio.sleep(config.screenshot_wait_for) + screenshot_data = await self.take_screenshot( + page, screenshot_height_threshold=config.screenshot_height_threshold + ) + + if screenshot_data or pdf_data or mhtml_data: + self.logger.info( + message="Exporting media (PDF/MHTML/screenshot) took {duration:.2f}s", + tag="EXPORT", + params={"duration": time.perf_counter() - start_export_time}, + ) + + # Define delayed content getter + async def get_delayed_content(delay: float = 5.0) -> str: + self.logger.info( + message="Waiting for {delay} seconds before retrieving content for {url}", + tag="INFO", + params={"delay": delay, "url": url}, + ) + await asyncio.sleep(delay) + return await page.content() + + # Return complete response + return AsyncCrawlResponse( + html=html, + response_headers=response_headers, + js_execution_result=execution_result, + status_code=status_code, + screenshot=screenshot_data, + pdf_data=pdf_data, + mhtml_data=mhtml_data, + get_delayed_content=get_delayed_content, + ssl_certificate=ssl_cert, + downloaded_files=( + self._downloaded_files if self._downloaded_files else None + ), + redirected_url=redirected_url, + # Include captured data if enabled + network_requests=captured_requests if config.capture_network_requests else None, + console_messages=captured_console if config.capture_console_messages else None, + ) + + except Exception as e: + raise e + + finally: + # If no session_id is given we should close the page + all_contexts = page.context.browser.contexts + total_pages = sum(len(context.pages) for context in all_contexts) + if config.session_id: + pass + elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless): + pass + else: + # Detach listeners before closing to prevent potential errors during close + if config.capture_network_requests: + page.remove_listener("request", handle_request_capture) + page.remove_listener("response", handle_response_capture) + page.remove_listener("requestfailed", handle_request_failed_capture) + if config.capture_console_messages: + page.remove_listener("console", handle_console_capture) + page.remove_listener("pageerror", handle_pageerror_capture) + + # Close the page + await page.close() + + # async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1): + async def _handle_full_page_scan(self, page: Page, scroll_delay: float = 0.1, max_scroll_steps: Optional[int] = None): + """ + Helper method to handle full page scanning. + + How it works: + 1. Get the viewport height. + 2. Scroll to the bottom of the page. + 3. Get the total height of the page. + 4. Scroll back to the top of the page. + 5. Scroll to the bottom of the page again. + 6. Continue scrolling until the bottom of the page is reached. + + Args: + page (Page): The Playwright page object + scroll_delay (float): The delay between page scrolls + max_scroll_steps (Optional[int]): Maximum number of scroll steps to perform. If None, scrolls until end. + + """ + try: + viewport_size = page.viewport_size + if viewport_size is None: + await page.set_viewport_size( + {"width": self.browser_config.viewport_width, "height": self.browser_config.viewport_height} + ) + viewport_size = page.viewport_size + + viewport_height = viewport_size.get( + "height", self.browser_config.viewport_height + ) + current_position = viewport_height + + # await page.evaluate(f"window.scrollTo(0, {current_position})") + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + # await self.csp_scroll_to(page, 0, current_position) + # await asyncio.sleep(scroll_delay) + + # total_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + total_height = dimensions["height"] + + scroll_step_count = 0 + while current_position < total_height: + #### + # NEW FEATURE: Check if we've reached the maximum allowed scroll steps + # This prevents infinite scrolling on very long pages or infinite scroll scenarios + # If max_scroll_steps is None, this check is skipped (unlimited scrolling - original behavior) + #### + if max_scroll_steps is not None and scroll_step_count >= max_scroll_steps: + break + current_position = min(current_position + viewport_height, total_height) + await self.safe_scroll(page, 0, current_position, delay=scroll_delay) + + # Increment the step counter for max_scroll_steps tracking + scroll_step_count += 1 + + # await page.evaluate(f"window.scrollTo(0, {current_position})") + # await asyncio.sleep(scroll_delay) + + # new_height = await page.evaluate("document.documentElement.scrollHeight") + dimensions = await self.get_page_dimensions(page) + new_height = dimensions["height"] + + if new_height > total_height: + total_height = new_height + + # await page.evaluate("window.scrollTo(0, 0)") + await self.safe_scroll(page, 0, 0) + + except Exception as e: + self.logger.warning( + message="Failed to perform full page scan: {error}", + tag="PAGE_SCAN", + params={"error": str(e)}, + ) + else: + # await page.evaluate("window.scrollTo(0, document.body.scrollHeight)") + await self.safe_scroll(page, 0, total_height) + + async def _handle_virtual_scroll(self, page: Page, config: "VirtualScrollConfig"): + """ + Handle virtual scroll containers (e.g., Twitter-like feeds) by capturing + content at different scroll positions and merging unique elements. + + Following the design: + 1. Get container HTML + 2. Scroll by container height + 3. Wait and check if container HTML changed + 4. Three cases: + - No change: continue scrolling + - New items added (appended): continue (items already in page) + - Items replaced: capture HTML chunk and add to list + 5. After N scrolls, merge chunks if any were captured + + Args: + page: The Playwright page object + config: Virtual scroll configuration + """ + try: + # Import VirtualScrollConfig to avoid circular import + from .async_configs import VirtualScrollConfig + + # Ensure config is a VirtualScrollConfig instance + if isinstance(config, dict): + config = VirtualScrollConfig.from_dict(config) + + self.logger.info( + message="Starting virtual scroll capture for container: {selector}", + tag="VSCROLL", + params={"selector": config.container_selector} + ) + + # JavaScript function to handle virtual scroll capture + virtual_scroll_js = """ + async (config) => { + const container = document.querySelector(config.container_selector); + if (!container) { + throw new Error(`Container not found: ${config.container_selector}`); + } + + // List to store HTML chunks when content is replaced + const htmlChunks = []; + let previousHTML = container.innerHTML; + let scrollCount = 0; + + // Determine scroll amount + let scrollAmount; + if (typeof config.scroll_by === 'number') { + scrollAmount = config.scroll_by; + } else if (config.scroll_by === 'page_height') { + scrollAmount = window.innerHeight; + } else { // container_height + scrollAmount = container.offsetHeight; + } + + // Perform scrolling + while (scrollCount < config.scroll_count) { + // Scroll the container + container.scrollTop += scrollAmount; + + // Wait for content to potentially load + await new Promise(resolve => setTimeout(resolve, config.wait_after_scroll * 1000)); + + // Get current HTML + const currentHTML = container.innerHTML; + + // Determine what changed + if (currentHTML === previousHTML) { + // Case 0: No change - continue scrolling + console.log(`Scroll ${scrollCount + 1}: No change in content`); + } else if (currentHTML.startsWith(previousHTML)) { + // Case 1: New items appended - content already in page + console.log(`Scroll ${scrollCount + 1}: New items appended`); + } else { + // Case 2: Items replaced - capture the previous HTML + console.log(`Scroll ${scrollCount + 1}: Content replaced, capturing chunk`); + htmlChunks.push(previousHTML); + } + + // Update previous HTML for next iteration + previousHTML = currentHTML; + scrollCount++; + + // Check if we've reached the end + if (container.scrollTop + container.clientHeight >= container.scrollHeight - 10) { + console.log(`Reached end of scrollable content at scroll ${scrollCount}`); + // Capture final chunk if content was replaced + if (htmlChunks.length > 0) { + htmlChunks.push(currentHTML); + } + break; + } + } + + // If we have chunks (case 2 occurred), merge them + if (htmlChunks.length > 0) { + console.log(`Merging ${htmlChunks.length} HTML chunks`); + + // Parse all chunks to extract unique elements + const tempDiv = document.createElement('div'); + const seenTexts = new Set(); + const uniqueElements = []; + + // Process each chunk + for (const chunk of htmlChunks) { + tempDiv.innerHTML = chunk; + const elements = tempDiv.children; + + for (let i = 0; i < elements.length; i++) { + const element = elements[i]; + // Normalize text for deduplication + const normalizedText = element.innerText + .toLowerCase() + .replace(/[\\s\\W]/g, ''); // Remove spaces and symbols + + if (!seenTexts.has(normalizedText)) { + seenTexts.add(normalizedText); + uniqueElements.push(element.outerHTML); + } + } + } + + // Replace container content with merged unique elements + container.innerHTML = uniqueElements.join('\\n'); + console.log(`Merged ${uniqueElements.length} unique elements from ${htmlChunks.length} chunks`); + + return { + success: true, + chunksCount: htmlChunks.length, + uniqueCount: uniqueElements.length, + replaced: true + }; + } else { + console.log('No content replacement detected, all content remains in page'); + return { + success: true, + chunksCount: 0, + uniqueCount: 0, + replaced: false + }; + } + } + """ + + # Execute virtual scroll capture + result = await page.evaluate(virtual_scroll_js, config.to_dict()) + + if result.get("replaced", False): + self.logger.success( + message="Virtual scroll completed. Merged {unique} unique elements from {chunks} chunks", + tag="VSCROLL", + params={ + "unique": result.get("uniqueCount", 0), + "chunks": result.get("chunksCount", 0) + } + ) + else: + self.logger.info( + message="Virtual scroll completed. Content was appended, no merging needed", + tag="VSCROLL" + ) + + except Exception as e: + self.logger.error( + message="Virtual scroll capture failed: {error}", + tag="VSCROLL", + params={"error": str(e)} + ) + # Continue with normal flow even if virtual scroll fails + + async def _handle_download(self, download): + """ + Handle file downloads. + + How it works: + 1. Get the suggested filename. + 2. Get the download path. + 3. Log the download. + 4. Start the download. + 5. Save the downloaded file. + 6. Log the completion. + + Args: + download (Download): The Playwright download object + + Returns: + None + """ + try: + suggested_filename = download.suggested_filename + download_path = os.path.join(self.browser_config.downloads_path, suggested_filename) + + self.logger.info( + message="Downloading {filename} to {path}", + tag="FETCH", + params={"filename": suggested_filename, "path": download_path}, + ) + + start_time = time.perf_counter() + await download.save_as(download_path) + end_time = time.perf_counter() + self._downloaded_files.append(download_path) + + self.logger.success( + message="Downloaded {filename} successfully", + tag="COMPLETE", + params={ + "filename": suggested_filename, + "path": download_path, + "duration": f"{end_time - start_time:.2f}s", + }, + ) + except Exception as e: + self.logger.error( + message="Failed to handle download: {error}", + tag="ERROR", + params={"error": str(e)}, + ) + + async def remove_overlay_elements(self, page: Page) -> None: + """ + Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. + + Args: + page (Page): The Playwright page instance + """ + remove_overlays_js = load_js_script("remove_overlay_elements") + + try: + await page.evaluate( + f""" + (() => {{ + try {{ + {remove_overlays_js} + return {{ success: true }}; + }} catch (error) {{ + return {{ + success: false, + error: error.toString(), + stack: error.stack + }}; + }} + }})() + """ + ) + await page.wait_for_timeout(500) # Wait for any animations to complete + except Exception as e: + self.logger.warning( + message="Failed to remove overlay elements: {error}", + tag="SCRAPE", + params={"error": str(e)}, + ) + + async def export_pdf(self, page: Page) -> bytes: + """ + Exports the current page as a PDF. + + Args: + page (Page): The Playwright page object + + Returns: + bytes: The PDF data + """ + pdf_data = await page.pdf(print_background=True) + return pdf_data + + async def capture_mhtml(self, page: Page) -> Optional[str]: + """ + Captures the current page as MHTML using CDP. + + MHTML (MIME HTML) is a web page archive format that combines the HTML content + with its resources (images, CSS, etc.) into a single MIME-encoded file. + + Args: + page (Page): The Playwright page object + + Returns: + Optional[str]: The MHTML content as a string, or None if there was an error + """ + try: + # Ensure the page is fully loaded before capturing + try: + # Wait for DOM content and network to be idle + await page.wait_for_load_state("domcontentloaded", timeout=5000) + await page.wait_for_load_state("networkidle", timeout=5000) + + # Give a little extra time for JavaScript execution + await page.wait_for_timeout(1000) + + # Wait for any animations to complete + await page.evaluate(""" + () => new Promise(resolve => { + // First requestAnimationFrame gets scheduled after the next repaint + requestAnimationFrame(() => { + // Second requestAnimationFrame gets called after all animations complete + requestAnimationFrame(resolve); + }); + }) + """) + except Error as e: + if self.logger: + self.logger.warning( + message="Wait for load state timed out: {error}", + tag="MHTML", + params={"error": str(e)}, + ) + + # Create a new CDP session + cdp_session = await page.context.new_cdp_session(page) + + # Call Page.captureSnapshot with format "mhtml" + result = await cdp_session.send("Page.captureSnapshot", {"format": "mhtml"}) + + # The result contains a 'data' field with the MHTML content + mhtml_content = result.get("data") + + # Detach the CDP session to clean up resources + await cdp_session.detach() + + return mhtml_content + except Exception as e: + # Log the error but don't raise it - we'll just return None for the MHTML + if self.logger: + self.logger.error( + message="Failed to capture MHTML: {error}", + tag="MHTML", + params={"error": str(e)}, + ) + return None + + async def _capture_console_messages( + self, page: Page, file_path: str + ) -> List[Dict[str, Union[str, float]]]: + """ + Captures console messages from the page. + Args: + + page (Page): The Playwright page object + Returns: + List[Dict[str, Union[str, float]]]: A list of captured console messages + """ + captured_console = [] + + def handle_console_message(msg): + try: + message_type = msg.type + message_text = msg.text + + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time(), + } + captured_console.append(entry) + except Exception as e: + if self.logger: + self.logger.warning( + f"Error capturing console message: {e}", tag="CAPTURE" + ) + + page.on("console", handle_console_message) + + await page.goto(file_path) + + return captured_console + + async def take_screenshot(self, page, **kwargs) -> str: + """ + Take a screenshot of the current page. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + need_scroll = await self.page_need_scroll(page) + + if not need_scroll: + # Page is short enough, just take a screenshot + return await self.take_screenshot_naive(page) + else: + # Page is too long, try to take a full-page screenshot + return await self.take_screenshot_scroller(page, **kwargs) + # return await self.take_screenshot_from_pdf(await self.export_pdf(page)) + + async def take_screenshot_from_pdf(self, pdf_data: bytes) -> str: + """ + Convert the first page of the PDF to a screenshot. + + Requires pdf2image and poppler. + + Args: + pdf_data (bytes): The PDF data + + Returns: + str: The base64-encoded screenshot data + """ + try: + from pdf2image import convert_from_bytes + + images = convert_from_bytes(pdf_data) + final_img = images[0].convert("RGB") + buffered = BytesIO() + final_img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + except Exception as e: + error_message = f"Failed to take PDF-based screenshot: {str(e)}" + self.logger.error( + message="PDF Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + # Return error image as fallback + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + async def take_screenshot_scroller(self, page: Page, **kwargs) -> str: + """ + Attempt to set a large viewport and take a full-page screenshot. + If still too large, segment the page as before. + + Requires pdf2image and poppler. + + Args: + page (Page): The Playwright page object + kwargs: Additional keyword arguments + + Returns: + str: The base64-encoded screenshot data + """ + try: + # Get page height + dimensions = await self.get_page_dimensions(page) + page_width = dimensions["width"] + page_height = dimensions["height"] + # page_height = await page.evaluate("document.documentElement.scrollHeight") + # page_width = await page.evaluate("document.documentElement.scrollWidth") + + # Set a large viewport + large_viewport_height = min( + page_height, + kwargs.get("screenshot_height_threshold", SCREENSHOT_HEIGHT_TRESHOLD), + ) + await page.set_viewport_size( + {"width": page_width, "height": large_viewport_height} + ) + + # Page still too long, segment approach + segments = [] + viewport_size = page.viewport_size + viewport_height = viewport_size["height"] + + num_segments = (page_height // viewport_height) + 1 + for i in range(num_segments): + y_offset = i * viewport_height + # Special handling for the last segment + if i == num_segments - 1: + last_part_height = page_height % viewport_height + + # If page_height is an exact multiple of viewport_height, + # we don't need an extra segment + if last_part_height == 0: + # Skip last segment if page height is exact multiple of viewport + break + + # Adjust viewport to exactly match the remaining content height + await page.set_viewport_size({"width": page_width, "height": last_part_height}) + + await page.evaluate(f"window.scrollTo(0, {y_offset})") + await asyncio.sleep(0.01) # wait for render + + # Capture the current segment + # Note: Using compression options (format, quality) would go here + seg_shot = await page.screenshot(full_page=False, type="jpeg", quality=85) + # seg_shot = await page.screenshot(full_page=False) + img = Image.open(BytesIO(seg_shot)).convert("RGB") + segments.append(img) + + # Reset viewport to original size after capturing segments + await page.set_viewport_size({"width": page_width, "height": viewport_height}) + + total_height = sum(img.height for img in segments) + stitched = Image.new("RGB", (segments[0].width, total_height)) + offset = 0 + for img in segments: + # stitched.paste(img, (0, offset)) + stitched.paste(img.convert("RGB"), (0, offset)) + offset += img.height + + buffered = BytesIO() + stitched = stitched.convert("RGB") + stitched.save(buffered, format="BMP", quality=85) + encoded = base64.b64encode(buffered.getvalue()).decode("utf-8") + + return encoded + except Exception as e: + error_message = f"Failed to take large viewport screenshot: {str(e)}" + self.logger.error( + message="Large viewport screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + # return error image + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + # finally: + # await page.close() + + async def take_screenshot_naive(self, page: Page) -> str: + """ + Takes a screenshot of the current page. + + Args: + page (Page): The Playwright page instance + + Returns: + str: Base64-encoded screenshot image + """ + try: + # The page is already loaded, just take the screenshot + screenshot = await page.screenshot(full_page=False) + return base64.b64encode(screenshot).decode("utf-8") + except Exception as e: + error_message = f"Failed to take screenshot: {str(e)}" + self.logger.error( + message="Screenshot failed: {error}", + tag="ERROR", + params={"error": error_message}, + ) + + # Generate an error image + img = Image.new("RGB", (800, 600), color="black") + draw = ImageDraw.Draw(img) + font = ImageFont.load_default() + draw.text((10, 10), error_message, fill=(255, 255, 255), font=font) + + buffered = BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + # finally: + # await page.close() + + async def export_storage_state(self, path: str = None) -> dict: + """ + Exports the current storage state (cookies, localStorage, sessionStorage) + to a JSON file at the specified path. + + Args: + path (str): The path to save the storage state JSON file + + Returns: + dict: The exported storage state + """ + if self.default_context: + state = await self.default_context.storage_state(path=path) + self.logger.info( + message="Exported storage state to {path}", + tag="INFO", + params={"path": path}, + ) + return state + else: + self.logger.warning( + message="No default_context available to export storage state.", + tag="WARNING", + ) + + async def robust_execute_user_script( + self, page: Page, js_code: Union[str, List[str]] + ) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context, + supporting both synchronous and async user code, plus navigations. + + How it works: + 1. Wait for load state 'domcontentloaded' + 2. If js_code is a string, execute it directly + 3. If js_code is a list, execute each element in sequence + 4. Wait for load state 'networkidle' + 5. Return results + + Args: + page (Page): The Playwright page instance + js_code (Union[str, List[str]]): The JavaScript code to execute + + Returns: + Dict[str, Any]: The results of the execution + """ + try: + await page.wait_for_load_state("domcontentloaded") + + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Attempt the evaluate + # If the user code triggers navigation, we catch the "context destroyed" error + # then wait for the new page to load before continuing + result = None + try: + # OLD VERSION: + # result = await page.evaluate( + # f""" + # (async () => {{ + # try {{ + # const script_result = {script}; + # return {{ success: true, result: script_result }}; + # }} catch (err) {{ + # return {{ success: false, error: err.toString(), stack: err.stack }}; + # }} + # }})(); + # """ + # ) + + # """ NEW VERSION: + # When {script} contains statements (e.g., const link = …; link.click();), + # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'. + # """ + result = await page.evaluate( + f""" + (async () => {{ + try {{ + return await (async () => {{ + {script} + }})(); + }} catch (err) {{ + return {{ success: false, error: err.toString(), stack: err.stack }}; + }} + }})(); + """ + ) + except Error as e: + # If it's due to navigation destroying the context, handle gracefully + if "Execution context was destroyed" in str(e): + self.logger.info( + "Navigation triggered by script, waiting for load state", + tag="JS_EXEC", + ) + try: + await page.wait_for_load_state("load", timeout=30000) + except Error as nav_err: + self.logger.warning( + message="Navigation wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)}, + ) + try: + await page.wait_for_load_state( + "networkidle", timeout=30000 + ) + except Error as nav_err: + self.logger.warning( + message="Network idle wait failed: {error}", + tag="JS_EXEC", + params={"error": str(nav_err)}, + ) + # Return partial success, or adapt as you see fit + result = { + "success": True, + "info": "Navigation triggered, ignoring context destroyed error", + } + else: + # It's some other error, log and continue + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + result = {"success": False, "error": str(e)} + + # If we made it this far with no repeated error, do post-load waits + t1 = time.time() + try: + await page.wait_for_load_state("domcontentloaded", timeout=5000) + except Error as e: + self.logger.warning( + message="DOM content load timeout: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + + # t1 = time.time() + # try: + # await page.wait_for_load_state('networkidle', timeout=5000) + # print("Network idle after script execution in", time.time() - t1) + # except Error as e: + # self.logger.warning( + # message="Network idle timeout: {error}", + # tag="JS_EXEC", + # params={"error": str(e)} + # ) + + results.append(result if result else {"success": True}) + + except Exception as e: + # Catch anything else + self.logger.error( + message="Script chunk failed: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + return {"success": False, "error": str(e)} + + async def execute_user_script( + self, page: Page, js_code: Union[str, List[str]] + ) -> Dict[str, Any]: + """ + Executes user-provided JavaScript code with proper error handling and context. + + Args: + page: Playwright page object + js_code: Single JavaScript string or list of JavaScript code strings + + Returns: + Dict containing execution status and results/errors + """ + try: + # Ensure the page is ready for script execution + await page.wait_for_load_state("domcontentloaded") + + # Handle single script or multiple scripts + if isinstance(js_code, str): + scripts = [js_code] + else: + scripts = js_code + + results = [] + for script in scripts: + try: + # Execute the script and wait for network idle + result = await page.evaluate( + f""" + (() => {{ + return new Promise((resolve) => {{ + try {{ + const result = (function() {{ + {script} + }})(); + + // If result is a promise, wait for it + if (result instanceof Promise) {{ + result.then(() => {{ + // Wait a bit for any triggered effects + setTimeout(() => resolve({{ success: true }}), 100); + }}).catch(error => {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }}); + }} else {{ + // For non-promise results, still wait a bit for effects + setTimeout(() => resolve({{ success: true }}), 100); + }} + }} catch (error) {{ + resolve({{ + success: false, + error: error.toString(), + stack: error.stack + }}); + }} + }}); + }})() + """ + ) + + # Wait for network idle after script execution + t1 = time.time() + await page.wait_for_load_state("domcontentloaded", timeout=5000) + + + t1 = time.time() + await page.wait_for_load_state("networkidle", timeout=5000) + + results.append(result if result else {"success": True}) + + except Error as e: + # Handle Playwright-specific errors + self.logger.error( + message="Playwright execution error: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + results.append({"success": False, "error": str(e)}) + + return {"success": True, "results": results} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + return {"success": False, "error": str(e)} + + except Exception as e: + self.logger.error( + message="Script execution failed: {error}", + tag="JS_EXEC", + params={"error": str(e)}, + ) + return {"success": False, "error": str(e)} + + async def check_visibility(self, page): + """ + Checks if an element is visible on the page. + + Args: + page: Playwright page object + + Returns: + Boolean indicating visibility + """ + return await page.evaluate( + """ + () => { + const element = document.body; + if (!element) return false; + const style = window.getComputedStyle(element); + const isVisible = style.display !== 'none' && + style.visibility !== 'hidden' && + style.opacity !== '0'; + return isVisible; + } + """ + ) + + async def safe_scroll(self, page: Page, x: int, y: int, delay: float = 0.1): + """ + Safely scroll the page with rendering time. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + """ + result = await self.csp_scroll_to(page, x, y) + if result["success"]: + await page.wait_for_timeout(delay * 1000) + return result + + async def csp_scroll_to(self, page: Page, x: int, y: int) -> Dict[str, Any]: + """ + Performs a CSP-compliant scroll operation and returns the result status. + + Args: + page: Playwright page object + x: Horizontal scroll position + y: Vertical scroll position + + Returns: + Dict containing scroll status and position information + """ + try: + result = await page.evaluate( + f"""() => {{ + try {{ + const startX = window.scrollX; + const startY = window.scrollY; + window.scrollTo({x}, {y}); + + // Get final position after scroll + const endX = window.scrollX; + const endY = window.scrollY; + + return {{ + success: true, + startPosition: {{ x: startX, y: startY }}, + endPosition: {{ x: endX, y: endY }}, + targetPosition: {{ x: {x}, y: {y} }}, + delta: {{ + x: Math.abs(endX - {x}), + y: Math.abs(endY - {y}) + }} + }}; + }} catch (e) {{ + return {{ + success: false, + error: e.toString() + }}; + }} + }}""" + ) + + if not result["success"]: + self.logger.warning( + message="Scroll operation failed: {error}", + tag="SCROLL", + params={"error": result.get("error")}, + ) + + return result + + except Exception as e: + self.logger.error( + message="Failed to execute scroll: {error}", + tag="SCROLL", + params={"error": str(e)}, + ) + return {"success": False, "error": str(e)} + + async def get_page_dimensions(self, page: Page): + """ + Get the dimensions of the page. + + Args: + page: Playwright page object + + Returns: + Dict containing width and height of the page + """ + return await page.evaluate( + """ + () => { + const {scrollWidth, scrollHeight} = document.documentElement; + return {width: scrollWidth, height: scrollHeight}; + } + """ + ) + + async def page_need_scroll(self, page: Page) -> bool: + """ + Determine whether the page need to scroll + + Args: + page: Playwright page object + + Returns: + bool: True if page needs scrolling + """ + try: + need_scroll = await page.evaluate( + """ + () => { + const scrollHeight = document.documentElement.scrollHeight; + const viewportHeight = window.innerHeight; + return scrollHeight > viewportHeight; + } + """ + ) + return need_scroll + except Exception as e: + self.logger.warning( + message="Failed to check scroll need: {error}. Defaulting to True for safety.", + tag="SCROLL", + params={"error": str(e)}, + ) + return True # Default to scrolling if check fails + + +#################################################################################################### +# HTTP Crawler Strategy +#################################################################################################### + +class HTTPCrawlerError(Exception): + """Base error class for HTTP crawler specific exceptions""" + pass + + +class ConnectionTimeoutError(HTTPCrawlerError): + """Raised when connection timeout occurs""" + pass + + +class HTTPStatusError(HTTPCrawlerError): + """Raised for unexpected status codes""" + def __init__(self, status_code: int, message: str): + self.status_code = status_code + super().__init__(f"HTTP {status_code}: {message}") + + +class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): + """ + Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. + """ + + __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config') + + DEFAULT_TIMEOUT: Final[int] = 30 + DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 + DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4) + DEFAULT_DNS_CACHE_TTL: Final[int] = 300 + VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'}) + + _BASE_HEADERS: Final = MappingProxyType({ + 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate, br', + 'Connection': 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' + }) + + def __init__( + self, + browser_config: Optional[HTTPCrawlerConfig] = None, + logger: Optional[AsyncLogger] = None, + max_connections: int = DEFAULT_MAX_CONNECTIONS, + dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, + chunk_size: int = DEFAULT_CHUNK_SIZE + ): + """Initialize the HTTP crawler with config""" + self.browser_config = browser_config or HTTPCrawlerConfig() + self.logger = logger + self.max_connections = max_connections + self.dns_cache_ttl = dns_cache_ttl + self.chunk_size = chunk_size + self._session: Optional[aiohttp.ClientSession] = None + + self.hooks = { + k: partial(self._execute_hook, k) + for k in ('before_request', 'after_request', 'on_error') + } + + # Set default hooks + self.set_hook('before_request', lambda *args, **kwargs: None) + self.set_hook('after_request', lambda *args, **kwargs: None) + self.set_hook('on_error', lambda *args, **kwargs: None) + + + async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: + await self.start() + return self + + async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: + await self.close() + + @contextlib.asynccontextmanager + async def _session_context(self): + try: + if not self._session: + await self.start() + yield self._session + finally: + pass + + def set_hook(self, hook_type: str, hook_func: Callable) -> None: + if hook_type in self.hooks: + self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) + else: + raise ValueError(f"Invalid hook type: {hook_type}") + + async def _execute_hook( + self, + hook_type: str, + hook_func: Callable, + *args: Any, + **kwargs: Any + ) -> Any: + if asyncio.iscoroutinefunction(hook_func): + return await hook_func(*args, **kwargs) + return hook_func(*args, **kwargs) + + async def start(self) -> None: + if not self._session: + connector = aiohttp.TCPConnector( + limit=self.max_connections, + ttl_dns_cache=self.dns_cache_ttl, + use_dns_cache=True, + force_close=False + ) + self._session = aiohttp.ClientSession( + headers=dict(self._BASE_HEADERS), + connector=connector, + timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT) + ) + + async def close(self) -> None: + if self._session and not self._session.closed: + try: + await asyncio.wait_for(self._session.close(), timeout=5.0) + except asyncio.TimeoutError: + if self.logger: + self.logger.warning( + message="Session cleanup timed out", + tag="CLEANUP" + ) + finally: + self._session = None + + async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: + async with aiofiles.open(path, mode='rb') as f: + while chunk := await f.read(self.chunk_size): + yield memoryview(chunk) + + async def _handle_file(self, path: str) -> AsyncCrawlResponse: + if not os.path.exists(path): + raise FileNotFoundError(f"Local file not found: {path}") + + chunks = [] + async for chunk in self._stream_file(path): + chunks.append(chunk.tobytes().decode('utf-8', errors='replace')) + + return AsyncCrawlResponse( + html=''.join(chunks), + response_headers={}, + status_code=200 + ) + + async def _handle_raw(self, content: str) -> AsyncCrawlResponse: + return AsyncCrawlResponse( + html=content, + response_headers={}, + status_code=200 + ) + + + async def _handle_http( + self, + url: str, + config: CrawlerRunConfig + ) -> AsyncCrawlResponse: + async with self._session_context() as session: + timeout = ClientTimeout( + total=config.page_timeout or self.DEFAULT_TIMEOUT, + connect=10, + sock_read=30 + ) + + headers = dict(self._BASE_HEADERS) + if self.browser_config.headers: + headers.update(self.browser_config.headers) + + request_kwargs = { + 'timeout': timeout, + 'allow_redirects': self.browser_config.follow_redirects, + 'ssl': self.browser_config.verify_ssl, + 'headers': headers + } + + if self.browser_config.method == "POST": + if self.browser_config.data: + request_kwargs['data'] = self.browser_config.data + if self.browser_config.json: + request_kwargs['json'] = self.browser_config.json + + await self.hooks['before_request'](url, request_kwargs) + + try: + async with session.request(self.browser_config.method, url, **request_kwargs) as response: + content = memoryview(await response.read()) + + if not (200 <= response.status < 300): + raise HTTPStatusError( + response.status, + f"Unexpected status code for {url}" + ) + + encoding = response.charset + if not encoding: + encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' + + result = AsyncCrawlResponse( + html=content.tobytes().decode(encoding, errors='replace'), + response_headers=dict(response.headers), + status_code=response.status, + redirected_url=str(response.url) + ) + + await self.hooks['after_request'](result) + return result + + except aiohttp.ServerTimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except aiohttp.ClientConnectorError as e: + await self.hooks['on_error'](e) + raise ConnectionError(f"Connection failed: {str(e)}") + + except aiohttp.ClientError as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP client error: {str(e)}") + + except asyncio.exceptions.TimeoutError as e: + await self.hooks['on_error'](e) + raise ConnectionTimeoutError(f"Request timed out: {str(e)}") + + except Exception as e: + await self.hooks['on_error'](e) + raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") + + async def crawl( + self, + url: str, + config: Optional[CrawlerRunConfig] = None, + **kwargs + ) -> AsyncCrawlResponse: + config = config or CrawlerRunConfig.from_kwargs(kwargs) + + parsed = urlparse(url) + scheme = parsed.scheme.rstrip('/') + + if scheme not in self.VALID_SCHEMES: + raise ValueError(f"Unsupported URL scheme: {scheme}") + + try: + if scheme == 'file': + return await self._handle_file(parsed.path) + elif scheme == 'raw': + return await self._handle_raw(parsed.path) + else: # http or https + return await self._handle_http(url, config) + + except Exception as e: + if self.logger: + self.logger.error( + message="Crawl failed: {error}", + tag="CRAWL", + params={"error": str(e), "url": url} + ) + raise \ No newline at end of file diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 9f1ed38d..8cb83ed4 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -21,6 +21,7 @@ from .async_logger import AsyncLogger from .ssl_certificate import SSLCertificate from .user_agent_generator import ValidUAGenerator from .browser_manager import BrowserManager +from .browser_adapter import BrowserAdapter, PlaywrightAdapter, UndetectedAdapter import aiofiles import aiohttp @@ -71,7 +72,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ def __init__( - self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, **kwargs + self, browser_config: BrowserConfig = None, logger: AsyncLogger = None, browser_adapter: BrowserAdapter = None, **kwargs ): """ Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. @@ -80,11 +81,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): browser_config (BrowserConfig): Configuration object containing browser settings. If None, will be created from kwargs for backwards compatibility. logger: Logger instance for recording events and errors. + browser_adapter (BrowserAdapter): Browser adapter for handling browser-specific operations. + If None, defaults to PlaywrightAdapter. **kwargs: Additional arguments for backwards compatibility and extending functionality. """ # Initialize browser config, either from provided object or kwargs self.browser_config = browser_config or BrowserConfig.from_kwargs(kwargs) self.logger = logger + + # Initialize browser adapter + self.adapter = browser_adapter or PlaywrightAdapter() # Initialize session management self._downloaded_files = [] @@ -104,7 +110,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Initialize browser manager with config self.browser_manager = BrowserManager( - browser_config=self.browser_config, logger=self.logger + browser_config=self.browser_config, + logger=self.logger, + use_undetected=isinstance(self.adapter, UndetectedAdapter) ) async def __aenter__(self): @@ -322,7 +330,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ try: - result = await page.evaluate(wrapper_js) + result = await self.adapter.evaluate(page, wrapper_js) return result except Exception as e: if "Error evaluating condition" in str(e): @@ -367,7 +375,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # Replace the iframe with a div containing the extracted content _iframe = iframe_content.replace("`", "\\`") - await page.evaluate( + await self.adapter.evaluate(page, f""" () => {{ const iframe = document.getElementById('iframe-{i}'); @@ -628,91 +636,16 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.on("requestfailed", handle_request_failed_capture) # Console Message Capturing + handle_console = None + handle_error = None if config.capture_console_messages: - def handle_console_capture(msg): - try: - message_type = "unknown" - try: - message_type = msg.type - except: - pass - - message_text = "unknown" - try: - message_text = msg.text - except: - pass - - # Basic console message with minimal content - entry = { - "type": message_type, - "text": message_text, - "timestamp": time.time() - } - - captured_console.append(entry) - - except Exception as e: - if self.logger: - self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE") - # Still add something to the list even on error - captured_console.append({ - "type": "console_capture_error", - "error": str(e), - "timestamp": time.time() - }) - - def handle_pageerror_capture(err): - try: - error_message = "Unknown error" - try: - error_message = err.message - except: - pass - - error_stack = "" - try: - error_stack = err.stack - except: - pass - - captured_console.append({ - "type": "error", - "text": error_message, - "stack": error_stack, - "timestamp": time.time() - }) - except Exception as e: - if self.logger: - self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE") - captured_console.append({ - "type": "pageerror_capture_error", - "error": str(e), - "timestamp": time.time() - }) - - # Add event listeners directly - page.on("console", handle_console_capture) - page.on("pageerror", handle_pageerror_capture) + # Set up console capture using adapter + handle_console = await self.adapter.setup_console_capture(page, captured_console) + handle_error = await self.adapter.setup_error_capture(page, captured_console) # Set up console logging if requested - if config.log_console: - def log_consol( - msg, console_log_type="debug" - ): # Corrected the parameter syntax - if console_log_type == "error": - self.logger.error( - message=f"Console error: {msg}", # Use f-string for variable interpolation - tag="CONSOLE" - ) - elif console_log_type == "debug": - self.logger.debug( - message=f"Console: {msg}", # Use f-string for variable interpolation - tag="CONSOLE" - ) - - page.on("console", log_consol) - page.on("pageerror", lambda e: log_consol(e, "error")) + # Note: For undetected browsers, console logging won't work directly + # but captured messages can still be logged after retrieval try: # Get SSL certificate information if requested and URL is HTTPS @@ -998,7 +931,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await page.wait_for_load_state("domcontentloaded", timeout=5) except PlaywrightTimeoutError: pass - await page.evaluate(update_image_dimensions_js) + await self.adapter.evaluate(page, update_image_dimensions_js) except Exception as e: self.logger.error( message="Error updating image dimensions: {error}", @@ -1027,7 +960,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for selector in selectors: try: - content = await page.evaluate( + content = await self.adapter.evaluate(page, f"""Array.from(document.querySelectorAll("{selector}")) .map(el => el.outerHTML) .join('')""" @@ -1085,6 +1018,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): await asyncio.sleep(delay) return await page.content() + # For undetected browsers, retrieve console messages before returning + if config.capture_console_messages and hasattr(self.adapter, 'retrieve_console_messages'): + final_messages = await self.adapter.retrieve_console_messages(page) + captured_console.extend(final_messages) + # Return complete response return AsyncCrawlResponse( html=html, @@ -1123,8 +1061,13 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): page.remove_listener("response", handle_response_capture) page.remove_listener("requestfailed", handle_request_failed_capture) if config.capture_console_messages: - page.remove_listener("console", handle_console_capture) - page.remove_listener("pageerror", handle_pageerror_capture) + # Retrieve any final console messages for undetected browsers + if hasattr(self.adapter, 'retrieve_console_messages'): + final_messages = await self.adapter.retrieve_console_messages(page) + captured_console.extend(final_messages) + + # Clean up console capture + await self.adapter.cleanup_console_capture(page, handle_console, handle_error) # Close the page await page.close() @@ -1354,7 +1297,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): """ # Execute virtual scroll capture - result = await page.evaluate(virtual_scroll_js, config.to_dict()) + result = await self.adapter.evaluate(page, virtual_scroll_js, config.to_dict()) if result.get("replaced", False): self.logger.success( @@ -1438,7 +1381,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): remove_overlays_js = load_js_script("remove_overlay_elements") try: - await page.evaluate( + await self.adapter.evaluate(page, f""" (() => {{ try {{ @@ -1843,7 +1786,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): # When {script} contains statements (e.g., const link = …; link.click();), # this forms invalid JavaScript, causing Playwright execution error: SyntaxError: Unexpected token 'const'. # """ - result = await page.evaluate( + result = await self.adapter.evaluate(page, f""" (async () => {{ try {{ @@ -1965,7 +1908,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): for script in scripts: try: # Execute the script and wait for network idle - result = await page.evaluate( + result = await self.adapter.evaluate(page, f""" (() => {{ return new Promise((resolve) => {{ @@ -2049,7 +1992,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Returns: Boolean indicating visibility """ - return await page.evaluate( + return await self.adapter.evaluate(page, """ () => { const element = document.body; @@ -2090,7 +2033,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Dict containing scroll status and position information """ try: - result = await page.evaluate( + result = await self.adapter.evaluate(page, f"""() => {{ try {{ const startX = window.scrollX; @@ -2147,7 +2090,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): Returns: Dict containing width and height of the page """ - return await page.evaluate( + return await self.adapter.evaluate(page, """ () => { const {scrollWidth, scrollHeight} = document.documentElement; @@ -2167,7 +2110,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): bool: True if page needs scrolling """ try: - need_scroll = await page.evaluate( + need_scroll = await self.adapter.evaluate(page, """ () => { const scrollHeight = document.documentElement.scrollHeight; @@ -2186,265 +2129,3 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): return True # Default to scrolling if check fails -#################################################################################################### -# HTTP Crawler Strategy -#################################################################################################### - -class HTTPCrawlerError(Exception): - """Base error class for HTTP crawler specific exceptions""" - pass - - -class ConnectionTimeoutError(HTTPCrawlerError): - """Raised when connection timeout occurs""" - pass - - -class HTTPStatusError(HTTPCrawlerError): - """Raised for unexpected status codes""" - def __init__(self, status_code: int, message: str): - self.status_code = status_code - super().__init__(f"HTTP {status_code}: {message}") - - -class AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy): - """ - Fast, lightweight HTTP-only crawler strategy optimized for memory efficiency. - """ - - __slots__ = ('logger', 'max_connections', 'dns_cache_ttl', 'chunk_size', '_session', 'hooks', 'browser_config') - - DEFAULT_TIMEOUT: Final[int] = 30 - DEFAULT_CHUNK_SIZE: Final[int] = 64 * 1024 - DEFAULT_MAX_CONNECTIONS: Final[int] = min(32, (os.cpu_count() or 1) * 4) - DEFAULT_DNS_CACHE_TTL: Final[int] = 300 - VALID_SCHEMES: Final = frozenset({'http', 'https', 'file', 'raw'}) - - _BASE_HEADERS: Final = MappingProxyType({ - 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate, br', - 'Connection': 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' - }) - - def __init__( - self, - browser_config: Optional[HTTPCrawlerConfig] = None, - logger: Optional[AsyncLogger] = None, - max_connections: int = DEFAULT_MAX_CONNECTIONS, - dns_cache_ttl: int = DEFAULT_DNS_CACHE_TTL, - chunk_size: int = DEFAULT_CHUNK_SIZE - ): - """Initialize the HTTP crawler with config""" - self.browser_config = browser_config or HTTPCrawlerConfig() - self.logger = logger - self.max_connections = max_connections - self.dns_cache_ttl = dns_cache_ttl - self.chunk_size = chunk_size - self._session: Optional[aiohttp.ClientSession] = None - - self.hooks = { - k: partial(self._execute_hook, k) - for k in ('before_request', 'after_request', 'on_error') - } - - # Set default hooks - self.set_hook('before_request', lambda *args, **kwargs: None) - self.set_hook('after_request', lambda *args, **kwargs: None) - self.set_hook('on_error', lambda *args, **kwargs: None) - - - async def __aenter__(self) -> AsyncHTTPCrawlerStrategy: - await self.start() - return self - - async def __aexit__(self, exc_type, exc_val, exc_tb) -> None: - await self.close() - - @contextlib.asynccontextmanager - async def _session_context(self): - try: - if not self._session: - await self.start() - yield self._session - finally: - pass - - def set_hook(self, hook_type: str, hook_func: Callable) -> None: - if hook_type in self.hooks: - self.hooks[hook_type] = partial(self._execute_hook, hook_type, hook_func) - else: - raise ValueError(f"Invalid hook type: {hook_type}") - - async def _execute_hook( - self, - hook_type: str, - hook_func: Callable, - *args: Any, - **kwargs: Any - ) -> Any: - if asyncio.iscoroutinefunction(hook_func): - return await hook_func(*args, **kwargs) - return hook_func(*args, **kwargs) - - async def start(self) -> None: - if not self._session: - connector = aiohttp.TCPConnector( - limit=self.max_connections, - ttl_dns_cache=self.dns_cache_ttl, - use_dns_cache=True, - force_close=False - ) - self._session = aiohttp.ClientSession( - headers=dict(self._BASE_HEADERS), - connector=connector, - timeout=ClientTimeout(total=self.DEFAULT_TIMEOUT) - ) - - async def close(self) -> None: - if self._session and not self._session.closed: - try: - await asyncio.wait_for(self._session.close(), timeout=5.0) - except asyncio.TimeoutError: - if self.logger: - self.logger.warning( - message="Session cleanup timed out", - tag="CLEANUP" - ) - finally: - self._session = None - - async def _stream_file(self, path: str) -> AsyncGenerator[memoryview, None]: - async with aiofiles.open(path, mode='rb') as f: - while chunk := await f.read(self.chunk_size): - yield memoryview(chunk) - - async def _handle_file(self, path: str) -> AsyncCrawlResponse: - if not os.path.exists(path): - raise FileNotFoundError(f"Local file not found: {path}") - - chunks = [] - async for chunk in self._stream_file(path): - chunks.append(chunk.tobytes().decode('utf-8', errors='replace')) - - return AsyncCrawlResponse( - html=''.join(chunks), - response_headers={}, - status_code=200 - ) - - async def _handle_raw(self, content: str) -> AsyncCrawlResponse: - return AsyncCrawlResponse( - html=content, - response_headers={}, - status_code=200 - ) - - - async def _handle_http( - self, - url: str, - config: CrawlerRunConfig - ) -> AsyncCrawlResponse: - async with self._session_context() as session: - timeout = ClientTimeout( - total=config.page_timeout or self.DEFAULT_TIMEOUT, - connect=10, - sock_read=30 - ) - - headers = dict(self._BASE_HEADERS) - if self.browser_config.headers: - headers.update(self.browser_config.headers) - - request_kwargs = { - 'timeout': timeout, - 'allow_redirects': self.browser_config.follow_redirects, - 'ssl': self.browser_config.verify_ssl, - 'headers': headers - } - - if self.browser_config.method == "POST": - if self.browser_config.data: - request_kwargs['data'] = self.browser_config.data - if self.browser_config.json: - request_kwargs['json'] = self.browser_config.json - - await self.hooks['before_request'](url, request_kwargs) - - try: - async with session.request(self.browser_config.method, url, **request_kwargs) as response: - content = memoryview(await response.read()) - - if not (200 <= response.status < 300): - raise HTTPStatusError( - response.status, - f"Unexpected status code for {url}" - ) - - encoding = response.charset - if not encoding: - encoding = chardet.detect(content.tobytes())['encoding'] or 'utf-8' - - result = AsyncCrawlResponse( - html=content.tobytes().decode(encoding, errors='replace'), - response_headers=dict(response.headers), - status_code=response.status, - redirected_url=str(response.url) - ) - - await self.hooks['after_request'](result) - return result - - except aiohttp.ServerTimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - - except aiohttp.ClientConnectorError as e: - await self.hooks['on_error'](e) - raise ConnectionError(f"Connection failed: {str(e)}") - - except aiohttp.ClientError as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP client error: {str(e)}") - - except asyncio.exceptions.TimeoutError as e: - await self.hooks['on_error'](e) - raise ConnectionTimeoutError(f"Request timed out: {str(e)}") - - except Exception as e: - await self.hooks['on_error'](e) - raise HTTPCrawlerError(f"HTTP request failed: {str(e)}") - - async def crawl( - self, - url: str, - config: Optional[CrawlerRunConfig] = None, - **kwargs - ) -> AsyncCrawlResponse: - config = config or CrawlerRunConfig.from_kwargs(kwargs) - - parsed = urlparse(url) - scheme = parsed.scheme.rstrip('/') - - if scheme not in self.VALID_SCHEMES: - raise ValueError(f"Unsupported URL scheme: {scheme}") - - try: - if scheme == 'file': - return await self._handle_file(parsed.path) - elif scheme == 'raw': - return await self._handle_raw(parsed.path) - else: # http or https - return await self._handle_http(url, config) - - except Exception as e: - if self.logger: - self.logger.error( - message="Crawl failed: {error}", - tag="CRAWL", - params={"error": str(e), "url": url} - ) - raise \ No newline at end of file diff --git a/crawl4ai/browser_adapter.py b/crawl4ai/browser_adapter.py new file mode 100644 index 00000000..85fef16e --- /dev/null +++ b/crawl4ai/browser_adapter.py @@ -0,0 +1,293 @@ +# browser_adapter.py +""" +Browser adapter for Crawl4AI to support both Playwright and undetected browsers +with minimal changes to existing codebase. +""" + +from abc import ABC, abstractmethod +from typing import List, Dict, Any, Optional, Callable +import time +import json + +# Import both, but use conditionally +try: + from playwright.async_api import Page +except ImportError: + Page = Any + +try: + from patchright.async_api import Page as UndetectedPage +except ImportError: + UndetectedPage = Any + + +class BrowserAdapter(ABC): + """Abstract adapter for browser-specific operations""" + + @abstractmethod + async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any: + """Execute JavaScript in the page""" + pass + + @abstractmethod + async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup console message capturing, returns handler function if needed""" + pass + + @abstractmethod + async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup error capturing, returns handler function if needed""" + pass + + @abstractmethod + async def retrieve_console_messages(self, page: Page) -> List[Dict]: + """Retrieve captured console messages (for undetected browsers)""" + pass + + @abstractmethod + async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]): + """Clean up console event listeners""" + pass + + @abstractmethod + def get_imports(self) -> tuple: + """Get the appropriate imports for this adapter""" + pass + + +class PlaywrightAdapter(BrowserAdapter): + """Adapter for standard Playwright""" + + async def evaluate(self, page: Page, expression: str, arg: Any = None) -> Any: + """Standard Playwright evaluate""" + if arg is not None: + return await page.evaluate(expression, arg) + return await page.evaluate(expression) + + async def setup_console_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup console capture using Playwright's event system""" + def handle_console_capture(msg): + try: + message_type = "unknown" + try: + message_type = msg.type + except: + pass + + message_text = "unknown" + try: + message_text = msg.text + except: + pass + + entry = { + "type": message_type, + "text": message_text, + "timestamp": time.time() + } + + captured_console.append(entry) + + except Exception as e: + captured_console.append({ + "type": "console_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + page.on("console", handle_console_capture) + return handle_console_capture + + async def setup_error_capture(self, page: Page, captured_console: List[Dict]) -> Optional[Callable]: + """Setup error capture using Playwright's event system""" + def handle_pageerror_capture(err): + try: + error_message = "Unknown error" + try: + error_message = err.message + except: + pass + + error_stack = "" + try: + error_stack = err.stack + except: + pass + + captured_console.append({ + "type": "error", + "text": error_message, + "stack": error_stack, + "timestamp": time.time() + }) + except Exception as e: + captured_console.append({ + "type": "pageerror_capture_error", + "error": str(e), + "timestamp": time.time() + }) + + page.on("pageerror", handle_pageerror_capture) + return handle_pageerror_capture + + async def retrieve_console_messages(self, page: Page) -> List[Dict]: + """Not needed for Playwright - messages are captured via events""" + return [] + + async def cleanup_console_capture(self, page: Page, handle_console: Optional[Callable], handle_error: Optional[Callable]): + """Remove event listeners""" + if handle_console: + page.remove_listener("console", handle_console) + if handle_error: + page.remove_listener("pageerror", handle_error) + + def get_imports(self) -> tuple: + """Return Playwright imports""" + from playwright.async_api import Page, Error + from playwright.async_api import TimeoutError as PlaywrightTimeoutError + return Page, Error, PlaywrightTimeoutError + + +class UndetectedAdapter(BrowserAdapter): + """Adapter for undetected browser automation with stealth features""" + + def __init__(self): + self._console_script_injected = {} + + async def evaluate(self, page: UndetectedPage, expression: str, arg: Any = None) -> Any: + """Undetected browser evaluate with isolated context""" + # For most evaluations, use isolated context for stealth + # Only use non-isolated when we need to access our injected console capture + isolated = not ( + "__console" in expression or + "__captured" in expression or + "__error" in expression or + "window.__" in expression + ) + + if arg is not None: + return await page.evaluate(expression, arg, isolated_context=isolated) + return await page.evaluate(expression, isolated_context=isolated) + + async def setup_console_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]: + """Setup console capture using JavaScript injection for undetected browsers""" + if not self._console_script_injected.get(page, False): + await page.add_init_script(""" + // Initialize console capture + window.__capturedConsole = []; + window.__capturedErrors = []; + + // Store original console methods + const originalConsole = {}; + ['log', 'info', 'warn', 'error', 'debug'].forEach(method => { + originalConsole[method] = console[method]; + console[method] = function(...args) { + try { + window.__capturedConsole.push({ + type: method, + text: args.map(arg => { + try { + if (typeof arg === 'object') { + return JSON.stringify(arg); + } + return String(arg); + } catch (e) { + return '[Object]'; + } + }).join(' '), + timestamp: Date.now() + }); + } catch (e) { + // Fail silently to avoid detection + } + + // Call original method + originalConsole[method].apply(console, args); + }; + }); + """) + self._console_script_injected[page] = True + + return None # No handler function needed for undetected browser + + async def setup_error_capture(self, page: UndetectedPage, captured_console: List[Dict]) -> Optional[Callable]: + """Setup error capture using JavaScript injection for undetected browsers""" + if not self._console_script_injected.get(page, False): + await page.add_init_script(""" + // Capture errors + window.addEventListener('error', (event) => { + try { + window.__capturedErrors.push({ + type: 'error', + text: event.message, + stack: event.error ? event.error.stack : '', + filename: event.filename, + lineno: event.lineno, + colno: event.colno, + timestamp: Date.now() + }); + } catch (e) { + // Fail silently + } + }); + + // Capture unhandled promise rejections + window.addEventListener('unhandledrejection', (event) => { + try { + window.__capturedErrors.push({ + type: 'unhandledrejection', + text: event.reason ? String(event.reason) : 'Unhandled Promise Rejection', + stack: event.reason && event.reason.stack ? event.reason.stack : '', + timestamp: Date.now() + }); + } catch (e) { + // Fail silently + } + }); + """) + self._console_script_injected[page] = True + + return None # No handler function needed for undetected browser + + async def retrieve_console_messages(self, page: UndetectedPage) -> List[Dict]: + """Retrieve captured console messages and errors from the page""" + messages = [] + + try: + # Get console messages + console_messages = await page.evaluate( + "() => { const msgs = window.__capturedConsole || []; window.__capturedConsole = []; return msgs; }", + isolated_context=False + ) + messages.extend(console_messages) + + # Get errors + errors = await page.evaluate( + "() => { const errs = window.__capturedErrors || []; window.__capturedErrors = []; return errs; }", + isolated_context=False + ) + messages.extend(errors) + + # Convert timestamps from JS to Python format + for msg in messages: + if 'timestamp' in msg and isinstance(msg['timestamp'], (int, float)): + msg['timestamp'] = msg['timestamp'] / 1000.0 # Convert from ms to seconds + + except Exception: + # If retrieval fails, return empty list + pass + + return messages + + async def cleanup_console_capture(self, page: UndetectedPage, handle_console: Optional[Callable], handle_error: Optional[Callable]): + """Clean up for undetected browser - retrieve final messages""" + # For undetected browser, we don't have event listeners to remove + # but we should retrieve any final messages + final_messages = await self.retrieve_console_messages(page) + return final_messages + + def get_imports(self) -> tuple: + """Return undetected browser imports""" + from patchright.async_api import Page, Error + from patchright.async_api import TimeoutError as PlaywrightTimeoutError + return Page, Error, PlaywrightTimeoutError \ No newline at end of file diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 08c1f52f..70ed20e4 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -573,21 +573,26 @@ class BrowserManager: _playwright_instance = None @classmethod - async def get_playwright(cls): - from playwright.async_api import async_playwright + async def get_playwright(cls, use_undetected: bool = False): + if use_undetected: + from patchright.async_api import async_playwright + else: + from playwright.async_api import async_playwright cls._playwright_instance = await async_playwright().start() return cls._playwright_instance - def __init__(self, browser_config: BrowserConfig, logger=None): + def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False): """ Initialize the BrowserManager with a browser configuration. Args: browser_config (BrowserConfig): Configuration object containing all browser settings logger: Logger instance for recording events and errors + use_undetected (bool): Whether to use undetected browser (Patchright) """ self.config: BrowserConfig = browser_config self.logger = logger + self.use_undetected = use_undetected # Browser state self.browser = None @@ -601,7 +606,11 @@ class BrowserManager: # Keep track of contexts by a "config signature," so each unique config reuses a single context self.contexts_by_config = {} - self._contexts_lock = asyncio.Lock() + self._contexts_lock = asyncio.Lock() + + # Stealth-related attributes + self._stealth_instance = None + self._stealth_cm = None # Initialize ManagedBrowser if needed if self.config.use_managed_browser: @@ -630,9 +639,21 @@ class BrowserManager: if self.playwright is not None: await self.close() - from playwright.async_api import async_playwright + if self.use_undetected: + from patchright.async_api import async_playwright + else: + from playwright.async_api import async_playwright - self.playwright = await async_playwright().start() + # Initialize playwright with or without stealth + if self.config.enable_stealth and not self.use_undetected: + # Import stealth only when needed + from playwright_stealth import Stealth + # Use the recommended stealth wrapper approach + self._stealth_instance = Stealth() + self._stealth_cm = self._stealth_instance.use_async(async_playwright()) + self.playwright = await self._stealth_cm.__aenter__() + else: + self.playwright = await async_playwright().start() if self.config.cdp_url or self.config.use_managed_browser: self.config.use_managed_browser = True @@ -1094,5 +1115,19 @@ class BrowserManager: self.managed_browser = None if self.playwright: - await self.playwright.stop() + # Handle stealth context manager cleanup if it exists + if hasattr(self, '_stealth_cm') and self._stealth_cm is not None: + try: + await self._stealth_cm.__aexit__(None, None, None) + except Exception as e: + if self.logger: + self.logger.error( + message="Error closing stealth context: {error}", + tag="ERROR", + params={"error": str(e)} + ) + self._stealth_cm = None + self._stealth_instance = None + else: + await self.playwright.stop() self.playwright = None diff --git a/crawl4ai/install.py b/crawl4ai/install.py index b2fcca78..68726ed8 100644 --- a/crawl4ai/install.py +++ b/crawl4ai/install.py @@ -119,6 +119,32 @@ def install_playwright(): logger.warning( f"Please run '{sys.executable} -m playwright install --with-deps' manually after the installation." ) + + # Install Patchright browsers for undetected browser support + logger.info("Installing Patchright browsers for undetected mode...", tag="INIT") + try: + subprocess.check_call( + [ + sys.executable, + "-m", + "patchright", + "install", + "--with-deps", + "--force", + "chromium", + ] + ) + logger.success( + "Patchright installation completed successfully.", tag="COMPLETE" + ) + except subprocess.CalledProcessError: + logger.warning( + f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation." + ) + except Exception: + logger.warning( + f"Please run '{sys.executable} -m patchright install --with-deps' manually after the installation." + ) def run_migration(): diff --git a/crawl4ai/prompts.py b/crawl4ai/prompts.py index 73aa00ac..c306308e 100644 --- a/crawl4ai/prompts.py +++ b/crawl4ai/prompts.py @@ -1056,7 +1056,7 @@ Your output must: """ -GENERATE_SCRIPT_PROMPT = """You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction. +GENERATE_SCRIPT_PROMPT = r"""You are a world-class browser automation specialist. Your sole purpose is to convert a natural language objective and a snippet of HTML into the most **efficient, robust, and simple** script possible to prepare a web page for data extraction. Your scripts run **before the crawl** to handle dynamic content, user interactions, and other obstacles. You are a master of two tools: raw **JavaScript** and the high-level **Crawl4ai Script (c4a)**. diff --git a/docs/examples/c4a_script/api_usage_examples.py b/docs/examples/c4a_script/api_usage_examples.py index c34f5ddd..4c76cd94 100644 --- a/docs/examples/c4a_script/api_usage_examples.py +++ b/docs/examples/c4a_script/api_usage_examples.py @@ -3,8 +3,8 @@ C4A-Script API Usage Examples Shows how to use the new Result-based API in various scenarios """ -from c4a_compile import compile, validate, compile_file -from c4a_result import CompilationResult, ValidationResult +from crawl4ai.script.c4a_compile import compile, validate, compile_file +from crawl4ai.script.c4a_result import CompilationResult, ValidationResult import json diff --git a/docs/examples/c4a_script/c4a_script_hello_world.py b/docs/examples/c4a_script/c4a_script_hello_world.py index 9c71d2e0..9959c4aa 100644 --- a/docs/examples/c4a_script/c4a_script_hello_world.py +++ b/docs/examples/c4a_script/c4a_script_hello_world.py @@ -3,7 +3,7 @@ C4A-Script Hello World A concise example showing how to use the C4A-Script compiler """ -from c4a_compile import compile +from crawl4ai.script.c4a_compile import compile # Define your C4A-Script script = """ diff --git a/docs/examples/c4a_script/c4a_script_hello_world_error.py b/docs/examples/c4a_script/c4a_script_hello_world_error.py index 895d7fe8..fc3dbfb2 100644 --- a/docs/examples/c4a_script/c4a_script_hello_world_error.py +++ b/docs/examples/c4a_script/c4a_script_hello_world_error.py @@ -3,7 +3,7 @@ C4A-Script Hello World - Error Example Shows how error handling works """ -from c4a_compile import compile +from crawl4ai.script.c4a_compile import compile # Define a script with an error (missing THEN) script = """ diff --git a/docs/examples/hello_world_undetected.py b/docs/examples/hello_world_undetected.py new file mode 100644 index 00000000..83ce51ef --- /dev/null +++ b/docs/examples/hello_world_undetected.py @@ -0,0 +1,57 @@ +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + DefaultMarkdownGenerator, + PruningContentFilter, + CrawlResult, + UndetectedAdapter +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + +async def main(): + # Create browser config + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + + # Create the undetected adapter + undetected_adapter = UndetectedAdapter() + + # Create the crawler strategy with the undetected adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + # Create the crawler with our custom strategy + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + # Configure the crawl + crawler_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() + ), + capture_console_messages=True, # Enable console capture to test adapter + ) + + # Test on a site that typically detects bots + print("Testing undetected adapter...") + result: CrawlResult = await crawler.arun( + url="https://www.helloworld.org", + config=crawler_config + ) + + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"Console messages captured: {len(result.console_messages or [])}") + print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/simple_anti_bot_examples.py b/docs/examples/simple_anti_bot_examples.py new file mode 100644 index 00000000..075ee9a2 --- /dev/null +++ b/docs/examples/simple_anti_bot_examples.py @@ -0,0 +1,59 @@ +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, UndetectedAdapter +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +# Example 1: Stealth Mode +async def stealth_mode_example(): + browser_config = BrowserConfig( + enable_stealth=True, + headless=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") + return result.html[:500] + +# Example 2: Undetected Browser +async def undetected_browser_example(): + browser_config = BrowserConfig( + headless=False + ) + + adapter = UndetectedAdapter() + strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=strategy, + config=browser_config + ) as crawler: + result = await crawler.arun("https://example.com") + return result.html[:500] + +# Example 3: Both Combined +async def combined_example(): + browser_config = BrowserConfig( + enable_stealth=True, + headless=False + ) + + adapter = UndetectedAdapter() + strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=strategy, + config=browser_config + ) as crawler: + result = await crawler.arun("https://example.com") + return result.html[:500] + +# Run examples +if __name__ == "__main__": + asyncio.run(stealth_mode_example()) + asyncio.run(undetected_browser_example()) + asyncio.run(combined_example()) \ No newline at end of file diff --git a/docs/examples/stealth_mode_example.py b/docs/examples/stealth_mode_example.py new file mode 100644 index 00000000..baf735db --- /dev/null +++ b/docs/examples/stealth_mode_example.py @@ -0,0 +1,522 @@ +""" +Stealth Mode Example with Crawl4AI + +This example demonstrates how to use the stealth mode feature to bypass basic bot detection. +The stealth mode uses playwright-stealth to modify browser fingerprints and behaviors +that are commonly used to detect automated browsers. + +Key features demonstrated: +1. Comparing crawling with and without stealth mode +2. Testing against bot detection sites +3. Accessing sites that block automated browsers +4. Best practices for stealth crawling +""" + +import asyncio +import json +from typing import Dict, Any +from colorama import Fore, Style, init + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.async_logger import AsyncLogger + +# Initialize colorama for colored output +init() + +# Create a logger for better output +logger = AsyncLogger(verbose=True) + + +async def test_bot_detection(use_stealth: bool = False) -> Dict[str, Any]: + """Test against a bot detection service""" + + logger.info( + f"Testing bot detection with stealth={'ON' if use_stealth else 'OFF'}", + tag="STEALTH" + ) + + # Configure browser with or without stealth + browser_config = BrowserConfig( + headless=False, # Use False to see the browser in action + enable_stealth=use_stealth, + viewport_width=1280, + viewport_height=800 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # JavaScript to extract bot detection results + detection_script = """ + // Comprehensive bot detection checks + (() => { + const detectionResults = { + // Basic WebDriver detection + webdriver: navigator.webdriver, + + // Chrome specific + chrome: !!window.chrome, + chromeRuntime: !!window.chrome?.runtime, + + // Automation indicators + automationControlled: navigator.webdriver, + + // Permissions API + permissionsPresent: !!navigator.permissions?.query, + + // Plugins + pluginsLength: navigator.plugins.length, + pluginsArray: Array.from(navigator.plugins).map(p => p.name), + + // Languages + languages: navigator.languages, + language: navigator.language, + + // User agent + userAgent: navigator.userAgent, + + // Screen and window properties + screen: { + width: screen.width, + height: screen.height, + availWidth: screen.availWidth, + availHeight: screen.availHeight, + colorDepth: screen.colorDepth, + pixelDepth: screen.pixelDepth + }, + + // WebGL vendor + webglVendor: (() => { + try { + const canvas = document.createElement('canvas'); + const gl = canvas.getContext('webgl') || canvas.getContext('experimental-webgl'); + const ext = gl.getExtension('WEBGL_debug_renderer_info'); + return gl.getParameter(ext.UNMASKED_VENDOR_WEBGL); + } catch (e) { + return 'Error'; + } + })(), + + // Platform + platform: navigator.platform, + + // Hardware concurrency + hardwareConcurrency: navigator.hardwareConcurrency, + + // Device memory + deviceMemory: navigator.deviceMemory, + + // Connection + connection: navigator.connection?.effectiveType + }; + + // Log results for console capture + console.log('DETECTION_RESULTS:', JSON.stringify(detectionResults, null, 2)); + + // Return results + return detectionResults; + })(); + """ + + # Crawl bot detection test page + config = CrawlerRunConfig( + js_code=detection_script, + capture_console_messages=True, + wait_until="networkidle", + delay_before_return_html=2.0 # Give time for all checks to complete + ) + + result = await crawler.arun( + url="https://bot.sannysoft.com", + config=config + ) + + if result.success: + # Extract detection results from console + detection_data = None + for msg in result.console_messages or []: + if "DETECTION_RESULTS:" in msg.get("text", ""): + try: + json_str = msg["text"].replace("DETECTION_RESULTS:", "").strip() + detection_data = json.loads(json_str) + except: + pass + + # Also try to get from JavaScript execution result + if not detection_data and result.js_execution_result: + detection_data = result.js_execution_result + + return { + "success": True, + "url": result.url, + "detection_data": detection_data, + "page_title": result.metadata.get("title", ""), + "stealth_enabled": use_stealth + } + else: + return { + "success": False, + "error": result.error_message, + "stealth_enabled": use_stealth + } + + +async def test_cloudflare_site(use_stealth: bool = False) -> Dict[str, Any]: + """Test accessing a Cloudflare-protected site""" + + logger.info( + f"Testing Cloudflare site with stealth={'ON' if use_stealth else 'OFF'}", + tag="STEALTH" + ) + + browser_config = BrowserConfig( + headless=True, # Cloudflare detection works better in headless mode with stealth + enable_stealth=use_stealth, + viewport_width=1920, + viewport_height=1080 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + wait_until="networkidle", + page_timeout=30000, # 30 seconds + delay_before_return_html=3.0 + ) + + # Test on a site that often shows Cloudflare challenges + result = await crawler.arun( + url="https://nowsecure.nl", + config=config + ) + + # Check if we hit Cloudflare challenge + cloudflare_detected = False + if result.html: + cloudflare_indicators = [ + "Checking your browser", + "Just a moment", + "cf-browser-verification", + "cf-challenge", + "ray ID" + ] + cloudflare_detected = any(indicator in result.html for indicator in cloudflare_indicators) + + return { + "success": result.success, + "url": result.url, + "cloudflare_challenge": cloudflare_detected, + "status_code": result.status_code, + "page_title": result.metadata.get("title", "") if result.metadata else "", + "stealth_enabled": use_stealth, + "html_snippet": result.html[:500] if result.html else "" + } + + +async def test_anti_bot_site(use_stealth: bool = False) -> Dict[str, Any]: + """Test against sites with anti-bot measures""" + + logger.info( + f"Testing anti-bot site with stealth={'ON' if use_stealth else 'OFF'}", + tag="STEALTH" + ) + + browser_config = BrowserConfig( + headless=False, + enable_stealth=use_stealth, + # Additional browser arguments that help with stealth + extra_args=[ + "--disable-blink-features=AutomationControlled", + "--disable-features=site-per-process" + ] if not use_stealth else [] # These are automatically applied with stealth + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Some sites check for specific behaviors + behavior_script = """ + (async () => { + // Simulate human-like behavior + const sleep = ms => new Promise(resolve => setTimeout(resolve, ms)); + + // Random mouse movement + const moveX = Math.random() * 100; + const moveY = Math.random() * 100; + + // Simulate reading time + await sleep(1000 + Math.random() * 2000); + + // Scroll slightly + window.scrollBy(0, 100 + Math.random() * 200); + + console.log('Human behavior simulation complete'); + return true; + })() + """ + + config = CrawlerRunConfig( + js_code=behavior_script, + wait_until="networkidle", + delay_before_return_html=5.0, # Longer delay to appear more human + capture_console_messages=True + ) + + # Test on a site that implements anti-bot measures + result = await crawler.arun( + url="https://www.g2.com/", + config=config + ) + + # Check for common anti-bot blocks + blocked_indicators = [ + "Access Denied", + "403 Forbidden", + "Security Check", + "Verify you are human", + "captcha", + "challenge" + ] + + blocked = False + if result.html: + blocked = any(indicator.lower() in result.html.lower() for indicator in blocked_indicators) + + return { + "success": result.success and not blocked, + "url": result.url, + "blocked": blocked, + "status_code": result.status_code, + "page_title": result.metadata.get("title", "") if result.metadata else "", + "stealth_enabled": use_stealth + } + + +async def compare_results(): + """Run all tests with and without stealth mode and compare results""" + + print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + print(f"{Fore.CYAN}Crawl4AI Stealth Mode Comparison{Style.RESET_ALL}") + print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n") + + # Test 1: Bot Detection + print(f"{Fore.YELLOW}1. Bot Detection Test (bot.sannysoft.com){Style.RESET_ALL}") + print("-" * 40) + + # Without stealth + regular_detection = await test_bot_detection(use_stealth=False) + if regular_detection["success"] and regular_detection["detection_data"]: + print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}") + data = regular_detection["detection_data"] + print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}") + print(f" • Chrome: {data.get('chrome', 'Unknown')}") + print(f" • Languages: {data.get('languages', 'Unknown')}") + print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}") + print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...") + + # With stealth + stealth_detection = await test_bot_detection(use_stealth=True) + if stealth_detection["success"] and stealth_detection["detection_data"]: + print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}") + data = stealth_detection["detection_data"] + print(f" • WebDriver detected: {data.get('webdriver', 'Unknown')}") + print(f" • Chrome: {data.get('chrome', 'Unknown')}") + print(f" • Languages: {data.get('languages', 'Unknown')}") + print(f" • Plugins: {data.get('pluginsLength', 'Unknown')}") + print(f" • User Agent: {data.get('userAgent', 'Unknown')[:60]}...") + + # Test 2: Cloudflare Site + print(f"\n\n{Fore.YELLOW}2. Cloudflare Protected Site Test{Style.RESET_ALL}") + print("-" * 40) + + # Without stealth + regular_cf = await test_cloudflare_site(use_stealth=False) + print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}") + print(f" • Success: {regular_cf['success']}") + print(f" • Cloudflare Challenge: {regular_cf['cloudflare_challenge']}") + print(f" • Status Code: {regular_cf['status_code']}") + print(f" • Page Title: {regular_cf['page_title']}") + + # With stealth + stealth_cf = await test_cloudflare_site(use_stealth=True) + print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}") + print(f" • Success: {stealth_cf['success']}") + print(f" • Cloudflare Challenge: {stealth_cf['cloudflare_challenge']}") + print(f" • Status Code: {stealth_cf['status_code']}") + print(f" • Page Title: {stealth_cf['page_title']}") + + # Test 3: Anti-bot Site + print(f"\n\n{Fore.YELLOW}3. Anti-Bot Site Test{Style.RESET_ALL}") + print("-" * 40) + + # Without stealth + regular_antibot = await test_anti_bot_site(use_stealth=False) + print(f"{Fore.RED}Without Stealth:{Style.RESET_ALL}") + print(f" • Success: {regular_antibot['success']}") + print(f" • Blocked: {regular_antibot['blocked']}") + print(f" • Status Code: {regular_antibot['status_code']}") + print(f" • Page Title: {regular_antibot['page_title']}") + + # With stealth + stealth_antibot = await test_anti_bot_site(use_stealth=True) + print(f"\n{Fore.GREEN}With Stealth:{Style.RESET_ALL}") + print(f" • Success: {stealth_antibot['success']}") + print(f" • Blocked: {stealth_antibot['blocked']}") + print(f" • Status Code: {stealth_antibot['status_code']}") + print(f" • Page Title: {stealth_antibot['page_title']}") + + # Summary + print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + print(f"{Fore.CYAN}Summary:{Style.RESET_ALL}") + print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + print(f"\nStealth mode helps bypass basic bot detection by:") + print(f" • Hiding webdriver property") + print(f" • Modifying browser fingerprints") + print(f" • Adjusting navigator properties") + print(f" • Emulating real browser plugin behavior") + print(f"\n{Fore.YELLOW}Note:{Style.RESET_ALL} Stealth mode is not a silver bullet.") + print(f"Advanced anti-bot systems may still detect automation.") + print(f"Always respect robots.txt and website terms of service.") + + +async def stealth_best_practices(): + """Demonstrate best practices for using stealth mode""" + + print(f"\n\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + print(f"{Fore.CYAN}Stealth Mode Best Practices{Style.RESET_ALL}") + print(f"{Fore.CYAN}{'='*60}{Style.RESET_ALL}\n") + + # Best Practice 1: Combine with realistic behavior + print(f"{Fore.YELLOW}1. Combine with Realistic Behavior:{Style.RESET_ALL}") + + browser_config = BrowserConfig( + headless=False, + enable_stealth=True, + viewport_width=1920, + viewport_height=1080 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Simulate human-like behavior + human_behavior_script = """ + (async () => { + // Wait random time between actions + const randomWait = () => Math.random() * 2000 + 1000; + + // Simulate reading + await new Promise(resolve => setTimeout(resolve, randomWait())); + + // Smooth scroll + const smoothScroll = async () => { + const totalHeight = document.body.scrollHeight; + const viewHeight = window.innerHeight; + let currentPosition = 0; + + while (currentPosition < totalHeight - viewHeight) { + const scrollAmount = Math.random() * 300 + 100; + window.scrollBy({ + top: scrollAmount, + behavior: 'smooth' + }); + currentPosition += scrollAmount; + await new Promise(resolve => setTimeout(resolve, randomWait())); + } + }; + + await smoothScroll(); + console.log('Human-like behavior simulation completed'); + return true; + })() + """ + + config = CrawlerRunConfig( + js_code=human_behavior_script, + wait_until="networkidle", + delay_before_return_html=3.0, + capture_console_messages=True + ) + + result = await crawler.arun( + url="https://example.com", + config=config + ) + + print(f" ✓ Simulated human-like scrolling and reading patterns") + print(f" ✓ Added random delays between actions") + print(f" ✓ Result: {result.success}") + + # Best Practice 2: Use appropriate viewport and user agent + print(f"\n{Fore.YELLOW}2. Use Realistic Viewport and User Agent:{Style.RESET_ALL}") + + # Get a realistic user agent + from crawl4ai.user_agent_generator import UserAgentGenerator + ua_generator = UserAgentGenerator() + + browser_config = BrowserConfig( + headless=True, + enable_stealth=True, + viewport_width=1920, + viewport_height=1080, + user_agent=ua_generator.generate(device_type="desktop", browser_type="chrome") + ) + + print(f" ✓ Using realistic viewport: 1920x1080") + print(f" ✓ Using current Chrome user agent") + print(f" ✓ Stealth mode will ensure consistency") + + # Best Practice 3: Manage request rate + print(f"\n{Fore.YELLOW}3. Manage Request Rate:{Style.RESET_ALL}") + print(f" ✓ Add delays between requests") + print(f" ✓ Randomize timing patterns") + print(f" ✓ Respect robots.txt") + + # Best Practice 4: Session management + print(f"\n{Fore.YELLOW}4. Use Session Management:{Style.RESET_ALL}") + + browser_config = BrowserConfig( + headless=False, + enable_stealth=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Create a session for multiple requests + session_id = "stealth_session_1" + + config = CrawlerRunConfig( + session_id=session_id, + wait_until="domcontentloaded" + ) + + # First request + result1 = await crawler.arun( + url="https://example.com", + config=config + ) + + # Subsequent request reuses the same browser context + result2 = await crawler.arun( + url="https://example.com/about", + config=config + ) + + print(f" ✓ Reused browser session for multiple requests") + print(f" ✓ Maintains cookies and state between requests") + print(f" ✓ More efficient and realistic browsing pattern") + + print(f"\n{Fore.CYAN}{'='*60}{Style.RESET_ALL}") + + +async def main(): + """Run all examples""" + + # Run comparison tests + await compare_results() + + # Show best practices + await stealth_best_practices() + + print(f"\n{Fore.GREEN}Examples completed!{Style.RESET_ALL}") + print(f"\n{Fore.YELLOW}Remember:{Style.RESET_ALL}") + print(f"• Stealth mode helps with basic bot detection") + print(f"• Always respect website terms of service") + print(f"• Consider rate limiting and ethical scraping practices") + print(f"• For advanced protection, consider additional measures") + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/stealth_mode_quick_start.py b/docs/examples/stealth_mode_quick_start.py new file mode 100644 index 00000000..ee189131 --- /dev/null +++ b/docs/examples/stealth_mode_quick_start.py @@ -0,0 +1,215 @@ +""" +Quick Start: Using Stealth Mode in Crawl4AI + +This example shows practical use cases for the stealth mode feature. +Stealth mode helps bypass basic bot detection mechanisms. +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +async def example_1_basic_stealth(): + """Example 1: Basic stealth mode usage""" + print("\n=== Example 1: Basic Stealth Mode ===") + + # Enable stealth mode in browser config + browser_config = BrowserConfig( + enable_stealth=True, # This is the key parameter + headless=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com") + print(f"✓ Crawled {result.url} successfully") + print(f"✓ Title: {result.metadata.get('title', 'N/A')}") + + +async def example_2_stealth_with_screenshot(): + """Example 2: Stealth mode with screenshot to show detection results""" + print("\n=== Example 2: Stealth Mode Visual Verification ===") + + browser_config = BrowserConfig( + enable_stealth=True, + headless=False # Set to False to see the browser + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + screenshot=True, + wait_until="networkidle" + ) + + result = await crawler.arun( + url="https://bot.sannysoft.com", + config=config + ) + + if result.success: + print(f"✓ Successfully crawled bot detection site") + print(f"✓ With stealth enabled, many detection tests should show as passed") + + if result.screenshot: + # Save screenshot for verification + import base64 + with open("stealth_detection_results.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print(f"✓ Screenshot saved as 'stealth_detection_results.png'") + print(f" Check the screenshot to see detection results!") + + +async def example_3_stealth_for_protected_sites(): + """Example 3: Using stealth for sites with bot protection""" + print("\n=== Example 3: Stealth for Protected Sites ===") + + browser_config = BrowserConfig( + enable_stealth=True, + headless=True, + viewport_width=1920, + viewport_height=1080 + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # Add human-like behavior + config = CrawlerRunConfig( + wait_until="networkidle", + delay_before_return_html=2.0, # Wait 2 seconds + js_code=""" + // Simulate human-like scrolling + window.scrollTo({ + top: document.body.scrollHeight / 2, + behavior: 'smooth' + }); + """ + ) + + # Try accessing a site that might have bot protection + result = await crawler.arun( + url="https://www.g2.com/products/slack/reviews", + config=config + ) + + if result.success: + print(f"✓ Successfully accessed protected site") + print(f"✓ Retrieved {len(result.html)} characters of HTML") + else: + print(f"✗ Failed to access site: {result.error_message}") + + +async def example_4_stealth_with_sessions(): + """Example 4: Stealth mode with session management""" + print("\n=== Example 4: Stealth + Session Management ===") + + browser_config = BrowserConfig( + enable_stealth=True, + headless=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + session_id = "my_stealth_session" + + # First request - establish session + config = CrawlerRunConfig( + session_id=session_id, + wait_until="domcontentloaded" + ) + + result1 = await crawler.arun( + url="https://news.ycombinator.com", + config=config + ) + print(f"✓ First request completed: {result1.url}") + + # Second request - reuse session + await asyncio.sleep(2) # Brief delay between requests + + result2 = await crawler.arun( + url="https://news.ycombinator.com/best", + config=config + ) + print(f"✓ Second request completed: {result2.url}") + print(f"✓ Session reused, maintaining cookies and state") + + +async def example_5_stealth_comparison(): + """Example 5: Compare results with and without stealth using screenshots""" + print("\n=== Example 5: Stealth Mode Comparison ===") + + test_url = "https://bot.sannysoft.com" + + # First test WITHOUT stealth + print("\nWithout stealth:") + regular_config = BrowserConfig( + enable_stealth=False, + headless=True + ) + + async with AsyncWebCrawler(config=regular_config) as crawler: + config = CrawlerRunConfig( + screenshot=True, + wait_until="networkidle" + ) + result = await crawler.arun(url=test_url, config=config) + + if result.success and result.screenshot: + import base64 + with open("comparison_without_stealth.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print(f" ✓ Screenshot saved: comparison_without_stealth.png") + print(f" Many tests will show as FAILED (red)") + + # Then test WITH stealth + print("\nWith stealth:") + stealth_config = BrowserConfig( + enable_stealth=True, + headless=True + ) + + async with AsyncWebCrawler(config=stealth_config) as crawler: + config = CrawlerRunConfig( + screenshot=True, + wait_until="networkidle" + ) + result = await crawler.arun(url=test_url, config=config) + + if result.success and result.screenshot: + import base64 + with open("comparison_with_stealth.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print(f" ✓ Screenshot saved: comparison_with_stealth.png") + print(f" More tests should show as PASSED (green)") + + print("\nCompare the two screenshots to see the difference!") + + +async def main(): + """Run all examples""" + print("Crawl4AI Stealth Mode Examples") + print("==============================") + + # Run basic example + await example_1_basic_stealth() + + # Run screenshot verification example + await example_2_stealth_with_screenshot() + + # Run protected site example + await example_3_stealth_for_protected_sites() + + # Run session example + await example_4_stealth_with_sessions() + + # Run comparison example + await example_5_stealth_comparison() + + print("\n" + "="*50) + print("Tips for using stealth mode effectively:") + print("- Use realistic viewport sizes (1920x1080, 1366x768)") + print("- Add delays between requests to appear more human") + print("- Combine with session management for better results") + print("- Remember: stealth mode is for legitimate scraping only") + print("="*50) + + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/stealth_test_simple.py b/docs/examples/stealth_test_simple.py new file mode 100644 index 00000000..8bf9c2d9 --- /dev/null +++ b/docs/examples/stealth_test_simple.py @@ -0,0 +1,62 @@ +""" +Simple test to verify stealth mode is working +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +async def test_stealth(): + """Test stealth mode effectiveness""" + + # Test WITHOUT stealth + print("=== WITHOUT Stealth ===") + config1 = BrowserConfig( + headless=False, + enable_stealth=False + ) + + async with AsyncWebCrawler(config=config1) as crawler: + result = await crawler.arun( + url="https://bot.sannysoft.com", + config=CrawlerRunConfig( + wait_until="networkidle", + screenshot=True + ) + ) + print(f"Success: {result.success}") + # Take screenshot + if result.screenshot: + with open("without_stealth.png", "wb") as f: + import base64 + f.write(base64.b64decode(result.screenshot)) + print("Screenshot saved: without_stealth.png") + + # Test WITH stealth + print("\n=== WITH Stealth ===") + config2 = BrowserConfig( + headless=False, + enable_stealth=True + ) + + async with AsyncWebCrawler(config=config2) as crawler: + result = await crawler.arun( + url="https://bot.sannysoft.com", + config=CrawlerRunConfig( + wait_until="networkidle", + screenshot=True + ) + ) + print(f"Success: {result.success}") + # Take screenshot + if result.screenshot: + with open("with_stealth.png", "wb") as f: + import base64 + f.write(base64.b64decode(result.screenshot)) + print("Screenshot saved: with_stealth.png") + + print("\nCheck the screenshots to see the difference in bot detection results!") + + +if __name__ == "__main__": + asyncio.run(test_stealth()) \ No newline at end of file diff --git a/docs/examples/undetectability/undetected_basic_test.py b/docs/examples/undetectability/undetected_basic_test.py new file mode 100644 index 00000000..f28231f0 --- /dev/null +++ b/docs/examples/undetectability/undetected_basic_test.py @@ -0,0 +1,74 @@ +""" +Basic Undetected Browser Test +Simple example to test if undetected mode works +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig + +async def test_regular_mode(): + """Test with regular browser""" + print("Testing Regular Browser Mode...") + browser_config = BrowserConfig( + headless=False, + verbose=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(f"Regular Mode - Success: {result.success}") + print(f"Regular Mode - Status: {result.status_code}") + print(f"Regular Mode - Content length: {len(result.markdown.raw_markdown)}") + print(f"Regular Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...") + return result.success + +async def test_undetected_mode(): + """Test with undetected browser""" + print("\nTesting Undetected Browser Mode...") + from crawl4ai import UndetectedAdapter + from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + browser_config = BrowserConfig( + headless=False, + verbose=True + ) + + # Create undetected adapter + undetected_adapter = UndetectedAdapter() + + # Create strategy with undetected adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + result = await crawler.arun(url="https://www.example.com") + print(f"Undetected Mode - Success: {result.success}") + print(f"Undetected Mode - Status: {result.status_code}") + print(f"Undetected Mode - Content length: {len(result.markdown.raw_markdown)}") + print(f"Undetected Mode - First 100 chars: {result.markdown.raw_markdown[:100]}...") + return result.success + +async def main(): + """Run both tests""" + print("🤖 Crawl4AI Basic Adapter Test\n") + + # Test regular mode + regular_success = await test_regular_mode() + + # Test undetected mode + undetected_success = await test_undetected_mode() + + # Summary + print("\n" + "="*50) + print("Summary:") + print(f"Regular Mode: {'✅ Success' if regular_success else '❌ Failed'}") + print(f"Undetected Mode: {'✅ Success' if undetected_success else '❌ Failed'}") + print("="*50) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/undetectability/undetected_bot_test.py b/docs/examples/undetectability/undetected_bot_test.py new file mode 100644 index 00000000..ba9a6ec7 --- /dev/null +++ b/docs/examples/undetectability/undetected_bot_test.py @@ -0,0 +1,155 @@ +""" +Bot Detection Test - Compare Regular vs Undetected +Tests browser fingerprinting differences at bot.sannysoft.com +""" + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + UndetectedAdapter, + CrawlResult +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +# Bot detection test site +TEST_URL = "https://bot.sannysoft.com" + +def analyze_bot_detection(result: CrawlResult) -> dict: + """Analyze bot detection results from the page""" + detections = { + "webdriver": False, + "headless": False, + "automation": False, + "user_agent": False, + "total_tests": 0, + "failed_tests": 0 + } + + if not result.success or not result.html: + return detections + + # Look for specific test results in the HTML + html_lower = result.html.lower() + + # Check for common bot indicators + if "webdriver" in html_lower and ("fail" in html_lower or "true" in html_lower): + detections["webdriver"] = True + detections["failed_tests"] += 1 + + if "headless" in html_lower and ("fail" in html_lower or "true" in html_lower): + detections["headless"] = True + detections["failed_tests"] += 1 + + if "automation" in html_lower and "detected" in html_lower: + detections["automation"] = True + detections["failed_tests"] += 1 + + # Count total tests (approximate) + detections["total_tests"] = html_lower.count("test") + html_lower.count("check") + + return detections + +async def test_browser_mode(adapter_name: str, adapter=None): + """Test a browser mode and return results""" + print(f"\n{'='*60}") + print(f"Testing: {adapter_name}") + print(f"{'='*60}") + + browser_config = BrowserConfig( + headless=False, # Run in headed mode for better results + verbose=True, + viewport_width=1920, + viewport_height=1080, + ) + + if adapter: + # Use undetected mode + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter + ) + crawler = AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) + else: + # Use regular mode + crawler = AsyncWebCrawler(config=browser_config) + + async with crawler: + config = CrawlerRunConfig( + delay_before_return_html=3.0, # Let detection scripts run + wait_for_images=True, + screenshot=True, + simulate_user=False, # Don't simulate for accurate detection + ) + + result = await crawler.arun(url=TEST_URL, config=config) + + print(f"\n✓ Success: {result.success}") + print(f"✓ Status Code: {result.status_code}") + + if result.success: + # Analyze detection results + detections = analyze_bot_detection(result) + + print(f"\n🔍 Bot Detection Analysis:") + print(f" - WebDriver Detected: {'❌ Yes' if detections['webdriver'] else '✅ No'}") + print(f" - Headless Detected: {'❌ Yes' if detections['headless'] else '✅ No'}") + print(f" - Automation Detected: {'❌ Yes' if detections['automation'] else '✅ No'}") + print(f" - Failed Tests: {detections['failed_tests']}") + + # Show some content + if result.markdown.raw_markdown: + print(f"\nContent preview:") + lines = result.markdown.raw_markdown.split('\n') + for line in lines[:20]: # Show first 20 lines + if any(keyword in line.lower() for keyword in ['test', 'pass', 'fail', 'yes', 'no']): + print(f" {line.strip()}") + + return result, detections if result.success else {} + +async def main(): + """Run the comparison""" + print("🤖 Crawl4AI - Bot Detection Test") + print(f"Testing at: {TEST_URL}") + print("This site runs various browser fingerprinting tests\n") + + # Test regular browser + regular_result, regular_detections = await test_browser_mode("Regular Browser") + + # Small delay + await asyncio.sleep(2) + + # Test undetected browser + undetected_adapter = UndetectedAdapter() + undetected_result, undetected_detections = await test_browser_mode( + "Undetected Browser", + undetected_adapter + ) + + # Summary comparison + print(f"\n{'='*60}") + print("COMPARISON SUMMARY") + print(f"{'='*60}") + + print(f"\n{'Test':<25} {'Regular':<15} {'Undetected':<15}") + print(f"{'-'*55}") + + if regular_detections and undetected_detections: + print(f"{'WebDriver Detection':<25} {'❌ Detected' if regular_detections['webdriver'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['webdriver'] else '✅ Passed':<15}") + print(f"{'Headless Detection':<25} {'❌ Detected' if regular_detections['headless'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['headless'] else '✅ Passed':<15}") + print(f"{'Automation Detection':<25} {'❌ Detected' if regular_detections['automation'] else '✅ Passed':<15} {'❌ Detected' if undetected_detections['automation'] else '✅ Passed':<15}") + print(f"{'Failed Tests':<25} {regular_detections['failed_tests']:<15} {undetected_detections['failed_tests']:<15}") + + print(f"\n{'='*60}") + + if undetected_detections.get('failed_tests', 0) < regular_detections.get('failed_tests', 1): + print("✅ Undetected browser performed better at evading detection!") + else: + print("ℹ️ Both browsers had similar detection results") + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/undetectability/undetected_cloudflare_test.py b/docs/examples/undetectability/undetected_cloudflare_test.py new file mode 100644 index 00000000..2fc2ce09 --- /dev/null +++ b/docs/examples/undetectability/undetected_cloudflare_test.py @@ -0,0 +1,164 @@ +""" +Undetected Browser Test - Cloudflare Protected Site +Tests the difference between regular and undetected modes on a Cloudflare-protected site +""" + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + UndetectedAdapter +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +# Test URL with Cloudflare protection +TEST_URL = "https://nowsecure.nl" + +async def test_regular_browser(): + """Test with regular browser - likely to be blocked""" + print("=" * 60) + print("Testing with Regular Browser") + print("=" * 60) + + browser_config = BrowserConfig( + headless=False, + verbose=True, + viewport_width=1920, + viewport_height=1080, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + delay_before_return_html=2.0, + simulate_user=True, + magic=True, # Try with magic mode too + ) + + result = await crawler.arun(url=TEST_URL, config=config) + + print(f"\n✓ Success: {result.success}") + print(f"✓ Status Code: {result.status_code}") + print(f"✓ HTML Length: {len(result.html)}") + + # Check for Cloudflare challenge + if result.html: + cf_indicators = [ + "Checking your browser", + "Please stand by", + "cloudflare", + "cf-browser-verification", + "Access denied", + "Ray ID" + ] + + detected = False + for indicator in cf_indicators: + if indicator.lower() in result.html.lower(): + print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found") + detected = True + break + + if not detected and len(result.markdown.raw_markdown) > 100: + print("✅ Successfully bypassed Cloudflare!") + print(f"Content preview: {result.markdown.raw_markdown[:200]}...") + elif not detected: + print("⚠️ Page loaded but content seems minimal") + + return result + +async def test_undetected_browser(): + """Test with undetected browser - should bypass Cloudflare""" + print("\n" + "=" * 60) + print("Testing with Undetected Browser") + print("=" * 60) + + browser_config = BrowserConfig( + headless=False, # Headless is easier to detect + verbose=True, + viewport_width=1920, + viewport_height=1080, + ) + + # Create undetected adapter + undetected_adapter = UndetectedAdapter() + + # Create strategy with undetected adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + config = CrawlerRunConfig( + delay_before_return_html=2.0, + simulate_user=True, + ) + + result = await crawler.arun(url=TEST_URL, config=config) + + print(f"\n✓ Success: {result.success}") + print(f"✓ Status Code: {result.status_code}") + print(f"✓ HTML Length: {len(result.html)}") + + # Check for Cloudflare challenge + if result.html: + cf_indicators = [ + "Checking your browser", + "Please stand by", + "cloudflare", + "cf-browser-verification", + "Access denied", + "Ray ID" + ] + + detected = False + for indicator in cf_indicators: + if indicator.lower() in result.html.lower(): + print(f"⚠️ Cloudflare Challenge Detected: '{indicator}' found") + detected = True + break + + if not detected and len(result.markdown.raw_markdown) > 100: + print("✅ Successfully bypassed Cloudflare!") + print(f"Content preview: {result.markdown.raw_markdown[:200]}...") + elif not detected: + print("⚠️ Page loaded but content seems minimal") + + return result + +async def main(): + """Compare regular vs undetected browser""" + print("🤖 Crawl4AI - Cloudflare Bypass Test") + print(f"Testing URL: {TEST_URL}\n") + + # Test regular browser + regular_result = await test_regular_browser() + + # Small delay + await asyncio.sleep(2) + + # Test undetected browser + undetected_result = await test_undetected_browser() + + # Summary + print("\n" + "=" * 60) + print("SUMMARY") + print("=" * 60) + print(f"Regular Browser:") + print(f" - Success: {regular_result.success}") + print(f" - Content Length: {len(regular_result.markdown.raw_markdown) if regular_result.markdown else 0}") + + print(f"\nUndetected Browser:") + print(f" - Success: {undetected_result.success}") + print(f" - Content Length: {len(undetected_result.markdown.raw_markdown) if undetected_result.markdown else 0}") + + if undetected_result.success and len(undetected_result.markdown.raw_markdown) > len(regular_result.markdown.raw_markdown): + print("\n✅ Undetected browser successfully bypassed protection!") + print("=" * 60) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/undetectability/undetected_vs_regular_comparison.py b/docs/examples/undetectability/undetected_vs_regular_comparison.py new file mode 100644 index 00000000..80972c0f --- /dev/null +++ b/docs/examples/undetectability/undetected_vs_regular_comparison.py @@ -0,0 +1,184 @@ +""" +Undetected vs Regular Browser Comparison +This example demonstrates the difference between regular and undetected browser modes +when accessing sites with bot detection services. + +Based on tested anti-bot services: +- Cloudflare +- Kasada +- Akamai +- DataDome +- Bet365 +- And others +""" + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + PlaywrightAdapter, + UndetectedAdapter, + CrawlResult +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + +# Test URLs for various bot detection services +TEST_SITES = { + "Cloudflare Protected": "https://nowsecure.nl", + # "Bot Detection Test": "https://bot.sannysoft.com", + # "Fingerprint Test": "https://fingerprint.com/products/bot-detection", + # "Browser Scan": "https://browserscan.net", + # "CreepJS": "https://abrahamjuliot.github.io/creepjs", +} + + +async def test_with_adapter(url: str, adapter_name: str, adapter): + """Test a URL with a specific adapter""" + browser_config = BrowserConfig( + headless=False, # Better for avoiding detection + viewport_width=1920, + viewport_height=1080, + verbose=True, + ) + + # Create the crawler strategy with the adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter + ) + + print(f"\n{'='*60}") + print(f"Testing with {adapter_name} adapter") + print(f"URL: {url}") + print(f"{'='*60}") + + try: + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + crawler_config = CrawlerRunConfig( + delay_before_return_html=3.0, # Give page time to load + wait_for_images=True, + screenshot=True, + simulate_user=True, # Add user simulation + ) + + result: CrawlResult = await crawler.arun( + url=url, + config=crawler_config + ) + + # Check results + print(f"✓ Status Code: {result.status_code}") + print(f"✓ Success: {result.success}") + print(f"✓ HTML Length: {len(result.html)}") + print(f"✓ Markdown Length: {len(result.markdown.raw_markdown)}") + + # Check for common bot detection indicators + detection_indicators = [ + "Access denied", + "Please verify you are human", + "Checking your browser", + "Enable JavaScript", + "captcha", + "403 Forbidden", + "Bot detection", + "Security check" + ] + + content_lower = result.markdown.raw_markdown.lower() + detected = False + for indicator in detection_indicators: + if indicator.lower() in content_lower: + print(f"⚠️ Possible detection: Found '{indicator}'") + detected = True + break + + if not detected: + print("✅ No obvious bot detection triggered!") + # Show first 200 chars of content + print(f"Content preview: {result.markdown.raw_markdown[:200]}...") + + return result.success and not detected + + except Exception as e: + print(f"❌ Error: {str(e)}") + return False + + +async def compare_adapters(url: str, site_name: str): + """Compare regular and undetected adapters on the same URL""" + print(f"\n{'#'*60}") + print(f"# Testing: {site_name}") + print(f"{'#'*60}") + + # Test with regular adapter + regular_adapter = PlaywrightAdapter() + regular_success = await test_with_adapter(url, "Regular", regular_adapter) + + # Small delay between tests + await asyncio.sleep(2) + + # Test with undetected adapter + undetected_adapter = UndetectedAdapter() + undetected_success = await test_with_adapter(url, "Undetected", undetected_adapter) + + # Summary + print(f"\n{'='*60}") + print(f"Summary for {site_name}:") + print(f"Regular Adapter: {'✅ Passed' if regular_success else '❌ Blocked/Detected'}") + print(f"Undetected Adapter: {'✅ Passed' if undetected_success else '❌ Blocked/Detected'}") + print(f"{'='*60}") + + return regular_success, undetected_success + + +async def main(): + """Run comparison tests on multiple sites""" + print("🤖 Crawl4AI Browser Adapter Comparison") + print("Testing regular vs undetected browser modes\n") + + results = {} + + # Test each site + for site_name, url in TEST_SITES.items(): + regular, undetected = await compare_adapters(url, site_name) + results[site_name] = { + "regular": regular, + "undetected": undetected + } + + # Delay between different sites + await asyncio.sleep(3) + + # Final summary + print(f"\n{'#'*60}") + print("# FINAL RESULTS") + print(f"{'#'*60}") + print(f"{'Site':<30} {'Regular':<15} {'Undetected':<15}") + print(f"{'-'*60}") + + for site, result in results.items(): + regular_status = "✅ Passed" if result["regular"] else "❌ Blocked" + undetected_status = "✅ Passed" if result["undetected"] else "❌ Blocked" + print(f"{site:<30} {regular_status:<15} {undetected_status:<15}") + + # Calculate success rates + regular_success = sum(1 for r in results.values() if r["regular"]) + undetected_success = sum(1 for r in results.values() if r["undetected"]) + total = len(results) + + print(f"\n{'='*60}") + print(f"Success Rates:") + print(f"Regular Adapter: {regular_success}/{total} ({regular_success/total*100:.1f}%)") + print(f"Undetected Adapter: {undetected_success}/{total} ({undetected_success/total*100:.1f}%)") + print(f"{'='*60}") + + +if __name__ == "__main__": + # Note: This example may take a while to run as it tests multiple sites + # You can comment out sites in TEST_SITES to run faster tests + asyncio.run(main()) \ No newline at end of file diff --git a/docs/examples/undetected_simple_demo.py b/docs/examples/undetected_simple_demo.py new file mode 100644 index 00000000..93954c9f --- /dev/null +++ b/docs/examples/undetected_simple_demo.py @@ -0,0 +1,118 @@ +""" +Simple Undetected Browser Demo +Demonstrates the basic usage of undetected browser mode +""" + +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + UndetectedAdapter +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +async def crawl_with_regular_browser(url: str): + """Crawl with regular browser""" + print("\n[Regular Browser Mode]") + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url=url, + config=CrawlerRunConfig( + delay_before_return_html=2.0 + ) + ) + + print(f"Success: {result.success}") + print(f"Status: {result.status_code}") + print(f"Content length: {len(result.markdown.raw_markdown)}") + + # Check for bot detection keywords + content = result.markdown.raw_markdown.lower() + if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]): + print("⚠️ Bot detection triggered!") + else: + print("✅ Page loaded successfully") + + return result + +async def crawl_with_undetected_browser(url: str): + """Crawl with undetected browser""" + print("\n[Undetected Browser Mode]") + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + + # Create undetected adapter and strategy + undetected_adapter = UndetectedAdapter() + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + result = await crawler.arun( + url=url, + config=CrawlerRunConfig( + delay_before_return_html=2.0 + ) + ) + + print(f"Success: {result.success}") + print(f"Status: {result.status_code}") + print(f"Content length: {len(result.markdown.raw_markdown)}") + + # Check for bot detection keywords + content = result.markdown.raw_markdown.lower() + if any(word in content for word in ["cloudflare", "checking your browser", "please wait"]): + print("⚠️ Bot detection triggered!") + else: + print("✅ Page loaded successfully") + + return result + +async def main(): + """Demo comparing regular vs undetected modes""" + print("🤖 Crawl4AI Undetected Browser Demo") + print("="*50) + + # Test URLs - you can change these + test_urls = [ + "https://www.example.com", # Simple site + "https://httpbin.org/headers", # Shows request headers + ] + + for url in test_urls: + print(f"\n📍 Testing URL: {url}") + + # Test with regular browser + regular_result = await crawl_with_regular_browser(url) + + # Small delay + await asyncio.sleep(2) + + # Test with undetected browser + undetected_result = await crawl_with_undetected_browser(url) + + # Compare results + print(f"\n📊 Comparison for {url}:") + print(f"Regular browser content: {len(regular_result.markdown.raw_markdown)} chars") + print(f"Undetected browser content: {len(undetected_result.markdown.raw_markdown)} chars") + + if url == "https://httpbin.org/headers": + # Show headers for comparison + print("\nHeaders seen by server:") + print("Regular:", regular_result.markdown.raw_markdown[:500]) + print("\nUndetected:", undetected_result.markdown.raw_markdown[:500]) + +if __name__ == "__main__": + asyncio.run(main()) \ No newline at end of file diff --git a/docs/md_v2/advanced/advanced-features.md b/docs/md_v2/advanced/advanced-features.md index 3563fd40..211869c3 100644 --- a/docs/md_v2/advanced/advanced-features.md +++ b/docs/md_v2/advanced/advanced-features.md @@ -358,9 +358,77 @@ if __name__ == "__main__": --- +--- + +## 7. Anti-Bot Features (Stealth Mode & Undetected Browser) + +Crawl4AI provides two powerful features to bypass bot detection: + +### 7.1 Stealth Mode + +Stealth mode uses playwright-stealth to modify browser fingerprints and behaviors. Enable it with a simple flag: + +```python +browser_config = BrowserConfig( + enable_stealth=True, # Activates stealth mode + headless=False +) +``` + +**When to use**: Sites with basic bot detection (checking navigator.webdriver, plugins, etc.) + +### 7.2 Undetected Browser + +For advanced bot detection, use the undetected browser adapter: + +```python +from crawl4ai import UndetectedAdapter +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +# Create undetected adapter +adapter = UndetectedAdapter() +strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter +) + +async with AsyncWebCrawler(crawler_strategy=strategy, config=browser_config) as crawler: + # Your crawling code +``` + +**When to use**: Sites with sophisticated bot detection (Cloudflare, DataDome, etc.) + +### 7.3 Combining Both + +For maximum evasion, combine stealth mode with undetected browser: + +```python +browser_config = BrowserConfig( + enable_stealth=True, # Enable stealth + headless=False +) + +adapter = UndetectedAdapter() # Use undetected browser +``` + +### Choosing the Right Approach + +| Detection Level | Recommended Approach | +|----------------|---------------------| +| No protection | Regular browser | +| Basic checks | Regular + Stealth mode | +| Advanced protection | Undetected browser | +| Maximum evasion | Undetected + Stealth mode | + +**Best Practice**: Start with regular browser + stealth mode. Only use undetected browser if needed, as it may be slightly slower. + +See [Undetected Browser Mode](undetected-browser.md) for detailed examples. + +--- + ## Conclusion & Next Steps -You’ve now explored several **advanced** features: +You've now explored several **advanced** features: - **Proxy Usage** - **PDF & Screenshot** capturing for large or critical pages @@ -368,7 +436,10 @@ You’ve now explored several **advanced** features: - **Custom Headers** for language or specialized requests - **Session Persistence** via storage state - **Robots.txt Compliance** +- **Anti-Bot Features** (Stealth Mode & Undetected Browser) -With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, and manage sessions across multiple runs—streamlining your entire data collection pipeline. +With these power tools, you can build robust scraping workflows that mimic real user behavior, handle secure sites, capture detailed snapshots, manage sessions across multiple runs, and bypass bot detection—streamlining your entire data collection pipeline. -**Last Updated**: 2025-01-01 \ No newline at end of file +**Note**: In future versions, we may enable stealth mode and undetected browser by default. For now, users should explicitly enable these features when needed. + +**Last Updated**: 2025-01-17 \ No newline at end of file diff --git a/docs/md_v2/advanced/undetected-browser.md b/docs/md_v2/advanced/undetected-browser.md new file mode 100644 index 00000000..310701e6 --- /dev/null +++ b/docs/md_v2/advanced/undetected-browser.md @@ -0,0 +1,394 @@ +# Undetected Browser Mode + +## Overview + +Crawl4AI offers two powerful anti-bot features to help you access websites with bot detection: + +1. **Stealth Mode** - Uses playwright-stealth to modify browser fingerprints and behaviors +2. **Undetected Browser Mode** - Advanced browser adapter with deep-level patches for sophisticated bot detection + +This guide covers both features and helps you choose the right approach for your needs. + +## Anti-Bot Features Comparison + +| Feature | Regular Browser | Stealth Mode | Undetected Browser | +|---------|----------------|--------------|-------------------| +| WebDriver Detection | ❌ | ✅ | ✅ | +| Navigator Properties | ❌ | ✅ | ✅ | +| Plugin Emulation | ❌ | ✅ | ✅ | +| CDP Detection | ❌ | Partial | ✅ | +| Deep Browser Patches | ❌ | ❌ | ✅ | +| Performance Impact | None | Minimal | Moderate | +| Setup Complexity | None | None | Minimal | + +## When to Use Each Approach + +### Use Regular Browser + Stealth Mode When: +- Sites have basic bot detection (checking navigator.webdriver, plugins, etc.) +- You need good performance with basic protection +- Sites check for common automation indicators + +### Use Undetected Browser When: +- Sites employ sophisticated bot detection services (Cloudflare, DataDome, etc.) +- Stealth mode alone isn't sufficient +- You're willing to trade some performance for better evasion + +### Best Practice: Progressive Enhancement +1. **Start with**: Regular browser + Stealth mode +2. **If blocked**: Switch to Undetected browser +3. **If still blocked**: Combine Undetected browser + Stealth mode + +## Stealth Mode + +Stealth mode is the simpler anti-bot solution that works with both regular and undetected browsers: + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig + +# Enable stealth mode with regular browser +browser_config = BrowserConfig( + enable_stealth=True, # Simple flag to enable + headless=False # Better for avoiding detection +) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") +``` + +### What Stealth Mode Does: +- Removes `navigator.webdriver` flag +- Modifies browser fingerprints +- Emulates realistic plugin behavior +- Adjusts navigator properties +- Fixes common automation leaks + +## Undetected Browser Mode + +For sites with sophisticated bot detection that stealth mode can't bypass, use the undetected browser adapter: + +### Key Features + +- **Drop-in Replacement**: Uses the same API as regular browser mode +- **Enhanced Stealth**: Built-in patches to evade common detection methods +- **Browser Adapter Pattern**: Seamlessly switch between regular and undetected modes +- **Automatic Installation**: `crawl4ai-setup` installs all necessary browser dependencies + +### Quick Start + +```python +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + UndetectedAdapter +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +async def main(): + # Create the undetected adapter + undetected_adapter = UndetectedAdapter() + + # Create browser config + browser_config = BrowserConfig( + headless=False, # Headless mode can be detected easier + verbose=True, + ) + + # Create the crawler strategy with undetected adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + # Create the crawler with our custom strategy + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + # Your crawling code here + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig() + ) + print(result.markdown[:500]) + +asyncio.run(main()) +``` + +## Combining Both Features + +For maximum evasion, combine stealth mode with undetected browser: + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, UndetectedAdapter +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + +# Create browser config with stealth enabled +browser_config = BrowserConfig( + enable_stealth=True, # Enable stealth mode + headless=False +) + +# Create undetected adapter +adapter = UndetectedAdapter() + +# Create strategy with both features +strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter +) + +async with AsyncWebCrawler( + crawler_strategy=strategy, + config=browser_config +) as crawler: + result = await crawler.arun("https://protected-site.com") +``` + +## Examples + +### Example 1: Basic Stealth Mode + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def test_stealth_mode(): + # Simple stealth mode configuration + browser_config = BrowserConfig( + enable_stealth=True, + headless=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://bot.sannysoft.com", + config=CrawlerRunConfig(screenshot=True) + ) + + if result.success: + print("✓ Successfully accessed bot detection test site") + # Save screenshot to verify detection results + if result.screenshot: + import base64 + with open("stealth_test.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print("✓ Screenshot saved - check for green (passed) tests") + +asyncio.run(test_stealth_mode()) +``` + +### Example 2: Undetected Browser Mode + +```python +import asyncio +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + UndetectedAdapter +) +from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy + + +async def main(): + # Create browser config + browser_config = BrowserConfig( + headless=False, + verbose=True, + ) + + # Create the undetected adapter + undetected_adapter = UndetectedAdapter() + + # Create the crawler strategy with the undetected adapter + crawler_strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=undetected_adapter + ) + + # Create the crawler with our custom strategy + async with AsyncWebCrawler( + crawler_strategy=crawler_strategy, + config=browser_config + ) as crawler: + # Configure the crawl + crawler_config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter() + ), + capture_console_messages=True, # Test adapter console capture + ) + + # Test on a site that typically detects bots + print("Testing undetected adapter...") + result: CrawlResult = await crawler.arun( + url="https://www.helloworld.org", + config=crawler_config + ) + + print(f"Status: {result.status_code}") + print(f"Success: {result.success}") + print(f"Console messages captured: {len(result.console_messages or [])}") + print(f"Markdown content (first 500 chars):\n{result.markdown.raw_markdown[:500]}") + + +if __name__ == "__main__": + asyncio.run(main()) +``` + +## Browser Adapter Pattern + +The undetected browser support is implemented using an adapter pattern, allowing seamless switching between different browser implementations: + +```python +# Regular browser adapter (default) +from crawl4ai import PlaywrightAdapter +regular_adapter = PlaywrightAdapter() + +# Undetected browser adapter +from crawl4ai import UndetectedAdapter +undetected_adapter = UndetectedAdapter() +``` + +The adapter handles: +- JavaScript execution +- Console message capture +- Error handling +- Browser-specific optimizations + +## Best Practices + +1. **Avoid Headless Mode**: Detection is easier in headless mode + ```python + browser_config = BrowserConfig(headless=False) + ``` + +2. **Use Reasonable Delays**: Don't rush through pages + ```python + crawler_config = CrawlerRunConfig( + wait_time=3.0, # Wait 3 seconds after page load + delay_before_return_html=2.0 # Additional delay + ) + ``` + +3. **Rotate User Agents**: You can customize user agents + ```python + browser_config = BrowserConfig( + headers={"User-Agent": "your-user-agent"} + ) + ``` + +4. **Handle Failures Gracefully**: Some sites may still detect and block + ```python + if not result.success: + print(f"Crawl failed: {result.error_message}") + ``` + +## Advanced Usage Tips + +### Progressive Detection Handling + +```python +async def crawl_with_progressive_evasion(url): + # Step 1: Try regular browser with stealth + browser_config = BrowserConfig( + enable_stealth=True, + headless=False + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url) + if result.success and "Access Denied" not in result.html: + return result + + # Step 2: If blocked, try undetected browser + print("Regular + stealth blocked, trying undetected browser...") + + adapter = UndetectedAdapter() + strategy = AsyncPlaywrightCrawlerStrategy( + browser_config=browser_config, + browser_adapter=adapter + ) + + async with AsyncWebCrawler( + crawler_strategy=strategy, + config=browser_config + ) as crawler: + result = await crawler.arun(url) + return result +``` + +## Installation + +The undetected browser dependencies are automatically installed when you run: + +```bash +crawl4ai-setup +``` + +This command installs all necessary browser dependencies for both regular and undetected modes. + +## Limitations + +- **Performance**: Slightly slower than regular mode due to additional patches +- **Headless Detection**: Some sites can still detect headless mode +- **Resource Usage**: May use more resources than regular mode +- **Not 100% Guaranteed**: Advanced anti-bot services are constantly evolving + +## Troubleshooting + +### Browser Not Found + +Run the setup command: +```bash +crawl4ai-setup +``` + +### Detection Still Occurring + +Try combining with other features: +```python +crawler_config = CrawlerRunConfig( + simulate_user=True, # Add user simulation + magic=True, # Enable magic mode + wait_time=5.0, # Longer waits +) +``` + +### Performance Issues + +If experiencing slow performance: +```python +# Use selective undetected mode only for protected sites +if is_protected_site(url): + adapter = UndetectedAdapter() +else: + adapter = PlaywrightAdapter() # Default adapter +``` + +## Future Plans + +**Note**: In future versions of Crawl4AI, we may enable stealth mode and undetected browser by default to provide better out-of-the-box success rates. For now, users should explicitly enable these features when needed. + +## Conclusion + +Crawl4AI provides flexible anti-bot solutions: + +1. **Start Simple**: Use regular browser + stealth mode for most sites +2. **Escalate if Needed**: Switch to undetected browser for sophisticated protection +3. **Combine for Maximum Effect**: Use both features together when facing the toughest challenges + +Remember: +- Always respect robots.txt and website terms of service +- Use appropriate delays to avoid overwhelming servers +- Consider the performance trade-offs of each approach +- Test progressively to find the minimum necessary evasion level + +## See Also + +- [Advanced Features](advanced-features.md) - Overview of all advanced features +- [Proxy & Security](proxy-security.md) - Using proxies with anti-bot features +- [Session Management](session-management.md) - Maintaining sessions across requests +- [Identity Based Crawling](identity-based-crawling.md) - Additional anti-detection strategies \ No newline at end of file diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 34a69d7b..e86bed4c 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -29,6 +29,7 @@ class BrowserConfig: text_mode=False, light_mode=False, extra_args=None, + enable_stealth=False, # ... other advanced parameters omitted here ): ... @@ -84,6 +85,11 @@ class BrowserConfig: - Additional flags for the underlying browser. - E.g. `["--disable-extensions"]`. +11. **`enable_stealth`**: + - If `True`, enables stealth mode using playwright-stealth. + - Modifies browser fingerprints to avoid basic bot detection. + - Default is `False`. Recommended for sites with bot protection. + ### Helper Methods Both configuration classes provide a `clone()` method to create modified copies: diff --git a/docs/md_v2/core/examples.md b/docs/md_v2/core/examples.md index 25186de6..b1c52013 100644 --- a/docs/md_v2/core/examples.md +++ b/docs/md_v2/core/examples.md @@ -54,6 +54,16 @@ This page provides a comprehensive list of example scripts that demonstrate vari | Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) | | SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) | +## Anti-Bot & Stealth Features + +| Example | Description | Link | +|---------|-------------|------| +| Stealth Mode Quick Start | Five practical examples showing how to use stealth mode for bypassing basic bot detection. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_quick_start.py) | +| Stealth Mode Comprehensive | Comprehensive demonstration of stealth mode features with bot detection testing and comparisons. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/stealth_mode_example.py) | +| Undetected Browser | Simple example showing how to use the undetected browser adapter. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world_undetected.py) | +| Undetected Browser Demo | Basic demo comparing regular and undetected browser modes. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/undetected_simple_demo.py) | +| Undetected Tests | Advanced tests comparing regular vs undetected browsers on various bot detection services. | [View Folder](https://github.com/unclecode/crawl4ai/tree/main/docs/examples/undetectability/) | + ## Customization & Security | Example | Description | Link | diff --git a/docs/md_v2/core/installation.md b/docs/md_v2/core/installation.md index 2e1fd431..6cd44068 100644 --- a/docs/md_v2/core/installation.md +++ b/docs/md_v2/core/installation.md @@ -18,7 +18,7 @@ crawl4ai-setup ``` **What does it do?** -- Installs or updates required Playwright browsers (Chromium, Firefox, etc.) +- Installs or updates required browser dependencies for both regular and undetected modes - Performs OS-level checks (e.g., missing libs on Linux) - Confirms your environment is ready to crawl diff --git a/mkdocs.yml b/mkdocs.yml index 1cc65101..ff148547 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -45,6 +45,7 @@ nav: - "Lazy Loading": "advanced/lazy-loading.md" - "Hooks & Auth": "advanced/hooks-auth.md" - "Proxy & Security": "advanced/proxy-security.md" + - "Undetected Browser": "advanced/undetected-browser.md" - "Session Management": "advanced/session-management.md" - "Multi-URL Crawling": "advanced/multi-url-crawling.md" - "Crawl Dispatcher": "advanced/crawl-dispatcher.md" diff --git a/pyproject.toml b/pyproject.toml index 3d70a68d..9b00bd28 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -13,34 +13,34 @@ authors = [ {name = "Unclecode", email = "unclecode@kidocode.com"} ] dependencies = [ + "aiofiles>=24.1.0", + "aiohttp>=3.11.11", "aiosqlite~=0.20", + "anyio>=4.0.0", "lxml~=5.3", "litellm>=1.53.1", "numpy>=1.26.0,<3", "pillow>=10.4", "playwright>=1.49.0", + "patchright>=1.49.0", "python-dotenv~=1.0", "requests~=2.26", "beautifulsoup4~=4.12", "tf-playwright-stealth>=1.1.0", "xxhash~=3.4", "rank-bm25~=0.2", - "aiofiles>=24.1.0", "snowballstemmer~=2.2", "pydantic>=2.10", "pyOpenSSL>=24.3.0", "psutil>=6.1.1", + "PyYAML>=6.0", "nltk>=3.9.1", - "playwright", "rich>=13.9.4", - "cssselect>=1.2.0", "httpx>=0.27.2", "httpx[http2]>=0.27.2", "fake-useragent>=2.0.3", "click>=8.1.7", - "pyperclip>=1.8.2", "chardet>=5.2.0", - "aiohttp>=3.11.11", "brotli>=1.1.0", "humanize>=4.10.0", "lark>=1.2.2", diff --git a/requirements.txt b/requirements.txt index 001d090d..20f4df4f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,26 +1,29 @@ # Note: These requirements are also specified in pyproject.toml # This file is kept for development environment setup and compatibility +aiofiles>=24.1.0 +aiohttp>=3.11.11 aiosqlite~=0.20 +anyio>=4.0.0 lxml~=5.3 litellm>=1.53.1 numpy>=1.26.0,<3 pillow>=10.4 playwright>=1.49.0 +patchright>=1.49.0 python-dotenv~=1.0 requests~=2.26 beautifulsoup4~=4.12 tf-playwright-stealth>=1.1.0 xxhash~=3.4 rank-bm25~=0.2 -aiofiles>=24.1.0 colorama~=0.4 snowballstemmer~=2.2 pydantic>=2.10 pyOpenSSL>=24.3.0 psutil>=6.1.1 +PyYAML>=6.0 nltk>=3.9.1 rich>=13.9.4 -cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 httpx[http2]>=0.27.2 diff --git a/tests/check_dependencies.py b/tests/check_dependencies.py new file mode 100755 index 00000000..e47ec372 --- /dev/null +++ b/tests/check_dependencies.py @@ -0,0 +1,344 @@ +#!/usr/bin/env python3 +""" +Dependency checker for Crawl4AI +Analyzes imports in the codebase and shows which files use them +""" + +import ast +import os +import sys +from pathlib import Path +from typing import Set, Dict, List, Tuple +from collections import defaultdict +import re +import toml + +# Standard library modules to ignore +STDLIB_MODULES = { + 'abc', 'argparse', 'asyncio', 'base64', 'collections', 'concurrent', 'contextlib', + 'copy', 'datetime', 'decimal', 'email', 'enum', 'functools', 'glob', 'hashlib', + 'http', 'importlib', 'io', 'itertools', 'json', 'logging', 'math', 'mimetypes', + 'multiprocessing', 'os', 'pathlib', 'pickle', 'platform', 'pprint', 'random', + 're', 'shutil', 'signal', 'socket', 'sqlite3', 'string', 'subprocess', 'sys', + 'tempfile', 'threading', 'time', 'traceback', 'typing', 'unittest', 'urllib', + 'uuid', 'warnings', 'weakref', 'xml', 'zipfile', 'dataclasses', 'secrets', + 'statistics', 'textwrap', 'queue', 'csv', 'gzip', 'tarfile', 'configparser', + 'inspect', 'operator', 'struct', 'binascii', 'codecs', 'locale', 'gc', + 'atexit', 'builtins', 'html', 'errno', 'fcntl', 'pwd', 'grp', 'resource', + 'termios', 'tty', 'pty', 'select', 'selectors', 'ssl', 'zlib', 'bz2', + 'lzma', 'types', 'copy', 'pydoc', 'profile', 'cProfile', 'timeit', + 'trace', 'doctest', 'pdb', 'contextvars', 'dataclasses', 'graphlib', + 'zoneinfo', 'tomllib', 'cgi', 'wsgiref', 'fileinput', 'linecache', + 'tokenize', 'tabnanny', 'compileall', 'dis', 'pickletools', 'formatter', + '__future__', 'array', 'ctypes', 'heapq', 'bisect', 'array', 'weakref', + 'types', 'copy', 'pprint', 'repr', 'numbers', 'cmath', 'fractions', + 'statistics', 'itertools', 'functools', 'operator', 'pathlib', 'fileinput', + 'stat', 'filecmp', 'tempfile', 'glob', 'fnmatch', 'linecache', 'shutil', + 'pickle', 'copyreg', 'shelve', 'marshal', 'dbm', 'sqlite3', 'zlib', 'gzip', + 'bz2', 'lzma', 'zipfile', 'tarfile', 'configparser', 'netrc', 'xdrlib', + 'plistlib', 'hashlib', 'hmac', 'secrets', 'os', 'io', 'time', 'argparse', + 'getopt', 'logging', 'getpass', 'curses', 'platform', 'errno', 'ctypes', + 'threading', 'multiprocessing', 'concurrent', 'subprocess', 'sched', 'queue', + 'contextvars', 'asyncio', 'socket', 'ssl', 'email', 'json', 'mailcap', + 'mailbox', 'mimetypes', 'base64', 'binhex', 'binascii', 'quopri', 'uu', + 'html', 'xml', 'webbrowser', 'cgi', 'cgitb', 'wsgiref', 'urllib', 'http', + 'ftplib', 'poplib', 'imaplib', 'nntplib', 'smtplib', 'smtpd', 'telnetlib', + 'uuid', 'socketserver', 'xmlrpc', 'ipaddress', 'audioop', 'aifc', 'sunau', + 'wave', 'chunk', 'colorsys', 'imghdr', 'sndhdr', 'ossaudiodev', 'gettext', + 'locale', 'turtle', 'cmd', 'shlex', 'tkinter', 'typing', 'pydoc', 'doctest', + 'unittest', 'test', '2to3', 'distutils', 'venv', 'ensurepip', 'zipapp', + 'py_compile', 'compileall', 'dis', 'pickletools', 'pdb', 'timeit', 'trace', + 'tracemalloc', 'warnings', 'faulthandler', 'pdb', 'dataclasses', 'cgi', + 'cgitb', 'chunk', 'crypt', 'imghdr', 'mailcap', 'nis', 'nntplib', 'optparse', + 'ossaudiodev', 'pipes', 'smtpd', 'sndhdr', 'spwd', 'sunau', 'telnetlib', + 'uu', 'xdrlib', 'msilib', 'pstats', 'rlcompleter', 'tkinter', 'ast' +} + +# Known package name mappings (import name -> package name) +PACKAGE_MAPPINGS = { + 'bs4': 'beautifulsoup4', + 'PIL': 'pillow', + 'cv2': 'opencv-python', + 'sklearn': 'scikit-learn', + 'yaml': 'PyYAML', + 'OpenSSL': 'pyOpenSSL', + 'sqlalchemy': 'SQLAlchemy', + 'playwright': 'playwright', + 'patchright': 'patchright', + 'dotenv': 'python-dotenv', + 'fake_useragent': 'fake-useragent', + 'playwright_stealth': 'tf-playwright-stealth', + 'sentence_transformers': 'sentence-transformers', + 'rank_bm25': 'rank-bm25', + 'snowballstemmer': 'snowballstemmer', + 'PyPDF2': 'PyPDF2', + 'pdf2image': 'pdf2image', +} + + +class ImportVisitor(ast.NodeVisitor): + """AST visitor to extract imports from Python files""" + + def __init__(self): + self.imports = {} # Changed to dict to store line numbers + self.from_imports = {} + + def visit_Import(self, node): + for alias in node.names: + module_name = alias.name.split('.')[0] + if module_name not in self.imports: + self.imports[module_name] = [] + self.imports[module_name].append(node.lineno) + + def visit_ImportFrom(self, node): + if node.module and node.level == 0: # absolute imports only + module_name = node.module.split('.')[0] + if module_name not in self.from_imports: + self.from_imports[module_name] = [] + self.from_imports[module_name].append(node.lineno) + + +def extract_imports_from_file(filepath: Path) -> Dict[str, List[int]]: + """Extract all imports from a Python file with line numbers""" + all_imports = {} + + try: + with open(filepath, 'r', encoding='utf-8') as f: + content = f.read() + + tree = ast.parse(content) + visitor = ImportVisitor() + visitor.visit(tree) + + # Merge imports and from_imports + for module, lines in visitor.imports.items(): + if module not in all_imports: + all_imports[module] = [] + all_imports[module].extend(lines) + + for module, lines in visitor.from_imports.items(): + if module not in all_imports: + all_imports[module] = [] + all_imports[module].extend(lines) + + except Exception as e: + # Silently skip files that can't be parsed + pass + + return all_imports + + +def get_codebase_imports_with_files(root_dir: Path) -> Dict[str, List[Tuple[str, List[int]]]]: + """Get all imports from the crawl4ai library and docs folders with file locations and line numbers""" + import_to_files = defaultdict(list) + + # Only scan crawl4ai library folder and docs folder + target_dirs = [ + root_dir / 'crawl4ai', + root_dir / 'docs' + ] + + for target_dir in target_dirs: + if not target_dir.exists(): + continue + + for py_file in target_dir.rglob('*.py'): + # Skip __pycache__ directories + if '__pycache__' in py_file.parts: + continue + + # Skip setup.py and similar files + if py_file.name in ['setup.py', 'setup.cfg', 'conf.py']: + continue + + imports = extract_imports_from_file(py_file) + + # Map each import to the file and line numbers + for imp, line_numbers in imports.items(): + relative_path = py_file.relative_to(root_dir) + import_to_files[imp].append((str(relative_path), sorted(line_numbers))) + + return dict(import_to_files) + + +def get_declared_dependencies() -> Set[str]: + """Get declared dependencies from pyproject.toml and requirements.txt""" + declared = set() + + # Read from pyproject.toml + if Path('pyproject.toml').exists(): + with open('pyproject.toml', 'r') as f: + data = toml.load(f) + + # Get main dependencies + deps = data.get('project', {}).get('dependencies', []) + for dep in deps: + # Parse dependency string (e.g., "numpy>=1.26.0,<3") + match = re.match(r'^([a-zA-Z0-9_-]+)', dep) + if match: + pkg_name = match.group(1).lower() + declared.add(pkg_name) + + # Get optional dependencies + optional = data.get('project', {}).get('optional-dependencies', {}) + for group, deps in optional.items(): + for dep in deps: + match = re.match(r'^([a-zA-Z0-9_-]+)', dep) + if match: + pkg_name = match.group(1).lower() + declared.add(pkg_name) + + # Also check requirements.txt as backup + if Path('requirements.txt').exists(): + with open('requirements.txt', 'r') as f: + for line in f: + line = line.strip() + if line and not line.startswith('#'): + match = re.match(r'^([a-zA-Z0-9_-]+)', line) + if match: + pkg_name = match.group(1).lower() + declared.add(pkg_name) + + return declared + + +def normalize_package_name(name: str) -> str: + """Normalize package name for comparison""" + # Handle known mappings first + if name in PACKAGE_MAPPINGS: + return PACKAGE_MAPPINGS[name].lower() + + # Basic normalization + return name.lower().replace('_', '-') + + +def check_missing_dependencies(): + """Main function to check for missing dependencies""" + print("🔍 Analyzing crawl4ai library and docs folders...\n") + + # Get all imports with their file locations + root_dir = Path('.') + import_to_files = get_codebase_imports_with_files(root_dir) + + # Get declared dependencies + declared_deps = get_declared_dependencies() + + # Normalize declared dependencies + normalized_declared = {normalize_package_name(dep) for dep in declared_deps} + + # Categorize imports + external_imports = {} + local_imports = {} + + # Known local packages + local_packages = {'crawl4ai'} + + for imp, file_info in import_to_files.items(): + # Skip standard library + if imp in STDLIB_MODULES: + continue + + # Check if it's a local import + if any(imp.startswith(local) for local in local_packages): + local_imports[imp] = file_info + else: + external_imports[imp] = file_info + + # Check which external imports are not declared + not_declared = {} + declared_imports = {} + + for imp, file_info in external_imports.items(): + normalized_imp = normalize_package_name(imp) + + # Check if import is covered by declared dependencies + found = False + for declared in normalized_declared: + if normalized_imp == declared or normalized_imp.startswith(declared + '.') or declared.startswith(normalized_imp): + found = True + break + + if found: + declared_imports[imp] = file_info + else: + not_declared[imp] = file_info + + # Print results + print(f"📊 Summary:") + print(f" - Total unique imports: {len(import_to_files)}") + print(f" - External imports: {len(external_imports)}") + print(f" - Declared dependencies: {len(declared_deps)}") + print(f" - External imports NOT in dependencies: {len(not_declared)}\n") + + if not_declared: + print("❌ External imports NOT declared in pyproject.toml or requirements.txt:\n") + + # Sort by import name + for imp in sorted(not_declared.keys()): + file_info = not_declared[imp] + print(f" 📦 {imp}") + if imp in PACKAGE_MAPPINGS: + print(f" → Package name: {PACKAGE_MAPPINGS[imp]}") + + # Show up to 3 files that use this import + for i, (file_path, line_numbers) in enumerate(file_info[:3]): + # Format line numbers for clickable output + if len(line_numbers) == 1: + print(f" - {file_path}:{line_numbers[0]}") + else: + # Show first few line numbers + line_str = ','.join(str(ln) for ln in line_numbers[:3]) + if len(line_numbers) > 3: + line_str += f"... ({len(line_numbers)} imports)" + print(f" - {file_path}: lines {line_str}") + + if len(file_info) > 3: + print(f" ... and {len(file_info) - 3} more files") + print() + + # Check for potentially unused dependencies + print("\n🔎 Checking declared dependencies usage...\n") + + # Get all used external packages + used_packages = set() + for imp in external_imports.keys(): + normalized = normalize_package_name(imp) + used_packages.add(normalized) + + # Find unused + unused = [] + for dep in declared_deps: + normalized_dep = normalize_package_name(dep) + + # Check if any import uses this dependency + found_usage = False + for used in used_packages: + if used == normalized_dep or used.startswith(normalized_dep) or normalized_dep.startswith(used): + found_usage = True + break + + if not found_usage: + # Some packages are commonly unused directly + indirect_deps = {'wheel', 'setuptools', 'pip', 'colorama', 'certifi', 'packaging', 'urllib3'} + if normalized_dep not in indirect_deps: + unused.append(dep) + + if unused: + print("⚠️ Declared dependencies with NO imports found:") + for dep in sorted(unused): + print(f" - {dep}") + print("\n Note: These might be used indirectly or by other dependencies") + else: + print("✅ All declared dependencies have corresponding imports") + + print("\n" + "="*60) + print("💡 How to use this report:") + print(" 1. Check each ❌ import to see if it's legitimate") + print(" 2. If legitimate, add the package to pyproject.toml") + print(" 3. If it's an internal module or typo, fix the import") + print(" 4. Review unused dependencies - remove if truly not needed") + print("="*60) + + +if __name__ == '__main__': + check_missing_dependencies() \ No newline at end of file