diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index cc7f3993..3d24bd84 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -16,6 +16,7 @@ import json import uuid from .models import AsyncCrawlResponse from .utils import create_box_message +from .user_agent_generator import UserAgentGenerator from playwright_stealth import StealthConfig, stealth_async stealth_config = StealthConfig( @@ -222,14 +223,20 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self.use_cached_html = use_cached_html self.user_agent = kwargs.get( "user_agent", - "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" + "Mozilla/5.0 (Linux; Android 11; SM-G973F) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.120 Mobile Safari/537.36" ) + user_agenr_generator = UserAgentGenerator() + if kwargs.get("user_agent_mode") == "random": + self.user_agent = user_agenr_generator.generate( + **kwargs.get("user_agent_generator_config", {}) + ) self.proxy = kwargs.get("proxy") self.proxy_config = kwargs.get("proxy_config") self.headless = kwargs.get("headless", True) self.browser_type = kwargs.get("browser_type", "chromium") self.headers = kwargs.get("headers", {}) + self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + self.headers.setdefault("sec-ch-ua", self.browser_hint) self.cookies = kwargs.get("cookies", []) self.sessions = {} self.session_ttl = 1800 @@ -307,7 +314,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): if self.user_agent: await self.default_context.set_extra_http_headers({ - "User-Agent": self.user_agent + "User-Agent": self.user_agent, + "sec-ch-ua": self.browser_hint, + # **self.headers }) else: # Base browser arguments @@ -321,7 +330,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "--disable-infobars", "--window-position=0,0", "--ignore-certificate-errors", - "--ignore-certificate-errors-spki-list" + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + ] } @@ -642,6 +653,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._cleanup_expired_sessions() session_id = kwargs.get("session_id") + # Check if in kwargs we have user_agent that will override the default user_agent + user_agent = kwargs.get("user_agent", self.user_agent) + + # Generate random user agent if magic mode is enabled and user_agent_mode is not random + if kwargs.get("user_agent_mode") != "random" and kwargs.get("magic", False): + user_agent = UserAgentGenerator().generate( + **kwargs.get("user_agent_generator_config", {}) + ) + # Handle page creation differently for managed browser context = None if self.use_managed_browser: @@ -666,7 +686,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation for non-persistent or non-Chrome browsers context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1200, "height": 800}, proxy={"server": self.proxy} if self.proxy else None, java_script_enabled=True, @@ -686,10 +706,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): else: # Normal context creation context = await self.browser.new_context( - user_agent=self.user_agent, + user_agent=user_agent, viewport={"width": 1920, "height": 1080}, proxy={"server": self.proxy} if self.proxy else None, accept_downloads=self.accept_downloads, + ignore_https_errors=True # Add this line ) if self.cookies: await context.add_cookies(self.cookies) diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py new file mode 100644 index 00000000..0a4df0bb --- /dev/null +++ b/crawl4ai/user_agent_generator.py @@ -0,0 +1,262 @@ +import random +from typing import Optional, Literal, List, Dict, Tuple +import re + + +class UserAgentGenerator: + def __init__(self): + # Previous platform definitions remain the same... + self.desktop_platforms = { + "windows": { + "10_64": "(Windows NT 10.0; Win64; x64)", + "10_32": "(Windows NT 10.0; WOW64)", + }, + "macos": { + "intel": "(Macintosh; Intel Mac OS X 10_15_7)", + "newer": "(Macintosh; Intel Mac OS X 10.15; rv:109.0)", + }, + "linux": { + "generic": "(X11; Linux x86_64)", + "ubuntu": "(X11; Ubuntu; Linux x86_64)", + "chrome_os": "(X11; CrOS x86_64 14541.0.0)", + } + } + + self.mobile_platforms = { + "android": { + "samsung": "(Linux; Android 13; SM-S901B)", + "pixel": "(Linux; Android 12; Pixel 6)", + "oneplus": "(Linux; Android 13; OnePlus 9 Pro)", + "xiaomi": "(Linux; Android 12; M2102J20SG)", + }, + "ios": { + "iphone": "(iPhone; CPU iPhone OS 16_5 like Mac OS X)", + "ipad": "(iPad; CPU OS 16_5 like Mac OS X)", + } + } + + # Browser Combinations + self.browser_combinations = { + 1: [ + ["chrome"], + ["firefox"], + ["safari"], + ["edge"] + ], + 2: [ + ["gecko", "firefox"], + ["chrome", "safari"], + ["webkit", "safari"] + ], + 3: [ + ["chrome", "safari", "edge"], + ["webkit", "chrome", "safari"] + ] + } + + # Rendering Engines with versions + self.rendering_engines = { + "chrome_webkit": "AppleWebKit/537.36", + "safari_webkit": "AppleWebKit/605.1.15", + "gecko": [ # Added Gecko versions + "Gecko/20100101", + "Gecko/20100101", # Firefox usually uses this constant version + "Gecko/2010010", + ] + } + + # Browser Versions + self.chrome_versions = [ + "Chrome/119.0.6045.199", + "Chrome/118.0.5993.117", + "Chrome/117.0.5938.149", + "Chrome/116.0.5845.187", + "Chrome/115.0.5790.171", + ] + + self.edge_versions = [ + "Edg/119.0.2151.97", + "Edg/118.0.2088.76", + "Edg/117.0.2045.47", + "Edg/116.0.1938.81", + "Edg/115.0.1901.203", + ] + + self.safari_versions = [ + "Safari/537.36", # For Chrome-based + "Safari/605.1.15", + "Safari/604.1", + "Safari/602.1", + "Safari/601.5.17", + ] + + # Added Firefox versions + self.firefox_versions = [ + "Firefox/119.0", + "Firefox/118.0.2", + "Firefox/117.0.1", + "Firefox/116.0", + "Firefox/115.0.3", + "Firefox/114.0.2", + "Firefox/113.0.1", + "Firefox/112.0", + "Firefox/111.0.1", + "Firefox/110.0", + ] + + def get_browser_stack(self, num_browsers: int = 1) -> List[str]: + """Get a valid combination of browser versions""" + if num_browsers not in self.browser_combinations: + raise ValueError(f"Unsupported number of browsers: {num_browsers}") + + combination = random.choice(self.browser_combinations[num_browsers]) + browser_stack = [] + + for browser in combination: + if browser == "chrome": + browser_stack.append(random.choice(self.chrome_versions)) + elif browser == "firefox": + browser_stack.append(random.choice(self.firefox_versions)) + elif browser == "safari": + browser_stack.append(random.choice(self.safari_versions)) + elif browser == "edge": + browser_stack.append(random.choice(self.edge_versions)) + elif browser == "gecko": + browser_stack.append(random.choice(self.rendering_engines["gecko"])) + elif browser == "webkit": + browser_stack.append(self.rendering_engines["chrome_webkit"]) + + return browser_stack + + def generate(self, + device_type: Optional[Literal['desktop', 'mobile']] = None, + os_type: Optional[str] = None, + device_brand: Optional[str] = None, + browser_type: Optional[Literal['chrome', 'edge', 'safari', 'firefox']] = None, + num_browsers: int = 3) -> str: + """ + Generate a random user agent with specified constraints. + + Args: + device_type: 'desktop' or 'mobile' + os_type: 'windows', 'macos', 'linux', 'android', 'ios' + device_brand: Specific device brand + browser_type: 'chrome', 'edge', 'safari', or 'firefox' + num_browsers: Number of browser specifications (1-3) + """ + # Get platform string + platform = self.get_random_platform(device_type, os_type, device_brand) + + # Start with Mozilla + components = ["Mozilla/5.0", platform] + + # Add browser stack + browser_stack = self.get_browser_stack(num_browsers) + + # Add appropriate legacy token based on browser stack + if "Firefox" in str(browser_stack): + components.append(random.choice(self.rendering_engines["gecko"])) + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + + # Add browser versions + components.extend(browser_stack) + + return " ".join(components) + + def generate_with_client_hints(self, **kwargs) -> Tuple[str, str]: + """Generate both user agent and matching client hints""" + user_agent = self.generate(**kwargs) + client_hints = self.generate_client_hints(user_agent) + return user_agent, client_hints + + def get_random_platform(self, device_type, os_type, device_brand): + """Helper method to get random platform based on constraints""" + platforms = self.desktop_platforms if device_type == 'desktop' else \ + self.mobile_platforms if device_type == 'mobile' else \ + {**self.desktop_platforms, **self.mobile_platforms} + + if os_type: + for platform_group in [self.desktop_platforms, self.mobile_platforms]: + if os_type in platform_group: + platforms = {os_type: platform_group[os_type]} + break + + os_key = random.choice(list(platforms.keys())) + if device_brand and device_brand in platforms[os_key]: + return platforms[os_key][device_brand] + return random.choice(list(platforms[os_key].values())) + + def parse_user_agent(self, user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + 'chrome': r'Chrome/(\d+)', + 'edge': r'Edg/(\d+)', + 'safari': r'Version/(\d+)', + 'firefox': r'Firefox/(\d+)' + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + + def generate_client_hints(self, user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + browsers = self.parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if 'chrome' in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if 'edge' in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif 'firefox' in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif 'safari' in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ', '.join(hints) + +# Example usage: +if __name__ == "__main__": + generator = UserAgentGenerator() + + print("\nSingle browser (Chrome):") + print(generator.generate(num_browsers=1, browser_type='chrome')) + + print("\nTwo browsers (Gecko/Firefox):") + print(generator.generate(num_browsers=2)) + + print("\nThree browsers (Chrome/Safari/Edge):") + print(generator.generate(num_browsers=3)) + + print("\nFirefox on Linux:") + print(generator.generate( + device_type='desktop', + os_type='linux', + browser_type='firefox', + num_browsers=2 + )) + + print("\nChrome/Safari/Edge on Windows:") + print(generator.generate( + device_type='desktop', + os_type='windows', + num_browsers=3 + )) \ No newline at end of file