diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d0a9b9e1..44c83262 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -7,7 +7,7 @@ from .config import ( SOCIAL_MEDIA_DOMAINS, ) -from .user_agent_generator import UserAgentGenerator +from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy @@ -100,11 +100,13 @@ class BrowserConfig: cookies: list = None, headers: dict = None, user_agent: str = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36" ), - user_agent_mode: str = None, - user_agent_generator_config: dict = None, + user_agent_mode: str = "", + user_agent_generator_config: dict = {}, text_mode: bool = False, light_mode: bool = False, extra_args: list = None, @@ -143,17 +145,15 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port - user_agenr_generator = UserAgentGenerator() - if self.user_agent_mode != "random" and self.user_agent_generator_config: - self.user_agent = user_agenr_generator.generate( + fa_user_agenr_generator = ValidUAGenerator() + if self.user_agent_mode == "random": + self.user_agent = fa_user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) - elif self.user_agent_mode == "random": - self.user_agent = user_agenr_generator.generate() else: pass - - self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + + self.browser_hint = UAGen.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) # If persistent context is requested, ensure managed browser is enabled @@ -382,6 +382,11 @@ class CrawlerRunConfig: stream (bool): If True, stream the page content as it is being loaded. url: str = None # This is not a compulsory parameter check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False + user_agent (str): Custom User-Agent string to use. Default: None + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. """ def __init__( @@ -453,6 +458,9 @@ class CrawlerRunConfig: stream: bool = False, url: str = None, check_robots_txt: bool = False, + user_agent: str = None, + user_agent_mode: str = None, + user_agent_generator_config: dict = {}, ): self.url = url @@ -535,6 +543,11 @@ class CrawlerRunConfig: # Robots.txt Handling Parameters self.check_robots_txt = check_robots_txt + # User Agent Parameters + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + # Validate type of extraction strategy and chunking strategy if they are provided if self.extraction_strategy is not None and not isinstance( self.extraction_strategy, ExtractionStrategy @@ -632,6 +645,9 @@ class CrawlerRunConfig: stream=kwargs.get("stream", False), url=kwargs.get("url"), check_robots_txt=kwargs.get("check_robots_txt", False), + user_agent=kwargs.get("user_agent"), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), ) # Create a funciton returns dict of the object @@ -695,6 +711,9 @@ class CrawlerRunConfig: "stream": self.stream, "url": self.url, "check_robots_txt": self.check_robots_txt, + "user_agent": self.user_agent, + "user_agent_mode": self.user_agent_mode, + "user_agent_generator_config": self.user_agent_generator_config, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b11796e0..62ee4c65 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -23,6 +23,7 @@ from .async_logger import AsyncLogger from playwright_stealth import StealthConfig from .ssl_certificate import SSLCertificate from .utils import get_home_folder, get_chromium_path +from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator stealth_config = StealthConfig( webdriver=True, @@ -128,6 +129,7 @@ class ManagedBrowser: self.host = host self.logger = logger self.shutting_down = False + self.cdp_url = cdp_url async def start(self) -> str: """ @@ -563,7 +565,7 @@ class BrowserManager: Context: Browser context object with the specified configurations """ # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) viewport_settings = { "width": self.config.viewport_width, "height": self.config.viewport_height, @@ -1269,10 +1271,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._downloaded_files = [] # Handle user agent with magic mode - user_agent = self.browser_config.user_agent - if config.magic and self.browser_config.user_agent_mode != "random": - self.browser_config.user_agent = UserAgentGenerator().generate( - **(self.browser_config.user_agent_generator_config or {}) + user_agent_to_override = config.user_agent + if user_agent_to_override: + self.browser_config.user_agent = user_agent_to_override + elif config.magic or config.user_agent_mode == "random": + self.browser_config.user_agent = ValidUAGenerator().generate( + **(config.user_agent_generator_config or {}) ) # Get page for session diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 4f0f42cb..91e7a31d 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -2,8 +2,146 @@ import random from typing import Optional, Literal, List, Dict, Tuple import re +from abc import ABC, abstractmethod +import random +from fake_useragent import UserAgent +import requests +from lxml import html +import json +from typing import Optional, List, Union, Dict -class UserAgentGenerator: +class UAGen(ABC): + @abstractmethod + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Union[str, Dict]: + pass + + @staticmethod + def generate_client_hints( user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + def _parse_user_agent(user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + "chrome": r"Chrome/(\d+)", + "edge": r"Edg/(\d+)", + "safari": r"Version/(\d+)", + "firefox": r"Firefox/(\d+)", + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + browsers = _parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if "chrome" in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if "edge" in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif "firefox" in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif "safari" in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ", ".join(hints) + +class ValidUAGenerator(UAGen): + def __init__(self): + self.ua = UserAgent() + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> str: + + self.ua = UserAgent( + browsers=browsers or ['Chrome', 'Firefox', 'Edge'], + os=os or ['Windows', 'Mac OS X'], + min_version=min_version, + platforms=platforms or ['desktop'], + fallback=fallback + ) + return self.ua.random + +class OnlineUAGenerator(UAGen): + def __init__(self): + self.agents = [] + self._fetch_agents() + + def _fetch_agents(self): + try: + response = requests.get( + 'https://www.useragents.me/', + timeout=5, + headers={'Accept': 'text/html,application/xhtml+xml'} + ) + response.raise_for_status() + + tree = html.fromstring(response.content) + json_text = tree.cssselect('#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea')[0].text + self.agents = json.loads(json_text) + except Exception as e: + print(f"Error fetching agents: {e}") + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Dict: + + if not self.agents: + self._fetch_agents() + + filtered_agents = self.agents + + if pct_threshold: + filtered_agents = [a for a in filtered_agents if a['pct'] >= pct_threshold] + + if browsers: + filtered_agents = [a for a in filtered_agents + if any(b.lower() in a['ua'].lower() for b in browsers)] + + if os: + os_list = [os] if isinstance(os, str) else os + filtered_agents = [a for a in filtered_agents + if any(o.lower() in a['ua'].lower() for o in os_list)] + + if platforms: + platform_list = [platforms] if isinstance(platforms, str) else platforms + filtered_agents = [a for a in filtered_agents + if any(p.lower() in a['ua'].lower() for p in platform_list)] + + return filtered_agents[0] if filtered_agents else {'ua': fallback, 'pct': 0} + + + +class UserAgentGenerator(): """ Generate random user agents with specified constraints. @@ -187,9 +325,15 @@ class UserAgentGenerator: browser_stack = self.get_browser_stack(num_browsers) # Add appropriate legacy token based on browser stack - if "Firefox" in str(browser_stack): + if "Firefox" in str(browser_stack) or browser_type == "firefox": components.append(random.choice(self.rendering_engines["gecko"])) - elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack) or browser_type == "chrome": + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Edge" in str(browser_stack) or browser_type == "edge": + components.append(self.rendering_engines["safari_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Safari" in str(browser_stack) or browser_type == "safari": components.append(self.rendering_engines["chrome_webkit"]) components.append("(KHTML, like Gecko)") @@ -273,27 +417,13 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": - generator = UserAgentGenerator() - print(generator.generate()) + + # Usage example: + generator = ValidUAGenerator() + ua = generator.generate() + print(ua) + + generator = OnlineUAGenerator() + ua = generator.generate() + print(ua) - print("\nSingle browser (Chrome):") - print(generator.generate(num_browsers=1, browser_type="chrome")) - - print("\nTwo browsers (Gecko/Firefox):") - print(generator.generate(num_browsers=2)) - - print("\nThree browsers (Chrome/Safari/Edge):") - print(generator.generate(num_browsers=3)) - - print("\nFirefox on Linux:") - print( - generator.generate( - device_type="desktop", - os_type="linux", - browser_type="firefox", - num_browsers=2, - ) - ) - - print("\nChrome/Safari/Edge on Windows:") - print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3))