diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 37dd8366..9dff4453 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -2,7 +2,7 @@ import warnings from .async_webcrawler import AsyncWebCrawler, CacheMode -from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig +from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig from .content_scraping_strategy import ( ContentScrapingStrategy, @@ -71,6 +71,7 @@ __all__ = [ "AsyncWebCrawler", "BrowserProfiler", "LLMConfig", + "GeolocationConfig", "DeepCrawlStrategy", "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index faa29024..dd5c584a 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -159,6 +159,55 @@ def is_empty_value(value: Any) -> bool: return True return False +class GeolocationConfig: + def __init__( + self, + latitude: float, + longitude: float, + accuracy: Optional[float] = 0.0 + ): + """Configuration class for geolocation settings. + + Args: + latitude: Latitude coordinate (e.g., 37.7749) + longitude: Longitude coordinate (e.g., -122.4194) + accuracy: Accuracy in meters. Default: 0.0 + """ + self.latitude = latitude + self.longitude = longitude + self.accuracy = accuracy + + @staticmethod + def from_dict(geo_dict: Dict) -> "GeolocationConfig": + """Create a GeolocationConfig from a dictionary.""" + return GeolocationConfig( + latitude=geo_dict.get("latitude"), + longitude=geo_dict.get("longitude"), + accuracy=geo_dict.get("accuracy", 0.0) + ) + + def to_dict(self) -> Dict: + """Convert to dictionary representation.""" + return { + "latitude": self.latitude, + "longitude": self.longitude, + "accuracy": self.accuracy + } + + def clone(self, **kwargs) -> "GeolocationConfig": + """Create a copy of this configuration with updated values. + + Args: + **kwargs: Key-value pairs of configuration options to update + + Returns: + GeolocationConfig: A new instance with the specified updates + """ + config_dict = self.to_dict() + config_dict.update(kwargs) + return GeolocationConfig.from_dict(config_dict) + + class ProxyConfig: def __init__( self, @@ -680,6 +729,14 @@ class CrawlerRunConfig(): proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}. If None, no additional proxy config. Default: None. + # Browser Location and Identity Parameters + locale (str or None): Locale to use for the browser context (e.g., "en-US"). + Default: None. + timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York"). + Default: None. + geolocation (GeolocationConfig or None): Geolocation configuration for the browser. + Default: None. + # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters @@ -829,6 +886,10 @@ class CrawlerRunConfig(): scraping_strategy: ContentScrapingStrategy = None, proxy_config: Union[ProxyConfig, dict, None] = None, proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None, + # Browser Location and Identity Parameters + locale: Optional[str] = None, + timezone_id: Optional[str] = None, + geolocation: Optional[GeolocationConfig] = None, # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters @@ -917,6 +978,11 @@ class CrawlerRunConfig(): self.scraping_strategy = scraping_strategy or WebScrapingStrategy() self.proxy_config = proxy_config self.proxy_rotation_strategy = proxy_rotation_strategy + + # Browser Location and Identity Parameters + self.locale = locale + self.timezone_id = timezone_id + self.geolocation = geolocation # SSL Parameters self.fetch_ssl_certificate = fetch_ssl_certificate @@ -1057,6 +1123,10 @@ class CrawlerRunConfig(): scraping_strategy=kwargs.get("scraping_strategy"), proxy_config=kwargs.get("proxy_config"), proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"), + # Browser Location and Identity Parameters + locale=kwargs.get("locale", None), + timezone_id=kwargs.get("timezone_id", None), + geolocation=kwargs.get("geolocation", None), # SSL Parameters fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False), # Caching Parameters @@ -1166,6 +1236,9 @@ class CrawlerRunConfig(): "scraping_strategy": self.scraping_strategy, "proxy_config": self.proxy_config, "proxy_rotation_strategy": self.proxy_rotation_strategy, + "locale": self.locale, + "timezone_id": self.timezone_id, + "geolocation": self.geolocation, "fetch_ssl_certificate": self.fetch_ssl_certificate, "cache_mode": self.cache_mode, "session_id": self.session_id, diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 642fd6c2..4be5f938 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -76,6 +76,51 @@ class ManagedBrowser: _cleanup(): Terminates the browser process and removes the temporary directory. create_profile(): Static method to create a user profile by launching a browser for user interaction. """ + + @staticmethod + def build_browser_flags(config: BrowserConfig) -> List[str]: + """Common CLI flags for launching Chromium""" + flags = [ + "--disable-gpu", + "--disable-gpu-compositing", + "--disable-software-rasterizer", + "--no-sandbox", + "--disable-dev-shm-usage", + "--no-first-run", + "--no-default-browser-check", + "--disable-infobars", + "--window-position=0,0", + "--ignore-certificate-errors", + "--ignore-certificate-errors-spki-list", + "--disable-blink-features=AutomationControlled", + "--window-position=400,0", + "--disable-renderer-backgrounding", + "--disable-ipc-flooding-protection", + "--force-color-profile=srgb", + "--mute-audio", + "--disable-background-timer-throttling", + ] + if config.light_mode: + flags.extend(BROWSER_DISABLE_OPTIONS) + if config.text_mode: + flags.extend([ + "--blink-settings=imagesEnabled=false", + "--disable-remote-fonts", + "--disable-images", + "--disable-javascript", + "--disable-software-rasterizer", + "--disable-dev-shm-usage", + ]) + # proxy support + if config.proxy: + flags.append(f"--proxy-server={config.proxy}") + elif config.proxy_config: + creds = "" + if config.proxy_config.username and config.proxy_config.password: + creds = f"{config.proxy_config.username}:{config.proxy_config.password}@" + flags.append(f"--proxy-server={creds}{config.proxy_config.server}") + # dedupe + return list(dict.fromkeys(flags)) browser_type: str user_data_dir: str @@ -280,29 +325,29 @@ class ManagedBrowser: return browser_path async def _get_browser_args(self) -> List[str]: - """Returns browser-specific command line arguments""" - base_args = [await self._get_browser_path()] - + """Returns full CLI args for launching the browser""" + base = [await self._get_browser_path()] if self.browser_type == "chromium": - args = [ + flags = [ f"--remote-debugging-port={self.debugging_port}", f"--user-data-dir={self.user_data_dir}", ] if self.headless: - args.append("--headless=new") + flags.append("--headless=new") + # merge common launch flags + flags.extend(self.build_browser_flags(self.browser_config)) elif self.browser_type == "firefox": - args = [ + flags = [ "--remote-debugging-port", str(self.debugging_port), "--profile", self.user_data_dir, ] if self.headless: - args.append("--headless") + flags.append("--headless") else: raise NotImplementedError(f"Browser type {self.browser_type} not supported") - - return base_args + args + return base + flags async def cleanup(self): """Cleanup browser process and temporary directory""" @@ -789,6 +834,23 @@ class BrowserManager: # Update context settings with text mode settings context_settings.update(text_mode_settings) + # inject locale / tz / geo if user provided them + if crawlerRunConfig: + if crawlerRunConfig.locale: + context_settings["locale"] = crawlerRunConfig.locale + if crawlerRunConfig.timezone_id: + context_settings["timezone_id"] = crawlerRunConfig.timezone_id + if crawlerRunConfig.geolocation: + context_settings["geolocation"] = { + "latitude": crawlerRunConfig.geolocation.latitude, + "longitude": crawlerRunConfig.geolocation.longitude, + "accuracy": crawlerRunConfig.geolocation.accuracy, + } + # ensure geolocation permission + perms = context_settings.get("permissions", []) + perms.append("geolocation") + context_settings["permissions"] = perms + # Create and return the context with all settings context = await self.browser.new_context(**context_settings) @@ -821,6 +883,10 @@ class BrowserManager: "semaphore_count", "url" ] + + # Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context + # and should cause a new context to be created if they change + for key in ephemeral_keys: if key in config_dict: del config_dict[key] diff --git a/docs/examples/use_geo_location.py b/docs/examples/use_geo_location.py new file mode 100644 index 00000000..2cfc866f --- /dev/null +++ b/docs/examples/use_geo_location.py @@ -0,0 +1,70 @@ +# use_geo_location.py +""" +Example: override locale, timezone, and geolocation using Crawl4ai patterns. + +This demo uses `AsyncWebCrawler.arun()` to fetch a page with +browser context primed for specific locale, timezone, and GPS, +and saves a screenshot for visual verification. +""" + +import asyncio +import base64 +from pathlib import Path +from typing import List +from crawl4ai import ( + AsyncWebCrawler, + CrawlerRunConfig, + BrowserConfig, + GeolocationConfig, + CrawlResult, +) + +async def demo_geo_override(): + """Demo: Crawl a geolocation-test page with overrides and screenshot.""" + print("\n=== Geo-Override Crawl ===") + + # 1) Browser setup: use Playwright-managed contexts + browser_cfg = BrowserConfig( + headless=False, + viewport_width=1280, + viewport_height=720, + use_managed_browser=False, + ) + + # 2) Run config: include locale, timezone_id, geolocation, and screenshot + run_cfg = CrawlerRunConfig( + url="https://browserleaks.com/geo", # test page that shows your location + locale="en-US", # Accept-Language & UI locale + timezone_id="America/Los_Angeles", # JS Date()/Intl timezone + geolocation=GeolocationConfig( # override GPS coords + latitude=34.0522, + longitude=-118.2437, + accuracy=10.0, + ), + screenshot=True, # capture screenshot after load + session_id="geo_test", # reuse context if rerunning + delay_before_return_html=5 + ) + + async with AsyncWebCrawler(config=browser_cfg) as crawler: + # 3) Run crawl (returns list even for single URL) + results: List[CrawlResult] = await crawler.arun( + url=run_cfg.url, + config=run_cfg, + ) + result = results[0] + + # 4) Save screenshot and report path + if result.screenshot: + __current_dir = Path(__file__).parent + out_dir = __current_dir / "tmp" + out_dir.mkdir(exist_ok=True) + shot_path = out_dir / "geo_test.png" + with open(shot_path, "wb") as f: + f.write(base64.b64decode(result.screenshot)) + print(f"Saved screenshot to {shot_path}") + else: + print("No screenshot captured, check configuration.") + +if __name__ == "__main__": + asyncio.run(demo_geo_override()) diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md index 403acb9a..3864f840 100644 --- a/docs/md_v2/advanced/identity-based-crawling.md +++ b/docs/md_v2/advanced/identity-based-crawling.md @@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet --- -## 7. Summary +## 7. Locale, Timezone, and Geolocation Control + +In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings. + +### Setting Locale and Timezone + +You can set the browser's locale and timezone through `CrawlerRunConfig`: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com", + config=CrawlerRunConfig( + # Set browser locale (language and region formatting) + locale="fr-FR", # French (France) + + # Set browser timezone + timezone_id="Europe/Paris", + + # Other normal options... + magic=True, + page_timeout=60000 + ) + ) +``` + +**How it works:** +- `locale` affects language preferences, date formats, number formats, etc. +- `timezone_id` affects JavaScript's Date object and time-related functionality +- These settings are applied when creating the browser context and maintained throughout the session + +### Configuring Geolocation + +Control the GPS coordinates reported by the browser's geolocation API: + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://maps.google.com", # Or any location-aware site + config=CrawlerRunConfig( + # Configure precise GPS coordinates + geolocation=GeolocationConfig( + latitude=48.8566, # Paris coordinates + longitude=2.3522, + accuracy=100 # Accuracy in meters (optional) + ), + + # This site will see you as being in Paris + page_timeout=60000 + ) + ) +``` + +**Important notes:** +- When `geolocation` is specified, the browser is automatically granted permission to access location +- Websites using the Geolocation API will receive the exact coordinates you specify +- This affects map services, store locators, delivery services, etc. +- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile + +### Combining with Managed Browsers + +These settings work perfectly with managed browsers for a complete identity solution: + +```python +from crawl4ai import ( + AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, + GeolocationConfig +) + +browser_config = BrowserConfig( + use_managed_browser=True, + user_data_dir="/path/to/my-profile", + browser_type="chromium" +) + +crawl_config = CrawlerRunConfig( + # Location settings + locale="es-MX", # Spanish (Mexico) + timezone_id="America/Mexico_City", + geolocation=GeolocationConfig( + latitude=19.4326, # Mexico City + longitude=-99.1332 + ) +) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawl_config) +``` + +Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity. + +## 8. Summary - **Create** your user-data directory either: - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` @@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet - Or through the interactive interface with `profiler.interactive_manager()` - **Log in** or configure sites as needed, then close the browser - **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True` +- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation` - **List and reuse** profiles with `BrowserProfiler.list_profiles()` - **Manage** your profiles with the dedicated `BrowserProfiler` class - Enjoy **persistent** sessions that reflect your real identity diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index 1f7e5ee2..0dc846a7 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -137,6 +137,11 @@ class CrawlerRunConfig: screenshot=False, pdf=False, capture_mhtml=False, + # Location and Identity Parameters + locale=None, # e.g. "en-US", "fr-FR" + timezone_id=None, # e.g. "America/New_York" + geolocation=None, # GeolocationConfig object + # Resource Management enable_rate_limiting=False, rate_limit_config=None, memory_threshold_percent=70.0,