feat(browser): add geolocation, locale and timezone support

Add support for controlling browser geolocation, locale and timezone settings:
- New GeolocationConfig class for managing GPS coordinates
- Add locale and timezone_id parameters to CrawlerRunConfig
- Update browser context creation to handle location settings
- Add example script for geolocation usage
- Update documentation with location-based identity features

This enables more precise control over browser identity and location reporting.
This commit is contained in:
UncleCode
2025-04-21 23:20:59 +08:00
parent 5297e362f3
commit b5c25731e6
6 changed files with 322 additions and 11 deletions

View File

@@ -2,7 +2,7 @@
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
from .content_scraping_strategy import (
ContentScrapingStrategy,
@@ -71,6 +71,7 @@ __all__ = [
"AsyncWebCrawler",
"BrowserProfiler",
"LLMConfig",
"GeolocationConfig",
"DeepCrawlStrategy",
"BFSDeepCrawlStrategy",
"BestFirstCrawlingStrategy",

View File

@@ -159,6 +159,55 @@ def is_empty_value(value: Any) -> bool:
return True
return False
class GeolocationConfig:
def __init__(
self,
latitude: float,
longitude: float,
accuracy: Optional[float] = 0.0
):
"""Configuration class for geolocation settings.
Args:
latitude: Latitude coordinate (e.g., 37.7749)
longitude: Longitude coordinate (e.g., -122.4194)
accuracy: Accuracy in meters. Default: 0.0
"""
self.latitude = latitude
self.longitude = longitude
self.accuracy = accuracy
@staticmethod
def from_dict(geo_dict: Dict) -> "GeolocationConfig":
"""Create a GeolocationConfig from a dictionary."""
return GeolocationConfig(
latitude=geo_dict.get("latitude"),
longitude=geo_dict.get("longitude"),
accuracy=geo_dict.get("accuracy", 0.0)
)
def to_dict(self) -> Dict:
"""Convert to dictionary representation."""
return {
"latitude": self.latitude,
"longitude": self.longitude,
"accuracy": self.accuracy
}
def clone(self, **kwargs) -> "GeolocationConfig":
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
GeolocationConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return GeolocationConfig.from_dict(config_dict)
class ProxyConfig:
def __init__(
self,
@@ -680,6 +729,14 @@ class CrawlerRunConfig():
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# Browser Location and Identity Parameters
locale (str or None): Locale to use for the browser context (e.g., "en-US").
Default: None.
timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
Default: None.
geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
Default: None.
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
@@ -829,6 +886,10 @@ class CrawlerRunConfig():
scraping_strategy: ContentScrapingStrategy = None,
proxy_config: Union[ProxyConfig, dict, None] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# Browser Location and Identity Parameters
locale: Optional[str] = None,
timezone_id: Optional[str] = None,
geolocation: Optional[GeolocationConfig] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
@@ -917,6 +978,11 @@ class CrawlerRunConfig():
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
self.proxy_config = proxy_config
self.proxy_rotation_strategy = proxy_rotation_strategy
# Browser Location and Identity Parameters
self.locale = locale
self.timezone_id = timezone_id
self.geolocation = geolocation
# SSL Parameters
self.fetch_ssl_certificate = fetch_ssl_certificate
@@ -1057,6 +1123,10 @@ class CrawlerRunConfig():
scraping_strategy=kwargs.get("scraping_strategy"),
proxy_config=kwargs.get("proxy_config"),
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
# Browser Location and Identity Parameters
locale=kwargs.get("locale", None),
timezone_id=kwargs.get("timezone_id", None),
geolocation=kwargs.get("geolocation", None),
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
@@ -1166,6 +1236,9 @@ class CrawlerRunConfig():
"scraping_strategy": self.scraping_strategy,
"proxy_config": self.proxy_config,
"proxy_rotation_strategy": self.proxy_rotation_strategy,
"locale": self.locale,
"timezone_id": self.timezone_id,
"geolocation": self.geolocation,
"fetch_ssl_certificate": self.fetch_ssl_certificate,
"cache_mode": self.cache_mode,
"session_id": self.session_id,

View File

@@ -76,6 +76,51 @@ class ManagedBrowser:
_cleanup(): Terminates the browser process and removes the temporary directory.
create_profile(): Static method to create a user profile by launching a browser for user interaction.
"""
@staticmethod
def build_browser_flags(config: BrowserConfig) -> List[str]:
"""Common CLI flags for launching Chromium"""
flags = [
"--disable-gpu",
"--disable-gpu-compositing",
"--disable-software-rasterizer",
"--no-sandbox",
"--disable-dev-shm-usage",
"--no-first-run",
"--no-default-browser-check",
"--disable-infobars",
"--window-position=0,0",
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list",
"--disable-blink-features=AutomationControlled",
"--window-position=400,0",
"--disable-renderer-backgrounding",
"--disable-ipc-flooding-protection",
"--force-color-profile=srgb",
"--mute-audio",
"--disable-background-timer-throttling",
]
if config.light_mode:
flags.extend(BROWSER_DISABLE_OPTIONS)
if config.text_mode:
flags.extend([
"--blink-settings=imagesEnabled=false",
"--disable-remote-fonts",
"--disable-images",
"--disable-javascript",
"--disable-software-rasterizer",
"--disable-dev-shm-usage",
])
# proxy support
if config.proxy:
flags.append(f"--proxy-server={config.proxy}")
elif config.proxy_config:
creds = ""
if config.proxy_config.username and config.proxy_config.password:
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
# dedupe
return list(dict.fromkeys(flags))
browser_type: str
user_data_dir: str
@@ -280,29 +325,29 @@ class ManagedBrowser:
return browser_path
async def _get_browser_args(self) -> List[str]:
"""Returns browser-specific command line arguments"""
base_args = [await self._get_browser_path()]
"""Returns full CLI args for launching the browser"""
base = [await self._get_browser_path()]
if self.browser_type == "chromium":
args = [
flags = [
f"--remote-debugging-port={self.debugging_port}",
f"--user-data-dir={self.user_data_dir}",
]
if self.headless:
args.append("--headless=new")
flags.append("--headless=new")
# merge common launch flags
flags.extend(self.build_browser_flags(self.browser_config))
elif self.browser_type == "firefox":
args = [
flags = [
"--remote-debugging-port",
str(self.debugging_port),
"--profile",
self.user_data_dir,
]
if self.headless:
args.append("--headless")
flags.append("--headless")
else:
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
return base_args + args
return base + flags
async def cleanup(self):
"""Cleanup browser process and temporary directory"""
@@ -789,6 +834,23 @@ class BrowserManager:
# Update context settings with text mode settings
context_settings.update(text_mode_settings)
# inject locale / tz / geo if user provided them
if crawlerRunConfig:
if crawlerRunConfig.locale:
context_settings["locale"] = crawlerRunConfig.locale
if crawlerRunConfig.timezone_id:
context_settings["timezone_id"] = crawlerRunConfig.timezone_id
if crawlerRunConfig.geolocation:
context_settings["geolocation"] = {
"latitude": crawlerRunConfig.geolocation.latitude,
"longitude": crawlerRunConfig.geolocation.longitude,
"accuracy": crawlerRunConfig.geolocation.accuracy,
}
# ensure geolocation permission
perms = context_settings.get("permissions", [])
perms.append("geolocation")
context_settings["permissions"] = perms
# Create and return the context with all settings
context = await self.browser.new_context(**context_settings)
@@ -821,6 +883,10 @@ class BrowserManager:
"semaphore_count",
"url"
]
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
# and should cause a new context to be created if they change
for key in ephemeral_keys:
if key in config_dict:
del config_dict[key]

View File

@@ -0,0 +1,70 @@
# use_geo_location.py
"""
Example: override locale, timezone, and geolocation using Crawl4ai patterns.
This demo uses `AsyncWebCrawler.arun()` to fetch a page with
browser context primed for specific locale, timezone, and GPS,
and saves a screenshot for visual verification.
"""
import asyncio
import base64
from pathlib import Path
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
BrowserConfig,
GeolocationConfig,
CrawlResult,
)
async def demo_geo_override():
"""Demo: Crawl a geolocation-test page with overrides and screenshot."""
print("\n=== Geo-Override Crawl ===")
# 1) Browser setup: use Playwright-managed contexts
browser_cfg = BrowserConfig(
headless=False,
viewport_width=1280,
viewport_height=720,
use_managed_browser=False,
)
# 2) Run config: include locale, timezone_id, geolocation, and screenshot
run_cfg = CrawlerRunConfig(
url="https://browserleaks.com/geo", # test page that shows your location
locale="en-US", # Accept-Language & UI locale
timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
geolocation=GeolocationConfig( # override GPS coords
latitude=34.0522,
longitude=-118.2437,
accuracy=10.0,
),
screenshot=True, # capture screenshot after load
session_id="geo_test", # reuse context if rerunning
delay_before_return_html=5
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# 3) Run crawl (returns list even for single URL)
results: List[CrawlResult] = await crawler.arun(
url=run_cfg.url,
config=run_cfg,
)
result = results[0]
# 4) Save screenshot and report path
if result.screenshot:
__current_dir = Path(__file__).parent
out_dir = __current_dir / "tmp"
out_dir.mkdir(exist_ok=True)
shot_path = out_dir / "geo_test.png"
with open(shot_path, "wb") as f:
f.write(base64.b64decode(result.screenshot))
print(f"Saved screenshot to {shot_path}")
else:
print("No screenshot captured, check configuration.")
if __name__ == "__main__":
asyncio.run(demo_geo_override())

View File

@@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
---
## 7. Summary
## 7. Locale, Timezone, and Geolocation Control
In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
### Setting Locale and Timezone
You can set the browser's locale and timezone through `CrawlerRunConfig`:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig(
# Set browser locale (language and region formatting)
locale="fr-FR", # French (France)
# Set browser timezone
timezone_id="Europe/Paris",
# Other normal options...
magic=True,
page_timeout=60000
)
)
```
**How it works:**
- `locale` affects language preferences, date formats, number formats, etc.
- `timezone_id` affects JavaScript's Date object and time-related functionality
- These settings are applied when creating the browser context and maintained throughout the session
### Configuring Geolocation
Control the GPS coordinates reported by the browser's geolocation API:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://maps.google.com", # Or any location-aware site
config=CrawlerRunConfig(
# Configure precise GPS coordinates
geolocation=GeolocationConfig(
latitude=48.8566, # Paris coordinates
longitude=2.3522,
accuracy=100 # Accuracy in meters (optional)
),
# This site will see you as being in Paris
page_timeout=60000
)
)
```
**Important notes:**
- When `geolocation` is specified, the browser is automatically granted permission to access location
- Websites using the Geolocation API will receive the exact coordinates you specify
- This affects map services, store locators, delivery services, etc.
- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
### Combining with Managed Browsers
These settings work perfectly with managed browsers for a complete identity solution:
```python
from crawl4ai import (
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
GeolocationConfig
)
browser_config = BrowserConfig(
use_managed_browser=True,
user_data_dir="/path/to/my-profile",
browser_type="chromium"
)
crawl_config = CrawlerRunConfig(
# Location settings
locale="es-MX", # Spanish (Mexico)
timezone_id="America/Mexico_City",
geolocation=GeolocationConfig(
latitude=19.4326, # Mexico City
longitude=-99.1332
)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=crawl_config)
```
Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
## 8. Summary
- **Create** your user-data directory either:
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
@@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
- Or through the interactive interface with `profiler.interactive_manager()`
- **Log in** or configure sites as needed, then close the browser
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
- **Manage** your profiles with the dedicated `BrowserProfiler` class
- Enjoy **persistent** sessions that reflect your real identity

View File

@@ -137,6 +137,11 @@ class CrawlerRunConfig:
screenshot=False,
pdf=False,
capture_mhtml=False,
# Location and Identity Parameters
locale=None, # e.g. "en-US", "fr-FR"
timezone_id=None, # e.g. "America/New_York"
geolocation=None, # GeolocationConfig object
# Resource Management
enable_rate_limiting=False,
rate_limit_config=None,
memory_threshold_percent=70.0,