feat(browser): add geolocation, locale and timezone support
Add support for controlling browser geolocation, locale and timezone settings: - New GeolocationConfig class for managing GPS coordinates - Add locale and timezone_id parameters to CrawlerRunConfig - Update browser context creation to handle location settings - Add example script for geolocation usage - Update documentation with location-based identity features This enables more precise control over browser identity and location reporting.
This commit is contained in:
@@ -2,7 +2,7 @@
|
||||
import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
@@ -71,6 +71,7 @@ __all__ = [
|
||||
"AsyncWebCrawler",
|
||||
"BrowserProfiler",
|
||||
"LLMConfig",
|
||||
"GeolocationConfig",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
|
||||
@@ -159,6 +159,55 @@ def is_empty_value(value: Any) -> bool:
|
||||
return True
|
||||
return False
|
||||
|
||||
class GeolocationConfig:
|
||||
def __init__(
|
||||
self,
|
||||
latitude: float,
|
||||
longitude: float,
|
||||
accuracy: Optional[float] = 0.0
|
||||
):
|
||||
"""Configuration class for geolocation settings.
|
||||
|
||||
Args:
|
||||
latitude: Latitude coordinate (e.g., 37.7749)
|
||||
longitude: Longitude coordinate (e.g., -122.4194)
|
||||
accuracy: Accuracy in meters. Default: 0.0
|
||||
"""
|
||||
self.latitude = latitude
|
||||
self.longitude = longitude
|
||||
self.accuracy = accuracy
|
||||
|
||||
@staticmethod
|
||||
def from_dict(geo_dict: Dict) -> "GeolocationConfig":
|
||||
"""Create a GeolocationConfig from a dictionary."""
|
||||
return GeolocationConfig(
|
||||
latitude=geo_dict.get("latitude"),
|
||||
longitude=geo_dict.get("longitude"),
|
||||
accuracy=geo_dict.get("accuracy", 0.0)
|
||||
)
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"latitude": self.latitude,
|
||||
"longitude": self.longitude,
|
||||
"accuracy": self.accuracy
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "GeolocationConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
GeolocationConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return GeolocationConfig.from_dict(config_dict)
|
||||
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
@@ -680,6 +729,14 @@ class CrawlerRunConfig():
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
# Browser Location and Identity Parameters
|
||||
locale (str or None): Locale to use for the browser context (e.g., "en-US").
|
||||
Default: None.
|
||||
timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
|
||||
Default: None.
|
||||
geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
|
||||
Default: None.
|
||||
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
@@ -829,6 +886,10 @@ class CrawlerRunConfig():
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
# Browser Location and Identity Parameters
|
||||
locale: Optional[str] = None,
|
||||
timezone_id: Optional[str] = None,
|
||||
geolocation: Optional[GeolocationConfig] = None,
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
@@ -917,6 +978,11 @@ class CrawlerRunConfig():
|
||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||
self.proxy_config = proxy_config
|
||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||
|
||||
# Browser Location and Identity Parameters
|
||||
self.locale = locale
|
||||
self.timezone_id = timezone_id
|
||||
self.geolocation = geolocation
|
||||
|
||||
# SSL Parameters
|
||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||
@@ -1057,6 +1123,10 @@ class CrawlerRunConfig():
|
||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
||||
# Browser Location and Identity Parameters
|
||||
locale=kwargs.get("locale", None),
|
||||
timezone_id=kwargs.get("timezone_id", None),
|
||||
geolocation=kwargs.get("geolocation", None),
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||
# Caching Parameters
|
||||
@@ -1166,6 +1236,9 @@ class CrawlerRunConfig():
|
||||
"scraping_strategy": self.scraping_strategy,
|
||||
"proxy_config": self.proxy_config,
|
||||
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||||
"locale": self.locale,
|
||||
"timezone_id": self.timezone_id,
|
||||
"geolocation": self.geolocation,
|
||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||
"cache_mode": self.cache_mode,
|
||||
"session_id": self.session_id,
|
||||
|
||||
@@ -76,6 +76,51 @@ class ManagedBrowser:
|
||||
_cleanup(): Terminates the browser process and removes the temporary directory.
|
||||
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
||||
"""Common CLI flags for launching Chromium"""
|
||||
flags = [
|
||||
"--disable-gpu",
|
||||
"--disable-gpu-compositing",
|
||||
"--disable-software-rasterizer",
|
||||
"--no-sandbox",
|
||||
"--disable-dev-shm-usage",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check",
|
||||
"--disable-infobars",
|
||||
"--window-position=0,0",
|
||||
"--ignore-certificate-errors",
|
||||
"--ignore-certificate-errors-spki-list",
|
||||
"--disable-blink-features=AutomationControlled",
|
||||
"--window-position=400,0",
|
||||
"--disable-renderer-backgrounding",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--force-color-profile=srgb",
|
||||
"--mute-audio",
|
||||
"--disable-background-timer-throttling",
|
||||
]
|
||||
if config.light_mode:
|
||||
flags.extend(BROWSER_DISABLE_OPTIONS)
|
||||
if config.text_mode:
|
||||
flags.extend([
|
||||
"--blink-settings=imagesEnabled=false",
|
||||
"--disable-remote-fonts",
|
||||
"--disable-images",
|
||||
"--disable-javascript",
|
||||
"--disable-software-rasterizer",
|
||||
"--disable-dev-shm-usage",
|
||||
])
|
||||
# proxy support
|
||||
if config.proxy:
|
||||
flags.append(f"--proxy-server={config.proxy}")
|
||||
elif config.proxy_config:
|
||||
creds = ""
|
||||
if config.proxy_config.username and config.proxy_config.password:
|
||||
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
||||
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
||||
# dedupe
|
||||
return list(dict.fromkeys(flags))
|
||||
|
||||
browser_type: str
|
||||
user_data_dir: str
|
||||
@@ -280,29 +325,29 @@ class ManagedBrowser:
|
||||
return browser_path
|
||||
|
||||
async def _get_browser_args(self) -> List[str]:
|
||||
"""Returns browser-specific command line arguments"""
|
||||
base_args = [await self._get_browser_path()]
|
||||
|
||||
"""Returns full CLI args for launching the browser"""
|
||||
base = [await self._get_browser_path()]
|
||||
if self.browser_type == "chromium":
|
||||
args = [
|
||||
flags = [
|
||||
f"--remote-debugging-port={self.debugging_port}",
|
||||
f"--user-data-dir={self.user_data_dir}",
|
||||
]
|
||||
if self.headless:
|
||||
args.append("--headless=new")
|
||||
flags.append("--headless=new")
|
||||
# merge common launch flags
|
||||
flags.extend(self.build_browser_flags(self.browser_config))
|
||||
elif self.browser_type == "firefox":
|
||||
args = [
|
||||
flags = [
|
||||
"--remote-debugging-port",
|
||||
str(self.debugging_port),
|
||||
"--profile",
|
||||
self.user_data_dir,
|
||||
]
|
||||
if self.headless:
|
||||
args.append("--headless")
|
||||
flags.append("--headless")
|
||||
else:
|
||||
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
||||
|
||||
return base_args + args
|
||||
return base + flags
|
||||
|
||||
async def cleanup(self):
|
||||
"""Cleanup browser process and temporary directory"""
|
||||
@@ -789,6 +834,23 @@ class BrowserManager:
|
||||
# Update context settings with text mode settings
|
||||
context_settings.update(text_mode_settings)
|
||||
|
||||
# inject locale / tz / geo if user provided them
|
||||
if crawlerRunConfig:
|
||||
if crawlerRunConfig.locale:
|
||||
context_settings["locale"] = crawlerRunConfig.locale
|
||||
if crawlerRunConfig.timezone_id:
|
||||
context_settings["timezone_id"] = crawlerRunConfig.timezone_id
|
||||
if crawlerRunConfig.geolocation:
|
||||
context_settings["geolocation"] = {
|
||||
"latitude": crawlerRunConfig.geolocation.latitude,
|
||||
"longitude": crawlerRunConfig.geolocation.longitude,
|
||||
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
||||
}
|
||||
# ensure geolocation permission
|
||||
perms = context_settings.get("permissions", [])
|
||||
perms.append("geolocation")
|
||||
context_settings["permissions"] = perms
|
||||
|
||||
# Create and return the context with all settings
|
||||
context = await self.browser.new_context(**context_settings)
|
||||
|
||||
@@ -821,6 +883,10 @@ class BrowserManager:
|
||||
"semaphore_count",
|
||||
"url"
|
||||
]
|
||||
|
||||
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
|
||||
# and should cause a new context to be created if they change
|
||||
|
||||
for key in ephemeral_keys:
|
||||
if key in config_dict:
|
||||
del config_dict[key]
|
||||
|
||||
70
docs/examples/use_geo_location.py
Normal file
70
docs/examples/use_geo_location.py
Normal file
@@ -0,0 +1,70 @@
|
||||
# use_geo_location.py
|
||||
"""
|
||||
Example: override locale, timezone, and geolocation using Crawl4ai patterns.
|
||||
|
||||
This demo uses `AsyncWebCrawler.arun()` to fetch a page with
|
||||
browser context primed for specific locale, timezone, and GPS,
|
||||
and saves a screenshot for visual verification.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BrowserConfig,
|
||||
GeolocationConfig,
|
||||
CrawlResult,
|
||||
)
|
||||
|
||||
async def demo_geo_override():
|
||||
"""Demo: Crawl a geolocation-test page with overrides and screenshot."""
|
||||
print("\n=== Geo-Override Crawl ===")
|
||||
|
||||
# 1) Browser setup: use Playwright-managed contexts
|
||||
browser_cfg = BrowserConfig(
|
||||
headless=False,
|
||||
viewport_width=1280,
|
||||
viewport_height=720,
|
||||
use_managed_browser=False,
|
||||
)
|
||||
|
||||
# 2) Run config: include locale, timezone_id, geolocation, and screenshot
|
||||
run_cfg = CrawlerRunConfig(
|
||||
url="https://browserleaks.com/geo", # test page that shows your location
|
||||
locale="en-US", # Accept-Language & UI locale
|
||||
timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
|
||||
geolocation=GeolocationConfig( # override GPS coords
|
||||
latitude=34.0522,
|
||||
longitude=-118.2437,
|
||||
accuracy=10.0,
|
||||
),
|
||||
screenshot=True, # capture screenshot after load
|
||||
session_id="geo_test", # reuse context if rerunning
|
||||
delay_before_return_html=5
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# 3) Run crawl (returns list even for single URL)
|
||||
results: List[CrawlResult] = await crawler.arun(
|
||||
url=run_cfg.url,
|
||||
config=run_cfg,
|
||||
)
|
||||
result = results[0]
|
||||
|
||||
# 4) Save screenshot and report path
|
||||
if result.screenshot:
|
||||
__current_dir = Path(__file__).parent
|
||||
out_dir = __current_dir / "tmp"
|
||||
out_dir.mkdir(exist_ok=True)
|
||||
shot_path = out_dir / "geo_test.png"
|
||||
with open(shot_path, "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print(f"Saved screenshot to {shot_path}")
|
||||
else:
|
||||
print("No screenshot captured, check configuration.")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(demo_geo_override())
|
||||
@@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
|
||||
|
||||
---
|
||||
|
||||
## 7. Summary
|
||||
## 7. Locale, Timezone, and Geolocation Control
|
||||
|
||||
In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
|
||||
|
||||
### Setting Locale and Timezone
|
||||
|
||||
You can set the browser's locale and timezone through `CrawlerRunConfig`:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig(
|
||||
# Set browser locale (language and region formatting)
|
||||
locale="fr-FR", # French (France)
|
||||
|
||||
# Set browser timezone
|
||||
timezone_id="Europe/Paris",
|
||||
|
||||
# Other normal options...
|
||||
magic=True,
|
||||
page_timeout=60000
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**How it works:**
|
||||
- `locale` affects language preferences, date formats, number formats, etc.
|
||||
- `timezone_id` affects JavaScript's Date object and time-related functionality
|
||||
- These settings are applied when creating the browser context and maintained throughout the session
|
||||
|
||||
### Configuring Geolocation
|
||||
|
||||
Control the GPS coordinates reported by the browser's geolocation API:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://maps.google.com", # Or any location-aware site
|
||||
config=CrawlerRunConfig(
|
||||
# Configure precise GPS coordinates
|
||||
geolocation=GeolocationConfig(
|
||||
latitude=48.8566, # Paris coordinates
|
||||
longitude=2.3522,
|
||||
accuracy=100 # Accuracy in meters (optional)
|
||||
),
|
||||
|
||||
# This site will see you as being in Paris
|
||||
page_timeout=60000
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
**Important notes:**
|
||||
- When `geolocation` is specified, the browser is automatically granted permission to access location
|
||||
- Websites using the Geolocation API will receive the exact coordinates you specify
|
||||
- This affects map services, store locators, delivery services, etc.
|
||||
- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
|
||||
|
||||
### Combining with Managed Browsers
|
||||
|
||||
These settings work perfectly with managed browsers for a complete identity solution:
|
||||
|
||||
```python
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||
GeolocationConfig
|
||||
)
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
user_data_dir="/path/to/my-profile",
|
||||
browser_type="chromium"
|
||||
)
|
||||
|
||||
crawl_config = CrawlerRunConfig(
|
||||
# Location settings
|
||||
locale="es-MX", # Spanish (Mexico)
|
||||
timezone_id="America/Mexico_City",
|
||||
geolocation=GeolocationConfig(
|
||||
latitude=19.4326, # Mexico City
|
||||
longitude=-99.1332
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=crawl_config)
|
||||
```
|
||||
|
||||
Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
|
||||
|
||||
## 8. Summary
|
||||
|
||||
- **Create** your user-data directory either:
|
||||
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
|
||||
@@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
|
||||
- Or through the interactive interface with `profiler.interactive_manager()`
|
||||
- **Log in** or configure sites as needed, then close the browser
|
||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
|
||||
- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
|
||||
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
|
||||
- **Manage** your profiles with the dedicated `BrowserProfiler` class
|
||||
- Enjoy **persistent** sessions that reflect your real identity
|
||||
|
||||
@@ -137,6 +137,11 @@ class CrawlerRunConfig:
|
||||
screenshot=False,
|
||||
pdf=False,
|
||||
capture_mhtml=False,
|
||||
# Location and Identity Parameters
|
||||
locale=None, # e.g. "en-US", "fr-FR"
|
||||
timezone_id=None, # e.g. "America/New_York"
|
||||
geolocation=None, # GeolocationConfig object
|
||||
# Resource Management
|
||||
enable_rate_limiting=False,
|
||||
rate_limit_config=None,
|
||||
memory_threshold_percent=70.0,
|
||||
|
||||
Reference in New Issue
Block a user