feat(browser): add geolocation, locale and timezone support
Add support for controlling browser geolocation, locale and timezone settings: - New GeolocationConfig class for managing GPS coordinates - Add locale and timezone_id parameters to CrawlerRunConfig - Update browser context creation to handle location settings - Add example script for geolocation usage - Update documentation with location-based identity features This enables more precise control over browser identity and location reporting.
This commit is contained in:
@@ -2,7 +2,7 @@
|
|||||||
import warnings
|
import warnings
|
||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
@@ -71,6 +71,7 @@ __all__ = [
|
|||||||
"AsyncWebCrawler",
|
"AsyncWebCrawler",
|
||||||
"BrowserProfiler",
|
"BrowserProfiler",
|
||||||
"LLMConfig",
|
"LLMConfig",
|
||||||
|
"GeolocationConfig",
|
||||||
"DeepCrawlStrategy",
|
"DeepCrawlStrategy",
|
||||||
"BFSDeepCrawlStrategy",
|
"BFSDeepCrawlStrategy",
|
||||||
"BestFirstCrawlingStrategy",
|
"BestFirstCrawlingStrategy",
|
||||||
|
|||||||
@@ -159,6 +159,55 @@ def is_empty_value(value: Any) -> bool:
|
|||||||
return True
|
return True
|
||||||
return False
|
return False
|
||||||
|
|
||||||
|
class GeolocationConfig:
|
||||||
|
def __init__(
|
||||||
|
self,
|
||||||
|
latitude: float,
|
||||||
|
longitude: float,
|
||||||
|
accuracy: Optional[float] = 0.0
|
||||||
|
):
|
||||||
|
"""Configuration class for geolocation settings.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
latitude: Latitude coordinate (e.g., 37.7749)
|
||||||
|
longitude: Longitude coordinate (e.g., -122.4194)
|
||||||
|
accuracy: Accuracy in meters. Default: 0.0
|
||||||
|
"""
|
||||||
|
self.latitude = latitude
|
||||||
|
self.longitude = longitude
|
||||||
|
self.accuracy = accuracy
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def from_dict(geo_dict: Dict) -> "GeolocationConfig":
|
||||||
|
"""Create a GeolocationConfig from a dictionary."""
|
||||||
|
return GeolocationConfig(
|
||||||
|
latitude=geo_dict.get("latitude"),
|
||||||
|
longitude=geo_dict.get("longitude"),
|
||||||
|
accuracy=geo_dict.get("accuracy", 0.0)
|
||||||
|
)
|
||||||
|
|
||||||
|
def to_dict(self) -> Dict:
|
||||||
|
"""Convert to dictionary representation."""
|
||||||
|
return {
|
||||||
|
"latitude": self.latitude,
|
||||||
|
"longitude": self.longitude,
|
||||||
|
"accuracy": self.accuracy
|
||||||
|
}
|
||||||
|
|
||||||
|
def clone(self, **kwargs) -> "GeolocationConfig":
|
||||||
|
"""Create a copy of this configuration with updated values.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
**kwargs: Key-value pairs of configuration options to update
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
GeolocationConfig: A new instance with the specified updates
|
||||||
|
"""
|
||||||
|
config_dict = self.to_dict()
|
||||||
|
config_dict.update(kwargs)
|
||||||
|
return GeolocationConfig.from_dict(config_dict)
|
||||||
|
|
||||||
|
|
||||||
class ProxyConfig:
|
class ProxyConfig:
|
||||||
def __init__(
|
def __init__(
|
||||||
self,
|
self,
|
||||||
@@ -680,6 +729,14 @@ class CrawlerRunConfig():
|
|||||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||||
If None, no additional proxy config. Default: None.
|
If None, no additional proxy config. Default: None.
|
||||||
|
|
||||||
|
# Browser Location and Identity Parameters
|
||||||
|
locale (str or None): Locale to use for the browser context (e.g., "en-US").
|
||||||
|
Default: None.
|
||||||
|
timezone_id (str or None): Timezone identifier to use for the browser context (e.g., "America/New_York").
|
||||||
|
Default: None.
|
||||||
|
geolocation (GeolocationConfig or None): Geolocation configuration for the browser.
|
||||||
|
Default: None.
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -829,6 +886,10 @@ class CrawlerRunConfig():
|
|||||||
scraping_strategy: ContentScrapingStrategy = None,
|
scraping_strategy: ContentScrapingStrategy = None,
|
||||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||||
|
# Browser Location and Identity Parameters
|
||||||
|
locale: Optional[str] = None,
|
||||||
|
timezone_id: Optional[str] = None,
|
||||||
|
geolocation: Optional[GeolocationConfig] = None,
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate: bool = False,
|
fetch_ssl_certificate: bool = False,
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -917,6 +978,11 @@ class CrawlerRunConfig():
|
|||||||
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
self.scraping_strategy = scraping_strategy or WebScrapingStrategy()
|
||||||
self.proxy_config = proxy_config
|
self.proxy_config = proxy_config
|
||||||
self.proxy_rotation_strategy = proxy_rotation_strategy
|
self.proxy_rotation_strategy = proxy_rotation_strategy
|
||||||
|
|
||||||
|
# Browser Location and Identity Parameters
|
||||||
|
self.locale = locale
|
||||||
|
self.timezone_id = timezone_id
|
||||||
|
self.geolocation = geolocation
|
||||||
|
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
self.fetch_ssl_certificate = fetch_ssl_certificate
|
self.fetch_ssl_certificate = fetch_ssl_certificate
|
||||||
@@ -1057,6 +1123,10 @@ class CrawlerRunConfig():
|
|||||||
scraping_strategy=kwargs.get("scraping_strategy"),
|
scraping_strategy=kwargs.get("scraping_strategy"),
|
||||||
proxy_config=kwargs.get("proxy_config"),
|
proxy_config=kwargs.get("proxy_config"),
|
||||||
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
proxy_rotation_strategy=kwargs.get("proxy_rotation_strategy"),
|
||||||
|
# Browser Location and Identity Parameters
|
||||||
|
locale=kwargs.get("locale", None),
|
||||||
|
timezone_id=kwargs.get("timezone_id", None),
|
||||||
|
geolocation=kwargs.get("geolocation", None),
|
||||||
# SSL Parameters
|
# SSL Parameters
|
||||||
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
|
||||||
# Caching Parameters
|
# Caching Parameters
|
||||||
@@ -1166,6 +1236,9 @@ class CrawlerRunConfig():
|
|||||||
"scraping_strategy": self.scraping_strategy,
|
"scraping_strategy": self.scraping_strategy,
|
||||||
"proxy_config": self.proxy_config,
|
"proxy_config": self.proxy_config,
|
||||||
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
"proxy_rotation_strategy": self.proxy_rotation_strategy,
|
||||||
|
"locale": self.locale,
|
||||||
|
"timezone_id": self.timezone_id,
|
||||||
|
"geolocation": self.geolocation,
|
||||||
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
"fetch_ssl_certificate": self.fetch_ssl_certificate,
|
||||||
"cache_mode": self.cache_mode,
|
"cache_mode": self.cache_mode,
|
||||||
"session_id": self.session_id,
|
"session_id": self.session_id,
|
||||||
|
|||||||
@@ -76,6 +76,51 @@ class ManagedBrowser:
|
|||||||
_cleanup(): Terminates the browser process and removes the temporary directory.
|
_cleanup(): Terminates the browser process and removes the temporary directory.
|
||||||
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
||||||
|
"""Common CLI flags for launching Chromium"""
|
||||||
|
flags = [
|
||||||
|
"--disable-gpu",
|
||||||
|
"--disable-gpu-compositing",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--no-sandbox",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
"--no-first-run",
|
||||||
|
"--no-default-browser-check",
|
||||||
|
"--disable-infobars",
|
||||||
|
"--window-position=0,0",
|
||||||
|
"--ignore-certificate-errors",
|
||||||
|
"--ignore-certificate-errors-spki-list",
|
||||||
|
"--disable-blink-features=AutomationControlled",
|
||||||
|
"--window-position=400,0",
|
||||||
|
"--disable-renderer-backgrounding",
|
||||||
|
"--disable-ipc-flooding-protection",
|
||||||
|
"--force-color-profile=srgb",
|
||||||
|
"--mute-audio",
|
||||||
|
"--disable-background-timer-throttling",
|
||||||
|
]
|
||||||
|
if config.light_mode:
|
||||||
|
flags.extend(BROWSER_DISABLE_OPTIONS)
|
||||||
|
if config.text_mode:
|
||||||
|
flags.extend([
|
||||||
|
"--blink-settings=imagesEnabled=false",
|
||||||
|
"--disable-remote-fonts",
|
||||||
|
"--disable-images",
|
||||||
|
"--disable-javascript",
|
||||||
|
"--disable-software-rasterizer",
|
||||||
|
"--disable-dev-shm-usage",
|
||||||
|
])
|
||||||
|
# proxy support
|
||||||
|
if config.proxy:
|
||||||
|
flags.append(f"--proxy-server={config.proxy}")
|
||||||
|
elif config.proxy_config:
|
||||||
|
creds = ""
|
||||||
|
if config.proxy_config.username and config.proxy_config.password:
|
||||||
|
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
||||||
|
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
||||||
|
# dedupe
|
||||||
|
return list(dict.fromkeys(flags))
|
||||||
|
|
||||||
browser_type: str
|
browser_type: str
|
||||||
user_data_dir: str
|
user_data_dir: str
|
||||||
@@ -280,29 +325,29 @@ class ManagedBrowser:
|
|||||||
return browser_path
|
return browser_path
|
||||||
|
|
||||||
async def _get_browser_args(self) -> List[str]:
|
async def _get_browser_args(self) -> List[str]:
|
||||||
"""Returns browser-specific command line arguments"""
|
"""Returns full CLI args for launching the browser"""
|
||||||
base_args = [await self._get_browser_path()]
|
base = [await self._get_browser_path()]
|
||||||
|
|
||||||
if self.browser_type == "chromium":
|
if self.browser_type == "chromium":
|
||||||
args = [
|
flags = [
|
||||||
f"--remote-debugging-port={self.debugging_port}",
|
f"--remote-debugging-port={self.debugging_port}",
|
||||||
f"--user-data-dir={self.user_data_dir}",
|
f"--user-data-dir={self.user_data_dir}",
|
||||||
]
|
]
|
||||||
if self.headless:
|
if self.headless:
|
||||||
args.append("--headless=new")
|
flags.append("--headless=new")
|
||||||
|
# merge common launch flags
|
||||||
|
flags.extend(self.build_browser_flags(self.browser_config))
|
||||||
elif self.browser_type == "firefox":
|
elif self.browser_type == "firefox":
|
||||||
args = [
|
flags = [
|
||||||
"--remote-debugging-port",
|
"--remote-debugging-port",
|
||||||
str(self.debugging_port),
|
str(self.debugging_port),
|
||||||
"--profile",
|
"--profile",
|
||||||
self.user_data_dir,
|
self.user_data_dir,
|
||||||
]
|
]
|
||||||
if self.headless:
|
if self.headless:
|
||||||
args.append("--headless")
|
flags.append("--headless")
|
||||||
else:
|
else:
|
||||||
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
|
||||||
|
return base + flags
|
||||||
return base_args + args
|
|
||||||
|
|
||||||
async def cleanup(self):
|
async def cleanup(self):
|
||||||
"""Cleanup browser process and temporary directory"""
|
"""Cleanup browser process and temporary directory"""
|
||||||
@@ -789,6 +834,23 @@ class BrowserManager:
|
|||||||
# Update context settings with text mode settings
|
# Update context settings with text mode settings
|
||||||
context_settings.update(text_mode_settings)
|
context_settings.update(text_mode_settings)
|
||||||
|
|
||||||
|
# inject locale / tz / geo if user provided them
|
||||||
|
if crawlerRunConfig:
|
||||||
|
if crawlerRunConfig.locale:
|
||||||
|
context_settings["locale"] = crawlerRunConfig.locale
|
||||||
|
if crawlerRunConfig.timezone_id:
|
||||||
|
context_settings["timezone_id"] = crawlerRunConfig.timezone_id
|
||||||
|
if crawlerRunConfig.geolocation:
|
||||||
|
context_settings["geolocation"] = {
|
||||||
|
"latitude": crawlerRunConfig.geolocation.latitude,
|
||||||
|
"longitude": crawlerRunConfig.geolocation.longitude,
|
||||||
|
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
||||||
|
}
|
||||||
|
# ensure geolocation permission
|
||||||
|
perms = context_settings.get("permissions", [])
|
||||||
|
perms.append("geolocation")
|
||||||
|
context_settings["permissions"] = perms
|
||||||
|
|
||||||
# Create and return the context with all settings
|
# Create and return the context with all settings
|
||||||
context = await self.browser.new_context(**context_settings)
|
context = await self.browser.new_context(**context_settings)
|
||||||
|
|
||||||
@@ -821,6 +883,10 @@ class BrowserManager:
|
|||||||
"semaphore_count",
|
"semaphore_count",
|
||||||
"url"
|
"url"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
|
||||||
|
# and should cause a new context to be created if they change
|
||||||
|
|
||||||
for key in ephemeral_keys:
|
for key in ephemeral_keys:
|
||||||
if key in config_dict:
|
if key in config_dict:
|
||||||
del config_dict[key]
|
del config_dict[key]
|
||||||
|
|||||||
70
docs/examples/use_geo_location.py
Normal file
70
docs/examples/use_geo_location.py
Normal file
@@ -0,0 +1,70 @@
|
|||||||
|
# use_geo_location.py
|
||||||
|
"""
|
||||||
|
Example: override locale, timezone, and geolocation using Crawl4ai patterns.
|
||||||
|
|
||||||
|
This demo uses `AsyncWebCrawler.arun()` to fetch a page with
|
||||||
|
browser context primed for specific locale, timezone, and GPS,
|
||||||
|
and saves a screenshot for visual verification.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import asyncio
|
||||||
|
import base64
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import List
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
BrowserConfig,
|
||||||
|
GeolocationConfig,
|
||||||
|
CrawlResult,
|
||||||
|
)
|
||||||
|
|
||||||
|
async def demo_geo_override():
|
||||||
|
"""Demo: Crawl a geolocation-test page with overrides and screenshot."""
|
||||||
|
print("\n=== Geo-Override Crawl ===")
|
||||||
|
|
||||||
|
# 1) Browser setup: use Playwright-managed contexts
|
||||||
|
browser_cfg = BrowserConfig(
|
||||||
|
headless=False,
|
||||||
|
viewport_width=1280,
|
||||||
|
viewport_height=720,
|
||||||
|
use_managed_browser=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 2) Run config: include locale, timezone_id, geolocation, and screenshot
|
||||||
|
run_cfg = CrawlerRunConfig(
|
||||||
|
url="https://browserleaks.com/geo", # test page that shows your location
|
||||||
|
locale="en-US", # Accept-Language & UI locale
|
||||||
|
timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
|
||||||
|
geolocation=GeolocationConfig( # override GPS coords
|
||||||
|
latitude=34.0522,
|
||||||
|
longitude=-118.2437,
|
||||||
|
accuracy=10.0,
|
||||||
|
),
|
||||||
|
screenshot=True, # capture screenshot after load
|
||||||
|
session_id="geo_test", # reuse context if rerunning
|
||||||
|
delay_before_return_html=5
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
|
# 3) Run crawl (returns list even for single URL)
|
||||||
|
results: List[CrawlResult] = await crawler.arun(
|
||||||
|
url=run_cfg.url,
|
||||||
|
config=run_cfg,
|
||||||
|
)
|
||||||
|
result = results[0]
|
||||||
|
|
||||||
|
# 4) Save screenshot and report path
|
||||||
|
if result.screenshot:
|
||||||
|
__current_dir = Path(__file__).parent
|
||||||
|
out_dir = __current_dir / "tmp"
|
||||||
|
out_dir.mkdir(exist_ok=True)
|
||||||
|
shot_path = out_dir / "geo_test.png"
|
||||||
|
with open(shot_path, "wb") as f:
|
||||||
|
f.write(base64.b64decode(result.screenshot))
|
||||||
|
print(f"Saved screenshot to {shot_path}")
|
||||||
|
else:
|
||||||
|
print("No screenshot captured, check configuration.")
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
asyncio.run(demo_geo_override())
|
||||||
@@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 7. Summary
|
## 7. Locale, Timezone, and Geolocation Control
|
||||||
|
|
||||||
|
In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
|
||||||
|
|
||||||
|
### Setting Locale and Timezone
|
||||||
|
|
||||||
|
You can set the browser's locale and timezone through `CrawlerRunConfig`:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
# Set browser locale (language and region formatting)
|
||||||
|
locale="fr-FR", # French (France)
|
||||||
|
|
||||||
|
# Set browser timezone
|
||||||
|
timezone_id="Europe/Paris",
|
||||||
|
|
||||||
|
# Other normal options...
|
||||||
|
magic=True,
|
||||||
|
page_timeout=60000
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**How it works:**
|
||||||
|
- `locale` affects language preferences, date formats, number formats, etc.
|
||||||
|
- `timezone_id` affects JavaScript's Date object and time-related functionality
|
||||||
|
- These settings are applied when creating the browser context and maintained throughout the session
|
||||||
|
|
||||||
|
### Configuring Geolocation
|
||||||
|
|
||||||
|
Control the GPS coordinates reported by the browser's geolocation API:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://maps.google.com", # Or any location-aware site
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
# Configure precise GPS coordinates
|
||||||
|
geolocation=GeolocationConfig(
|
||||||
|
latitude=48.8566, # Paris coordinates
|
||||||
|
longitude=2.3522,
|
||||||
|
accuracy=100 # Accuracy in meters (optional)
|
||||||
|
),
|
||||||
|
|
||||||
|
# This site will see you as being in Paris
|
||||||
|
page_timeout=60000
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
**Important notes:**
|
||||||
|
- When `geolocation` is specified, the browser is automatically granted permission to access location
|
||||||
|
- Websites using the Geolocation API will receive the exact coordinates you specify
|
||||||
|
- This affects map services, store locators, delivery services, etc.
|
||||||
|
- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
|
||||||
|
|
||||||
|
### Combining with Managed Browsers
|
||||||
|
|
||||||
|
These settings work perfectly with managed browsers for a complete identity solution:
|
||||||
|
|
||||||
|
```python
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
|
||||||
|
GeolocationConfig
|
||||||
|
)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
use_managed_browser=True,
|
||||||
|
user_data_dir="/path/to/my-profile",
|
||||||
|
browser_type="chromium"
|
||||||
|
)
|
||||||
|
|
||||||
|
crawl_config = CrawlerRunConfig(
|
||||||
|
# Location settings
|
||||||
|
locale="es-MX", # Spanish (Mexico)
|
||||||
|
timezone_id="America/Mexico_City",
|
||||||
|
geolocation=GeolocationConfig(
|
||||||
|
latitude=19.4326, # Mexico City
|
||||||
|
longitude=-99.1332
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=crawl_config)
|
||||||
|
```
|
||||||
|
|
||||||
|
Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
|
||||||
|
|
||||||
|
## 8. Summary
|
||||||
|
|
||||||
- **Create** your user-data directory either:
|
- **Create** your user-data directory either:
|
||||||
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
|
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
|
||||||
@@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
|
|||||||
- Or through the interactive interface with `profiler.interactive_manager()`
|
- Or through the interactive interface with `profiler.interactive_manager()`
|
||||||
- **Log in** or configure sites as needed, then close the browser
|
- **Log in** or configure sites as needed, then close the browser
|
||||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
|
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
|
||||||
|
- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
|
||||||
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
|
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
|
||||||
- **Manage** your profiles with the dedicated `BrowserProfiler` class
|
- **Manage** your profiles with the dedicated `BrowserProfiler` class
|
||||||
- Enjoy **persistent** sessions that reflect your real identity
|
- Enjoy **persistent** sessions that reflect your real identity
|
||||||
|
|||||||
@@ -137,6 +137,11 @@ class CrawlerRunConfig:
|
|||||||
screenshot=False,
|
screenshot=False,
|
||||||
pdf=False,
|
pdf=False,
|
||||||
capture_mhtml=False,
|
capture_mhtml=False,
|
||||||
|
# Location and Identity Parameters
|
||||||
|
locale=None, # e.g. "en-US", "fr-FR"
|
||||||
|
timezone_id=None, # e.g. "America/New_York"
|
||||||
|
geolocation=None, # GeolocationConfig object
|
||||||
|
# Resource Management
|
||||||
enable_rate_limiting=False,
|
enable_rate_limiting=False,
|
||||||
rate_limit_config=None,
|
rate_limit_config=None,
|
||||||
memory_threshold_percent=70.0,
|
memory_threshold_percent=70.0,
|
||||||
|
|||||||
Reference in New Issue
Block a user