feat(browser): add BrowserProfiler class for identity-based browsing
Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include: - Interactive profile creation and management - Profile listing, retrieval, and deletion - Guided console interface - Migration of profile management from ManagedBrowser - New example script for identity-based browsing ALSO: - Updates logging format in AsyncWebCrawler - Removes content filter from hello_world example - Relaxes httpx version constraint BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler
This commit is contained in:
@@ -16,9 +16,9 @@ async def main():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
# content_filter=PruningContentFilter(
|
||||
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
# )
|
||||
),
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
|
||||
108
docs/examples/identity_based_browsing.py
Normal file
108
docs/examples/identity_based_browsing.py
Normal file
@@ -0,0 +1,108 @@
|
||||
"""
|
||||
Identity-Based Browsing Example with Crawl4AI
|
||||
|
||||
This example demonstrates how to:
|
||||
1. Create a persistent browser profile interactively
|
||||
2. List available profiles
|
||||
3. Use a saved profile for crawling authenticated sites
|
||||
4. Delete profiles when no longer needed
|
||||
|
||||
Uses the new BrowserProfiler class for profile management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
# Initialize colorama
|
||||
init()
|
||||
|
||||
# Create a shared logger instance
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create a shared BrowserProfiler instance
|
||||
profiler = BrowserProfiler(logger=logger)
|
||||
|
||||
|
||||
async def crawl_with_profile(profile_path, url):
|
||||
"""Use a profile to crawl an authenticated page"""
|
||||
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
|
||||
|
||||
# Create browser config with the profile path
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Set to False if you want to see the browser window
|
||||
use_managed_browser=True, # Required for persistent profiles
|
||||
user_data_dir=profile_path
|
||||
)
|
||||
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Initialize crawler with the browser config
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Crawl the URL - You should have access to authenticated content now
|
||||
result = await crawler.arun(url)
|
||||
|
||||
elapsed_time = asyncio.get_event_loop().time() - start_time
|
||||
|
||||
if result.success:
|
||||
# Use url_status method for consistent logging
|
||||
logger.url_status(url, True, elapsed_time, tag="CRAWL")
|
||||
|
||||
# Print page title or some indication of success
|
||||
title = result.metadata.get("title", "")
|
||||
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
|
||||
return result
|
||||
else:
|
||||
# Log error status
|
||||
logger.error_status(url, result.error_message, tag="CRAWL")
|
||||
return None
|
||||
|
||||
|
||||
async def main():
|
||||
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
|
||||
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
|
||||
|
||||
# Choose between interactive mode and automatic mode
|
||||
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
|
||||
|
||||
if mode == 'i':
|
||||
# Interactive profile management - use the interactive_manager method
|
||||
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
|
||||
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
|
||||
else:
|
||||
# Automatic mode - simplified example
|
||||
profiles = profiler.list_profiles()
|
||||
|
||||
if not profiles:
|
||||
# Create a new profile if none exists
|
||||
logger.info("No profiles found. Creating a new one...", tag="DEMO")
|
||||
profile_path = await profiler.create_profile()
|
||||
if not profile_path:
|
||||
logger.error("Cannot proceed without a valid profile", tag="DEMO")
|
||||
return
|
||||
else:
|
||||
# Use the first (most recent) profile
|
||||
profile_path = profiles[0]["path"]
|
||||
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
|
||||
|
||||
# Example: Crawl an authenticated page
|
||||
urls_to_crawl = [
|
||||
"https://github.com/settings/profile", # GitHub requires login
|
||||
# "https://twitter.com/home", # Twitter requires login
|
||||
# "https://www.linkedin.com/feed/", # LinkedIn requires login
|
||||
]
|
||||
|
||||
for url in urls_to_crawl:
|
||||
await crawl_with_profile(profile_path, url)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
# Run the async main function
|
||||
asyncio.run(main())
|
||||
except KeyboardInterrupt:
|
||||
logger.warning("Example interrupted by user", tag="DEMO")
|
||||
except Exception as e:
|
||||
logger.error(f"Error in example: {str(e)}", tag="DEMO")
|
||||
@@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler:
|
||||
|
||||
---
|
||||
|
||||
## 6. Summary
|
||||
## 6. Using the BrowserProfiler Class
|
||||
|
||||
- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.
|
||||
- **Log in** or configure sites as needed, then close the browser.
|
||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.
|
||||
- Enjoy **persistent** sessions that reflect your real identity.
|
||||
- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
|
||||
Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
|
||||
|
||||
### Creating and Managing Profiles with BrowserProfiler
|
||||
|
||||
The `BrowserProfiler` class offers a comprehensive API for browser profile management:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import BrowserProfiler
|
||||
|
||||
async def manage_profiles():
|
||||
# Create a profiler instance
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Create a profile interactively - opens a browser window
|
||||
profile_path = await profiler.create_profile(
|
||||
profile_name="my-login-profile" # Optional: name your profile
|
||||
)
|
||||
|
||||
print(f"Profile saved at: {profile_path}")
|
||||
|
||||
# List all available profiles
|
||||
profiles = profiler.list_profiles()
|
||||
|
||||
for profile in profiles:
|
||||
print(f"Profile: {profile['name']}")
|
||||
print(f" Path: {profile['path']}")
|
||||
print(f" Created: {profile['created']}")
|
||||
print(f" Browser type: {profile['type']}")
|
||||
|
||||
# Get a specific profile path by name
|
||||
specific_profile = profiler.get_profile_path("my-login-profile")
|
||||
|
||||
# Delete a profile when no longer needed
|
||||
success = profiler.delete_profile("old-profile-name")
|
||||
|
||||
asyncio.run(manage_profiles())
|
||||
```
|
||||
|
||||
**How profile creation works:**
|
||||
1. A browser window opens for you to interact with
|
||||
2. You log in to websites, set preferences, etc.
|
||||
3. When you're done, press 'q' in the terminal to close the browser
|
||||
4. The profile is saved in the Crawl4AI profiles directory
|
||||
5. You can use the returned path with `BrowserConfig.user_data_dir`
|
||||
|
||||
### Interactive Profile Management
|
||||
|
||||
The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Define a function to use a profile for crawling
|
||||
async def crawl_with_profile(profile_path, url):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir=profile_path
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url)
|
||||
return result
|
||||
|
||||
async def main():
|
||||
# Create a profiler instance
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Launch the interactive profile manager
|
||||
# Passing the crawl function as a callback adds a "crawl with profile" option
|
||||
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Legacy Methods
|
||||
|
||||
For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
|
||||
|
||||
```python
|
||||
from crawl4ai.browser_manager import ManagedBrowser
|
||||
|
||||
# These methods still work but use BrowserProfiler internally
|
||||
profiles = ManagedBrowser.list_profiles()
|
||||
```
|
||||
|
||||
### Complete Example
|
||||
|
||||
See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
|
||||
|
||||
---
|
||||
|
||||
## 7. Summary
|
||||
|
||||
- **Create** your user-data directory either:
|
||||
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
|
||||
- Or by using the built-in `BrowserProfiler.create_profile()` method
|
||||
- Or through the interactive interface with `profiler.interactive_manager()`
|
||||
- **Log in** or configure sites as needed, then close the browser
|
||||
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
|
||||
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
|
||||
- **Manage** your profiles with the dedicated `BrowserProfiler` class
|
||||
- Enjoy **persistent** sessions that reflect your real identity
|
||||
- If you only need quick, ephemeral automation, **Magic Mode** might suffice
|
||||
|
||||
**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.
|
||||
|
||||
|
||||
Reference in New Issue
Block a user