From cba4a466e50a5bd8031b494f26e9e868e59eeaa9 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sun, 2 Mar 2025 20:32:29 +0800 Subject: [PATCH] feat(browser): add BrowserProfiler class for identity-based browsing Adds a new BrowserProfiler class that provides comprehensive management of browser profiles for identity-based crawling. Features include: - Interactive profile creation and management - Profile listing, retrieval, and deletion - Guided console interface - Migration of profile management from ManagedBrowser - New example script for identity-based browsing ALSO: - Updates logging format in AsyncWebCrawler - Removes content filter from hello_world example - Relaxes httpx version constraint BREAKING CHANGE: Profile management methods from ManagedBrowser are now deprecated and delegate to BrowserProfiler --- crawl4ai/__init__.py | 2 + crawl4ai/async_webcrawler.py | 4 +- crawl4ai/browser_manager.py | 76 +++ crawl4ai/browser_profiler.py | 544 ++++++++++++++++++ crawl4ai/user_agent_generator.py | 3 +- docs/examples/hello_world.py | 6 +- docs/examples/identity_based_browsing.py | 108 ++++ .../md_v2/advanced/identity-based-crawling.md | 113 +++- pyproject.toml | 2 +- 9 files changed, 844 insertions(+), 14 deletions(-) create mode 100644 crawl4ai/browser_profiler.py create mode 100644 docs/examples/identity_based_browsing.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 12322540..31e4ca7a 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -42,6 +42,7 @@ from .async_dispatcher import ( ) from .docker_client import Crawl4aiDockerClient from .hub import CrawlerHub +from .browser_profiler import BrowserProfiler from .deep_crawling import ( DeepCrawlStrategy, BFSDeepCrawlStrategy, @@ -66,6 +67,7 @@ __all__ = [ "AsyncLoggerBase", "AsyncLogger", "AsyncWebCrawler", + "BrowserProfiler", "DeepCrawlStrategy", "BFSDeepCrawlStrategy", "BestFirstCrawlingStrategy", diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 1060fdcf..b62a9797 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -584,9 +584,9 @@ class AsyncWebCrawler: # Log processing completion self.logger.info( - message="Processed {url:.50}... | Time: {timing}ms", + message="{url:.50}... | Time: {timing}s", tag="SCRAPE", - params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000)}, + params={"url": _url, "timing": int((time.perf_counter() - t1) * 1000) / 1000}, ) ################################ diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 29c2ba1b..acc45c4c 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -74,6 +74,7 @@ class ManagedBrowser: _get_browser_args(): Returns browser-specific command line arguments. _get_user_data_dir(): Returns the user data directory path. _cleanup(): Terminates the browser process and removes the temporary directory. + create_profile(): Static method to create a user profile by launching a browser for user interaction. """ browser_type: str @@ -288,6 +289,80 @@ class ManagedBrowser: tag="ERROR", params={"error": str(e)}, ) + + # These methods have been moved to BrowserProfiler class + @staticmethod + async def create_profile(browser_config=None, profile_name=None, logger=None): + """ + This method has been moved to the BrowserProfiler class. + + Creates a browser profile by launching a browser for interactive user setup + and waits until the user closes it. The profile is stored in a directory that + can be used later with BrowserConfig.user_data_dir. + + Please use BrowserProfiler.create_profile() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + profile_path = await profiler.create_profile(profile_name="my-login-profile") + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler(logger=logger) + return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config) + + @staticmethod + def list_profiles(): + """ + This method has been moved to the BrowserProfiler class. + + Lists all available browser profiles in the Crawl4AI profiles directory. + + Please use BrowserProfiler.list_profiles() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + profiles = profiler.list_profiles() + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler() + return profiler.list_profiles() + + @staticmethod + def delete_profile(profile_name_or_path): + """ + This method has been moved to the BrowserProfiler class. + + Delete a browser profile by name or path. + + Please use BrowserProfiler.delete_profile() instead. + + Example: + ```python + from crawl4ai.browser_profiler import BrowserProfiler + + profiler = BrowserProfiler() + success = profiler.delete_profile("my-profile") + ``` + """ + from .browser_profiler import BrowserProfiler + + # Create a BrowserProfiler instance and delegate to it + profiler = BrowserProfiler() + return profiler.delete_profile(profile_name_or_path) + + class BrowserManager: @@ -304,6 +379,7 @@ class BrowserManager: sessions (dict): Dictionary to store session information session_ttl (int): Session timeout in seconds """ + def __init__(self, browser_config: BrowserConfig, logger=None): """ diff --git a/crawl4ai/browser_profiler.py b/crawl4ai/browser_profiler.py new file mode 100644 index 00000000..53a40993 --- /dev/null +++ b/crawl4ai/browser_profiler.py @@ -0,0 +1,544 @@ +""" +Browser Profiler Module + +This module provides a dedicated class for managing browser profiles +that can be used for identity-based crawling with Crawl4AI. +""" + +import os +import asyncio +import signal +import sys +import datetime +import uuid +import shutil +from typing import List, Dict, Optional, Any +from colorama import Fore, Style, init + +from .async_configs import BrowserConfig +from .browser_manager import ManagedBrowser +from .async_logger import AsyncLogger, AsyncLoggerBase +from .utils import get_home_folder + + +class BrowserProfiler: + """ + A dedicated class for managing browser profiles for Crawl4AI. + + The BrowserProfiler allows you to: + - Create browser profiles interactively + - List available profiles + - Delete profiles when no longer needed + - Get profile paths for use in BrowserConfig + + Profiles are stored by default in ~/.crawl4ai/profiles/ + """ + + def __init__(self, logger: Optional[AsyncLoggerBase] = None): + """ + Initialize the BrowserProfiler. + + Args: + logger (AsyncLoggerBase, optional): Logger for outputting messages. + If None, a default AsyncLogger will be created. + """ + # Initialize colorama for colorful terminal output + init() + + # Create a logger if not provided + if logger is None: + self.logger = AsyncLogger(verbose=True) + elif not isinstance(logger, AsyncLoggerBase): + self.logger = AsyncLogger(verbose=True) + else: + self.logger = logger + + # Ensure profiles directory exists + self.profiles_dir = os.path.join(get_home_folder(), "profiles") + os.makedirs(self.profiles_dir, exist_ok=True) + + async def create_profile(self, + profile_name: Optional[str] = None, + browser_config: Optional[BrowserConfig] = None) -> Optional[str]: + """ + Creates a browser profile by launching a browser for interactive user setup + and waits until the user closes it. The profile is stored in a directory that + can be used later with BrowserConfig.user_data_dir. + + Args: + profile_name (str, optional): Name for the profile directory. + If None, a name is generated based on timestamp. + browser_config (BrowserConfig, optional): Configuration for the browser. + If None, a default configuration is used with headless=False. + + Returns: + str: Path to the created profile directory, or None if creation failed + + Example: + ```python + profiler = BrowserProfiler() + + # Create a profile interactively + profile_path = await profiler.create_profile( + profile_name="my-login-profile" + ) + + # Use the profile in a crawler + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + # The crawler will now use your profile with all your cookies and login state + result = await crawler.arun("https://example.com/dashboard") + ``` + """ + # Create default browser config if none provided + if browser_config is None: + from .async_configs import BrowserConfig + browser_config = BrowserConfig( + browser_type="chromium", + headless=False, # Must be visible for user interaction + verbose=True + ) + else: + # Ensure headless is False for user interaction + browser_config.headless = False + + # Generate profile name if not provided + if not profile_name: + timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S") + profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}" + + # Sanitize profile name (replace spaces and special chars) + profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name) + + # Set user data directory + profile_path = os.path.join(self.profiles_dir, profile_name) + os.makedirs(profile_path, exist_ok=True) + + # Print instructions for the user with colorama formatting + border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}" + self.logger.info(f"\n{border}", tag="PROFILE") + self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE") + self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + self.logger.info("\nInstructions:", tag="PROFILE") + self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE") + self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE") + self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE") + self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE") + self.logger.info(f"{border}\n", tag="PROFILE") + + # Create managed browser instance + managed_browser = ManagedBrowser( + browser_type=browser_config.browser_type, + user_data_dir=profile_path, + headless=False, # Must be visible + logger=self.logger, + debugging_port=browser_config.debugging_port + ) + + # Set up signal handlers to ensure cleanup on interrupt + original_sigint = signal.getsignal(signal.SIGINT) + original_sigterm = signal.getsignal(signal.SIGTERM) + + # Define cleanup handler for signals + async def cleanup_handler(sig, frame): + self.logger.warning("\nCleaning up browser process...", tag="PROFILE") + await managed_browser.cleanup() + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + if sig == signal.SIGINT: + self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE") + sys.exit(1) + + # Set signal handlers + def sigint_handler(sig, frame): + asyncio.create_task(cleanup_handler(sig, frame)) + + signal.signal(signal.SIGINT, sigint_handler) + signal.signal(signal.SIGTERM, sigint_handler) + + # Event to signal when user is done with the browser + user_done_event = asyncio.Event() + + # Run keyboard input loop in a separate task + async def listen_for_quit_command(): + import termios + import tty + import select + + # First output the prompt + self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE") + + # Save original terminal settings + fd = sys.stdin.fileno() + old_settings = termios.tcgetattr(fd) + + try: + # Switch to non-canonical mode (no line buffering) + tty.setcbreak(fd) + + while True: + # Check if input is available (non-blocking) + readable, _, _ = select.select([sys.stdin], [], [], 0.5) + if readable: + key = sys.stdin.read(1) + if key.lower() == 'q': + self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE") + user_done_event.set() + return + + # Check if the browser process has already exited + if managed_browser.browser_process and managed_browser.browser_process.poll() is not None: + self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE") + user_done_event.set() + return + + await asyncio.sleep(0.1) + + finally: + # Restore terminal settings + termios.tcsetattr(fd, termios.TCSADRAIN, old_settings) + + try: + # Start the browser + await managed_browser.start() + + # Check if browser started successfully + browser_process = managed_browser.browser_process + if not browser_process: + self.logger.error("Failed to start browser process.", tag="PROFILE") + return None + + self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE") + + # Start listening for keyboard input + listener_task = asyncio.create_task(listen_for_quit_command()) + + # Wait for either the user to press 'q' or for the browser process to exit naturally + while not user_done_event.is_set() and browser_process.poll() is None: + await asyncio.sleep(0.5) + + # Cancel the listener task if it's still running + if not listener_task.done(): + listener_task.cancel() + try: + await listener_task + except asyncio.CancelledError: + pass + + # If the browser is still running and the user pressed 'q', terminate it + if browser_process.poll() is None and user_done_event.is_set(): + self.logger.info("Terminating browser process...", tag="PROFILE") + await managed_browser.cleanup() + + self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE") + + except Exception as e: + self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE") + await managed_browser.cleanup() + return None + finally: + # Restore original signal handlers + signal.signal(signal.SIGINT, original_sigint) + signal.signal(signal.SIGTERM, original_sigterm) + + # Make sure browser is fully cleaned up + await managed_browser.cleanup() + + # Return the profile path + return profile_path + + def list_profiles(self) -> List[Dict[str, Any]]: + """ + Lists all available browser profiles in the Crawl4AI profiles directory. + + Returns: + list: A list of dictionaries containing profile information: + [{"name": "profile_name", "path": "/path/to/profile", "created": datetime, "type": "chromium|firefox"}] + + Example: + ```python + profiler = BrowserProfiler() + + # List all available profiles + profiles = profiler.list_profiles() + + for profile in profiles: + print(f"Profile: {profile['name']}") + print(f" Path: {profile['path']}") + print(f" Created: {profile['created']}") + print(f" Browser type: {profile['type']}") + ``` + """ + if not os.path.exists(self.profiles_dir): + return [] + + profiles = [] + + for name in os.listdir(self.profiles_dir): + profile_path = os.path.join(self.profiles_dir, name) + + # Skip if not a directory + if not os.path.isdir(profile_path): + continue + + # Check if this looks like a valid browser profile + # For Chromium: Look for Preferences file + # For Firefox: Look for prefs.js file + is_valid = False + + if os.path.exists(os.path.join(profile_path, "Preferences")) or \ + os.path.exists(os.path.join(profile_path, "Default", "Preferences")): + is_valid = "chromium" + elif os.path.exists(os.path.join(profile_path, "prefs.js")): + is_valid = "firefox" + + if is_valid: + # Get creation time + created = datetime.datetime.fromtimestamp( + os.path.getctime(profile_path) + ) + + profiles.append({ + "name": name, + "path": profile_path, + "created": created, + "type": is_valid + }) + + # Sort by creation time, newest first + profiles.sort(key=lambda x: x["created"], reverse=True) + + return profiles + + def get_profile_path(self, profile_name: str) -> Optional[str]: + """ + Get the full path to a profile by name. + + Args: + profile_name (str): Name of the profile (not the full path) + + Returns: + str: Full path to the profile directory, or None if not found + + Example: + ```python + profiler = BrowserProfiler() + + path = profiler.get_profile_path("my-profile") + if path: + print(f"Profile path: {path}") + else: + print("Profile not found") + ``` + """ + profile_path = os.path.join(self.profiles_dir, profile_name) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + return None + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return None # Not a valid browser profile + + return profile_path + + def delete_profile(self, profile_name_or_path: str) -> bool: + """ + Delete a browser profile by name or path. + + Args: + profile_name_or_path (str): Name of the profile or full path to profile directory + + Returns: + bool: True if the profile was deleted successfully, False otherwise + + Example: + ```python + profiler = BrowserProfiler() + + # Delete by name + success = profiler.delete_profile("my-profile") + + # Delete by path + success = profiler.delete_profile("/path/to/.crawl4ai/profiles/my-profile") + ``` + """ + # Determine if input is a name or a path + if os.path.isabs(profile_name_or_path): + # Full path provided + profile_path = profile_name_or_path + else: + # Just a name provided, construct path + profile_path = os.path.join(self.profiles_dir, profile_name_or_path) + + # Check if path exists and is a valid profile + if not os.path.isdir(profile_path): + return False + + # Look for profile indicators + is_profile = ( + os.path.exists(os.path.join(profile_path, "Preferences")) or + os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or + os.path.exists(os.path.join(profile_path, "prefs.js")) + ) + + if not is_profile: + return False # Not a valid browser profile + + # Delete the profile directory + try: + shutil.rmtree(profile_path) + return True + except Exception: + return False + + async def interactive_manager(self, crawl_callback=None): + """ + Launch an interactive profile management console. + + Args: + crawl_callback (callable, optional): Function to call when selecting option to use + a profile for crawling. It will be called with (profile_path, url). + + Example: + ```python + profiler = BrowserProfiler() + + # Define a custom crawl function + async def my_crawl_function(profile_path, url): + print(f"Crawling {url} with profile {profile_path}") + # Implement your crawling logic here + + # Start interactive manager + await profiler.interactive_manager(crawl_callback=my_crawl_function) + ``` + """ + while True: + self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU") + + # Only show crawl option if callback provided + if crawl_callback: + self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU") + self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "5" + else: + self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU") + exit_option = "4" + + choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}") + + if choice == "1": + # Create new profile + name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}") + await self.create_profile(name or None) + + elif choice == "2": + # List profiles + profiles = self.list_profiles() + + if not profiles: + self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES") + continue + + # Print profile information with colorama formatting + self.logger.info("\nAvailable profiles:", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES") + self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES") + self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES") + self.logger.info("", tag="PROFILES") # Empty line for spacing + + elif choice == "3": + # Delete profile + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found to delete", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to delete + profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_name = profiles[idx]["name"] + self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + + # Confirm deletion + confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}") + if confirm.lower() == 'y': + success = self.delete_profile(profiles[idx]["path"]) + + if success: + self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES") + else: + self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == "4" and crawl_callback: + # Use profile to crawl a site + profiles = self.list_profiles() + if not profiles: + self.logger.warning("No profiles found. Create one first.", tag="PROFILES") + continue + + # Display numbered list + self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES") + for i, profile in enumerate(profiles): + self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES") + + # Get profile to use + profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}") + if profile_idx.lower() == 'c': + continue + + try: + idx = int(profile_idx) - 1 + if 0 <= idx < len(profiles): + profile_path = profiles[idx]["path"] + url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}") + if url: + # Call the provided crawl callback + await crawl_callback(profile_path, url) + else: + self.logger.error("No URL provided", tag="CRAWL") + else: + self.logger.error("Invalid profile number", tag="PROFILES") + except ValueError: + self.logger.error("Please enter a valid number", tag="PROFILES") + + elif choice == exit_option: + # Exit + self.logger.info("Exiting profile management", tag="MENU") + break + + else: + self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU") \ No newline at end of file diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 91e7a31d..df212568 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -3,12 +3,11 @@ from typing import Optional, Literal, List, Dict, Tuple import re from abc import ABC, abstractmethod -import random from fake_useragent import UserAgent import requests from lxml import html import json -from typing import Optional, List, Union, Dict +from typing import Union class UAGen(ABC): @abstractmethod diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index 021b24b6..c44908d5 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -16,9 +16,9 @@ async def main(): crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator( - content_filter=PruningContentFilter( - threshold=0.48, threshold_type="fixed", min_word_threshold=0 - ) + # content_filter=PruningContentFilter( + # threshold=0.48, threshold_type="fixed", min_word_threshold=0 + # ) ), ) result : CrawlResult = await crawler.arun( diff --git a/docs/examples/identity_based_browsing.py b/docs/examples/identity_based_browsing.py new file mode 100644 index 00000000..b6315e46 --- /dev/null +++ b/docs/examples/identity_based_browsing.py @@ -0,0 +1,108 @@ +""" +Identity-Based Browsing Example with Crawl4AI + +This example demonstrates how to: +1. Create a persistent browser profile interactively +2. List available profiles +3. Use a saved profile for crawling authenticated sites +4. Delete profiles when no longer needed + +Uses the new BrowserProfiler class for profile management. +""" + +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig +from crawl4ai.browser_profiler import BrowserProfiler +from crawl4ai.async_logger import AsyncLogger +from colorama import Fore, Style, init + +# Initialize colorama +init() + +# Create a shared logger instance +logger = AsyncLogger(verbose=True) + +# Create a shared BrowserProfiler instance +profiler = BrowserProfiler(logger=logger) + + +async def crawl_with_profile(profile_path, url): + """Use a profile to crawl an authenticated page""" + logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL") + + # Create browser config with the profile path + browser_config = BrowserConfig( + headless=True, # Set to False if you want to see the browser window + use_managed_browser=True, # Required for persistent profiles + user_data_dir=profile_path + ) + + start_time = asyncio.get_event_loop().time() + + # Initialize crawler with the browser config + async with AsyncWebCrawler(config=browser_config) as crawler: + # Crawl the URL - You should have access to authenticated content now + result = await crawler.arun(url) + + elapsed_time = asyncio.get_event_loop().time() - start_time + + if result.success: + # Use url_status method for consistent logging + logger.url_status(url, True, elapsed_time, tag="CRAWL") + + # Print page title or some indication of success + title = result.metadata.get("title", "") + logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL") + return result + else: + # Log error status + logger.error_status(url, result.error_message, tag="CRAWL") + return None + + +async def main(): + logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO") + logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO") + + # Choose between interactive mode and automatic mode + mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower() + + if mode == 'i': + # Interactive profile management - use the interactive_manager method + # Pass the crawl_with_profile function as the callback for the "crawl a website" option + await profiler.interactive_manager(crawl_callback=crawl_with_profile) + else: + # Automatic mode - simplified example + profiles = profiler.list_profiles() + + if not profiles: + # Create a new profile if none exists + logger.info("No profiles found. Creating a new one...", tag="DEMO") + profile_path = await profiler.create_profile() + if not profile_path: + logger.error("Cannot proceed without a valid profile", tag="DEMO") + return + else: + # Use the first (most recent) profile + profile_path = profiles[0]["path"] + logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO") + + # Example: Crawl an authenticated page + urls_to_crawl = [ + "https://github.com/settings/profile", # GitHub requires login + # "https://twitter.com/home", # Twitter requires login + # "https://www.linkedin.com/feed/", # LinkedIn requires login + ] + + for url in urls_to_crawl: + await crawl_with_profile(profile_path, url) + + +if __name__ == "__main__": + try: + # Run the async main function + asyncio.run(main()) + except KeyboardInterrupt: + logger.warning("Example interrupted by user", tag="DEMO") + except Exception as e: + logger.error(f"Error in example: {str(e)}", tag="DEMO") \ No newline at end of file diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md index 702d9475..403acb9a 100644 --- a/docs/md_v2/advanced/identity-based-crawling.md +++ b/docs/md_v2/advanced/identity-based-crawling.md @@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler: --- -## 6. Summary +## 6. Using the BrowserProfiler Class -- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`. -- **Log in** or configure sites as needed, then close the browser. -- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`. -- Enjoy **persistent** sessions that reflect your real identity. -- If you only need quick, ephemeral automation, **Magic Mode** might suffice. +Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing. + +### Creating and Managing Profiles with BrowserProfiler + +The `BrowserProfiler` class offers a comprehensive API for browser profile management: + +```python +import asyncio +from crawl4ai import BrowserProfiler + +async def manage_profiles(): + # Create a profiler instance + profiler = BrowserProfiler() + + # Create a profile interactively - opens a browser window + profile_path = await profiler.create_profile( + profile_name="my-login-profile" # Optional: name your profile + ) + + print(f"Profile saved at: {profile_path}") + + # List all available profiles + profiles = profiler.list_profiles() + + for profile in profiles: + print(f"Profile: {profile['name']}") + print(f" Path: {profile['path']}") + print(f" Created: {profile['created']}") + print(f" Browser type: {profile['type']}") + + # Get a specific profile path by name + specific_profile = profiler.get_profile_path("my-login-profile") + + # Delete a profile when no longer needed + success = profiler.delete_profile("old-profile-name") + +asyncio.run(manage_profiles()) +``` + +**How profile creation works:** +1. A browser window opens for you to interact with +2. You log in to websites, set preferences, etc. +3. When you're done, press 'q' in the terminal to close the browser +4. The profile is saved in the Crawl4AI profiles directory +5. You can use the returned path with `BrowserConfig.user_data_dir` + +### Interactive Profile Management + +The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion: + +```python +import asyncio +from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig + +# Define a function to use a profile for crawling +async def crawl_with_profile(profile_path, url): + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url) + return result + +async def main(): + # Create a profiler instance + profiler = BrowserProfiler() + + # Launch the interactive profile manager + # Passing the crawl function as a callback adds a "crawl with profile" option + await profiler.interactive_manager(crawl_callback=crawl_with_profile) + +asyncio.run(main()) +``` + +### Legacy Methods + +For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class: + +```python +from crawl4ai.browser_manager import ManagedBrowser + +# These methods still work but use BrowserProfiler internally +profiles = ManagedBrowser.list_profiles() +``` + +### Complete Example + +See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class. + +--- + +## 7. Summary + +- **Create** your user-data directory either: + - By launching Chrome/Chromium externally with `--user-data-dir=/some/path` + - Or by using the built-in `BrowserProfiler.create_profile()` method + - Or through the interactive interface with `profiler.interactive_manager()` +- **Log in** or configure sites as needed, then close the browser +- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True` +- **List and reuse** profiles with `BrowserProfiler.list_profiles()` +- **Manage** your profiles with the dedicated `BrowserProfiler` class +- Enjoy **persistent** sessions that reflect your real identity +- If you only need quick, ephemeral automation, **Magic Mode** might suffice **Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary. diff --git a/pyproject.toml b/pyproject.toml index 2f73655b..af5eca74 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,7 +36,7 @@ dependencies = [ "aiofiles", "rich>=13.9.4", "cssselect>=1.2.0", - "httpx==0.27.2", + "httpx>=0.27.2", "fake-useragent>=2.0.3", "click>=8.1.7", "pyperclip>=1.8.2",