diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index b62a9797..c294eebd 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -222,7 +222,7 @@ class AsyncWebCrawler: async def arun( self, url: str, - config: Optional[CrawlerRunConfig] = None, + config: CrawlerRunConfig = None, # Legacy parameters maintained for backwards compatibility word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy: ExtractionStrategy = None, @@ -270,7 +270,7 @@ class AsyncWebCrawler: Returns: CrawlResult: The result of crawling and processing """ - crawler_config = config + crawler_config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: raise ValueError("Invalid URL, make sure the URL is a non-empty string") diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 32fb67f7..b32ab4cc 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -431,6 +431,81 @@ def delete_profile_interactive(profiler: BrowserProfiler): except (ValueError, IndexError): console.print("[red]Invalid selection.[/red]") +async def crawl_with_profile_cli(profile_path, url): + """Use a profile to crawl a website via CLI""" + console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]") + + # Create browser config with the profile + browser_cfg = BrowserConfig( + headless=False, # Set to False to see the browser in action + use_managed_browser=True, + user_data_dir=profile_path + ) + + # Default crawler config + crawler_cfg = CrawlerRunConfig() + + # Ask for output format + output_format = Prompt.ask( + "[cyan]Output format[/cyan]", + choices=["all", "json", "markdown", "md", "title"], + default="markdown" + ) + + try: + # Run the crawler + result = await run_crawler(url, browser_cfg, crawler_cfg, True) + + # Handle output + if output_format == "all": + console.print(json.dumps(result.model_dump(), indent=2)) + elif output_format == "json": + console.print(json.dumps(json.loads(result.extracted_content), indent=2)) + elif output_format in ["markdown", "md"]: + console.print(result.markdown.raw_markdown) + elif output_format == "title": + console.print(result.metadata.get("title", "No title found")) + + console.print(f"[green]Successfully crawled[/green] {url}") + return result + except Exception as e: + console.print(f"[red]Error crawling:[/red] {str(e)}") + return None + +async def use_profile_to_crawl(): + """Interactive profile selection for crawling""" + profiler = BrowserProfiler() + profiles = profiler.list_profiles() + + if not profiles: + console.print("[yellow]No profiles found. Create one first.[/yellow]") + return + + # Display profiles + display_profiles_table(profiles) + + # Get profile selection + idx = Prompt.ask( + "[cyan]Enter number of profile to use[/cyan]", + console=console, + choices=[str(i+1) for i in range(len(profiles))], + show_choices=False + ) + + try: + idx = int(idx) - 1 + profile = profiles[idx] + + # Get URL + url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]") + if url: + # Crawl with the selected profile + await crawl_with_profile_cli(profile["path"], url) + else: + console.print("[red]No URL provided[/red]") + except (ValueError, IndexError): + console.print("[red]Invalid selection[/red]") + async def manage_profiles(): """Interactive profile management menu""" profiler = BrowserProfiler() @@ -439,14 +514,15 @@ async def manage_profiles(): "1": "List profiles", "2": "Create new profile", "3": "Delete profile", - "4": "Exit", + "4": "Use a profile to crawl a website", + "5": "Exit", } while True: console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan")) for key, value in options.items(): - color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan" + color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan" console.print(f"[{color}]{key}[/{color}]. {value}") choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1") @@ -463,8 +539,12 @@ async def manage_profiles(): elif choice == "3": # Delete profile delete_profile_interactive(profiler) - + elif choice == "4": + # Use profile to crawl + await use_profile_to_crawl() + + elif choice == "5": # Exit console.print("[cyan]Exiting profile manager.[/cyan]") break diff --git a/crawl4ai/deep_crawling/base_strategy.py b/crawl4ai/deep_crawling/base_strategy.py index 222338a3..e1b3fe6b 100644 --- a/crawl4ai/deep_crawling/base_strategy.py +++ b/crawl4ai/deep_crawling/base_strategy.py @@ -16,7 +16,7 @@ class DeepCrawlDecorator: def __call__(self, original_arun): @wraps(original_arun) - async def wrapped_arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs): + async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs): # If deep crawling is already active, call the original method to avoid recursion. if config and config.deep_crawl_strategy and not self.deep_crawl_active.get(): token = self.deep_crawl_active.set(True) diff --git a/docs/examples/identity_based_browsing.py b/docs/examples/identity_based_browsing.py index b6315e46..01596948 100644 --- a/docs/examples/identity_based_browsing.py +++ b/docs/examples/identity_based_browsing.py @@ -32,7 +32,7 @@ async def crawl_with_profile(profile_path, url): # Create browser config with the profile path browser_config = BrowserConfig( - headless=True, # Set to False if you want to see the browser window + headless=False, # Set to False if you want to see the browser window use_managed_browser=True, # Required for persistent profiles user_data_dir=profile_path )