feat(profiles): add CLI command for crawling with browser profiles
Adds new functionality to crawl websites using saved browser profiles directly from the CLI. This includes: - New CLI option to use profiles for crawling - Helper functions for profile-based crawling - Fixed type hints for config parameters - Updated example to show browser window by default This makes it easier for users to leverage saved browser profiles for crawling without writing code.
This commit is contained in:
@@ -222,7 +222,7 @@ class AsyncWebCrawler:
|
|||||||
async def arun(
|
async def arun(
|
||||||
self,
|
self,
|
||||||
url: str,
|
url: str,
|
||||||
config: Optional[CrawlerRunConfig] = None,
|
config: CrawlerRunConfig = None,
|
||||||
# Legacy parameters maintained for backwards compatibility
|
# Legacy parameters maintained for backwards compatibility
|
||||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
@@ -270,7 +270,7 @@ class AsyncWebCrawler:
|
|||||||
Returns:
|
Returns:
|
||||||
CrawlResult: The result of crawling and processing
|
CrawlResult: The result of crawling and processing
|
||||||
"""
|
"""
|
||||||
crawler_config = config
|
crawler_config = config or CrawlerRunConfig()
|
||||||
if not isinstance(url, str) or not url:
|
if not isinstance(url, str) or not url:
|
||||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||||
|
|
||||||
|
|||||||
@@ -431,6 +431,81 @@ def delete_profile_interactive(profiler: BrowserProfiler):
|
|||||||
except (ValueError, IndexError):
|
except (ValueError, IndexError):
|
||||||
console.print("[red]Invalid selection.[/red]")
|
console.print("[red]Invalid selection.[/red]")
|
||||||
|
|
||||||
|
async def crawl_with_profile_cli(profile_path, url):
|
||||||
|
"""Use a profile to crawl a website via CLI"""
|
||||||
|
console.print(f"[cyan]Crawling [bold]{url}[/bold] using profile at [bold]{profile_path}[/bold][/cyan]")
|
||||||
|
|
||||||
|
# Create browser config with the profile
|
||||||
|
browser_cfg = BrowserConfig(
|
||||||
|
headless=False, # Set to False to see the browser in action
|
||||||
|
use_managed_browser=True,
|
||||||
|
user_data_dir=profile_path
|
||||||
|
)
|
||||||
|
|
||||||
|
# Default crawler config
|
||||||
|
crawler_cfg = CrawlerRunConfig()
|
||||||
|
|
||||||
|
# Ask for output format
|
||||||
|
output_format = Prompt.ask(
|
||||||
|
"[cyan]Output format[/cyan]",
|
||||||
|
choices=["all", "json", "markdown", "md", "title"],
|
||||||
|
default="markdown"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Run the crawler
|
||||||
|
result = await run_crawler(url, browser_cfg, crawler_cfg, True)
|
||||||
|
|
||||||
|
# Handle output
|
||||||
|
if output_format == "all":
|
||||||
|
console.print(json.dumps(result.model_dump(), indent=2))
|
||||||
|
elif output_format == "json":
|
||||||
|
console.print(json.dumps(json.loads(result.extracted_content), indent=2))
|
||||||
|
elif output_format in ["markdown", "md"]:
|
||||||
|
console.print(result.markdown.raw_markdown)
|
||||||
|
elif output_format == "title":
|
||||||
|
console.print(result.metadata.get("title", "No title found"))
|
||||||
|
|
||||||
|
console.print(f"[green]Successfully crawled[/green] {url}")
|
||||||
|
return result
|
||||||
|
except Exception as e:
|
||||||
|
console.print(f"[red]Error crawling:[/red] {str(e)}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
async def use_profile_to_crawl():
|
||||||
|
"""Interactive profile selection for crawling"""
|
||||||
|
profiler = BrowserProfiler()
|
||||||
|
profiles = profiler.list_profiles()
|
||||||
|
|
||||||
|
if not profiles:
|
||||||
|
console.print("[yellow]No profiles found. Create one first.[/yellow]")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Display profiles
|
||||||
|
display_profiles_table(profiles)
|
||||||
|
|
||||||
|
# Get profile selection
|
||||||
|
idx = Prompt.ask(
|
||||||
|
"[cyan]Enter number of profile to use[/cyan]",
|
||||||
|
console=console,
|
||||||
|
choices=[str(i+1) for i in range(len(profiles))],
|
||||||
|
show_choices=False
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
idx = int(idx) - 1
|
||||||
|
profile = profiles[idx]
|
||||||
|
|
||||||
|
# Get URL
|
||||||
|
url = Prompt.ask("[cyan]Enter URL to crawl[/cyan]")
|
||||||
|
if url:
|
||||||
|
# Crawl with the selected profile
|
||||||
|
await crawl_with_profile_cli(profile["path"], url)
|
||||||
|
else:
|
||||||
|
console.print("[red]No URL provided[/red]")
|
||||||
|
except (ValueError, IndexError):
|
||||||
|
console.print("[red]Invalid selection[/red]")
|
||||||
|
|
||||||
async def manage_profiles():
|
async def manage_profiles():
|
||||||
"""Interactive profile management menu"""
|
"""Interactive profile management menu"""
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
@@ -439,14 +514,15 @@ async def manage_profiles():
|
|||||||
"1": "List profiles",
|
"1": "List profiles",
|
||||||
"2": "Create new profile",
|
"2": "Create new profile",
|
||||||
"3": "Delete profile",
|
"3": "Delete profile",
|
||||||
"4": "Exit",
|
"4": "Use a profile to crawl a website",
|
||||||
|
"5": "Exit",
|
||||||
}
|
}
|
||||||
|
|
||||||
while True:
|
while True:
|
||||||
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
|
console.print(Panel("[bold cyan]Browser Profile Manager[/bold cyan]", border_style="cyan"))
|
||||||
|
|
||||||
for key, value in options.items():
|
for key, value in options.items():
|
||||||
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "cyan"
|
color = "green" if key == "1" else "yellow" if key == "2" else "red" if key == "3" else "blue" if key == "4" else "cyan"
|
||||||
console.print(f"[{color}]{key}[/{color}]. {value}")
|
console.print(f"[{color}]{key}[/{color}]. {value}")
|
||||||
|
|
||||||
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
|
choice = Prompt.ask("Enter choice", choices=list(options.keys()), default="1")
|
||||||
@@ -463,8 +539,12 @@ async def manage_profiles():
|
|||||||
elif choice == "3":
|
elif choice == "3":
|
||||||
# Delete profile
|
# Delete profile
|
||||||
delete_profile_interactive(profiler)
|
delete_profile_interactive(profiler)
|
||||||
|
|
||||||
elif choice == "4":
|
elif choice == "4":
|
||||||
|
# Use profile to crawl
|
||||||
|
await use_profile_to_crawl()
|
||||||
|
|
||||||
|
elif choice == "5":
|
||||||
# Exit
|
# Exit
|
||||||
console.print("[cyan]Exiting profile manager.[/cyan]")
|
console.print("[cyan]Exiting profile manager.[/cyan]")
|
||||||
break
|
break
|
||||||
|
|||||||
@@ -16,7 +16,7 @@ class DeepCrawlDecorator:
|
|||||||
|
|
||||||
def __call__(self, original_arun):
|
def __call__(self, original_arun):
|
||||||
@wraps(original_arun)
|
@wraps(original_arun)
|
||||||
async def wrapped_arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs):
|
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
|
||||||
# If deep crawling is already active, call the original method to avoid recursion.
|
# If deep crawling is already active, call the original method to avoid recursion.
|
||||||
if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
|
if config and config.deep_crawl_strategy and not self.deep_crawl_active.get():
|
||||||
token = self.deep_crawl_active.set(True)
|
token = self.deep_crawl_active.set(True)
|
||||||
|
|||||||
@@ -32,7 +32,7 @@ async def crawl_with_profile(profile_path, url):
|
|||||||
|
|
||||||
# Create browser config with the profile path
|
# Create browser config with the profile path
|
||||||
browser_config = BrowserConfig(
|
browser_config = BrowserConfig(
|
||||||
headless=True, # Set to False if you want to see the browser window
|
headless=False, # Set to False if you want to see the browser window
|
||||||
use_managed_browser=True, # Required for persistent profiles
|
use_managed_browser=True, # Required for persistent profiles
|
||||||
user_data_dir=profile_path
|
user_data_dir=profile_path
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user