From ee25c771d8d8512e1359ebd929448c68e414171c Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 2 Jul 2025 14:07:23 +0200 Subject: [PATCH] feat(cli): add deep crawling options with configurable strategies and max pages. ref #874 --- crawl4ai/async_webcrawler.py | 9 ++-- crawl4ai/cli.py | 81 +++++++++++++++++++++++++++++------- 2 files changed, 72 insertions(+), 18 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index ce9a0633..3303bd22 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -502,9 +502,12 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) else: cleaned_html = sanitize_input_encode(result.cleaned_html) - media = result.media.model_dump() - tables = media.pop("tables", []) - links = result.links.model_dump() + # media = result.media.model_dump() + # tables = media.pop("tables", []) + # links = result.links.model_dump() + media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media + tables = media.pop("tables", []) if isinstance(media, dict) else [] + links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links metadata = result.metadata fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index 33b313bc..51b53500 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -27,7 +27,10 @@ from crawl4ai import ( PruningContentFilter, BrowserProfiler, DefaultMarkdownGenerator, - LLMConfig + LLMConfig, + BFSDeepCrawlStrategy, + DFSDeepCrawlStrategy, + BestFirstCrawlingStrategy, ) from crawl4ai.config import USER_SETTINGS from litellm import completion @@ -1010,13 +1013,15 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--crawler", "-c", type=str, callback=parse_key_values, help="Crawler parameters as key1=value1,key2=value2") @click.option("--output", "-o", type=click.Choice(["all", "json", "markdown", "md", "markdown-fit", "md-fit"]), default="all") @click.option("--output-file", "-O", type=click.Path(), help="Output file path (default: stdout)") -@click.option("--bypass-cache", "-b", is_flag=True, default=True, help="Bypass cache when crawling") +@click.option("--bypass-cache", "-bc", is_flag=True, default=True, help="Bypass cache when crawling") @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") +@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)") +@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): """Crawl a website and extract content Simple Usage: @@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON.""" crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy() + # Handle deep crawling configuration + if deep_crawl: + if deep_crawl == "bfs": + crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy( + max_depth=3, + max_pages=max_pages + ) + elif deep_crawl == "dfs": + crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy( + max_depth=3, + max_pages=max_pages + ) + elif deep_crawl == "best-first": + crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy( + max_depth=3, + max_pages=max_pages + ) + + if verbose: + console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages") + config = get_global_config() browser_cfg.verbose = config.get("VERBOSE", False) @@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON.""" verbose ) + # Handle deep crawl results (list) vs single result + if isinstance(result, list): + if len(result) == 0: + click.echo("No results found during deep crawling") + return + # Use the first result for question answering and output + main_result = result[0] + all_results = result + else: + # Single result from regular crawling + main_result = result + all_results = [result] + # Handle question if question: provider, token = setup_llm_config() - markdown = result.markdown.raw_markdown + markdown = main_result.markdown.raw_markdown anyio.run(stream_llm_response, url, markdown, question, provider, token) return # Handle output if not output_file: if output == "all": - click.echo(json.dumps(result.model_dump(), indent=2)) + if isinstance(result, list): + output_data = [r.model_dump() for r in all_results] + click.echo(json.dumps(output_data, indent=2)) + else: + click.echo(json.dumps(main_result.model_dump(), indent=2)) elif output == "json": - print(result.extracted_content) - extracted_items = json.loads(result.extracted_content) + print(main_result.extracted_content) + extracted_items = json.loads(main_result.extracted_content) click.echo(json.dumps(extracted_items, indent=2)) elif output in ["markdown", "md"]: - click.echo(result.markdown.raw_markdown) + click.echo(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: - click.echo(result.markdown.fit_markdown) + click.echo(main_result.markdown.fit_markdown) else: if output == "all": with open(output_file, "w") as f: - f.write(json.dumps(result.model_dump(), indent=2)) + if isinstance(result, list): + output_data = [r.model_dump() for r in all_results] + f.write(json.dumps(output_data, indent=2)) + else: + f.write(json.dumps(main_result.model_dump(), indent=2)) elif output == "json": with open(output_file, "w") as f: - f.write(result.extracted_content) + f.write(main_result.extracted_content) elif output in ["markdown", "md"]: with open(output_file, "w") as f: - f.write(result.markdown.raw_markdown) + f.write(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: with open(output_file, "w") as f: - f.write(result.markdown.fit_markdown) + f.write(main_result.markdown.fit_markdown) except Exception as e: raise click.ClickException(str(e)) @@ -1354,9 +1401,11 @@ def profiles_cmd(): @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") +@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy") +@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): """Crawl4AI CLI - Web content extraction tool Simple Usage: @@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f bypass_cache=bypass_cache, question=question, verbose=verbose, - profile=profile + profile=profile, + deep_crawl=deep_crawl, + max_pages=max_pages ) def main():