diff --git a/README.md b/README.md index 97a907a1..f70eb264 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant [✨ Check out latest update v0.7.0](#-recent-updates) -🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0) +🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
🤓 My Personal Story diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 9fdb0fe2..9f1ed38d 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): except Error: visibility_info = await self.check_visibility(page) - if self.browser_config.config.verbose: + if self.browser_config.verbose: self.logger.debug( message="Body visibility info: {info}", tag="DEBUG", diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index a2c6cf9f..ed34ea50 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -502,9 +502,12 @@ class AsyncWebCrawler: metadata = result.get("metadata", {}) else: cleaned_html = sanitize_input_encode(result.cleaned_html) - media = result.media.model_dump() - tables = media.pop("tables", []) - links = result.links.model_dump() + # media = result.media.model_dump() + # tables = media.pop("tables", []) + # links = result.links.model_dump() + media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media + tables = media.pop("tables", []) if isinstance(media, dict) else [] + links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links metadata = result.metadata fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) diff --git a/crawl4ai/cli.py b/crawl4ai/cli.py index a5fb7dbb..51b53500 100644 --- a/crawl4ai/cli.py +++ b/crawl4ai/cli.py @@ -27,7 +27,10 @@ from crawl4ai import ( PruningContentFilter, BrowserProfiler, DefaultMarkdownGenerator, - LLMConfig + LLMConfig, + BFSDeepCrawlStrategy, + DFSDeepCrawlStrategy, + BestFirstCrawlingStrategy, ) from crawl4ai.config import USER_SETTINGS from litellm import completion @@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") +@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)") +@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): """Crawl a website and extract content Simple Usage: @@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON.""" crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy() + # Handle deep crawling configuration + if deep_crawl: + if deep_crawl == "bfs": + crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy( + max_depth=3, + max_pages=max_pages + ) + elif deep_crawl == "dfs": + crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy( + max_depth=3, + max_pages=max_pages + ) + elif deep_crawl == "best-first": + crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy( + max_depth=3, + max_pages=max_pages + ) + + if verbose: + console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages") + config = get_global_config() browser_cfg.verbose = config.get("VERBOSE", False) @@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON.""" verbose ) + # Handle deep crawl results (list) vs single result + if isinstance(result, list): + if len(result) == 0: + click.echo("No results found during deep crawling") + return + # Use the first result for question answering and output + main_result = result[0] + all_results = result + else: + # Single result from regular crawling + main_result = result + all_results = [result] + # Handle question if question: provider, token = setup_llm_config() - markdown = result.markdown.raw_markdown + markdown = main_result.markdown.raw_markdown anyio.run(stream_llm_response, url, markdown, question, provider, token) return # Handle output if not output_file: if output == "all": - click.echo(json.dumps(result.model_dump(), indent=2)) + if isinstance(result, list): + output_data = [r.model_dump() for r in all_results] + click.echo(json.dumps(output_data, indent=2)) + else: + click.echo(json.dumps(main_result.model_dump(), indent=2)) elif output == "json": - print(result.extracted_content) - extracted_items = json.loads(result.extracted_content) + print(main_result.extracted_content) + extracted_items = json.loads(main_result.extracted_content) click.echo(json.dumps(extracted_items, indent=2)) elif output in ["markdown", "md"]: - click.echo(result.markdown.raw_markdown) + click.echo(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: - click.echo(result.markdown.fit_markdown) + click.echo(main_result.markdown.fit_markdown) else: if output == "all": with open(output_file, "w") as f: - f.write(json.dumps(result.model_dump(), indent=2)) + if isinstance(result, list): + output_data = [r.model_dump() for r in all_results] + f.write(json.dumps(output_data, indent=2)) + else: + f.write(json.dumps(main_result.model_dump(), indent=2)) elif output == "json": with open(output_file, "w") as f: - f.write(result.extracted_content) + f.write(main_result.extracted_content) elif output in ["markdown", "md"]: with open(output_file, "w") as f: - f.write(result.markdown.raw_markdown) + f.write(main_result.markdown.raw_markdown) elif output in ["markdown-fit", "md-fit"]: with open(output_file, "w") as f: - f.write(result.markdown.fit_markdown) + f.write(main_result.markdown.fit_markdown) except Exception as e: raise click.ClickException(str(e)) @@ -1354,9 +1401,11 @@ def profiles_cmd(): @click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--verbose", "-v", is_flag=True) @click.option("--profile", "-p", help="Use a specific browser profile (by name)") +@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy") +@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode") def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, - output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): + output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int): """Crawl4AI CLI - Web content extraction tool Simple Usage: @@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f bypass_cache=bypass_cache, question=question, verbose=verbose, - profile=profile + profile=profile, + deep_crawl=deep_crawl, + max_pages=max_pages ) def main(): diff --git a/crawl4ai/utils.py b/crawl4ai/utils.py index 8735dee0..b2001cdd 100644 --- a/crawl4ai/utils.py +++ b/crawl4ai/utils.py @@ -3342,7 +3342,13 @@ async def get_text_embeddings( # Default: use sentence-transformers else: # Lazy load to avoid importing heavy libraries unless needed - from sentence_transformers import SentenceTransformer + try: + from sentence_transformers import SentenceTransformer + except ImportError: + raise ImportError( + "sentence-transformers is required for local embeddings. " + "Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers" + ) # Cache the model in function attribute to avoid reloading if not hasattr(get_text_embeddings, '_models'): diff --git a/deploy/docker/api.py b/deploy/docker/api.py index b728acd1..0fcb96b7 100644 --- a/deploy/docker/api.py +++ b/deploy/docker/api.py @@ -5,6 +5,7 @@ from typing import List, Tuple, Dict from functools import partial from uuid import uuid4 from datetime import datetime +from base64 import b64encode import logging from typing import Optional, AsyncGenerator @@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator) server_memory_mb = _get_memory_mb() result_dict = result.model_dump() result_dict['server_memory_mb'] = server_memory_mb + # If PDF exists, encode it to base64 + if result_dict.get('pdf') is not None: + result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}") data = json.dumps(result_dict, default=datetime_handler) + "\n" yield data.encode('utf-8') @@ -443,10 +447,19 @@ async def handle_crawl_request( mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB") - + + # Process results to handle PDF bytes + processed_results = [] + for result in results: + result_dict = result.model_dump() + # If PDF exists, encode it to base64 + if result_dict.get('pdf') is not None: + result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8') + processed_results.append(result_dict) + return { "success": True, - "results": [result.model_dump() for result in results], + "results": processed_results, "server_processing_time_s": end_time - start_time, "server_memory_delta_mb": mem_delta_mb, "server_peak_memory_mb": peak_mem_mb diff --git a/docs/md_v2/core/c4a-script.md b/docs/md_v2/core/c4a-script.md index 7f5006bf..d92e426e 100644 --- a/docs/md_v2/core/c4a-script.md +++ b/docs/md_v2/core/c4a-script.md @@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow. Want to learn by doing? We've got you covered: -**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now! +**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now! -**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code - -**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine +**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code ### Running the Tutorial Locally diff --git a/pyproject.toml b/pyproject.toml index a582d430..3d70a68d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,7 +44,6 @@ dependencies = [ "brotli>=1.1.0", "humanize>=4.10.0", "lark>=1.2.2", - "sentence-transformers>=2.2.0", "alphashape>=1.3.1", "shapely>=2.0.0" ] @@ -62,8 +61,8 @@ classifiers = [ [project.optional-dependencies] pdf = ["PyPDF2"] torch = ["torch", "nltk", "scikit-learn"] -transformer = ["transformers", "tokenizers"] -cosine = ["torch", "transformers", "nltk"] +transformer = ["transformers", "tokenizers", "sentence-transformers"] +cosine = ["torch", "transformers", "nltk", "sentence-transformers"] sync = ["selenium"] all = [ "PyPDF2", @@ -72,8 +71,8 @@ all = [ "scikit-learn", "transformers", "tokenizers", - "selenium", - "PyPDF2" + "sentence-transformers", + "selenium" ] [project.scripts] diff --git a/requirements.txt b/requirements.txt index 37fc7959..001d090d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -24,7 +24,6 @@ cssselect>=1.2.0 chardet>=5.2.0 brotli>=1.1.0 httpx[http2]>=0.27.2 -sentence-transformers>=2.2.0 alphashape>=1.3.1 shapely>=2.0.0