Merge remote-tracking branch 'origin/develop'

2025-07-25 15:35:53 +08:00
parent 66a979ad11 9546773a07
commit 84e462d9f8
9 changed files with 101 additions and 32 deletions
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant

 [✨ Check out latest update v0.7.0](#-recent-updates)

-🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
+🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)

 <details>
 <summary>🤓 <strong>My Personal Story</strong></summary>
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
            except Error:
                visibility_info = await self.check_visibility(page)

-                if self.browser_config.config.verbose:
+                if self.browser_config.verbose:
                    self.logger.debug(
                        message="Body visibility info: {info}",
                        tag="DEBUG",
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -502,9 +502,12 @@ class AsyncWebCrawler:
            metadata = result.get("metadata", {})
        else:
            cleaned_html = sanitize_input_encode(result.cleaned_html)
-            media = result.media.model_dump()
-            tables = media.pop("tables", [])
-            links = result.links.model_dump()
+            # media = result.media.model_dump()
+            # tables = media.pop("tables", [])
+            # links = result.links.model_dump()
+            media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
+            tables = media.pop("tables", []) if isinstance(media, dict) else []
+            links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
            metadata = result.metadata

        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
--- a/crawl4ai/cli.py
+++ b/crawl4ai/cli.py
@@ -27,7 +27,10 @@ from crawl4ai import (
    PruningContentFilter,
    BrowserProfiler,
    DefaultMarkdownGenerator,
-    LLMConfig
+    LLMConfig,
+    BFSDeepCrawlStrategy,
+    DFSDeepCrawlStrategy,
+    BestFirstCrawlingStrategy,
 )
 from crawl4ai.config import USER_SETTINGS
 from litellm import completion
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
+@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
 def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, 
           extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
-           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+           output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
    """Crawl a website and extract content
    
    Simple Usage:
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""

        crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()    

+        # Handle deep crawling configuration
+        if deep_crawl:
+            if deep_crawl == "bfs":
+                crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
+                    max_depth=3,
+                    max_pages=max_pages
+                )
+            elif deep_crawl == "dfs":
+                crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
+                    max_depth=3,
+                    max_pages=max_pages
+                )
+            elif deep_crawl == "best-first":
+                crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
+                    max_depth=3,
+                    max_pages=max_pages
+                )
+            
+            if verbose:
+                console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
+
        config = get_global_config()
        
        browser_cfg.verbose = config.get("VERBOSE", False)
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
            verbose
        )

+        # Handle deep crawl results (list) vs single result
+        if isinstance(result, list):
+            if len(result) == 0:
+                click.echo("No results found during deep crawling")
+                return
+            # Use the first result for question answering and output
+            main_result = result[0]
+            all_results = result
+        else:
+            # Single result from regular crawling
+            main_result = result
+            all_results = [result]
+
        # Handle question
        if question:
            provider, token = setup_llm_config()
-            markdown = result.markdown.raw_markdown
+            markdown = main_result.markdown.raw_markdown
            anyio.run(stream_llm_response, url, markdown, question, provider, token)
            return
        
        # Handle output
        if not output_file:
            if output == "all":
-                click.echo(json.dumps(result.model_dump(), indent=2))
+                if isinstance(result, list):
+                    output_data = [r.model_dump() for r in all_results]
+                    click.echo(json.dumps(output_data, indent=2))
+                else:
+                    click.echo(json.dumps(main_result.model_dump(), indent=2))
            elif output == "json":
-                print(result.extracted_content)
-                extracted_items = json.loads(result.extracted_content)
+                print(main_result.extracted_content)
+                extracted_items = json.loads(main_result.extracted_content)
                click.echo(json.dumps(extracted_items, indent=2))
                
            elif output in ["markdown", "md"]:
-                click.echo(result.markdown.raw_markdown)
+                click.echo(main_result.markdown.raw_markdown)
            elif output in ["markdown-fit", "md-fit"]:
-                click.echo(result.markdown.fit_markdown)
+                click.echo(main_result.markdown.fit_markdown)
        else:
            if output == "all":
                with open(output_file, "w") as f:
-                    f.write(json.dumps(result.model_dump(), indent=2))
+                    if isinstance(result, list):
+                        output_data = [r.model_dump() for r in all_results]
+                        f.write(json.dumps(output_data, indent=2))
+                    else:
+                        f.write(json.dumps(main_result.model_dump(), indent=2))
            elif output == "json":
                with open(output_file, "w") as f:
-                    f.write(result.extracted_content)
+                    f.write(main_result.extracted_content)
            elif output in ["markdown", "md"]:
                with open(output_file, "w") as f:
-                    f.write(result.markdown.raw_markdown)
+                    f.write(main_result.markdown.raw_markdown)
            elif output in ["markdown-fit", "md-fit"]:
                with open(output_file, "w") as f:
-                    f.write(result.markdown.fit_markdown)
+                    f.write(main_result.markdown.fit_markdown)
            
    except Exception as e:
        raise click.ClickException(str(e))
@@ -1354,9 +1401,11 @@ def profiles_cmd():
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
+@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
+@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
 def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, 
        extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
-        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
+        output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
    """Crawl4AI CLI - Web content extraction tool

    Simple Usage:
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
        bypass_cache=bypass_cache,
        question=question,
        verbose=verbose,
-        profile=profile
+        profile=profile,
+        deep_crawl=deep_crawl,
+        max_pages=max_pages
    )

 def main():
--- a/crawl4ai/utils.py
+++ b/crawl4ai/utils.py
@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
    # Default: use sentence-transformers
    else:
        # Lazy load to avoid importing heavy libraries unless needed
-        from sentence_transformers import SentenceTransformer
+        try:
+            from sentence_transformers import SentenceTransformer
+        except ImportError:
+            raise ImportError(
+                "sentence-transformers is required for local embeddings. "
+                "Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
+            )
        
        # Cache the model in function attribute to avoid reloading
        if not hasattr(get_text_embeddings, '_models'):
--- a/deploy/docker/api.py
+++ b/deploy/docker/api.py
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
 from functools import partial
 from uuid import uuid4
 from datetime import datetime
+from base64 import b64encode

 import logging
 from typing import Optional, AsyncGenerator
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
                server_memory_mb = _get_memory_mb()
                result_dict = result.model_dump()
                result_dict['server_memory_mb'] = server_memory_mb
+                # If PDF exists, encode it to base64
+                if result_dict.get('pdf') is not None:
+                    result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
                logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
                data = json.dumps(result_dict, default=datetime_handler) + "\n"
                yield data.encode('utf-8')
@@ -443,10 +447,19 @@ async def handle_crawl_request(
            mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
            peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
        logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
-                              
+
+        # Process results to handle PDF bytes
+        processed_results = []
+        for result in results:
+            result_dict = result.model_dump()
+            # If PDF exists, encode it to base64
+            if result_dict.get('pdf') is not None:
+                result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
+            processed_results.append(result_dict)
+            
        return {
            "success": True,
-            "results": [result.model_dump() for result in results],
+            "results": processed_results,
            "server_processing_time_s": end_time - start_time,
            "server_memory_delta_mb": mem_delta_mb,
            "server_peak_memory_mb": peak_mem_mb
--- a/docs/md_v2/core/c4a-script.md
+++ b/docs/md_v2/core/c4a-script.md
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.

 Want to learn by doing? We've got you covered:

-**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
+**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!

-**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
-
-**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
+**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code

 ### Running the Tutorial Locally

--- a/pyproject.toml
+++ b/pyproject.toml
@@ -44,7 +44,6 @@ dependencies = [
    "brotli>=1.1.0",
    "humanize>=4.10.0",
    "lark>=1.2.2",
-    "sentence-transformers>=2.2.0",
    "alphashape>=1.3.1",
    "shapely>=2.0.0"
 ]
@@ -62,8 +61,8 @@ classifiers = [
 [project.optional-dependencies]
 pdf = ["PyPDF2"]  
 torch = ["torch", "nltk", "scikit-learn"]
-transformer = ["transformers", "tokenizers"]
-cosine = ["torch", "transformers", "nltk"]
+transformer = ["transformers", "tokenizers", "sentence-transformers"]
+cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
 sync = ["selenium"]
 all = [
    "PyPDF2",
@@ -72,8 +71,8 @@ all = [
    "scikit-learn",
    "transformers",
    "tokenizers",
-    "selenium",
-    "PyPDF2"  
+    "sentence-transformers",
+    "selenium"
 ]

 [project.scripts]
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,6 @@ cssselect>=1.2.0
 chardet>=5.2.0
 brotli>=1.1.0
 httpx[http2]>=0.27.2
-sentence-transformers>=2.2.0
 alphashape>=1.3.1
 shapely>=2.0.0