Merge remote-tracking branch 'origin/develop'

This commit is contained in:
UncleCode
2025-07-25 15:35:53 +08:00
9 changed files with 101 additions and 32 deletions

View File

@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
[✨ Check out latest update v0.7.0](#-recent-updates) [✨ Check out latest update v0.7.0](#-recent-updates)
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0) 🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
<details> <details>
<summary>🤓 <strong>My Personal Story</strong></summary> <summary>🤓 <strong>My Personal Story</strong></summary>

View File

@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error: except Error:
visibility_info = await self.check_visibility(page) visibility_info = await self.check_visibility(page)
if self.browser_config.config.verbose: if self.browser_config.verbose:
self.logger.debug( self.logger.debug(
message="Body visibility info: {info}", message="Body visibility info: {info}",
tag="DEBUG", tag="DEBUG",

View File

@@ -502,9 +502,12 @@ class AsyncWebCrawler:
metadata = result.get("metadata", {}) metadata = result.get("metadata", {})
else: else:
cleaned_html = sanitize_input_encode(result.cleaned_html) cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump() # media = result.media.model_dump()
tables = media.pop("tables", []) # tables = media.pop("tables", [])
links = result.links.model_dump() # links = result.links.model_dump()
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
tables = media.pop("tables", []) if isinstance(media, dict) else []
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
metadata = result.metadata metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000) fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

View File

@@ -27,7 +27,10 @@ from crawl4ai import (
PruningContentFilter, PruningContentFilter,
BrowserProfiler, BrowserProfiler,
DefaultMarkdownGenerator, DefaultMarkdownGenerator,
LLMConfig LLMConfig,
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
BestFirstCrawlingStrategy,
) )
from crawl4ai.config import USER_SETTINGS from crawl4ai.config import USER_SETTINGS
from litellm import completion from litellm import completion
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True) @click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)") @click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str, def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str): output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl a website and extract content """Crawl a website and extract content
Simple Usage: Simple Usage:
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy() crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
# Handle deep crawling configuration
if deep_crawl:
if deep_crawl == "bfs":
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "dfs":
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "best-first":
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=3,
max_pages=max_pages
)
if verbose:
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
config = get_global_config() config = get_global_config()
browser_cfg.verbose = config.get("VERBOSE", False) browser_cfg.verbose = config.get("VERBOSE", False)
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
verbose verbose
) )
# Handle deep crawl results (list) vs single result
if isinstance(result, list):
if len(result) == 0:
click.echo("No results found during deep crawling")
return
# Use the first result for question answering and output
main_result = result[0]
all_results = result
else:
# Single result from regular crawling
main_result = result
all_results = [result]
# Handle question # Handle question
if question: if question:
provider, token = setup_llm_config() provider, token = setup_llm_config()
markdown = result.markdown.raw_markdown markdown = main_result.markdown.raw_markdown
anyio.run(stream_llm_response, url, markdown, question, provider, token) anyio.run(stream_llm_response, url, markdown, question, provider, token)
return return
# Handle output # Handle output
if not output_file: if not output_file:
if output == "all": if output == "all":
click.echo(json.dumps(result.model_dump(), indent=2)) if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
click.echo(json.dumps(output_data, indent=2))
else:
click.echo(json.dumps(main_result.model_dump(), indent=2))
elif output == "json": elif output == "json":
print(result.extracted_content) print(main_result.extracted_content)
extracted_items = json.loads(result.extracted_content) extracted_items = json.loads(main_result.extracted_content)
click.echo(json.dumps(extracted_items, indent=2)) click.echo(json.dumps(extracted_items, indent=2))
elif output in ["markdown", "md"]: elif output in ["markdown", "md"]:
click.echo(result.markdown.raw_markdown) click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]: elif output in ["markdown-fit", "md-fit"]:
click.echo(result.markdown.fit_markdown) click.echo(main_result.markdown.fit_markdown)
else: else:
if output == "all": if output == "all":
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(json.dumps(result.model_dump(), indent=2)) if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
f.write(json.dumps(output_data, indent=2))
else:
f.write(json.dumps(main_result.model_dump(), indent=2))
elif output == "json": elif output == "json":
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(result.extracted_content) f.write(main_result.extracted_content)
elif output in ["markdown", "md"]: elif output in ["markdown", "md"]:
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(result.markdown.raw_markdown) f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]: elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f: with open(output_file, "w") as f:
f.write(result.markdown.fit_markdown) f.write(main_result.markdown.fit_markdown)
except Exception as e: except Exception as e:
raise click.ClickException(str(e)) raise click.ClickException(str(e))
@@ -1354,9 +1401,11 @@ def profiles_cmd():
@click.option("--question", "-q", help="Ask a question about the crawled content") @click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True) @click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)") @click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str, def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict, extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str): output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl4AI CLI - Web content extraction tool """Crawl4AI CLI - Web content extraction tool
Simple Usage: Simple Usage:
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
bypass_cache=bypass_cache, bypass_cache=bypass_cache,
question=question, question=question,
verbose=verbose, verbose=verbose,
profile=profile profile=profile,
deep_crawl=deep_crawl,
max_pages=max_pages
) )
def main(): def main():

View File

@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
# Default: use sentence-transformers # Default: use sentence-transformers
else: else:
# Lazy load to avoid importing heavy libraries unless needed # Lazy load to avoid importing heavy libraries unless needed
from sentence_transformers import SentenceTransformer try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
"sentence-transformers is required for local embeddings. "
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
)
# Cache the model in function attribute to avoid reloading # Cache the model in function attribute to avoid reloading
if not hasattr(get_text_embeddings, '_models'): if not hasattr(get_text_embeddings, '_models'):

View File

@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
from functools import partial from functools import partial
from uuid import uuid4 from uuid import uuid4
from datetime import datetime from datetime import datetime
from base64 import b64encode
import logging import logging
from typing import Optional, AsyncGenerator from typing import Optional, AsyncGenerator
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb() server_memory_mb = _get_memory_mb()
result_dict = result.model_dump() result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb result_dict['server_memory_mb'] = server_memory_mb
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}") logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n" data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8') yield data.encode('utf-8')
@@ -443,10 +447,19 @@ async def handle_crawl_request(
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB") logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
# Process results to handle PDF bytes
processed_results = []
for result in results:
result_dict = result.model_dump()
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
return { return {
"success": True, "success": True,
"results": [result.model_dump() for result in results], "results": processed_results,
"server_processing_time_s": end_time - start_time, "server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb, "server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb "server_peak_memory_mb": peak_mem_mb

View File

@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
Want to learn by doing? We've got you covered: Want to learn by doing? We've got you covered:
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now! **🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code **📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
### Running the Tutorial Locally ### Running the Tutorial Locally

View File

@@ -44,7 +44,6 @@ dependencies = [
"brotli>=1.1.0", "brotli>=1.1.0",
"humanize>=4.10.0", "humanize>=4.10.0",
"lark>=1.2.2", "lark>=1.2.2",
"sentence-transformers>=2.2.0",
"alphashape>=1.3.1", "alphashape>=1.3.1",
"shapely>=2.0.0" "shapely>=2.0.0"
] ]
@@ -62,8 +61,8 @@ classifiers = [
[project.optional-dependencies] [project.optional-dependencies]
pdf = ["PyPDF2"] pdf = ["PyPDF2"]
torch = ["torch", "nltk", "scikit-learn"] torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"] transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk"] cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"] sync = ["selenium"]
all = [ all = [
"PyPDF2", "PyPDF2",
@@ -72,8 +71,8 @@ all = [
"scikit-learn", "scikit-learn",
"transformers", "transformers",
"tokenizers", "tokenizers",
"selenium", "sentence-transformers",
"PyPDF2" "selenium"
] ]
[project.scripts] [project.scripts]

View File

@@ -24,7 +24,6 @@ cssselect>=1.2.0
chardet>=5.2.0 chardet>=5.2.0
brotli>=1.1.0 brotli>=1.1.0
httpx[http2]>=0.27.2 httpx[http2]>=0.27.2
sentence-transformers>=2.2.0
alphashape>=1.3.1 alphashape>=1.3.1
shapely>=2.0.0 shapely>=2.0.0