Merge remote-tracking branch 'origin/develop'

This commit is contained in:
UncleCode
2025-07-25 15:35:53 +08:00
9 changed files with 101 additions and 32 deletions

View File

@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
[✨ Check out latest update v0.7.0](#-recent-updates)
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
<details>
<summary>🤓 <strong>My Personal Story</strong></summary>

View File

@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
except Error:
visibility_info = await self.check_visibility(page)
if self.browser_config.config.verbose:
if self.browser_config.verbose:
self.logger.debug(
message="Body visibility info: {info}",
tag="DEBUG",

View File

@@ -502,9 +502,12 @@ class AsyncWebCrawler:
metadata = result.get("metadata", {})
else:
cleaned_html = sanitize_input_encode(result.cleaned_html)
media = result.media.model_dump()
tables = media.pop("tables", [])
links = result.links.model_dump()
# media = result.media.model_dump()
# tables = media.pop("tables", [])
# links = result.links.model_dump()
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
tables = media.pop("tables", []) if isinstance(media, dict) else []
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

View File

@@ -27,7 +27,10 @@ from crawl4ai import (
PruningContentFilter,
BrowserProfiler,
DefaultMarkdownGenerator,
LLMConfig
LLMConfig,
BFSDeepCrawlStrategy,
DFSDeepCrawlStrategy,
BestFirstCrawlingStrategy,
)
from crawl4ai.config import USER_SETTINGS
from litellm import completion
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl a website and extract content
Simple Usage:
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
# Handle deep crawling configuration
if deep_crawl:
if deep_crawl == "bfs":
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "dfs":
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
max_depth=3,
max_pages=max_pages
)
elif deep_crawl == "best-first":
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
max_depth=3,
max_pages=max_pages
)
if verbose:
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
config = get_global_config()
browser_cfg.verbose = config.get("VERBOSE", False)
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
verbose
)
# Handle deep crawl results (list) vs single result
if isinstance(result, list):
if len(result) == 0:
click.echo("No results found during deep crawling")
return
# Use the first result for question answering and output
main_result = result[0]
all_results = result
else:
# Single result from regular crawling
main_result = result
all_results = [result]
# Handle question
if question:
provider, token = setup_llm_config()
markdown = result.markdown.raw_markdown
markdown = main_result.markdown.raw_markdown
anyio.run(stream_llm_response, url, markdown, question, provider, token)
return
# Handle output
if not output_file:
if output == "all":
click.echo(json.dumps(result.model_dump(), indent=2))
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
click.echo(json.dumps(output_data, indent=2))
else:
click.echo(json.dumps(main_result.model_dump(), indent=2))
elif output == "json":
print(result.extracted_content)
extracted_items = json.loads(result.extracted_content)
print(main_result.extracted_content)
extracted_items = json.loads(main_result.extracted_content)
click.echo(json.dumps(extracted_items, indent=2))
elif output in ["markdown", "md"]:
click.echo(result.markdown.raw_markdown)
click.echo(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
click.echo(result.markdown.fit_markdown)
click.echo(main_result.markdown.fit_markdown)
else:
if output == "all":
with open(output_file, "w") as f:
f.write(json.dumps(result.model_dump(), indent=2))
if isinstance(result, list):
output_data = [r.model_dump() for r in all_results]
f.write(json.dumps(output_data, indent=2))
else:
f.write(json.dumps(main_result.model_dump(), indent=2))
elif output == "json":
with open(output_file, "w") as f:
f.write(result.extracted_content)
f.write(main_result.extracted_content)
elif output in ["markdown", "md"]:
with open(output_file, "w") as f:
f.write(result.markdown.raw_markdown)
f.write(main_result.markdown.raw_markdown)
elif output in ["markdown-fit", "md-fit"]:
with open(output_file, "w") as f:
f.write(result.markdown.fit_markdown)
f.write(main_result.markdown.fit_markdown)
except Exception as e:
raise click.ClickException(str(e))
@@ -1354,9 +1401,11 @@ def profiles_cmd():
@click.option("--question", "-q", help="Ask a question about the crawled content")
@click.option("--verbose", "-v", is_flag=True)
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
"""Crawl4AI CLI - Web content extraction tool
Simple Usage:
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
bypass_cache=bypass_cache,
question=question,
verbose=verbose,
profile=profile
profile=profile,
deep_crawl=deep_crawl,
max_pages=max_pages
)
def main():

View File

@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
# Default: use sentence-transformers
else:
# Lazy load to avoid importing heavy libraries unless needed
from sentence_transformers import SentenceTransformer
try:
from sentence_transformers import SentenceTransformer
except ImportError:
raise ImportError(
"sentence-transformers is required for local embeddings. "
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
)
# Cache the model in function attribute to avoid reloading
if not hasattr(get_text_embeddings, '_models'):

View File

@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
from functools import partial
from uuid import uuid4
from datetime import datetime
from base64 import b64encode
import logging
from typing import Optional, AsyncGenerator
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb()
result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
data = json.dumps(result_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8')
@@ -443,10 +447,19 @@ async def handle_crawl_request(
mem_delta_mb = end_mem_mb - start_mem_mb # <--- Calculate delta
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
# Process results to handle PDF bytes
processed_results = []
for result in results:
result_dict = result.model_dump()
# If PDF exists, encode it to base64
if result_dict.get('pdf') is not None:
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
processed_results.append(result_dict)
return {
"success": True,
"results": [result.model_dump() for result in results],
"results": processed_results,
"server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb

View File

@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
Want to learn by doing? We've got you covered:
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
### Running the Tutorial Locally

View File

@@ -44,7 +44,6 @@ dependencies = [
"brotli>=1.1.0",
"humanize>=4.10.0",
"lark>=1.2.2",
"sentence-transformers>=2.2.0",
"alphashape>=1.3.1",
"shapely>=2.0.0"
]
@@ -62,8 +61,8 @@ classifiers = [
[project.optional-dependencies]
pdf = ["PyPDF2"]
torch = ["torch", "nltk", "scikit-learn"]
transformer = ["transformers", "tokenizers"]
cosine = ["torch", "transformers", "nltk"]
transformer = ["transformers", "tokenizers", "sentence-transformers"]
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
sync = ["selenium"]
all = [
"PyPDF2",
@@ -72,8 +71,8 @@ all = [
"scikit-learn",
"transformers",
"tokenizers",
"selenium",
"PyPDF2"
"sentence-transformers",
"selenium"
]
[project.scripts]

View File

@@ -24,7 +24,6 @@ cssselect>=1.2.0
chardet>=5.2.0
brotli>=1.1.0
httpx[http2]>=0.27.2
sentence-transformers>=2.2.0
alphashape>=1.3.1
shapely>=2.0.0