Merge remote-tracking branch 'origin/develop'
This commit is contained in:
@@ -28,7 +28,7 @@ Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant
|
|||||||
|
|
||||||
[✨ Check out latest update v0.7.0](#-recent-updates)
|
[✨ Check out latest update v0.7.0](#-recent-updates)
|
||||||
|
|
||||||
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://docs.crawl4ai.com/blog/release-v0.7.0)
|
🎉 **Version 0.7.0 is now available!** The Adaptive Intelligence Update introduces groundbreaking features: Adaptive Crawling that learns website patterns, Virtual Scroll support for infinite pages, intelligent Link Preview with 3-layer scoring, Async URL Seeder for massive discovery, and significant performance improvements. [Read the release notes →](https://github.com/unclecode/crawl4ai/blob/main/docs/blog/release-v0.7.0.md)
|
||||||
|
|
||||||
<details>
|
<details>
|
||||||
<summary>🤓 <strong>My Personal Story</strong></summary>
|
<summary>🤓 <strong>My Personal Story</strong></summary>
|
||||||
|
|||||||
@@ -824,7 +824,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
except Error:
|
except Error:
|
||||||
visibility_info = await self.check_visibility(page)
|
visibility_info = await self.check_visibility(page)
|
||||||
|
|
||||||
if self.browser_config.config.verbose:
|
if self.browser_config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
message="Body visibility info: {info}",
|
message="Body visibility info: {info}",
|
||||||
tag="DEBUG",
|
tag="DEBUG",
|
||||||
|
|||||||
@@ -502,9 +502,12 @@ class AsyncWebCrawler:
|
|||||||
metadata = result.get("metadata", {})
|
metadata = result.get("metadata", {})
|
||||||
else:
|
else:
|
||||||
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
cleaned_html = sanitize_input_encode(result.cleaned_html)
|
||||||
media = result.media.model_dump()
|
# media = result.media.model_dump()
|
||||||
tables = media.pop("tables", [])
|
# tables = media.pop("tables", [])
|
||||||
links = result.links.model_dump()
|
# links = result.links.model_dump()
|
||||||
|
media = result.media.model_dump() if hasattr(result.media, 'model_dump') else result.media
|
||||||
|
tables = media.pop("tables", []) if isinstance(media, dict) else []
|
||||||
|
links = result.links.model_dump() if hasattr(result.links, 'model_dump') else result.links
|
||||||
metadata = result.metadata
|
metadata = result.metadata
|
||||||
|
|
||||||
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
|
||||||
|
|||||||
@@ -27,7 +27,10 @@ from crawl4ai import (
|
|||||||
PruningContentFilter,
|
PruningContentFilter,
|
||||||
BrowserProfiler,
|
BrowserProfiler,
|
||||||
DefaultMarkdownGenerator,
|
DefaultMarkdownGenerator,
|
||||||
LLMConfig
|
LLMConfig,
|
||||||
|
BFSDeepCrawlStrategy,
|
||||||
|
DFSDeepCrawlStrategy,
|
||||||
|
BestFirstCrawlingStrategy,
|
||||||
)
|
)
|
||||||
from crawl4ai.config import USER_SETTINGS
|
from crawl4ai.config import USER_SETTINGS
|
||||||
from litellm import completion
|
from litellm import completion
|
||||||
@@ -1014,9 +1017,11 @@ def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless
|
|||||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
@click.option("--verbose", "-v", is_flag=True)
|
||||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||||
|
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy (bfs, dfs, or best-first)")
|
||||||
|
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||||
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config: str,
|
||||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||||
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
output: str, output_file: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||||
"""Crawl a website and extract content
|
"""Crawl a website and extract content
|
||||||
|
|
||||||
Simple Usage:
|
Simple Usage:
|
||||||
@@ -1156,6 +1161,27 @@ Always return valid, properly formatted JSON."""
|
|||||||
|
|
||||||
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
crawler_cfg.scraping_strategy = LXMLWebScrapingStrategy()
|
||||||
|
|
||||||
|
# Handle deep crawling configuration
|
||||||
|
if deep_crawl:
|
||||||
|
if deep_crawl == "bfs":
|
||||||
|
crawler_cfg.deep_crawl_strategy = BFSDeepCrawlStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
elif deep_crawl == "dfs":
|
||||||
|
crawler_cfg.deep_crawl_strategy = DFSDeepCrawlStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
elif deep_crawl == "best-first":
|
||||||
|
crawler_cfg.deep_crawl_strategy = BestFirstCrawlingStrategy(
|
||||||
|
max_depth=3,
|
||||||
|
max_pages=max_pages
|
||||||
|
)
|
||||||
|
|
||||||
|
if verbose:
|
||||||
|
console.print(f"[green]Deep crawling enabled:[/green] {deep_crawl} strategy, max {max_pages} pages")
|
||||||
|
|
||||||
config = get_global_config()
|
config = get_global_config()
|
||||||
|
|
||||||
browser_cfg.verbose = config.get("VERBOSE", False)
|
browser_cfg.verbose = config.get("VERBOSE", False)
|
||||||
@@ -1170,39 +1196,60 @@ Always return valid, properly formatted JSON."""
|
|||||||
verbose
|
verbose
|
||||||
)
|
)
|
||||||
|
|
||||||
|
# Handle deep crawl results (list) vs single result
|
||||||
|
if isinstance(result, list):
|
||||||
|
if len(result) == 0:
|
||||||
|
click.echo("No results found during deep crawling")
|
||||||
|
return
|
||||||
|
# Use the first result for question answering and output
|
||||||
|
main_result = result[0]
|
||||||
|
all_results = result
|
||||||
|
else:
|
||||||
|
# Single result from regular crawling
|
||||||
|
main_result = result
|
||||||
|
all_results = [result]
|
||||||
|
|
||||||
# Handle question
|
# Handle question
|
||||||
if question:
|
if question:
|
||||||
provider, token = setup_llm_config()
|
provider, token = setup_llm_config()
|
||||||
markdown = result.markdown.raw_markdown
|
markdown = main_result.markdown.raw_markdown
|
||||||
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
anyio.run(stream_llm_response, url, markdown, question, provider, token)
|
||||||
return
|
return
|
||||||
|
|
||||||
# Handle output
|
# Handle output
|
||||||
if not output_file:
|
if not output_file:
|
||||||
if output == "all":
|
if output == "all":
|
||||||
click.echo(json.dumps(result.model_dump(), indent=2))
|
if isinstance(result, list):
|
||||||
|
output_data = [r.model_dump() for r in all_results]
|
||||||
|
click.echo(json.dumps(output_data, indent=2))
|
||||||
|
else:
|
||||||
|
click.echo(json.dumps(main_result.model_dump(), indent=2))
|
||||||
elif output == "json":
|
elif output == "json":
|
||||||
print(result.extracted_content)
|
print(main_result.extracted_content)
|
||||||
extracted_items = json.loads(result.extracted_content)
|
extracted_items = json.loads(main_result.extracted_content)
|
||||||
click.echo(json.dumps(extracted_items, indent=2))
|
click.echo(json.dumps(extracted_items, indent=2))
|
||||||
|
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
click.echo(result.markdown.raw_markdown)
|
click.echo(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
click.echo(result.markdown.fit_markdown)
|
click.echo(main_result.markdown.fit_markdown)
|
||||||
else:
|
else:
|
||||||
if output == "all":
|
if output == "all":
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(json.dumps(result.model_dump(), indent=2))
|
if isinstance(result, list):
|
||||||
|
output_data = [r.model_dump() for r in all_results]
|
||||||
|
f.write(json.dumps(output_data, indent=2))
|
||||||
|
else:
|
||||||
|
f.write(json.dumps(main_result.model_dump(), indent=2))
|
||||||
elif output == "json":
|
elif output == "json":
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.extracted_content)
|
f.write(main_result.extracted_content)
|
||||||
elif output in ["markdown", "md"]:
|
elif output in ["markdown", "md"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.markdown.raw_markdown)
|
f.write(main_result.markdown.raw_markdown)
|
||||||
elif output in ["markdown-fit", "md-fit"]:
|
elif output in ["markdown-fit", "md-fit"]:
|
||||||
with open(output_file, "w") as f:
|
with open(output_file, "w") as f:
|
||||||
f.write(result.markdown.fit_markdown)
|
f.write(main_result.markdown.fit_markdown)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise click.ClickException(str(e))
|
raise click.ClickException(str(e))
|
||||||
@@ -1354,9 +1401,11 @@ def profiles_cmd():
|
|||||||
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
@click.option("--question", "-q", help="Ask a question about the crawled content")
|
||||||
@click.option("--verbose", "-v", is_flag=True)
|
@click.option("--verbose", "-v", is_flag=True)
|
||||||
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
@click.option("--profile", "-p", help="Use a specific browser profile (by name)")
|
||||||
|
@click.option("--deep-crawl", type=click.Choice(["bfs", "dfs", "best-first"]), help="Enable deep crawling with specified strategy")
|
||||||
|
@click.option("--max-pages", type=int, default=10, help="Maximum number of pages to crawl in deep crawl mode")
|
||||||
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
def default(url: str, example: bool, browser_config: str, crawler_config: str, filter_config: str,
|
||||||
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
extraction_config: str, json_extract: str, schema: str, browser: Dict, crawler: Dict,
|
||||||
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str):
|
output: str, bypass_cache: bool, question: str, verbose: bool, profile: str, deep_crawl: str, max_pages: int):
|
||||||
"""Crawl4AI CLI - Web content extraction tool
|
"""Crawl4AI CLI - Web content extraction tool
|
||||||
|
|
||||||
Simple Usage:
|
Simple Usage:
|
||||||
@@ -1406,7 +1455,9 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
|||||||
bypass_cache=bypass_cache,
|
bypass_cache=bypass_cache,
|
||||||
question=question,
|
question=question,
|
||||||
verbose=verbose,
|
verbose=verbose,
|
||||||
profile=profile
|
profile=profile,
|
||||||
|
deep_crawl=deep_crawl,
|
||||||
|
max_pages=max_pages
|
||||||
)
|
)
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
|
|||||||
@@ -3342,7 +3342,13 @@ async def get_text_embeddings(
|
|||||||
# Default: use sentence-transformers
|
# Default: use sentence-transformers
|
||||||
else:
|
else:
|
||||||
# Lazy load to avoid importing heavy libraries unless needed
|
# Lazy load to avoid importing heavy libraries unless needed
|
||||||
from sentence_transformers import SentenceTransformer
|
try:
|
||||||
|
from sentence_transformers import SentenceTransformer
|
||||||
|
except ImportError:
|
||||||
|
raise ImportError(
|
||||||
|
"sentence-transformers is required for local embeddings. "
|
||||||
|
"Install it with: pip install 'crawl4ai[transformer]' or pip install sentence-transformers"
|
||||||
|
)
|
||||||
|
|
||||||
# Cache the model in function attribute to avoid reloading
|
# Cache the model in function attribute to avoid reloading
|
||||||
if not hasattr(get_text_embeddings, '_models'):
|
if not hasattr(get_text_embeddings, '_models'):
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ from typing import List, Tuple, Dict
|
|||||||
from functools import partial
|
from functools import partial
|
||||||
from uuid import uuid4
|
from uuid import uuid4
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
|
from base64 import b64encode
|
||||||
|
|
||||||
import logging
|
import logging
|
||||||
from typing import Optional, AsyncGenerator
|
from typing import Optional, AsyncGenerator
|
||||||
@@ -371,6 +372,9 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
|||||||
server_memory_mb = _get_memory_mb()
|
server_memory_mb = _get_memory_mb()
|
||||||
result_dict = result.model_dump()
|
result_dict = result.model_dump()
|
||||||
result_dict['server_memory_mb'] = server_memory_mb
|
result_dict['server_memory_mb'] = server_memory_mb
|
||||||
|
# If PDF exists, encode it to base64
|
||||||
|
if result_dict.get('pdf') is not None:
|
||||||
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||||
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||||
yield data.encode('utf-8')
|
yield data.encode('utf-8')
|
||||||
@@ -444,9 +448,18 @@ async def handle_crawl_request(
|
|||||||
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
peak_mem_mb = max(peak_mem_mb if peak_mem_mb else 0, end_mem_mb) # <--- Get peak memory
|
||||||
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
logger.info(f"Memory usage: Start: {start_mem_mb} MB, End: {end_mem_mb} MB, Delta: {mem_delta_mb} MB, Peak: {peak_mem_mb} MB")
|
||||||
|
|
||||||
|
# Process results to handle PDF bytes
|
||||||
|
processed_results = []
|
||||||
|
for result in results:
|
||||||
|
result_dict = result.model_dump()
|
||||||
|
# If PDF exists, encode it to base64
|
||||||
|
if result_dict.get('pdf') is not None:
|
||||||
|
result_dict['pdf'] = b64encode(result_dict['pdf']).decode('utf-8')
|
||||||
|
processed_results.append(result_dict)
|
||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"results": [result.model_dump() for result in results],
|
"results": processed_results,
|
||||||
"server_processing_time_s": end_time - start_time,
|
"server_processing_time_s": end_time - start_time,
|
||||||
"server_memory_delta_mb": mem_delta_mb,
|
"server_memory_delta_mb": mem_delta_mb,
|
||||||
"server_peak_memory_mb": peak_mem_mb
|
"server_peak_memory_mb": peak_mem_mb
|
||||||
|
|||||||
@@ -52,11 +52,9 @@ That's it! In just a few lines, you've automated a complete search workflow.
|
|||||||
|
|
||||||
Want to learn by doing? We've got you covered:
|
Want to learn by doing? We've got you covered:
|
||||||
|
|
||||||
**🚀 [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)** - Try C4A-Script in your browser right now!
|
**🚀 [Live Demo](https://docs.crawl4ai.com/apps/c4a-script/)** - Try C4A-Script in your browser right now!
|
||||||
|
|
||||||
**📁 [Tutorial Examples](/examples/c4a_script/)** - Complete examples with source code
|
**📁 [Tutorial Examples](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/c4a_script/)** - Complete examples with source code
|
||||||
|
|
||||||
**🛠️ [Local Tutorial](/examples/c4a_script/tutorial/)** - Run the interactive tutorial on your machine
|
|
||||||
|
|
||||||
### Running the Tutorial Locally
|
### Running the Tutorial Locally
|
||||||
|
|
||||||
|
|||||||
@@ -44,7 +44,6 @@ dependencies = [
|
|||||||
"brotli>=1.1.0",
|
"brotli>=1.1.0",
|
||||||
"humanize>=4.10.0",
|
"humanize>=4.10.0",
|
||||||
"lark>=1.2.2",
|
"lark>=1.2.2",
|
||||||
"sentence-transformers>=2.2.0",
|
|
||||||
"alphashape>=1.3.1",
|
"alphashape>=1.3.1",
|
||||||
"shapely>=2.0.0"
|
"shapely>=2.0.0"
|
||||||
]
|
]
|
||||||
@@ -62,8 +61,8 @@ classifiers = [
|
|||||||
[project.optional-dependencies]
|
[project.optional-dependencies]
|
||||||
pdf = ["PyPDF2"]
|
pdf = ["PyPDF2"]
|
||||||
torch = ["torch", "nltk", "scikit-learn"]
|
torch = ["torch", "nltk", "scikit-learn"]
|
||||||
transformer = ["transformers", "tokenizers"]
|
transformer = ["transformers", "tokenizers", "sentence-transformers"]
|
||||||
cosine = ["torch", "transformers", "nltk"]
|
cosine = ["torch", "transformers", "nltk", "sentence-transformers"]
|
||||||
sync = ["selenium"]
|
sync = ["selenium"]
|
||||||
all = [
|
all = [
|
||||||
"PyPDF2",
|
"PyPDF2",
|
||||||
@@ -72,8 +71,8 @@ all = [
|
|||||||
"scikit-learn",
|
"scikit-learn",
|
||||||
"transformers",
|
"transformers",
|
||||||
"tokenizers",
|
"tokenizers",
|
||||||
"selenium",
|
"sentence-transformers",
|
||||||
"PyPDF2"
|
"selenium"
|
||||||
]
|
]
|
||||||
|
|
||||||
[project.scripts]
|
[project.scripts]
|
||||||
|
|||||||
@@ -24,7 +24,6 @@ cssselect>=1.2.0
|
|||||||
chardet>=5.2.0
|
chardet>=5.2.0
|
||||||
brotli>=1.1.0
|
brotli>=1.1.0
|
||||||
httpx[http2]>=0.27.2
|
httpx[http2]>=0.27.2
|
||||||
sentence-transformers>=2.2.0
|
|
||||||
alphashape>=1.3.1
|
alphashape>=1.3.1
|
||||||
shapely>=2.0.0
|
shapely>=2.0.0
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user