.*?)(?=\r?\n```)', # code block
+ re.DOTALL
+ )
+ chunks: List[str] = []
+ for m in pattern.finditer(code_md):
+ file_path = m.group("path").strip()
+ code_blk = m.group("code")
+ tree = ast.parse(code_blk)
+ lines = code_blk.splitlines()
+ for node in tree.body:
+ if isinstance(node, (ast.FunctionDef, ast.AsyncFunctionDef, ast.ClassDef)):
+ start = node.lineno - 1
+ end = getattr(node, "end_lineno", start + 1)
+ snippet = "\n".join(lines[start:end])
+ chunks.append(f"# File: {file_path}\n{snippet}")
+ return chunks
+
+def chunk_doc_sections(doc: str) -> List[str]:
+ lines = doc.splitlines(keepends=True)
+ sections = []
+ current: List[str] = []
+ for line in lines:
+ if re.match(r"^#{1,6}\s", line):
+ if current:
+ sections.append("".join(current))
+ current = [line]
+ else:
+ current.append(line)
+ if current:
+ sections.append("".join(current))
+ return sections
+
+@app.get("/ask")
+@limiter.limit(config["rate_limiting"]["default_limit"])
+@mcp_tool("ask")
+async def get_context(
+ request: Request,
+ _td: Dict = Depends(token_dep),
+ context_type: str = Query("all", regex="^(code|doc|all)$"),
+ query: Optional[str] = Query(None, description="search query to filter chunks"),
+ score_ratio: float = Query(0.5, ge=0.0, le=1.0, description="min score as fraction of max_score"),
+ max_results: int = Query(20, ge=1, description="absolute cap on returned chunks"),
+):
+ """
+ This end point is design for any questions about Crawl4ai library. It returns a plain text markdown with extensive information about Crawl4ai.
+ You can use this as a context for any AI assistant. Use this endpoint for AI assistants to retrieve library context for decision making or code generation tasks.
+ Alway is BEST practice you provide a query to filter the context. Otherwise the lenght of the response will be very long.
+
+ Parameters:
+ - context_type: Specify "code" for code context, "doc" for documentation context, or "all" for both.
+ - query: RECOMMENDED search query to filter paragraphs using BM25. You can leave this empty to get all the context.
+ - score_ratio: Minimum score as a fraction of the maximum score for filtering results.
+ - max_results: Maximum number of results to return. Default is 20.
+
+ Returns:
+ - JSON response with the requested context.
+ - If "code" is specified, returns the code context.
+ - If "doc" is specified, returns the documentation context.
+ - If "all" is specified, returns both code and documentation contexts.
+ """
+ # load contexts
+ base = os.path.dirname(__file__)
+ code_path = os.path.join(base, "c4ai-code-context.md")
+ doc_path = os.path.join(base, "c4ai-doc-context.md")
+ if not os.path.exists(code_path) or not os.path.exists(doc_path):
+ raise HTTPException(404, "Context files not found")
+
+ with open(code_path, "r") as f:
+ code_content = f.read()
+ with open(doc_path, "r") as f:
+ doc_content = f.read()
+
+ # if no query, just return raw contexts
+ if not query:
+ if context_type == "code":
+ return JSONResponse({"code_context": code_content})
+ if context_type == "doc":
+ return JSONResponse({"doc_context": doc_content})
+ return JSONResponse({
+ "code_context": code_content,
+ "doc_context": doc_content,
+ })
+
+ tokens = query.split()
+ results: Dict[str, List[Dict[str, float]]] = {}
+
+ # code BM25 over functions/classes
+ if context_type in ("code", "all"):
+ code_chunks = chunk_code_functions(code_content)
+ bm25 = BM25Okapi([c.split() for c in code_chunks])
+ scores = bm25.get_scores(tokens)
+ max_sc = float(scores.max()) if scores.size > 0 else 0.0
+ cutoff = max_sc * score_ratio
+ picked = [(c, s) for c, s in zip(code_chunks, scores) if s >= cutoff]
+ picked = sorted(picked, key=lambda x: x[1], reverse=True)[:max_results]
+ results["code_results"] = [{"text": c, "score": s} for c, s in picked]
+
+ # doc BM25 over markdown sections
+ if context_type in ("doc", "all"):
+ sections = chunk_doc_sections(doc_content)
+ bm25d = BM25Okapi([sec.split() for sec in sections])
+ scores_d = bm25d.get_scores(tokens)
+ max_sd = float(scores_d.max()) if scores_d.size > 0 else 0.0
+ cutoff_d = max_sd * score_ratio
+ idxs = [i for i, s in enumerate(scores_d) if s >= cutoff_d]
+ neighbors = set(i for idx in idxs for i in (idx-1, idx, idx+1))
+ valid = [i for i in sorted(neighbors) if 0 <= i < len(sections)]
+ valid = valid[:max_results]
+ results["doc_results"] = [
+ {"text": sections[i], "score": scores_d[i]} for i in valid
+ ]
+
+ return JSONResponse(results)
+
+
+# attach MCP layer (adds /mcp/ws, /mcp/sse, /mcp/schema)
+print(f"MCP server running on {config['app']['host']}:{config['app']['port']}")
+attach_mcp(
+ app,
+ base_url=f"http://{config['app']['host']}:{config['app']['port']}"
+)
+
+# ────────────────────────── cli ──────────────────────────────
if __name__ == "__main__":
import uvicorn
uvicorn.run(
@@ -177,5 +643,6 @@ if __name__ == "__main__":
host=config["app"]["host"],
port=config["app"]["port"],
reload=config["app"]["reload"],
- timeout_keep_alive=config["app"]["timeout_keep_alive"]
- )
\ No newline at end of file
+ timeout_keep_alive=config["app"]["timeout_keep_alive"],
+ )
+# ─────────────────────────────────────────────────────────────
diff --git a/deploy/docker/static/playground/index.html b/deploy/docker/static/playground/index.html
new file mode 100644
index 00000000..8f0e2bdd
--- /dev/null
+++ b/deploy/docker/static/playground/index.html
@@ -0,0 +1,817 @@
+
+
+
+
+
+
+ Crawl4AI Playground
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
Request Builder
+
+
+
+
+
+
+
+ Advanced Config (Python → auto‑JSON)
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Ready
+
+
+ Time:
+ -
+
+
+ Memory:
+ -
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
🔥 Stress Test
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Completed: 0/0
+ Avg. Time: 0ms
+ Peak Memory: 0MB
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/deploy/docker/supervisord.conf b/deploy/docker/supervisord.conf
index 1274f2c3..a1b994aa 100644
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@@ -1,12 +1,28 @@
[supervisord]
-nodaemon=true
+nodaemon=true ; Run supervisord in the foreground
+logfile=/dev/null ; Log supervisord output to stdout/stderr
+logfile_maxbytes=0
[program:redis]
-command=redis-server
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
+user=appuser ; Run redis as our non-root user
autorestart=true
priority=10
+stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
+stderr_logfile_maxbytes=0
[program:gunicorn]
-command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 1 --threads 4 --timeout 1800 --graceful-timeout 30 --keep-alive 300 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
+directory=/app ; Working directory for the app
+user=appuser ; Run gunicorn as our non-root user
autorestart=true
-priority=20
\ No newline at end of file
+priority=20
+environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
+stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
+stdout_logfile_maxbytes=0
+stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
+stderr_logfile_maxbytes=0
+
+# Optional: Add filebeat or other logging agents here if needed
\ No newline at end of file
diff --git a/docker-compose.yml b/docker-compose.yml
index 6a7bf7cb..10ff3269 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,16 +1,21 @@
-# Base configuration (not a service, just a reusable config block)
+version: '3.8'
+
+# Shared configuration for all environments
x-base-config: &base-config
ports:
- - "11235:11235"
- - "8000:8000"
- - "9222:9222"
- - "8080:8080"
+ - "11235:11235" # Gunicorn port
+ env_file:
+ - .llm.env # API keys (create from .llm.env.example)
environment:
- - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+ - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
+ - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
+ - GROQ_API_KEY=${GROQ_API_KEY:-}
+ - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
+ - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
+ - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
volumes:
- - /dev/shm:/dev/shm
+ - /dev/shm:/dev/shm # Chromium performance
deploy:
resources:
limits:
@@ -24,42 +29,21 @@ x-base-config: &base-config
timeout: 10s
retries: 3
start_period: 40s
+ user: "appuser"
services:
- # Local build services for different platforms
- crawl4ai-amd64:
+ crawl4ai:
+ # 1. Default: Pull multi-platform test image from Docker Hub
+ # 2. Override with local image via: IMAGE=local-test docker compose up
+ image: ${IMAGE:-unclecode/crawl4ai:${TAG:-latest}}
+
+ # Local build config (used with --build)
build:
context: .
dockerfile: Dockerfile
args:
- PYTHON_VERSION: "3.10"
- INSTALL_TYPE: ${INSTALL_TYPE:-basic}
- ENABLE_GPU: false
- platforms:
- - linux/amd64
- profiles: ["local-amd64"]
- <<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik
-
- crawl4ai-arm64:
- build:
- context: .
- dockerfile: Dockerfile
- args:
- PYTHON_VERSION: "3.10"
- INSTALL_TYPE: ${INSTALL_TYPE:-basic}
- ENABLE_GPU: false
- platforms:
- - linux/arm64
- profiles: ["local-arm64"]
- <<: *base-config
-
- # Hub services for different platforms and versions
- crawl4ai-hub-amd64:
- image: unclecode/crawl4ai:${VERSION:-basic}-amd64
- profiles: ["hub-amd64"]
- <<: *base-config
-
- crawl4ai-hub-arm64:
- image: unclecode/crawl4ai:${VERSION:-basic}-arm64
- profiles: ["hub-arm64"]
+ INSTALL_TYPE: ${INSTALL_TYPE:-default}
+ ENABLE_GPU: ${ENABLE_GPU:-false}
+
+ # Inherit shared config
<<: *base-config
\ No newline at end of file
diff --git a/docs/examples/crypto_analysis_example.py b/docs/examples/crypto_analysis_example.py
index 3cdba2c4..10b9e7ab 100644
--- a/docs/examples/crypto_analysis_example.py
+++ b/docs/examples/crypto_analysis_example.py
@@ -383,29 +383,29 @@ async def main():
scroll_delay=0.2,
)
- # # Execute market data extraction
- # results: List[CrawlResult] = await crawler.arun(
- # url="https://coinmarketcap.com/?page=1", config=crawl_config
- # )
+ # Execute market data extraction
+ results: List[CrawlResult] = await crawler.arun(
+ url="https://coinmarketcap.com/?page=1", config=crawl_config
+ )
- # # Process results
- # raw_df = pd.DataFrame()
- # for result in results:
- # if result.success and result.media["tables"]:
- # # Extract primary market table
- # # DataFrame
- # raw_df = pd.DataFrame(
- # result.media["tables"][0]["rows"],
- # columns=result.media["tables"][0]["headers"],
- # )
- # break
+ # Process results
+ raw_df = pd.DataFrame()
+ for result in results:
+ if result.success and result.media["tables"]:
+ # Extract primary market table
+ # DataFrame
+ raw_df = pd.DataFrame(
+ result.media["tables"][0]["rows"],
+ columns=result.media["tables"][0]["headers"],
+ )
+ break
# This is for debugging only
# ////// Remove this in production from here..
# Save raw data for debugging
- # raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
- # print("🔍 Raw data saved to 'raw_crypto_data.csv'")
+ raw_df.to_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv", index=False)
+ print("🔍 Raw data saved to 'raw_crypto_data.csv'")
# Read from file for debugging
raw_df = pd.read_csv(f"{__current_dir__}/tmp/raw_crypto_data.csv")
diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py
new file mode 100644
index 00000000..09625248
--- /dev/null
+++ b/docs/examples/docker/demo_docker_api.py
@@ -0,0 +1,1016 @@
+import asyncio
+import httpx
+import json
+import os
+import time
+from typing import List, Dict, Any, AsyncGenerator, Optional
+from dotenv import load_dotenv
+from rich.console import Console
+from rich.syntax import Syntax
+from rich.panel import Panel
+from rich.table import Table
+
+# --- Setup & Configuration ---
+load_dotenv() # Load environment variables from .env file
+
+console = Console()
+
+# --- Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
+# Target URLs
+SIMPLE_URL = "https://httpbin.org/html"
+LINKS_URL = "https://httpbin.org/links/10/0"
+FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
+BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
+PYTHON_URL = "https://python.org" # For deeper crawl
+# Use the same sample site as deep crawl tests for consistency
+DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+ """Check if the server is healthy before running tests."""
+ console.print("[bold cyan]Checking server health...[/]", end="")
+ try:
+ response = await client.get("/health", timeout=10.0)
+ response.raise_for_status()
+ health_data = response.json()
+ console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
+ return True
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ console.print(f"\n[bold red]Server health check FAILED:[/]")
+ console.print(f"Error: {e}")
+ console.print(f"Is the server running at {BASE_URL}?")
+ return False
+ except Exception as e:
+ console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+ console.print(e)
+ return False
+
+def print_payload(payload: Dict[str, Any]):
+ """Prints the JSON payload nicely with a dark theme."""
+ syntax = Syntax(
+ json.dumps(payload, indent=2),
+ "json",
+ theme="monokai", # <--- Changed theme here
+ line_numbers=False,
+ word_wrap=True # Added word wrap for potentially long payloads
+ )
+ console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
+
+def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
+ """Prints a concise summary of crawl results."""
+ if not results:
+ console.print(f"[yellow]{title}: No results received.[/]")
+ return
+
+ console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False))
+ count = 0
+ for result in results:
+ if count >= max_items:
+ console.print(f"... (showing first {max_items} of {len(results)} results)")
+ break
+ count += 1
+ success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
+ url = result.get('url', 'N/A')
+ status = result.get('status_code', 'N/A')
+ content_info = ""
+ if result.get('extracted_content'):
+ content_str = json.dumps(result['extracted_content'])
+ snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str
+ content_info = f" | Extracted: [cyan]{snippet}[/]"
+ elif result.get('markdown'):
+ content_info = f" | Markdown: [cyan]Present[/]"
+ elif result.get('html'):
+ content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
+
+ console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
+ if "metadata" in result and "depth" in result["metadata"]:
+ console.print(f" Depth: {result['metadata']['depth']}")
+ if not result.get('success') and result.get('error_message'):
+ console.print(f" [red]Error: {result['error_message']}[/]")
+
+
+async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]:
+ """Handles non-streaming POST requests."""
+ console.rule(f"[bold blue]{title}[/]", style="blue")
+ print_payload(payload)
+ console.print(f"Sending POST request to {client.base_url}{endpoint}...")
+ try:
+ start_time = time.time()
+ response = await client.post(endpoint, json=payload)
+ duration = time.time() - start_time
+ console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
+ response.raise_for_status()
+ data = response.json()
+ if data.get("success"):
+ results = data.get("results", [])
+ print_result_summary(results, title=f"{title} Results")
+ return results
+ else:
+ console.print("[bold red]Request reported failure:[/]")
+ console.print(data)
+ return None
+ except httpx.HTTPStatusError as e:
+ console.print(f"[bold red]HTTP Error:[/]")
+ console.print(f"Status: {e.response.status_code}")
+ try:
+ console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+ except json.JSONDecodeError:
+ console.print(f"Response Body: {e.response.text}")
+ except httpx.RequestError as e:
+ console.print(f"[bold red]Request Error: {e}[/]")
+ except Exception as e:
+ console.print(f"[bold red]Unexpected Error: {e}[/]")
+ return None
+
+async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
+ """Handles streaming POST requests."""
+ console.rule(f"[bold magenta]{title}[/]", style="magenta")
+ print_payload(payload)
+ console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
+ all_results = []
+ initial_status_code = None # Store initial status code
+
+ try:
+ start_time = time.time()
+ async with client.stream("POST", endpoint, json=payload) as response:
+ initial_status_code = response.status_code # Capture initial status
+ duration = time.time() - start_time # Time to first byte potentially
+ console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
+ response.raise_for_status() # Raise exception for bad *initial* status codes
+
+ console.print("[magenta]--- Streaming Results ---[/]")
+ completed = False
+ async for line in response.aiter_lines():
+ if line:
+ try:
+ data = json.loads(line)
+ if data.get("status") == "completed":
+ completed = True
+ console.print("[bold green]--- Stream Completed ---[/]")
+ break
+ elif data.get("url"): # Looks like a result dictionary
+ all_results.append(data)
+ # Display summary info as it arrives
+ success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
+ url = data.get('url', 'N/A')
+ # Display status code FROM THE RESULT DATA if available
+ result_status = data.get('status_code', 'N/A')
+ console.print(f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
+ if not data.get('success') and data.get('error_message'):
+ console.print(f" [red]Error: {data['error_message']}[/]")
+ else:
+ console.print(f" [yellow]Stream meta-data:[/yellow] {data}")
+ except json.JSONDecodeError:
+ console.print(f" [red]Stream decode error for line:[/red] {line}")
+ if not completed:
+ console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
+
+ except httpx.HTTPStatusError as e:
+ # Use the captured initial status code if available, otherwise from the exception
+ status = initial_status_code if initial_status_code is not None else e.response.status_code
+ console.print(f"[bold red]HTTP Error (Initial Request):[/]")
+ console.print(f"Status: {status}")
+ try:
+ console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
+ except json.JSONDecodeError:
+ console.print(f"Response Body: {e.response.text}")
+ except httpx.RequestError as e:
+ console.print(f"[bold red]Request Error: {e}[/]")
+ except Exception as e:
+ console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
+ console.print_exception(show_locals=False) # Print stack trace for unexpected errors
+
+ # Call print_result_summary with the *collected* results AFTER the stream is done
+ print_result_summary(all_results, title=f"{title} Collected Results")
+
+def load_proxies_from_env() -> List[Dict]:
+ """
+ Load proxies from the PROXIES environment variable.
+ Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,...
+ Returns a list of dictionaries suitable for the 'params' of ProxyConfig.
+ """
+ proxies_params_list = []
+ proxies_str = os.getenv("PROXIES", "")
+ if not proxies_str:
+ # console.print("[yellow]PROXIES environment variable not set or empty.[/]")
+ return proxies_params_list # Return empty list if not set
+
+ try:
+ proxy_entries = proxies_str.split(",")
+ for entry in proxy_entries:
+ entry = entry.strip()
+ if not entry:
+ continue
+
+ parts = entry.split(":")
+ proxy_dict = {}
+
+ if len(parts) == 4: # Format: IP:PORT:USER:PASS
+ ip, port, username, password = parts
+ proxy_dict = {
+ "server": f"http://{ip}:{port}", # Assuming http protocol
+ "username": username,
+ "password": password,
+ # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it
+ }
+ elif len(parts) == 2: # Format: IP:PORT
+ ip, port = parts
+ proxy_dict = {
+ "server": f"http://{ip}:{port}",
+ # "ip": ip
+ }
+ else:
+ console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
+ continue
+
+ proxies_params_list.append(proxy_dict)
+
+ except Exception as e:
+ console.print(f"[red]Error loading proxies from environment:[/red] {e}")
+
+ if proxies_params_list:
+ console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
+ # else:
+ # console.print("[yellow]No valid proxies loaded from environment.[/]")
+
+ return proxies_params_list
+
+
+
+# --- Demo Functions ---
+
+# 1. Basic Crawling
+async def demo_basic_single_url(client: httpx.AsyncClient):
+ payload = {
+ "urls": [SIMPLE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+ }
+ result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
+ return result
+
+async def demo_basic_multi_url(client: httpx.AsyncClient):
+ payload = {
+ "urls": [SIMPLE_URL, LINKS_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
+ }
+ result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
+ return result
+
+async def demo_streaming_multi_url(client: httpx.AsyncClient):
+ payload = {
+ "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}}
+ }
+ result = stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
+ return result
+
+# 2. Markdown Generation & Content Filtering
+async def demo_markdown_default(client: httpx.AsyncClient):
+ payload = {
+ "urls": [SIMPLE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ "markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default
+ }
+ }
+ }
+ result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
+ return result
+
+async def demo_markdown_pruning(client: httpx.AsyncClient):
+ payload = {
+ "urls": [PYTHON_URL], # Use a more complex page
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ "markdown_generator": {
+ "type": "DefaultMarkdownGenerator",
+ "params": {
+ "content_filter": {
+ "type": "PruningContentFilter",
+ "params": {"threshold": 0.6, "threshold_type": "relative"}
+ }
+ }
+ }
+ }
+ }
+ }
+ result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
+ return result
+
+async def demo_markdown_bm25(client: httpx.AsyncClient):
+ payload = {
+ "urls": [PYTHON_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ "markdown_generator": {
+ "type": "DefaultMarkdownGenerator",
+ "params": {
+ "content_filter": {
+ "type": "BM25ContentFilter",
+ "params": {"user_query": "Python documentation language reference"}
+ }
+ }
+ }
+ }
+ }
+ }
+ result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter")
+ return result
+
+# 3. Specific Parameters
+# Corrected Demo Function: demo_param_css_selector
+async def demo_param_css_selector(client: httpx.AsyncClient):
+ target_selector = ".main-content" # Using the suggested correct selector
+ payload = {
+ "urls": [PYTHON_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ "css_selector": target_selector # Target specific div
+ # No extraction strategy is needed to demo this parameter's effect on input HTML
+ }
+ }
+ }
+ results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')")
+
+ if results:
+ result = results[0]
+ if result['success'] and result.get('html'):
+ # Check if the returned HTML is likely constrained
+ # A simple check: does it contain expected content from within the selector,
+ # and does it LACK content known to be outside (like footer links)?
+ html_content = result['html']
+ content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere
+ footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content
+
+ console.print(f" Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
+ console.print(f" Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
+
+ if not content_present or not footer_absent:
+ console.print(f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
+ else:
+ console.print(f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
+
+ elif result['success']:
+ console.print("[yellow]HTML content was empty in the successful result.[/]")
+ # Error message is handled by print_result_summary called by make_request
+
+async def demo_param_js_execution(client: httpx.AsyncClient):
+ payload = {
+ "urls": [FORMS_URL], # Use a page with a form
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ # Simple JS to fill and maybe click (won't submit without more complex setup)
+ "js_code": """
+ () => {
+ document.querySelector('[name="custname"]').value = 'Crawl4AI Demo';
+ return { filled_name: document.querySelector('[name="custname"]').value };
+ }
+ """,
+ "delay_before_return_html": 0.5 # Give JS time to potentially run
+ }
+ }
+ }
+ results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
+ if results and results[0].get("js_execution_result"):
+ console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"])
+ elif results:
+ console.print("[yellow]JS Execution Result not found in response.[/]")
+
+async def demo_param_screenshot(client: httpx.AsyncClient):
+ payload = {
+ "urls": [SIMPLE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"cache_mode": "BYPASS", "screenshot": True}
+ }
+ }
+ results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
+ if results and results[0].get("screenshot"):
+ console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
+ elif results:
+ console.print("[yellow]Screenshot data not found in response.[/]")
+
+async def demo_param_ssl_fetch(client: httpx.AsyncClient):
+ payload = {
+ "urls": [PYTHON_URL], # Needs HTTPS
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True}
+ }
+ }
+ results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate")
+ if results and results[0].get("ssl_certificate"):
+ console.print("[cyan]SSL Certificate Info:[/]")
+ console.print(results[0]["ssl_certificate"])
+ elif results:
+ console.print("[yellow]SSL Certificate data not found in response.[/]")
+
+async def demo_param_proxy(client: httpx.AsyncClient):
+ proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
+ if not proxy_params_list:
+ console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
+ console.print("Set the PROXIES environment variable to run this demo.")
+ console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
+ return
+
+ payload = {
+ "urls": ["https://httpbin.org/ip"], # URL that shows originating IP
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": "BYPASS",
+ "proxy_rotation_strategy": {
+ "type": "RoundRobinProxyStrategy",
+ "params": {
+ "proxies": [
+ # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
+ {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
+ for p in proxy_params_list
+ ]
+ }
+ }
+ }
+ }
+ }
+ results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies")
+
+ # --- Verification Logic ---
+ if results and results[0].get("success"):
+ result = results[0]
+ try:
+ # httpbin.org/ip returns JSON within the HTML body's tag
+ html_content = result.get('html', '')
+ # Basic extraction - find JSON within tags or just the JSON itself
+ json_str = None
+ if '
+
+
+ Console Test
+
+
+ Console Message Test
+
+
+
+ """)
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ capture_console_messages=True,
+ wait_until="networkidle" # Wait to make sure all scripts execute
+ )
+
+ result = await crawler.arun(
+ url=f"file://{html_file}",
+ config=config
+ )
+
+ if result.success and result.console_messages:
+ print(f"Captured {len(result.console_messages)} console messages")
+
+ # Count by message type
+ message_types = {}
+ for msg in result.console_messages:
+ msg_type = msg.get("type", "unknown")
+ message_types[msg_type] = message_types.get(msg_type, 0) + 1
+
+ print("Message types:")
+ for msg_type, count in message_types.items():
+ print(f" - {msg_type}: {count}")
+
+ # Show all messages
+ print("\nAll console messages:")
+ for i, msg in enumerate(result.console_messages, 1):
+ print(f" {i}. [{msg.get('type', 'unknown')}] {msg.get('text', '')}")
+
+async def demo_combined_capture():
+ """Capturing both network requests and console messages"""
+ print("\n=== 3. Combined Network and Console Capture ===")
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ capture_network_requests=True,
+ capture_console_messages=True,
+ wait_until="networkidle"
+ )
+
+ result = await crawler.arun(
+ url="https://httpbin.org/html",
+ config=config
+ )
+
+ if result.success:
+ network_count = len(result.network_requests) if result.network_requests else 0
+ console_count = len(result.console_messages) if result.console_messages else 0
+
+ print(f"Captured {network_count} network events and {console_count} console messages")
+
+ # Save the captured data to a JSON file for analysis
+ output_file = os.path.join(__cur_dir__, "tmp", "capture_data.json")
+ with open(output_file, "w") as f:
+ json.dump({
+ "url": result.url,
+ "timestamp": datetime.now().isoformat(),
+ "network_requests": result.network_requests,
+ "console_messages": result.console_messages
+ }, f, indent=2)
+
+ print(f"Full capture data saved to {output_file}")
+
+async def analyze_spa_network_traffic():
+ """Analyze network traffic of a Single-Page Application"""
+ print("\n=== 4. Analyzing SPA Network Traffic ===")
+
+ async with AsyncWebCrawler(config=BrowserConfig(
+ headless=True,
+ viewport_width=1280,
+ viewport_height=800
+ )) as crawler:
+ config = CrawlerRunConfig(
+ capture_network_requests=True,
+ capture_console_messages=True,
+ # Wait longer to ensure all resources are loaded
+ wait_until="networkidle",
+ page_timeout=60000, # 60 seconds
+ )
+
+ result = await crawler.arun(
+ url="https://weather.com",
+ config=config
+ )
+
+ if result.success and result.network_requests:
+ # Extract different types of requests
+ requests = []
+ responses = []
+ failures = []
+
+ for event in result.network_requests:
+ event_type = event.get("event_type")
+ if event_type == "request":
+ requests.append(event)
+ elif event_type == "response":
+ responses.append(event)
+ elif event_type == "request_failed":
+ failures.append(event)
+
+ print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+
+ # Analyze request types
+ resource_types = {}
+ for req in requests:
+ resource_type = req.get("resource_type", "unknown")
+ resource_types[resource_type] = resource_types.get(resource_type, 0) + 1
+
+ print("\nResource types:")
+ for resource_type, count in sorted(resource_types.items(), key=lambda x: x[1], reverse=True):
+ print(f" - {resource_type}: {count}")
+
+ # Analyze API calls
+ api_calls = [r for r in requests if "api" in r.get("url", "").lower()]
+ if api_calls:
+ print(f"\nDetected {len(api_calls)} API calls:")
+ for i, call in enumerate(api_calls[:5], 1): # Show first 5
+ print(f" {i}. {call.get('method')} {call.get('url')}")
+ if len(api_calls) > 5:
+ print(f" ... and {len(api_calls) - 5} more")
+
+ # Analyze response status codes
+ status_codes = {}
+ for resp in responses:
+ status = resp.get("status", 0)
+ status_codes[status] = status_codes.get(status, 0) + 1
+
+ print("\nResponse status codes:")
+ for status, count in sorted(status_codes.items()):
+ print(f" - {status}: {count}")
+
+ # Analyze failures
+ if failures:
+ print("\nFailed requests:")
+ for i, failure in enumerate(failures[:5], 1): # Show first 5
+ print(f" {i}. {failure.get('url')} - {failure.get('failure_text')}")
+ if len(failures) > 5:
+ print(f" ... and {len(failures) - 5} more")
+
+ # Check for console errors
+ if result.console_messages:
+ errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+ if errors:
+ print(f"\nDetected {len(errors)} console errors:")
+ for i, error in enumerate(errors[:3], 1): # Show first 3
+ print(f" {i}. {error.get('text', '')[:100]}...")
+ if len(errors) > 3:
+ print(f" ... and {len(errors) - 3} more")
+
+ # Save analysis to file
+ output_file = os.path.join(__cur_dir__, "tmp", "weather_network_analysis.json")
+ with open(output_file, "w") as f:
+ json.dump({
+ "url": result.url,
+ "timestamp": datetime.now().isoformat(),
+ "statistics": {
+ "request_count": len(requests),
+ "response_count": len(responses),
+ "failure_count": len(failures),
+ "resource_types": resource_types,
+ "status_codes": {str(k): v for k, v in status_codes.items()},
+ "api_call_count": len(api_calls),
+ "console_error_count": len(errors) if result.console_messages else 0
+ },
+ "network_requests": result.network_requests,
+ "console_messages": result.console_messages
+ }, f, indent=2)
+
+ print(f"\nFull analysis saved to {output_file}")
+
+async def demo_security_analysis():
+ """Using network capture for security analysis"""
+ print("\n=== 5. Security Analysis with Network Capture ===")
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ capture_network_requests=True,
+ capture_console_messages=True,
+ wait_until="networkidle"
+ )
+
+ # A site that makes multiple third-party requests
+ result = await crawler.arun(
+ url="https://www.nytimes.com/",
+ config=config
+ )
+
+ if result.success and result.network_requests:
+ print(f"Captured {len(result.network_requests)} network events")
+
+ # Extract all domains
+ domains = set()
+ for req in result.network_requests:
+ if req.get("event_type") == "request":
+ url = req.get("url", "")
+ try:
+ from urllib.parse import urlparse
+ domain = urlparse(url).netloc
+ if domain:
+ domains.add(domain)
+ except:
+ pass
+
+ print(f"\nDetected requests to {len(domains)} unique domains:")
+ main_domain = urlparse(result.url).netloc
+
+ # Separate first-party vs third-party domains
+ first_party = [d for d in domains if main_domain in d]
+ third_party = [d for d in domains if main_domain not in d]
+
+ print(f" - First-party domains: {len(first_party)}")
+ print(f" - Third-party domains: {len(third_party)}")
+
+ # Look for potential trackers/analytics
+ tracking_keywords = ["analytics", "tracker", "pixel", "tag", "stats", "metric", "collect", "beacon"]
+ potential_trackers = []
+
+ for domain in third_party:
+ if any(keyword in domain.lower() for keyword in tracking_keywords):
+ potential_trackers.append(domain)
+
+ if potential_trackers:
+ print(f"\nPotential tracking/analytics domains ({len(potential_trackers)}):")
+ for i, domain in enumerate(sorted(potential_trackers)[:10], 1):
+ print(f" {i}. {domain}")
+ if len(potential_trackers) > 10:
+ print(f" ... and {len(potential_trackers) - 10} more")
+
+ # Check for insecure (HTTP) requests
+ insecure_requests = [
+ req.get("url") for req in result.network_requests
+ if req.get("event_type") == "request" and req.get("url", "").startswith("http://")
+ ]
+
+ if insecure_requests:
+ print(f"\nWarning: Found {len(insecure_requests)} insecure (HTTP) requests:")
+ for i, url in enumerate(insecure_requests[:5], 1):
+ print(f" {i}. {url}")
+ if len(insecure_requests) > 5:
+ print(f" ... and {len(insecure_requests) - 5} more")
+
+ # Save security analysis to file
+ output_file = os.path.join(__cur_dir__, "tmp", "security_analysis.json")
+ with open(output_file, "w") as f:
+ json.dump({
+ "url": result.url,
+ "main_domain": main_domain,
+ "timestamp": datetime.now().isoformat(),
+ "analysis": {
+ "total_requests": len([r for r in result.network_requests if r.get("event_type") == "request"]),
+ "unique_domains": len(domains),
+ "first_party_domains": first_party,
+ "third_party_domains": third_party,
+ "potential_trackers": potential_trackers,
+ "insecure_requests": insecure_requests
+ }
+ }, f, indent=2)
+
+ print(f"\nFull security analysis saved to {output_file}")
+
+async def demo_performance_analysis():
+ """Using network capture for performance analysis"""
+ print("\n=== 6. Performance Analysis with Network Capture ===")
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ capture_network_requests=True,
+ page_timeout=60 * 2 * 1000 # 120 seconds
+ )
+
+ result = await crawler.arun(
+ url="https://www.cnn.com/",
+ config=config
+ )
+
+ if result.success and result.network_requests:
+ # Filter only response events with timing information
+ responses_with_timing = [
+ r for r in result.network_requests
+ if r.get("event_type") == "response" and r.get("request_timing")
+ ]
+
+ if responses_with_timing:
+ print(f"Analyzing timing for {len(responses_with_timing)} network responses")
+
+ # Group by resource type
+ resource_timings = {}
+ for resp in responses_with_timing:
+ url = resp.get("url", "")
+ timing = resp.get("request_timing", {})
+
+ # Determine resource type from URL extension
+ ext = url.split(".")[-1].lower() if "." in url.split("/")[-1] else "unknown"
+ if ext in ["jpg", "jpeg", "png", "gif", "webp", "svg", "ico"]:
+ resource_type = "image"
+ elif ext in ["js"]:
+ resource_type = "javascript"
+ elif ext in ["css"]:
+ resource_type = "css"
+ elif ext in ["woff", "woff2", "ttf", "otf", "eot"]:
+ resource_type = "font"
+ else:
+ resource_type = "other"
+
+ if resource_type not in resource_timings:
+ resource_timings[resource_type] = []
+
+ # Calculate request duration if timing information is available
+ if isinstance(timing, dict) and "requestTime" in timing and "receiveHeadersEnd" in timing:
+ # Convert to milliseconds
+ duration = (timing["receiveHeadersEnd"] - timing["requestTime"]) * 1000
+ resource_timings[resource_type].append({
+ "url": url,
+ "duration_ms": duration
+ })
+ if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
+ # Convert to milliseconds
+ duration = (timing["responseStart"] - timing["requestStart"]) * 1000
+ resource_timings[resource_type].append({
+ "url": url,
+ "duration_ms": duration
+ })
+
+ # Calculate statistics for each resource type
+ print("\nPerformance by resource type:")
+ for resource_type, timings in resource_timings.items():
+ if timings:
+ durations = [t["duration_ms"] for t in timings]
+ avg_duration = sum(durations) / len(durations)
+ max_duration = max(durations)
+ slowest_resource = next(t["url"] for t in timings if t["duration_ms"] == max_duration)
+
+ print(f" {resource_type.upper()}:")
+ print(f" - Count: {len(timings)}")
+ print(f" - Avg time: {avg_duration:.2f} ms")
+ print(f" - Max time: {max_duration:.2f} ms")
+ print(f" - Slowest: {slowest_resource}")
+
+ # Identify the slowest resources overall
+ all_timings = []
+ for resource_type, timings in resource_timings.items():
+ for timing in timings:
+ timing["type"] = resource_type
+ all_timings.append(timing)
+
+ all_timings.sort(key=lambda x: x["duration_ms"], reverse=True)
+
+ print("\nTop 5 slowest resources:")
+ for i, timing in enumerate(all_timings[:5], 1):
+ print(f" {i}. [{timing['type']}] {timing['url']} - {timing['duration_ms']:.2f} ms")
+
+ # Save performance analysis to file
+ output_file = os.path.join(__cur_dir__, "tmp", "performance_analysis.json")
+ with open(output_file, "w") as f:
+ json.dump({
+ "url": result.url,
+ "timestamp": datetime.now().isoformat(),
+ "resource_timings": resource_timings,
+ "slowest_resources": all_timings[:10] # Save top 10
+ }, f, indent=2)
+
+ print(f"\nFull performance analysis saved to {output_file}")
+
+async def main():
+ """Run all demo functions sequentially"""
+ print("=== Network and Console Capture Examples ===")
+
+ # Make sure tmp directory exists
+ os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
+
+ # Run basic examples
+ # await demo_basic_network_capture()
+ await demo_basic_console_capture()
+ # await demo_combined_capture()
+
+ # Run advanced examples
+ # await analyze_spa_network_traffic()
+ # await demo_security_analysis()
+ # await demo_performance_analysis()
+
+ print("\n=== Examples Complete ===")
+ print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/docs/examples/quickstart_async.config.py b/docs/examples/quickstart.py
similarity index 100%
rename from docs/examples/quickstart_async.config.py
rename to docs/examples/quickstart.py
diff --git a/docs/examples/quickstart_async.py b/docs/examples/quickstart_async.py
deleted file mode 100644
index aeb0d20a..00000000
--- a/docs/examples/quickstart_async.py
+++ /dev/null
@@ -1,675 +0,0 @@
-import os, sys
-
-from crawl4ai import LLMConfig
-
-# append parent directory to system path
-sys.path.append(
- os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-)
-os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
-
-import asyncio
-# import nest_asyncio
-# nest_asyncio.apply()
-
-import time
-import json
-import os
-import re
-from typing import Dict, List
-from bs4 import BeautifulSoup
-from pydantic import BaseModel, Field
-from crawl4ai import AsyncWebCrawler, CacheMode
-from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
-from crawl4ai.content_filter_strategy import PruningContentFilter
-from crawl4ai.extraction_strategy import (
- JsonCssExtractionStrategy,
- LLMExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-print("Crawl4AI: Advanced Web Crawling and Data Extraction")
-print("GitHub Repository: https://github.com/unclecode/crawl4ai")
-print("Twitter: @unclecode")
-print("Website: https://crawl4ai.com")
-
-
-async def simple_crawl():
- print("\n--- Basic Usage ---")
- async with AsyncWebCrawler(verbose=True) as crawler:
- result = await crawler.arun(
- url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
- )
- print(result.markdown[:500]) # Print first 500 characters
-
-
-async def simple_example_with_running_js_code():
- print("\n--- Executing JavaScript and Using CSS Selectors ---")
- # New code to handle the wait_for parameter
- wait_for = """() => {
- return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
- }"""
-
- # wait_for can be also just a css selector
- # wait_for = "article.tease-card:nth-child(10)"
-
- async with AsyncWebCrawler(verbose=True) as crawler:
- js_code = [
- "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
- ]
- result = await crawler.arun(
- url="https://www.nbcnews.com/business",
- js_code=js_code,
- # wait_for=wait_for,
- cache_mode=CacheMode.BYPASS,
- )
- print(result.markdown[:500]) # Print first 500 characters
-
-
-async def simple_example_with_css_selector():
- print("\n--- Using CSS Selectors ---")
- async with AsyncWebCrawler(verbose=True) as crawler:
- result = await crawler.arun(
- url="https://www.nbcnews.com/business",
- css_selector=".wide-tease-item__description",
- cache_mode=CacheMode.BYPASS,
- )
- print(result.markdown[:500]) # Print first 500 characters
-
-
-async def use_proxy():
- print("\n--- Using a Proxy ---")
- print(
- "Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
- )
- # Uncomment and modify the following lines to use a proxy
- async with AsyncWebCrawler(
- verbose=True, proxy="http://your-proxy-url:port"
- ) as crawler:
- result = await crawler.arun(
- url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
- )
- if result.success:
- print(result.markdown[:500]) # Print first 500 characters
-
-
-async def capture_and_save_screenshot(url: str, output_path: str):
- async with AsyncWebCrawler(verbose=True) as crawler:
- result = await crawler.arun(
- url=url, screenshot=True, cache_mode=CacheMode.BYPASS
- )
-
- if result.success and result.screenshot:
- import base64
-
- # Decode the base64 screenshot data
- screenshot_data = base64.b64decode(result.screenshot)
-
- # Save the screenshot as a JPEG file
- with open(output_path, "wb") as f:
- f.write(screenshot_data)
-
- print(f"Screenshot saved successfully to {output_path}")
- else:
- print("Failed to capture screenshot")
-
-
-class OpenAIModelFee(BaseModel):
- model_name: str = Field(..., description="Name of the OpenAI model.")
- input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
- output_fee: str = Field(
- ..., description="Fee for output token for the OpenAI model."
- )
-
-
-async def extract_structured_data_using_llm(
- provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
-):
- print(f"\n--- Extracting Structured Data with {provider} ---")
-
- if api_token is None and provider != "ollama":
- print(f"API token is required for {provider}. Skipping this example.")
- return
-
- # extra_args = {}
- extra_args = {
- "temperature": 0,
- "top_p": 0.9,
- "max_tokens": 2000,
- # any other supported parameters for litellm
- }
- if extra_headers:
- extra_args["extra_headers"] = extra_headers
-
- async with AsyncWebCrawler(verbose=True) as crawler:
- result = await crawler.arun(
- url="https://openai.com/api/pricing/",
- word_count_threshold=1,
- extraction_strategy=LLMExtractionStrategy(
- llm_config=LLMConfig(provider=provider,api_token=api_token),
- schema=OpenAIModelFee.model_json_schema(),
- extraction_type="schema",
- instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
- Do not miss any models in the entire content. One extracted model JSON format should look like this:
- {"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
- extra_args=extra_args,
- ),
- cache_mode=CacheMode.BYPASS,
- )
- print(result.extracted_content)
-
-
-async def extract_structured_data_using_css_extractor():
- print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
- schema = {
- "name": "KidoCode Courses",
- "baseSelector": "section.charge-methodology .w-tab-content > div",
- "fields": [
- {
- "name": "section_title",
- "selector": "h3.heading-50",
- "type": "text",
- },
- {
- "name": "section_description",
- "selector": ".charge-content",
- "type": "text",
- },
- {
- "name": "course_name",
- "selector": ".text-block-93",
- "type": "text",
- },
- {
- "name": "course_description",
- "selector": ".course-content-text",
- "type": "text",
- },
- {
- "name": "course_icon",
- "selector": ".image-92",
- "type": "attribute",
- "attribute": "src",
- },
- ],
- }
-
- async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
- # Create the JavaScript that handles clicking multiple times
- js_click_tabs = """
- (async () => {
- const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
-
- for(let tab of tabs) {
- // scroll to the tab
- tab.scrollIntoView();
- tab.click();
- // Wait for content to load and animations to complete
- await new Promise(r => setTimeout(r, 500));
- }
- })();
- """
-
- result = await crawler.arun(
- url="https://www.kidocode.com/degrees/technology",
- extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
- js_code=[js_click_tabs],
- cache_mode=CacheMode.BYPASS,
- )
-
- companies = json.loads(result.extracted_content)
- print(f"Successfully extracted {len(companies)} companies")
- print(json.dumps(companies[0], indent=2))
-
-
-# Advanced Session-Based Crawling with Dynamic Content 🔄
-async def crawl_dynamic_content_pages_method_1():
- print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
- first_commit = ""
-
- async def on_execution_started(page):
- nonlocal first_commit
- try:
- while True:
- await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
- commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
- commit = await commit.evaluate("(element) => element.textContent")
- commit = re.sub(r"\s+", "", commit)
- if commit and commit != first_commit:
- first_commit = commit
- break
- await asyncio.sleep(0.5)
- except Exception as e:
- print(f"Warning: New content didn't appear after JavaScript execution: {e}")
-
- async with AsyncWebCrawler(verbose=True) as crawler:
- crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
-
- url = "https://github.com/microsoft/TypeScript/commits/main"
- session_id = "typescript_commits_session"
- all_commits = []
-
- js_next_page = """
- (() => {
- const button = document.querySelector('a[data-testid="pagination-next-button"]');
- if (button) button.click();
- })();
- """
-
- for page in range(3): # Crawl 3 pages
- result = await crawler.arun(
- url=url,
- session_id=session_id,
- css_selector="li.Box-sc-g0xbh4-0",
- js=js_next_page if page > 0 else None,
- cache_mode=CacheMode.BYPASS,
- js_only=page > 0,
- headless=False,
- )
-
- assert result.success, f"Failed to crawl page {page + 1}"
-
- soup = BeautifulSoup(result.cleaned_html, "html.parser")
- commits = soup.select("li")
- all_commits.extend(commits)
-
- print(f"Page {page + 1}: Found {len(commits)} commits")
-
- await crawler.crawler_strategy.kill_session(session_id)
- print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_2():
- print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
-
- async with AsyncWebCrawler(verbose=True) as crawler:
- url = "https://github.com/microsoft/TypeScript/commits/main"
- session_id = "typescript_commits_session"
- all_commits = []
- last_commit = ""
-
- js_next_page_and_wait = """
- (async () => {
- const getCurrentCommit = () => {
- const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
- return commits.length > 0 ? commits[0].textContent.trim() : null;
- };
-
- const initialCommit = getCurrentCommit();
- const button = document.querySelector('a[data-testid="pagination-next-button"]');
- if (button) button.click();
-
- // Poll for changes
- while (true) {
- await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
- const newCommit = getCurrentCommit();
- if (newCommit && newCommit !== initialCommit) {
- break;
- }
- }
- })();
- """
-
- schema = {
- "name": "Commit Extractor",
- "baseSelector": "li.Box-sc-g0xbh4-0",
- "fields": [
- {
- "name": "title",
- "selector": "h4.markdown-title",
- "type": "text",
- "transform": "strip",
- },
- ],
- }
- extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
- for page in range(3): # Crawl 3 pages
- result = await crawler.arun(
- url=url,
- session_id=session_id,
- css_selector="li.Box-sc-g0xbh4-0",
- extraction_strategy=extraction_strategy,
- js_code=js_next_page_and_wait if page > 0 else None,
- js_only=page > 0,
- cache_mode=CacheMode.BYPASS,
- headless=False,
- )
-
- assert result.success, f"Failed to crawl page {page + 1}"
-
- commits = json.loads(result.extracted_content)
- all_commits.extend(commits)
-
- print(f"Page {page + 1}: Found {len(commits)} commits")
-
- await crawler.crawler_strategy.kill_session(session_id)
- print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_dynamic_content_pages_method_3():
- print(
- "\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
- )
-
- async with AsyncWebCrawler(verbose=True) as crawler:
- url = "https://github.com/microsoft/TypeScript/commits/main"
- session_id = "typescript_commits_session"
- all_commits = []
-
- js_next_page = """
- const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
- if (commits.length > 0) {
- window.firstCommit = commits[0].textContent.trim();
- }
- const button = document.querySelector('a[data-testid="pagination-next-button"]');
- if (button) button.click();
- """
-
- wait_for = """() => {
- const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
- if (commits.length === 0) return false;
- const firstCommit = commits[0].textContent.trim();
- return firstCommit !== window.firstCommit;
- }"""
-
- schema = {
- "name": "Commit Extractor",
- "baseSelector": "li.Box-sc-g0xbh4-0",
- "fields": [
- {
- "name": "title",
- "selector": "h4.markdown-title",
- "type": "text",
- "transform": "strip",
- },
- ],
- }
- extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
-
- for page in range(3): # Crawl 3 pages
- result = await crawler.arun(
- url=url,
- session_id=session_id,
- css_selector="li.Box-sc-g0xbh4-0",
- extraction_strategy=extraction_strategy,
- js_code=js_next_page if page > 0 else None,
- wait_for=wait_for if page > 0 else None,
- js_only=page > 0,
- cache_mode=CacheMode.BYPASS,
- headless=False,
- )
-
- assert result.success, f"Failed to crawl page {page + 1}"
-
- commits = json.loads(result.extracted_content)
- all_commits.extend(commits)
-
- print(f"Page {page + 1}: Found {len(commits)} commits")
-
- await crawler.crawler_strategy.kill_session(session_id)
- print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
-
-
-async def crawl_custom_browser_type():
- # Use Firefox
- start = time.time()
- async with AsyncWebCrawler(
- browser_type="firefox", verbose=True, headless=True
- ) as crawler:
- result = await crawler.arun(
- url="https://www.example.com", cache_mode=CacheMode.BYPASS
- )
- print(result.markdown[:500])
- print("Time taken: ", time.time() - start)
-
- # Use WebKit
- start = time.time()
- async with AsyncWebCrawler(
- browser_type="webkit", verbose=True, headless=True
- ) as crawler:
- result = await crawler.arun(
- url="https://www.example.com", cache_mode=CacheMode.BYPASS
- )
- print(result.markdown[:500])
- print("Time taken: ", time.time() - start)
-
- # Use Chromium (default)
- start = time.time()
- async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
- result = await crawler.arun(
- url="https://www.example.com", cache_mode=CacheMode.BYPASS
- )
- print(result.markdown[:500])
- print("Time taken: ", time.time() - start)
-
-
-async def crawl_with_user_simultion():
- async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
- url = "YOUR-URL-HERE"
- result = await crawler.arun(
- url=url,
- cache_mode=CacheMode.BYPASS,
- magic=True, # Automatically detects and removes overlays, popups, and other elements that block content
- # simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
- # override_navigator = True # Overrides the navigator object to make it look like a real user
- )
-
- print(result.markdown)
-
-
-async def speed_comparison():
- # print("\n--- Speed Comparison ---")
- # print("Firecrawl (simulated):")
- # print("Time taken: 7.02 seconds")
- # print("Content length: 42074 characters")
- # print("Images found: 49")
- # print()
- # Simulated Firecrawl performance
- from firecrawl import FirecrawlApp
-
- app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
- start = time.time()
- scrape_status = app.scrape_url(
- "https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
- )
- end = time.time()
- print("Firecrawl:")
- print(f"Time taken: {end - start:.2f} seconds")
- print(f"Content length: {len(scrape_status['markdown'])} characters")
- print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
- print()
-
- async with AsyncWebCrawler() as crawler:
- # Crawl4AI simple crawl
- start = time.time()
- result = await crawler.arun(
- url="https://www.nbcnews.com/business",
- word_count_threshold=0,
- cache_mode=CacheMode.BYPASS,
- verbose=False,
- )
- end = time.time()
- print("Crawl4AI (simple crawl):")
- print(f"Time taken: {end - start:.2f} seconds")
- print(f"Content length: {len(result.markdown)} characters")
- print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
- print()
-
- # Crawl4AI with advanced content filtering
- start = time.time()
- result = await crawler.arun(
- url="https://www.nbcnews.com/business",
- word_count_threshold=0,
- markdown_generator=DefaultMarkdownGenerator(
- content_filter=PruningContentFilter(
- threshold=0.48, threshold_type="fixed", min_word_threshold=0
- )
- # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
- ),
- cache_mode=CacheMode.BYPASS,
- verbose=False,
- )
- end = time.time()
- print("Crawl4AI (Markdown Plus):")
- print(f"Time taken: {end - start:.2f} seconds")
- print(f"Content length: {len(result.markdown.raw_markdown)} characters")
- print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
- print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
- print()
-
- # Crawl4AI with JavaScript execution
- start = time.time()
- result = await crawler.arun(
- url="https://www.nbcnews.com/business",
- js_code=[
- "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
- ],
- word_count_threshold=0,
- cache_mode=CacheMode.BYPASS,
- markdown_generator=DefaultMarkdownGenerator(
- content_filter=PruningContentFilter(
- threshold=0.48, threshold_type="fixed", min_word_threshold=0
- )
- # content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
- ),
- verbose=False,
- )
- end = time.time()
- print("Crawl4AI (with JavaScript execution):")
- print(f"Time taken: {end - start:.2f} seconds")
- print(f"Content length: {len(result.markdown.raw_markdown)} characters")
- print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
- print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
-
- print("\nNote on Speed Comparison:")
- print("The speed test conducted here may not reflect optimal conditions.")
- print("When we call Firecrawl's API, we're seeing its best performance,")
- print("while Crawl4AI's performance is limited by the local network speed.")
- print("For a more accurate comparison, it's recommended to run these tests")
- print("on servers with a stable and fast internet connection.")
- print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
- print("If you run these tests in an environment with better network conditions,")
- print("you may observe an even more significant speed advantage for Crawl4AI.")
-
-
-async def generate_knowledge_graph():
- class Entity(BaseModel):
- name: str
- description: str
-
- class Relationship(BaseModel):
- entity1: Entity
- entity2: Entity
- description: str
- relation_type: str
-
- class KnowledgeGraph(BaseModel):
- entities: List[Entity]
- relationships: List[Relationship]
-
- extraction_strategy = LLMExtractionStrategy(
- llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
- schema=KnowledgeGraph.model_json_schema(),
- extraction_type="schema",
- instruction="""Extract entities and relationships from the given text.""",
- )
- async with AsyncWebCrawler() as crawler:
- url = "https://paulgraham.com/love.html"
- result = await crawler.arun(
- url=url,
- cache_mode=CacheMode.BYPASS,
- extraction_strategy=extraction_strategy,
- # magic=True
- )
- # print(result.extracted_content)
- with open(os.path.join(__location__, "kb.json"), "w") as f:
- f.write(result.extracted_content)
-
-
-async def fit_markdown_remove_overlay():
- async with AsyncWebCrawler(
- headless=True, # Set to False to see what is happening
- verbose=True,
- user_agent_mode="random",
- user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
- ) as crawler:
- result = await crawler.arun(
- url="https://www.kidocode.com/degrees/technology",
- cache_mode=CacheMode.BYPASS,
- markdown_generator=DefaultMarkdownGenerator(
- content_filter=PruningContentFilter(
- threshold=0.48, threshold_type="fixed", min_word_threshold=0
- ),
- options={"ignore_links": True},
- ),
- # markdown_generator=DefaultMarkdownGenerator(
- # content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
- # options={
- # "ignore_links": True
- # }
- # ),
- )
-
- if result.success:
- print(len(result.markdown.raw_markdown))
- print(len(result.markdown.markdown_with_citations))
- print(len(result.markdown.fit_markdown))
-
- # Save clean html
- with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
- f.write(result.cleaned_html)
-
- with open(
- os.path.join(__location__, "output/output_raw_markdown.md"), "w"
- ) as f:
- f.write(result.markdown.raw_markdown)
-
- with open(
- os.path.join(__location__, "output/output_markdown_with_citations.md"),
- "w",
- ) as f:
- f.write(result.markdown.markdown_with_citations)
-
- with open(
- os.path.join(__location__, "output/output_fit_markdown.md"), "w"
- ) as f:
- f.write(result.markdown.fit_markdown)
-
- print("Done")
-
-
-async def main():
- # await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
-
- # await simple_crawl()
- # await simple_example_with_running_js_code()
- # await simple_example_with_css_selector()
- # # await use_proxy()
- # await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
- # await extract_structured_data_using_css_extractor()
-
- # LLM extraction examples
- # await extract_structured_data_using_llm()
- # await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
- # await extract_structured_data_using_llm("ollama/llama3.2")
-
- # You always can pass custom headers to the extraction strategy
- # custom_headers = {
- # "Authorization": "Bearer your-custom-token",
- # "X-Custom-Header": "Some-Value"
- # }
- # await extract_structured_data_using_llm(extra_headers=custom_headers)
-
- # await crawl_dynamic_content_pages_method_1()
- # await crawl_dynamic_content_pages_method_2()
- await crawl_dynamic_content_pages_method_3()
-
- # await crawl_custom_browser_type()
-
- # await speed_comparison()
-
-
-if __name__ == "__main__":
- asyncio.run(main())
diff --git a/docs/examples/quickstart_examples_set_1.py b/docs/examples/quickstart_examples_set_1.py
new file mode 100644
index 00000000..078d1c4a
--- /dev/null
+++ b/docs/examples/quickstart_examples_set_1.py
@@ -0,0 +1,412 @@
+import asyncio
+import os
+import json
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import ProxyConfig
+
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
+from crawl4ai import RoundRobinProxyStrategy
+from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
+from crawl4ai import LLMConfig
+from crawl4ai import PruningContentFilter, BM25ContentFilter
+from crawl4ai import DefaultMarkdownGenerator
+from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
+from crawl4ai import BrowserConfig
+
+__cur_dir__ = Path(__file__).parent
+
+async def demo_basic_crawl():
+ """Basic web crawling with markdown generation"""
+ print("\n=== 1. Basic Web Crawling ===")
+ async with AsyncWebCrawler(config = BrowserConfig(
+ viewport_height=800,
+ viewport_width=1200,
+ headless=True,
+ verbose=True,
+ )) as crawler:
+ results: List[CrawlResult] = await crawler.arun(
+ url="https://news.ycombinator.com/"
+ )
+
+ for i, result in enumerate(results):
+ print(f"Result {i + 1}:")
+ print(f"Success: {result.success}")
+ if result.success:
+ print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
+ print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
+ else:
+ print("Failed to crawl the URL")
+
+async def demo_parallel_crawl():
+ """Crawl multiple URLs in parallel"""
+ print("\n=== 2. Parallel Crawling ===")
+
+ urls = [
+ "https://news.ycombinator.com/",
+ "https://example.com/",
+ "https://httpbin.org/html",
+ ]
+
+ async with AsyncWebCrawler() as crawler:
+ results: List[CrawlResult] = await crawler.arun_many(
+ urls=urls,
+ )
+
+ print(f"Crawled {len(results)} URLs in parallel:")
+ for i, result in enumerate(results):
+ print(
+ f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
+ )
+
+async def demo_fit_markdown():
+ """Generate focused markdown with LLM content filter"""
+ print("\n=== 3. Fit Markdown with LLM Content Filter ===")
+
+ async with AsyncWebCrawler() as crawler:
+ result: CrawlResult = await crawler.arun(
+ url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
+ config=CrawlerRunConfig(
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter()
+ )
+ ),
+ )
+
+ # Print stats and save the fit markdown
+ print(f"Raw: {len(result.markdown.raw_markdown)} chars")
+ print(f"Fit: {len(result.markdown.fit_markdown)} chars")
+
+async def demo_llm_structured_extraction_no_schema():
+ # Create a simple LLM extraction strategy (no schema required)
+ extraction_strategy = LLMExtractionStrategy(
+ llm_config=LLMConfig(
+ provider="groq/qwen-2.5-32b",
+ api_token="env:GROQ_API_KEY",
+ ),
+ instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
+ extract_type="schema",
+ schema="{title: string, url: string, comments: int}",
+ extra_args={
+ "temperature": 0.0,
+ "max_tokens": 4096,
+ },
+ verbose=True,
+ )
+
+ config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+ async with AsyncWebCrawler() as crawler:
+ results: List[CrawlResult] = await crawler.arun(
+ "https://news.ycombinator.com/", config=config
+ )
+
+ for result in results:
+ print(f"URL: {result.url}")
+ print(f"Success: {result.success}")
+ if result.success:
+ data = json.loads(result.extracted_content)
+ print(json.dumps(data, indent=2))
+ else:
+ print("Failed to extract structured data")
+
+async def demo_css_structured_extraction_no_schema():
+ """Extract structured data using CSS selectors"""
+ print("\n=== 5. CSS-Based Structured Extraction ===")
+ # Sample HTML for schema generation (one-time cost)
+ sample_html = """
+
+ """
+
+ # Check if schema file exists
+ schema_file_path = f"{__cur_dir__}/tmp/schema.json"
+ if os.path.exists(schema_file_path):
+ with open(schema_file_path, "r") as f:
+ schema = json.load(f)
+ else:
+ # Generate schema using LLM (one-time setup)
+ schema = JsonCssExtractionStrategy.generate_schema(
+ html=sample_html,
+ llm_config=LLMConfig(
+ provider="groq/qwen-2.5-32b",
+ api_token="env:GROQ_API_KEY",
+ ),
+ query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
+ )
+
+ print(f"Generated schema: {json.dumps(schema, indent=2)}")
+ # Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
+ with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
+ json.dump(schema, f, indent=2)
+
+ # Create no-LLM extraction strategy with the generated schema
+ extraction_strategy = JsonCssExtractionStrategy(schema)
+ config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
+
+ # Use the fast CSS extraction (no LLM calls during extraction)
+ async with AsyncWebCrawler() as crawler:
+ results: List[CrawlResult] = await crawler.arun(
+ "https://thehackernews.com", config=config
+ )
+
+ for result in results:
+ print(f"URL: {result.url}")
+ print(f"Success: {result.success}")
+ if result.success:
+ data = json.loads(result.extracted_content)
+ print(json.dumps(data, indent=2))
+ else:
+ print("Failed to extract structured data")
+
+async def demo_deep_crawl():
+ """Deep crawling with BFS strategy"""
+ print("\n=== 6. Deep Crawling ===")
+
+ filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
+
+ deep_crawl_strategy = BFSDeepCrawlStrategy(
+ max_depth=1, max_pages=5, filter_chain=filter_chain
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ results: List[CrawlResult] = await crawler.arun(
+ url="https://docs.crawl4ai.com",
+ config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
+ )
+
+ print(f"Deep crawl returned {len(results)} pages:")
+ for i, result in enumerate(results):
+ depth = result.metadata.get("depth", "unknown")
+ print(f" {i + 1}. {result.url} (Depth: {depth})")
+
+async def demo_js_interaction():
+ """Execute JavaScript to load more content"""
+ print("\n=== 7. JavaScript Interaction ===")
+
+ # A simple page that needs JS to reveal content
+ async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
+ # Initial load
+
+ news_schema = {
+ "name": "news",
+ "baseSelector": "tr.athing",
+ "fields": [
+ {
+ "name": "title",
+ "selector": "span.titleline",
+ "type": "text",
+ }
+ ],
+ }
+ results: List[CrawlResult] = await crawler.arun(
+ url="https://news.ycombinator.com",
+ config=CrawlerRunConfig(
+ session_id="hn_session", # Keep session
+ extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
+ ),
+ )
+
+ news = []
+ for result in results:
+ if result.success:
+ data = json.loads(result.extracted_content)
+ news.extend(data)
+ print(json.dumps(data, indent=2))
+ else:
+ print("Failed to extract structured data")
+
+ print(f"Initial items: {len(news)}")
+
+ # Click "More" link
+ more_config = CrawlerRunConfig(
+ js_code="document.querySelector('a.morelink').click();",
+ js_only=True, # Continue in same page
+ session_id="hn_session", # Keep session
+ extraction_strategy=JsonCssExtractionStrategy(
+ schema=news_schema,
+ ),
+ )
+
+ result: List[CrawlResult] = await crawler.arun(
+ url="https://news.ycombinator.com", config=more_config
+ )
+
+ # Extract new items
+ for result in results:
+ if result.success:
+ data = json.loads(result.extracted_content)
+ news.extend(data)
+ print(json.dumps(data, indent=2))
+ else:
+ print("Failed to extract structured data")
+ print(f"Total items: {len(news)}")
+
+async def demo_media_and_links():
+ """Extract media and links from a page"""
+ print("\n=== 8. Media and Links Extraction ===")
+
+ async with AsyncWebCrawler() as crawler:
+ result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
+
+ for i, result in enumerate(result):
+ # Extract and save all images
+ images = result.media.get("images", [])
+ print(f"Found {len(images)} images")
+
+ # Extract and save all links (internal and external)
+ internal_links = result.links.get("internal", [])
+ external_links = result.links.get("external", [])
+ print(f"Found {len(internal_links)} internal links")
+ print(f"Found {len(external_links)} external links")
+
+ # Print some of the images and links
+ for image in images[:3]:
+ print(f"Image: {image['src']}")
+ for link in internal_links[:3]:
+ print(f"Internal link: {link['href']}")
+ for link in external_links[:3]:
+ print(f"External link: {link['href']}")
+
+ # # Save everything to files
+ with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
+ json.dump(images, f, indent=2)
+
+ with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
+ json.dump(
+ {"internal": internal_links, "external": external_links},
+ f,
+ indent=2,
+ )
+
+async def demo_screenshot_and_pdf():
+ """Capture screenshot and PDF of a page"""
+ print("\n=== 9. Screenshot and PDF Capture ===")
+
+ async with AsyncWebCrawler() as crawler:
+ result: List[CrawlResult] = await crawler.arun(
+ # url="https://example.com",
+ url="https://en.wikipedia.org/wiki/Giant_anteater",
+ config=CrawlerRunConfig(screenshot=True, pdf=True),
+ )
+
+ for i, result in enumerate(result):
+ # if result.screenshot_data:
+ if result.screenshot:
+ # Save screenshot
+ screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
+ with open(screenshot_path, "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+ print(f"Screenshot saved to {screenshot_path}")
+
+ # if result.pdf_data:
+ if result.pdf:
+ # Save PDF
+ pdf_path = f"{__cur_dir__}/tmp/example.pdf"
+ with open(pdf_path, "wb") as f:
+ f.write(result.pdf)
+ print(f"PDF saved to {pdf_path}")
+
+async def demo_proxy_rotation():
+ """Proxy rotation for multiple requests"""
+ print("\n=== 10. Proxy Rotation ===")
+
+ # Example proxies (replace with real ones)
+ proxies = [
+ ProxyConfig(server="http://proxy1.example.com:8080"),
+ ProxyConfig(server="http://proxy2.example.com:8080"),
+ ]
+
+ proxy_strategy = RoundRobinProxyStrategy(proxies)
+
+ print(f"Using {len(proxies)} proxies in rotation")
+ print(
+ "Note: This example uses placeholder proxies - replace with real ones to test"
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ config = CrawlerRunConfig(
+ proxy_rotation_strategy=proxy_strategy
+ )
+
+ # In a real scenario, these would be run and the proxies would rotate
+ print("In a real scenario, requests would rotate through the available proxies")
+
+async def demo_raw_html_and_file():
+ """Process raw HTML and local files"""
+ print("\n=== 11. Raw HTML and Local Files ===")
+
+ raw_html = """
+
+ Sample Article
+ This is sample content for testing Crawl4AI's raw HTML processing.
+
+ """
+
+ # Save to file
+ file_path = Path("docs/examples/tmp/sample.html").absolute()
+ with open(file_path, "w") as f:
+ f.write(raw_html)
+
+ async with AsyncWebCrawler() as crawler:
+ # Crawl raw HTML
+ raw_result = await crawler.arun(
+ url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+ )
+ print("Raw HTML processing:")
+ print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")
+
+ # Crawl local file
+ file_result = await crawler.arun(
+ url=f"file://{file_path}",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("\nLocal file processing:")
+ print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")
+
+ # Clean up
+ os.remove(file_path)
+ print(f"Processed both raw HTML and local file ({file_path})")
+
+async def main():
+ """Run all demo functions sequentially"""
+ print("=== Comprehensive Crawl4AI Demo ===")
+ print("Note: Some examples require API keys or other configurations")
+
+ # Run all demos
+ await demo_basic_crawl()
+ await demo_parallel_crawl()
+ await demo_fit_markdown()
+ await demo_llm_structured_extraction_no_schema()
+ await demo_css_structured_extraction_no_schema()
+ await demo_deep_crawl()
+ await demo_js_interaction()
+ await demo_media_and_links()
+ await demo_screenshot_and_pdf()
+ # # await demo_proxy_rotation()
+ await demo_raw_html_and_file()
+
+ # Clean up any temp files that may have been created
+ print("\n=== Demo Complete ===")
+ print("Check for any generated files (screenshots, PDFs) in the current directory")
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/quickstart_examples_set_2.py b/docs/examples/quickstart_examples_set_2.py
new file mode 100644
index 00000000..3adbfc0d
--- /dev/null
+++ b/docs/examples/quickstart_examples_set_2.py
@@ -0,0 +1,562 @@
+import os, sys
+
+from crawl4ai.types import LLMConfig
+
+sys.path.append(
+ os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+)
+
+import asyncio
+import time
+import json
+import re
+from typing import Dict
+from bs4 import BeautifulSoup
+from pydantic import BaseModel, Field
+from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai.content_filter_strategy import PruningContentFilter
+from crawl4ai.extraction_strategy import (
+ JsonCssExtractionStrategy,
+ LLMExtractionStrategy,
+)
+
+__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
+
+print("Crawl4AI: Advanced Web Crawling and Data Extraction")
+print("GitHub Repository: https://github.com/unclecode/crawl4ai")
+print("Twitter: @unclecode")
+print("Website: https://crawl4ai.com")
+
+
+# Basic Example - Simple Crawl
+async def simple_crawl():
+ print("\n--- Basic Usage ---")
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+
+async def clean_content():
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ excluded_tags=["nav", "footer", "aside"],
+ remove_overlay_elements=True,
+ markdown_generator=DefaultMarkdownGenerator(
+ content_filter=PruningContentFilter(
+ threshold=0.48, threshold_type="fixed", min_word_threshold=0
+ ),
+ options={"ignore_links": True},
+ ),
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://en.wikipedia.org/wiki/Apple",
+ config=crawler_config,
+ )
+ full_markdown_length = len(result.markdown.raw_markdown)
+ fit_markdown_length = len(result.markdown.fit_markdown)
+ print(f"Full Markdown Length: {full_markdown_length}")
+ print(f"Fit Markdown Length: {fit_markdown_length}")
+
+
+async def link_analysis():
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.ENABLED,
+ exclude_external_links=True,
+ exclude_social_media_links=True,
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business",
+ config=crawler_config,
+ )
+ print(f"Found {len(result.links['internal'])} internal links")
+ print(f"Found {len(result.links['external'])} external links")
+
+ for link in result.links["internal"][:5]:
+ print(f"Href: {link['href']}\nText: {link['text']}\n")
+
+
+# JavaScript Execution Example
+async def simple_example_with_running_js_code():
+ print("\n--- Executing JavaScript and Using CSS Selectors ---")
+
+ browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
+ # wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+
+# CSS Selector Example
+async def simple_example_with_css_selector():
+ print("\n--- Using CSS Selectors ---")
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ print(result.markdown[:500])
+
+
+async def media_handling():
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ for img in result.media["images"][:5]:
+ print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
+
+
+async def custom_hook_workflow(verbose=True):
+ async with AsyncWebCrawler() as crawler:
+ # Set a 'before_goto' hook to run custom code just before navigation
+ crawler.crawler_strategy.set_hook(
+ "before_goto",
+ lambda page, context: print("[Hook] Preparing to navigate..."),
+ )
+
+ # Perform the crawl operation
+ result = await crawler.arun(url="https://crawl4ai.com")
+ print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
+
+
+# Proxy Example
+async def use_proxy():
+ print("\n--- Using a Proxy ---")
+ browser_config = BrowserConfig(
+ headless=True,
+ proxy_config={
+ "server": "http://proxy.example.com:8080",
+ "username": "username",
+ "password": "password",
+ },
+ )
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business", config=crawler_config
+ )
+ if result.success:
+ print(result.markdown[:500])
+
+
+# Screenshot Example
+async def capture_and_save_screenshot(url: str, output_path: str):
+ browser_config = BrowserConfig(headless=True)
+ crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url=url, config=crawler_config)
+
+ if result.success and result.screenshot:
+ import base64
+
+ screenshot_data = base64.b64decode(result.screenshot)
+ with open(output_path, "wb") as f:
+ f.write(screenshot_data)
+ print(f"Screenshot saved successfully to {output_path}")
+ else:
+ print("Failed to capture screenshot")
+
+
+# LLM Extraction Example
+class OpenAIModelFee(BaseModel):
+ model_name: str = Field(..., description="Name of the OpenAI model.")
+ input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
+ output_fee: str = Field(
+ ..., description="Fee for output token for the OpenAI model."
+ )
+
+
+async def extract_structured_data_using_llm(
+ provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
+):
+ print(f"\n--- Extracting Structured Data with {provider} ---")
+
+ if api_token is None and provider != "ollama":
+ print(f"API token is required for {provider}. Skipping this example.")
+ return
+
+ browser_config = BrowserConfig(headless=True)
+
+ extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
+ if extra_headers:
+ extra_args["extra_headers"] = extra_headers
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ word_count_threshold=1,
+ page_timeout=80000,
+ extraction_strategy=LLMExtractionStrategy(
+ llm_config=LLMConfig(provider=provider,api_token=api_token),
+ schema=OpenAIModelFee.model_json_schema(),
+ extraction_type="schema",
+ instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
+ Do not miss any models in the entire content.""",
+ extra_args=extra_args,
+ ),
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://openai.com/api/pricing/", config=crawler_config
+ )
+ print(result.extracted_content)
+
+
+# CSS Extraction Example
+async def extract_structured_data_using_css_extractor():
+ print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
+ schema = {
+ "name": "KidoCode Courses",
+ "baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
+ "fields": [
+ {
+ "name": "section_title",
+ "selector": "h3.heading-50",
+ "type": "text",
+ },
+ {
+ "name": "section_description",
+ "selector": ".charge-content",
+ "type": "text",
+ },
+ {
+ "name": "course_name",
+ "selector": ".text-block-93",
+ "type": "text",
+ },
+ {
+ "name": "course_description",
+ "selector": ".course-content-text",
+ "type": "text",
+ },
+ {
+ "name": "course_icon",
+ "selector": ".image-92",
+ "type": "attribute",
+ "attribute": "src",
+ },
+ ],
+ }
+
+ browser_config = BrowserConfig(headless=True, java_script_enabled=True)
+
+ js_click_tabs = """
+ (async () => {
+ const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
+ for(let tab of tabs) {
+ tab.scrollIntoView();
+ tab.click();
+ await new Promise(r => setTimeout(r, 500));
+ }
+ })();
+ """
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=JsonCssExtractionStrategy(schema),
+ js_code=[js_click_tabs],
+ delay_before_return_html=1
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(
+ url="https://www.kidocode.com/degrees/technology", config=crawler_config
+ )
+
+ companies = json.loads(result.extracted_content)
+ print(f"Successfully extracted {len(companies)} companies")
+ print(json.dumps(companies[0], indent=2))
+
+
+# Dynamic Content Examples - Method 1
+async def crawl_dynamic_content_pages_method_1():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+ first_commit = ""
+
+ async def on_execution_started(page, **kwargs):
+ nonlocal first_commit
+ try:
+ while True:
+ await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
+ commit = await commit.evaluate("(element) => element.textContent")
+ commit = re.sub(r"\s+", "", commit)
+ if commit and commit != first_commit:
+ first_commit = commit
+ break
+ await asyncio.sleep(0.5)
+ except Exception as e:
+ print(f"Warning: New content didn't appear after JavaScript execution: {e}")
+
+ browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
+
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ js_next_page = """
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+ """
+
+ for page in range(3):
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ css_selector="li.Box-sc-g0xbh4-0",
+ js_code=js_next_page if page > 0 else None,
+ js_only=page > 0,
+ session_id=session_id,
+ )
+
+ result = await crawler.arun(url=url, config=crawler_config)
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ soup = BeautifulSoup(result.cleaned_html, "html.parser")
+ commits = soup.select("li")
+ all_commits.extend(commits)
+
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+# Dynamic Content Examples - Method 2
+async def crawl_dynamic_content_pages_method_2():
+ print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
+
+ browser_config = BrowserConfig(headless=False, java_script_enabled=True)
+
+ js_next_page_and_wait = """
+ (async () => {
+ const getCurrentCommit = () => {
+ const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
+ return commits.length > 0 ? commits[0].textContent.trim() : null;
+ };
+
+ const initialCommit = getCurrentCommit();
+ const button = document.querySelector('a[data-testid="pagination-next-button"]');
+ if (button) button.click();
+
+ while (true) {
+ await new Promise(resolve => setTimeout(resolve, 100));
+ const newCommit = getCurrentCommit();
+ if (newCommit && newCommit !== initialCommit) {
+ break;
+ }
+ }
+ })();
+ """
+
+ schema = {
+ "name": "Commit Extractor",
+ "baseSelector": "li.Box-sc-g0xbh4-0",
+ "fields": [
+ {
+ "name": "title",
+ "selector": "h4.markdown-title",
+ "type": "text",
+ "transform": "strip",
+ },
+ ],
+ }
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ url = "https://github.com/microsoft/TypeScript/commits/main"
+ session_id = "typescript_commits_session"
+ all_commits = []
+
+ extraction_strategy = JsonCssExtractionStrategy(schema)
+
+ for page in range(3):
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ css_selector="li.Box-sc-g0xbh4-0",
+ extraction_strategy=extraction_strategy,
+ js_code=js_next_page_and_wait if page > 0 else None,
+ js_only=page > 0,
+ session_id=session_id,
+ )
+
+ result = await crawler.arun(url=url, config=crawler_config)
+ assert result.success, f"Failed to crawl page {page + 1}"
+
+ commits = json.loads(result.extracted_content)
+ all_commits.extend(commits)
+ print(f"Page {page + 1}: Found {len(commits)} commits")
+
+ print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
+
+
+async def cosine_similarity_extraction():
+ from crawl4ai.extraction_strategy import CosineStrategy
+ crawl_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ extraction_strategy=CosineStrategy(
+ word_count_threshold=10,
+ max_dist=0.2, # Maximum distance between two words
+ linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
+ top_k=3, # Number of top keywords to extract
+ sim_threshold=0.3, # Similarity threshold for clustering
+ semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
+ verbose=True,
+ ),
+ )
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
+ config=crawl_config,
+ )
+ print(json.loads(result.extracted_content)[:5])
+
+
+# Browser Comparison
+async def crawl_custom_browser_type():
+ print("\n--- Browser Comparison ---")
+
+ # Firefox
+ browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("Firefox:", time.time() - start)
+ print(result.markdown[:500])
+
+ # WebKit
+ browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("WebKit:", time.time() - start)
+ print(result.markdown[:500])
+
+ # Chromium (default)
+ browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
+ start = time.time()
+ async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
+ result = await crawler.arun(
+ url="https://www.example.com",
+ config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
+ )
+ print("Chromium:", time.time() - start)
+ print(result.markdown[:500])
+
+
+# Anti-Bot and User Simulation
+async def crawl_with_user_simulation():
+ browser_config = BrowserConfig(
+ headless=True,
+ user_agent_mode="random",
+ user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
+ )
+
+ crawler_config = CrawlerRunConfig(
+ cache_mode=CacheMode.BYPASS,
+ magic=True,
+ simulate_user=True,
+ override_navigator=True,
+ )
+
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
+ print(result.markdown)
+
+
+async def ssl_certification():
+ # Configure crawler to fetch SSL certificate
+ config = CrawlerRunConfig(
+ fetch_ssl_certificate=True,
+ cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(url="https://example.com", config=config)
+
+ if result.success and result.ssl_certificate:
+ cert = result.ssl_certificate
+
+ tmp_dir = os.path.join(__location__, "tmp")
+ os.makedirs(tmp_dir, exist_ok=True)
+
+ # 1. Access certificate properties directly
+ print("\nCertificate Information:")
+ print(f"Issuer: {cert.issuer.get('CN', '')}")
+ print(f"Valid until: {cert.valid_until}")
+ print(f"Fingerprint: {cert.fingerprint}")
+
+ # 2. Export certificate in different formats
+ cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
+ print("\nCertificate exported to:")
+ print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
+
+ pem_data = cert.to_pem(
+ os.path.join(tmp_dir, "certificate.pem")
+ ) # For web servers
+ print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
+
+ der_data = cert.to_der(
+ os.path.join(tmp_dir, "certificate.der")
+ ) # For Java apps
+ print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
+
+
+# Main execution
+async def main():
+ # Basic examples
+ await simple_crawl()
+ await simple_example_with_running_js_code()
+ await simple_example_with_css_selector()
+
+ # Advanced examples
+ await extract_structured_data_using_css_extractor()
+ await extract_structured_data_using_llm(
+ "openai/gpt-4o", os.getenv("OPENAI_API_KEY")
+ )
+ await crawl_dynamic_content_pages_method_1()
+ await crawl_dynamic_content_pages_method_2()
+
+ # Browser comparisons
+ await crawl_custom_browser_type()
+
+ # Screenshot example
+ await capture_and_save_screenshot(
+ "https://www.example.com",
+ os.path.join(__location__, "tmp/example_screenshot.jpg")
+ )
+
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/docs/examples/quickstart_sync.py b/docs/examples/quickstart_sync.py
deleted file mode 100644
index 78f3e56c..00000000
--- a/docs/examples/quickstart_sync.py
+++ /dev/null
@@ -1,405 +0,0 @@
-import os
-import time
-from crawl4ai import LLMConfig
-from crawl4ai.web_crawler import WebCrawler
-from crawl4ai.chunking_strategy import *
-from crawl4ai.extraction_strategy import *
-from crawl4ai.crawler_strategy import *
-from rich import print
-from rich.console import Console
-from functools import lru_cache
-
-console = Console()
-
-
-@lru_cache()
-def create_crawler():
- crawler = WebCrawler(verbose=True)
- crawler.warmup()
- return crawler
-
-
-def print_result(result):
- # Print each key in one line and just the first 10 characters of each one's value and three dots
- console.print("\t[bold]Result:[/bold]")
- for key, value in result.model_dump().items():
- if isinstance(value, str) and value:
- console.print(f"\t{key}: [green]{value[:20]}...[/green]")
- if result.extracted_content:
- items = json.loads(result.extracted_content)
- print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
-
-
-def cprint(message, press_any_key=False):
- console.print(message)
- if press_any_key:
- console.print("Press any key to continue...", style="")
- input()
-
-
-def basic_usage(crawler):
- cprint(
- "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
- )
- result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
- cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
- print_result(result)
-
-
-def basic_usage_some_params(crawler):
- cprint(
- "🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
- )
- cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
- print_result(result)
-
-
-def screenshot_usage(crawler):
- cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
- result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
- cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
- # Save the screenshot to a file
- with open("screenshot.png", "wb") as f:
- f.write(base64.b64decode(result.screenshot))
- cprint("Screenshot saved to 'screenshot.png'!")
- print_result(result)
-
-
-def understanding_parameters(crawler):
- cprint(
- "\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
- )
- cprint(
- "By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
- )
-
- # First crawl (reads from cache)
- cprint("1️⃣ First crawl (caches the result):", True)
- start_time = time.time()
- result = crawler.run(url="https://www.nbcnews.com/business")
- end_time = time.time()
- cprint(
- f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
- )
- print_result(result)
-
- # Force to crawl again
- cprint("2️⃣ Second crawl (Force to crawl again):", True)
- start_time = time.time()
- result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
- end_time = time.time()
- cprint(
- f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
- )
- print_result(result)
-
-
-def add_chunking_strategy(crawler):
- # Adding a chunking strategy: RegexChunking
- cprint(
- "\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
- True,
- )
- cprint(
- "RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- chunking_strategy=RegexChunking(patterns=["\n\n"]),
- )
- cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
- print_result(result)
-
- # Adding another chunking strategy: NlpSentenceChunking
- cprint(
- "\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
- True,
- )
- cprint(
- "NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
- )
- cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
- print_result(result)
-
-
-def add_extraction_strategy(crawler):
- # Adding an extraction strategy: CosineStrategy
- cprint(
- "\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
- True,
- )
- cprint(
- "CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=CosineStrategy(
- word_count_threshold=10,
- max_dist=0.2,
- linkage_method="ward",
- top_k=3,
- sim_threshold=0.3,
- verbose=True,
- ),
- )
- cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
- print_result(result)
-
- # Using semantic_filter with CosineStrategy
- cprint(
- "You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=CosineStrategy(
- semantic_filter="inflation rent prices",
- ),
- )
- cprint(
- "[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
- )
- print_result(result)
-
-
-def add_llm_extraction_strategy(crawler):
- # Adding an LLM extraction strategy without instructions
- cprint(
- "\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
- True,
- )
- cprint(
- "LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(
- llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
- ),
- )
- cprint(
- "[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
- )
- print_result(result)
-
- # Adding an LLM extraction strategy with instructions
- cprint(
- "\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
- True,
- )
- cprint(
- "Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
- )
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(
- llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
- instruction="I am interested in only financial news",
- ),
- )
- cprint(
- "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
- )
- print_result(result)
-
- result = crawler.run(
- url="https://www.nbcnews.com/business",
- extraction_strategy=LLMExtractionStrategy(
- llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
- instruction="Extract only content related to technology",
- ),
- )
- cprint(
- "[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
- )
- print_result(result)
-
-
-def targeted_extraction(crawler):
- # Using a CSS selector to extract only H2 tags
- cprint(
- "\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
- True,
- )
- result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
- cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
- print_result(result)
-
-
-def interactive_extraction(crawler):
- # Passing JavaScript code to interact with the page
- cprint(
- "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
- True,
- )
- cprint(
- "In this example we try to click the 'Load More' button on the page using JavaScript code."
- )
- js_code = """
- const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
- loadMoreButton && loadMoreButton.click();
- """
- # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
- # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
- result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
- cprint(
- "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
- )
- print_result(result)
-
-
-def multiple_scrip(crawler):
- # Passing JavaScript code to interact with the page
- cprint(
- "\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
- True,
- )
- cprint(
- "In this example we try to click the 'Load More' button on the page using JavaScript code."
- )
- js_code = [
- """
- const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
- loadMoreButton && loadMoreButton.click();
- """
- ] * 2
- # crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
- # crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
- result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
- cprint(
- "[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
- )
- print_result(result)
-
-
-def using_crawler_hooks(crawler):
- # Example usage of the hooks for authentication and setting a cookie
- def on_driver_created(driver):
- print("[HOOK] on_driver_created")
- # Example customization: maximize the window
- driver.maximize_window()
-
- # Example customization: logging in to a hypothetical website
- driver.get("https://example.com/login")
-
- from selenium.webdriver.support.ui import WebDriverWait
- from selenium.webdriver.common.by import By
- from selenium.webdriver.support import expected_conditions as EC
-
- WebDriverWait(driver, 10).until(
- EC.presence_of_element_located((By.NAME, "username"))
- )
- driver.find_element(By.NAME, "username").send_keys("testuser")
- driver.find_element(By.NAME, "password").send_keys("password123")
- driver.find_element(By.NAME, "login").click()
- WebDriverWait(driver, 10).until(
- EC.presence_of_element_located((By.ID, "welcome"))
- )
- # Add a custom cookie
- driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
- return driver
-
- def before_get_url(driver):
- print("[HOOK] before_get_url")
- # Example customization: add a custom header
- # Enable Network domain for sending headers
- driver.execute_cdp_cmd("Network.enable", {})
- # Add a custom header
- driver.execute_cdp_cmd(
- "Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
- )
- return driver
-
- def after_get_url(driver):
- print("[HOOK] after_get_url")
- # Example customization: log the URL
- print(driver.current_url)
- return driver
-
- def before_return_html(driver, html):
- print("[HOOK] before_return_html")
- # Example customization: log the HTML
- print(len(html))
- return driver
-
- cprint(
- "\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
- True,
- )
-
- crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
- crawler_strategy.set_hook("on_driver_created", on_driver_created)
- crawler_strategy.set_hook("before_get_url", before_get_url)
- crawler_strategy.set_hook("after_get_url", after_get_url)
- crawler_strategy.set_hook("before_return_html", before_return_html)
-
- crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
- crawler.warmup()
- result = crawler.run(url="https://example.com")
-
- cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
- print_result(result=result)
-
-
-def using_crawler_hooks_dleay_example(crawler):
- def delay(driver):
- print("Delaying for 5 seconds...")
- time.sleep(5)
- print("Resuming...")
-
- def create_crawler():
- crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
- crawler_strategy.set_hook("after_get_url", delay)
- crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
- crawler.warmup()
- return crawler
-
- cprint(
- "\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
- )
- crawler = create_crawler()
- result = crawler.run(url="https://google.com", bypass_cache=True)
-
- cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
- print_result(result)
-
-
-def main():
- cprint(
- "🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
- )
- cprint(
- "⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
- )
- cprint(
- "If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
- )
-
- crawler = create_crawler()
-
- crawler.always_by_pass_cache = True
- basic_usage(crawler)
- # basic_usage_some_params(crawler)
- understanding_parameters(crawler)
-
- crawler.always_by_pass_cache = True
- screenshot_usage(crawler)
- add_chunking_strategy(crawler)
- add_extraction_strategy(crawler)
- add_llm_extraction_strategy(crawler)
- targeted_extraction(crawler)
- interactive_extraction(crawler)
- multiple_scrip(crawler)
-
- cprint(
- "\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
- )
-
-
-if __name__ == "__main__":
- main()
diff --git a/docs/examples/quickstart_v0.ipynb b/docs/examples/quickstart_v0.ipynb
deleted file mode 100644
index 0282aa12..00000000
--- a/docs/examples/quickstart_v0.ipynb
+++ /dev/null
@@ -1,735 +0,0 @@
-{
- "cells": [
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "6yLvrXn7yZQI"
- },
- "source": [
- "# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
- "\n",
- "Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
- "\n",
- "- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
- "- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
- "- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
- "\n",
- "Let's explore the powerful features of Crawl4AI!"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "KIn_9nxFyZQK"
- },
- "source": [
- "## Installation\n",
- "\n",
- "First, let's install Crawl4AI from GitHub:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "mSnaxLf3zMog"
- },
- "outputs": [],
- "source": [
- "!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "xlXqaRtayZQK"
- },
- "outputs": [],
- "source": [
- "!pip install crawl4ai\n",
- "!pip install nest-asyncio\n",
- "!playwright install"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "qKCE7TI7yZQL"
- },
- "source": [
- "Now, let's import the necessary libraries:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 1,
- "metadata": {
- "id": "I67tr7aAyZQL"
- },
- "outputs": [],
- "source": [
- "import asyncio\n",
- "import nest_asyncio\n",
- "from crawl4ai import AsyncWebCrawler\n",
- "from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
- "import json\n",
- "import time\n",
- "from pydantic import BaseModel, Field\n",
- "\n",
- "nest_asyncio.apply()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "h7yR_Rt_yZQM"
- },
- "source": [
- "## Basic Usage\n",
- "\n",
- "Let's start with a simple crawl example:"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 2,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "yBh6hf4WyZQM",
- "outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
- "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
- "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
- "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
- "18102\n"
- ]
- }
- ],
- "source": [
- "async def simple_crawl():\n",
- " async with AsyncWebCrawler(verbose=True) as crawler:\n",
- " result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
- " print(len(result.markdown))\n",
- "await simple_crawl()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "9rtkgHI28uI4"
- },
- "source": [
- "💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, you’ll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "MzZ0zlJ9yZQM"
- },
- "source": [
- "## Advanced Features\n",
- "\n",
- "### Executing JavaScript and Using CSS Selectors"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 3,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "gHStF86xyZQM",
- "outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
- "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
- "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
- "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
- "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
- "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
- "41135\n"
- ]
- }
- ],
- "source": [
- "async def js_and_css():\n",
- " async with AsyncWebCrawler(verbose=True) as crawler:\n",
- " js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
- " result = await crawler.arun(\n",
- " url=\"https://www.nbcnews.com/business\",\n",
- " js_code=js_code,\n",
- " # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
- " bypass_cache=True\n",
- " )\n",
- " print(len(result.markdown))\n",
- "\n",
- "await js_and_css()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "cqE_W4coyZQM"
- },
- "source": [
- "### Using a Proxy\n",
- "\n",
- "Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "QjAyiAGqyZQM"
- },
- "outputs": [],
- "source": [
- "async def use_proxy():\n",
- " async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
- " result = await crawler.arun(\n",
- " url=\"https://www.nbcnews.com/business\",\n",
- " bypass_cache=True\n",
- " )\n",
- " print(result.markdown[:500]) # Print first 500 characters\n",
- "\n",
- "# Uncomment the following line to run the proxy example\n",
- "# await use_proxy()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "XTZ88lbayZQN"
- },
- "source": [
- "### Extracting Structured Data with OpenAI\n",
- "\n",
- "Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 14,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "fIOlDayYyZQN",
- "outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
- "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
- "[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
- "[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
- "[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
- "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
- "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
- "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
- "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
- "[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
- "[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
- "[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
- "[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
- "[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
- "[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
- "[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
- "5029\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "from google.colab import userdata\n",
- "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
- "\n",
- "class OpenAIModelFee(BaseModel):\n",
- " model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
- " input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
- " output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
- "\n",
- "async def extract_openai_fees():\n",
- " async with AsyncWebCrawler(verbose=True) as crawler:\n",
- " result = await crawler.arun(\n",
- " url='https://openai.com/api/pricing/',\n",
- " word_count_threshold=1,\n",
- " extraction_strategy=LLMExtractionStrategy(\n",
- " provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
- " schema=OpenAIModelFee.schema(),\n",
- " extraction_type=\"schema\",\n",
- " instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
- " Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
- " {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
- " ),\n",
- " bypass_cache=True,\n",
- " )\n",
- " print(len(result.extracted_content))\n",
- "\n",
- "# Uncomment the following line to run the OpenAI extraction example\n",
- "await extract_openai_fees()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "BypA5YxEyZQN"
- },
- "source": [
- "### Advanced Multi-Page Crawling with JavaScript Execution"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "tfkcVQ0b7mw-"
- },
- "source": [
- "## Advanced Multi-Page Crawling with JavaScript Execution\n",
- "\n",
- "This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
- "\n",
- "To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 11,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "qUBKGpn3yZQN",
- "outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
- "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
- "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
- "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
- "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
- "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
- "Page 1: Found 35 commits\n",
- "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
- "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
- "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
- "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
- "Page 2: Found 35 commits\n",
- "[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
- "[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
- "[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
- "[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
- "Page 3: Found 35 commits\n",
- "Successfully crawled 105 commits across 3 pages\n"
- ]
- }
- ],
- "source": [
- "import re\n",
- "from bs4 import BeautifulSoup\n",
- "\n",
- "async def crawl_typescript_commits():\n",
- " first_commit = \"\"\n",
- " async def on_execution_started(page):\n",
- " nonlocal first_commit\n",
- " try:\n",
- " while True:\n",
- " await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
- " commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
- " commit = await commit.evaluate('(element) => element.textContent')\n",
- " commit = re.sub(r'\\s+', '', commit)\n",
- " if commit and commit != first_commit:\n",
- " first_commit = commit\n",
- " break\n",
- " await asyncio.sleep(0.5)\n",
- " except Exception as e:\n",
- " print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
- "\n",
- " async with AsyncWebCrawler(verbose=True) as crawler:\n",
- " crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
- "\n",
- " url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
- " session_id = \"typescript_commits_session\"\n",
- " all_commits = []\n",
- "\n",
- " js_next_page = \"\"\"\n",
- " const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
- " if (button) button.click();\n",
- " \"\"\"\n",
- "\n",
- " for page in range(3): # Crawl 3 pages\n",
- " result = await crawler.arun(\n",
- " url=url,\n",
- " session_id=session_id,\n",
- " css_selector=\"li.Box-sc-g0xbh4-0\",\n",
- " js=js_next_page if page > 0 else None,\n",
- " bypass_cache=True,\n",
- " js_only=page > 0\n",
- " )\n",
- "\n",
- " assert result.success, f\"Failed to crawl page {page + 1}\"\n",
- "\n",
- " soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
- " commits = soup.select(\"li\")\n",
- " all_commits.extend(commits)\n",
- "\n",
- " print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
- "\n",
- " await crawler.crawler_strategy.kill_session(session_id)\n",
- " print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
- "\n",
- "await crawl_typescript_commits()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "EJRnYsp6yZQN"
- },
- "source": [
- "### Using JsonCssExtractionStrategy for Fast Structured Output"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "1ZMqIzB_8SYp"
- },
- "source": [
- "The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
- "\n",
- "1. You define a schema that describes the pattern of data you're interested in extracting.\n",
- "2. The schema includes a base selector that identifies repeating elements on the page.\n",
- "3. Within the schema, you define fields, each with its own selector and type.\n",
- "4. These field selectors are applied within the context of each base selector element.\n",
- "5. The strategy supports nested structures, lists within lists, and various data types.\n",
- "6. You can even include computed fields for more complex data manipulation.\n",
- "\n",
- "This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
- "\n",
- "For more details and advanced usage, check out the full documentation on the Crawl4AI website."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 12,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "trCMR2T9yZQN",
- "outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
- "[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
- "[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
- "[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
- "[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
- "[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
- "[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
- "[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
- "Successfully extracted 11 news teasers\n",
- "{\n",
- " \"category\": \"Business News\",\n",
- " \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
- " \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
- " \"time\": \"13h ago\",\n",
- " \"image\": {\n",
- " \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
- " \"alt\": \"Mike Tirico.\"\n",
- " },\n",
- " \"link\": \"https://www.nbcnews.com/business\"\n",
- "}\n"
- ]
- }
- ],
- "source": [
- "async def extract_news_teasers():\n",
- " schema = {\n",
- " \"name\": \"News Teaser Extractor\",\n",
- " \"baseSelector\": \".wide-tease-item__wrapper\",\n",
- " \"fields\": [\n",
- " {\n",
- " \"name\": \"category\",\n",
- " \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
- " \"type\": \"text\",\n",
- " },\n",
- " {\n",
- " \"name\": \"headline\",\n",
- " \"selector\": \".wide-tease-item__headline\",\n",
- " \"type\": \"text\",\n",
- " },\n",
- " {\n",
- " \"name\": \"summary\",\n",
- " \"selector\": \".wide-tease-item__description\",\n",
- " \"type\": \"text\",\n",
- " },\n",
- " {\n",
- " \"name\": \"time\",\n",
- " \"selector\": \"[data-testid='wide-tease-date']\",\n",
- " \"type\": \"text\",\n",
- " },\n",
- " {\n",
- " \"name\": \"image\",\n",
- " \"type\": \"nested\",\n",
- " \"selector\": \"picture.teasePicture img\",\n",
- " \"fields\": [\n",
- " {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
- " {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
- " ],\n",
- " },\n",
- " {\n",
- " \"name\": \"link\",\n",
- " \"selector\": \"a[href]\",\n",
- " \"type\": \"attribute\",\n",
- " \"attribute\": \"href\",\n",
- " },\n",
- " ],\n",
- " }\n",
- "\n",
- " extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
- "\n",
- " async with AsyncWebCrawler(verbose=True) as crawler:\n",
- " result = await crawler.arun(\n",
- " url=\"https://www.nbcnews.com/business\",\n",
- " extraction_strategy=extraction_strategy,\n",
- " bypass_cache=True,\n",
- " )\n",
- "\n",
- " assert result.success, \"Failed to crawl the page\"\n",
- "\n",
- " news_teasers = json.loads(result.extracted_content)\n",
- " print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
- " print(json.dumps(news_teasers[0], indent=2))\n",
- "\n",
- "await extract_news_teasers()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "FnyVhJaByZQN"
- },
- "source": [
- "## Speed Comparison\n",
- "\n",
- "Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "agDD186f3wig"
- },
- "source": [
- "💡 **Note on Speed Comparison:**\n",
- "\n",
- "The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
- "\n",
- "For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
- "\n",
- "If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
- ]
- },
- {
- "cell_type": "code",
- "execution_count": null,
- "metadata": {
- "id": "F7KwHv8G1LbY"
- },
- "outputs": [],
- "source": [
- "!pip install firecrawl"
- ]
- },
- {
- "cell_type": "code",
- "execution_count": 4,
- "metadata": {
- "colab": {
- "base_uri": "https://localhost:8080/"
- },
- "id": "91813zILyZQN",
- "outputId": "663223db-ab89-4976-b233-05ceca62b19b"
- },
- "outputs": [
- {
- "name": "stdout",
- "output_type": "stream",
- "text": [
- "Firecrawl (simulated):\n",
- "Time taken: 4.38 seconds\n",
- "Content length: 41967 characters\n",
- "Images found: 49\n",
- "\n",
- "Crawl4AI (simple crawl):\n",
- "Time taken: 4.22 seconds\n",
- "Content length: 18221 characters\n",
- "Images found: 49\n",
- "\n",
- "Crawl4AI (with JavaScript execution):\n",
- "Time taken: 9.13 seconds\n",
- "Content length: 34243 characters\n",
- "Images found: 89\n"
- ]
- }
- ],
- "source": [
- "import os\n",
- "from google.colab import userdata\n",
- "os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
- "import time\n",
- "from firecrawl import FirecrawlApp\n",
- "\n",
- "async def speed_comparison():\n",
- " # Simulated Firecrawl performance\n",
- " app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
- " start = time.time()\n",
- " scrape_status = app.scrape_url(\n",
- " 'https://www.nbcnews.com/business',\n",
- " params={'formats': ['markdown', 'html']}\n",
- " )\n",
- " end = time.time()\n",
- " print(\"Firecrawl (simulated):\")\n",
- " print(f\"Time taken: {end - start:.2f} seconds\")\n",
- " print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
- " print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
- " print()\n",
- "\n",
- " async with AsyncWebCrawler() as crawler:\n",
- " # Crawl4AI simple crawl\n",
- " start = time.time()\n",
- " result = await crawler.arun(\n",
- " url=\"https://www.nbcnews.com/business\",\n",
- " word_count_threshold=0,\n",
- " bypass_cache=True,\n",
- " verbose=False\n",
- " )\n",
- " end = time.time()\n",
- " print(\"Crawl4AI (simple crawl):\")\n",
- " print(f\"Time taken: {end - start:.2f} seconds\")\n",
- " print(f\"Content length: {len(result.markdown)} characters\")\n",
- " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
- " print()\n",
- "\n",
- " # Crawl4AI with JavaScript execution\n",
- " start = time.time()\n",
- " result = await crawler.arun(\n",
- " url=\"https://www.nbcnews.com/business\",\n",
- " js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
- " word_count_threshold=0,\n",
- " bypass_cache=True,\n",
- " verbose=False\n",
- " )\n",
- " end = time.time()\n",
- " print(\"Crawl4AI (with JavaScript execution):\")\n",
- " print(f\"Time taken: {end - start:.2f} seconds\")\n",
- " print(f\"Content length: {len(result.markdown)} characters\")\n",
- " print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
- "\n",
- "await speed_comparison()"
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "OBFFYVJIyZQN"
- },
- "source": [
- "If you run on a local machine with a proper internet speed:\n",
- "- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
- "- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
- "\n",
- "Please note that actual performance may vary depending on network conditions and the specific content being crawled."
- ]
- },
- {
- "cell_type": "markdown",
- "metadata": {
- "id": "A6_1RK1_yZQO"
- },
- "source": [
- "## Conclusion\n",
- "\n",
- "In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
- "\n",
- "1. Basic crawling\n",
- "2. JavaScript execution and CSS selector usage\n",
- "3. Proxy support\n",
- "4. Structured data extraction with OpenAI\n",
- "5. Advanced multi-page crawling with JavaScript execution\n",
- "6. Fast structured output using JsonCssExtractionStrategy\n",
- "7. Speed comparison with other services\n",
- "\n",
- "Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
- "\n",
- "For more information and advanced usage, please visit the [Crawl4AI documentation](https://docs.crawl4ai.com/).\n",
- "\n",
- "Happy crawling!"
- ]
- }
- ],
- "metadata": {
- "colab": {
- "provenance": []
- },
- "kernelspec": {
- "display_name": "venv",
- "language": "python",
- "name": "python3"
- },
- "language_info": {
- "codemirror_mode": {
- "name": "ipython",
- "version": 3
- },
- "file_extension": ".py",
- "mimetype": "text/x-python",
- "name": "python",
- "nbconvert_exporter": "python",
- "pygments_lexer": "ipython3",
- "version": "3.10.13"
- }
- },
- "nbformat": 4,
- "nbformat_minor": 0
-}
diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py
index 3cbbdb7b..fe8e0a2b 100644
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
diff --git a/docs/examples/use_geo_location.py b/docs/examples/use_geo_location.py
new file mode 100644
index 00000000..2cfc866f
--- /dev/null
+++ b/docs/examples/use_geo_location.py
@@ -0,0 +1,70 @@
+# use_geo_location.py
+"""
+Example: override locale, timezone, and geolocation using Crawl4ai patterns.
+
+This demo uses `AsyncWebCrawler.arun()` to fetch a page with
+browser context primed for specific locale, timezone, and GPS,
+and saves a screenshot for visual verification.
+"""
+
+import asyncio
+import base64
+from pathlib import Path
+from typing import List
+from crawl4ai import (
+ AsyncWebCrawler,
+ CrawlerRunConfig,
+ BrowserConfig,
+ GeolocationConfig,
+ CrawlResult,
+)
+
+async def demo_geo_override():
+ """Demo: Crawl a geolocation-test page with overrides and screenshot."""
+ print("\n=== Geo-Override Crawl ===")
+
+ # 1) Browser setup: use Playwright-managed contexts
+ browser_cfg = BrowserConfig(
+ headless=False,
+ viewport_width=1280,
+ viewport_height=720,
+ use_managed_browser=False,
+ )
+
+ # 2) Run config: include locale, timezone_id, geolocation, and screenshot
+ run_cfg = CrawlerRunConfig(
+ url="https://browserleaks.com/geo", # test page that shows your location
+ locale="en-US", # Accept-Language & UI locale
+ timezone_id="America/Los_Angeles", # JS Date()/Intl timezone
+ geolocation=GeolocationConfig( # override GPS coords
+ latitude=34.0522,
+ longitude=-118.2437,
+ accuracy=10.0,
+ ),
+ screenshot=True, # capture screenshot after load
+ session_id="geo_test", # reuse context if rerunning
+ delay_before_return_html=5
+ )
+
+ async with AsyncWebCrawler(config=browser_cfg) as crawler:
+ # 3) Run crawl (returns list even for single URL)
+ results: List[CrawlResult] = await crawler.arun(
+ url=run_cfg.url,
+ config=run_cfg,
+ )
+ result = results[0]
+
+ # 4) Save screenshot and report path
+ if result.screenshot:
+ __current_dir = Path(__file__).parent
+ out_dir = __current_dir / "tmp"
+ out_dir.mkdir(exist_ok=True)
+ shot_path = out_dir / "geo_test.png"
+ with open(shot_path, "wb") as f:
+ f.write(base64.b64decode(result.screenshot))
+ print(f"Saved screenshot to {shot_path}")
+ else:
+ print("No screenshot captured, check configuration.")
+
+if __name__ == "__main__":
+ asyncio.run(demo_geo_override())
diff --git a/docs/md_v2/advanced/identity-based-crawling.md b/docs/md_v2/advanced/identity-based-crawling.md
index 403acb9a..3864f840 100644
--- a/docs/md_v2/advanced/identity-based-crawling.md
+++ b/docs/md_v2/advanced/identity-based-crawling.md
@@ -263,7 +263,102 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
---
-## 7. Summary
+## 7. Locale, Timezone, and Geolocation Control
+
+In addition to using persistent profiles, Crawl4AI supports customizing your browser's locale, timezone, and geolocation settings. These features enhance your identity-based browsing experience by allowing you to control how websites perceive your location and regional settings.
+
+### Setting Locale and Timezone
+
+You can set the browser's locale and timezone through `CrawlerRunConfig`:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ config=CrawlerRunConfig(
+ # Set browser locale (language and region formatting)
+ locale="fr-FR", # French (France)
+
+ # Set browser timezone
+ timezone_id="Europe/Paris",
+
+ # Other normal options...
+ magic=True,
+ page_timeout=60000
+ )
+ )
+```
+
+**How it works:**
+- `locale` affects language preferences, date formats, number formats, etc.
+- `timezone_id` affects JavaScript's Date object and time-related functionality
+- These settings are applied when creating the browser context and maintained throughout the session
+
+### Configuring Geolocation
+
+Control the GPS coordinates reported by the browser's geolocation API:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, GeolocationConfig
+
+async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://maps.google.com", # Or any location-aware site
+ config=CrawlerRunConfig(
+ # Configure precise GPS coordinates
+ geolocation=GeolocationConfig(
+ latitude=48.8566, # Paris coordinates
+ longitude=2.3522,
+ accuracy=100 # Accuracy in meters (optional)
+ ),
+
+ # This site will see you as being in Paris
+ page_timeout=60000
+ )
+ )
+```
+
+**Important notes:**
+- When `geolocation` is specified, the browser is automatically granted permission to access location
+- Websites using the Geolocation API will receive the exact coordinates you specify
+- This affects map services, store locators, delivery services, etc.
+- Combined with the appropriate `locale` and `timezone_id`, you can create a fully consistent location profile
+
+### Combining with Managed Browsers
+
+These settings work perfectly with managed browsers for a complete identity solution:
+
+```python
+from crawl4ai import (
+ AsyncWebCrawler, BrowserConfig, CrawlerRunConfig,
+ GeolocationConfig
+)
+
+browser_config = BrowserConfig(
+ use_managed_browser=True,
+ user_data_dir="/path/to/my-profile",
+ browser_type="chromium"
+)
+
+crawl_config = CrawlerRunConfig(
+ # Location settings
+ locale="es-MX", # Spanish (Mexico)
+ timezone_id="America/Mexico_City",
+ geolocation=GeolocationConfig(
+ latitude=19.4326, # Mexico City
+ longitude=-99.1332
+ )
+)
+
+async with AsyncWebCrawler(config=browser_config) as crawler:
+ result = await crawler.arun(url="https://example.com", config=crawl_config)
+```
+
+Combining persistent profiles with precise geolocation and region settings gives you complete control over your digital identity.
+
+## 8. Summary
- **Create** your user-data directory either:
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
@@ -271,6 +366,7 @@ See the full example in `docs/examples/identity_based_browsing.py` for a complet
- Or through the interactive interface with `profiler.interactive_manager()`
- **Log in** or configure sites as needed, then close the browser
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
+- **Customize** identity aspects with `locale`, `timezone_id`, and `geolocation`
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
- **Manage** your profiles with the dedicated `BrowserProfiler` class
- Enjoy **persistent** sessions that reflect your real identity
diff --git a/docs/md_v2/advanced/network-console-capture.md b/docs/md_v2/advanced/network-console-capture.md
new file mode 100644
index 00000000..4305a25f
--- /dev/null
+++ b/docs/md_v2/advanced/network-console-capture.md
@@ -0,0 +1,205 @@
+# Network Requests & Console Message Capturing
+
+Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
+
+## Configuration
+
+To enable network and console capturing, use these configuration options:
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+# Enable both network request capture and console message capture
+config = CrawlerRunConfig(
+ capture_network_requests=True, # Capture all network requests and responses
+ capture_console_messages=True # Capture all browser console output
+)
+```
+
+## Example Usage
+
+```python
+import asyncio
+import json
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+ # Enable both network request capture and console message capture
+ config = CrawlerRunConfig(
+ capture_network_requests=True,
+ capture_console_messages=True
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun(
+ url="https://example.com",
+ config=config
+ )
+
+ if result.success:
+ # Analyze network requests
+ if result.network_requests:
+ print(f"Captured {len(result.network_requests)} network events")
+
+ # Count request types
+ request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+ response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+ failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
+
+ print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
+
+ # Find API calls
+ api_calls = [r for r in result.network_requests
+ if r.get("event_type") == "request" and "api" in r.get("url", "")]
+ if api_calls:
+ print(f"Detected {len(api_calls)} API calls:")
+ for call in api_calls[:3]: # Show first 3
+ print(f" - {call.get('method')} {call.get('url')}")
+
+ # Analyze console messages
+ if result.console_messages:
+ print(f"Captured {len(result.console_messages)} console messages")
+
+ # Group by type
+ message_types = {}
+ for msg in result.console_messages:
+ msg_type = msg.get("type", "unknown")
+ message_types[msg_type] = message_types.get(msg_type, 0) + 1
+
+ print("Message types:", message_types)
+
+ # Show errors (often the most important)
+ errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
+ if errors:
+ print(f"Found {len(errors)} console errors:")
+ for err in errors[:2]: # Show first 2
+ print(f" - {err.get('text', '')[:100]}")
+
+ # Export all captured data to a file for detailed analysis
+ with open("network_capture.json", "w") as f:
+ json.dump({
+ "url": result.url,
+ "network_requests": result.network_requests or [],
+ "console_messages": result.console_messages or []
+ }, f, indent=2)
+
+ print("Exported detailed capture data to network_capture.json")
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+## Captured Data Structure
+
+### Network Requests
+
+The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
+| `url` | The URL of the request |
+| `timestamp` | Unix timestamp when the event was captured |
+
+#### Request Event Fields
+
+```json
+{
+ "event_type": "request",
+ "url": "https://example.com/api/data.json",
+ "method": "GET",
+ "headers": {"User-Agent": "...", "Accept": "..."},
+ "post_data": "key=value&otherkey=value",
+ "resource_type": "fetch",
+ "is_navigation_request": false,
+ "timestamp": 1633456789.123
+}
+```
+
+#### Response Event Fields
+
+```json
+{
+ "event_type": "response",
+ "url": "https://example.com/api/data.json",
+ "status": 200,
+ "status_text": "OK",
+ "headers": {"Content-Type": "application/json", "Cache-Control": "..."},
+ "from_service_worker": false,
+ "request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
+ "timestamp": 1633456789.456
+}
+```
+
+#### Failed Request Event Fields
+
+```json
+{
+ "event_type": "request_failed",
+ "url": "https://example.com/missing.png",
+ "method": "GET",
+ "resource_type": "image",
+ "failure_text": "net::ERR_ABORTED 404",
+ "timestamp": 1633456789.789
+}
+```
+
+### Console Messages
+
+The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
+
+| Field | Description |
+|-------|-------------|
+| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
+| `text` | The message text |
+| `timestamp` | Unix timestamp when the message was captured |
+
+#### Console Message Example
+
+```json
+{
+ "type": "error",
+ "text": "Uncaught TypeError: Cannot read property 'length' of undefined",
+ "location": "https://example.com/script.js:123:45",
+ "timestamp": 1633456790.123
+}
+```
+
+## Key Benefits
+
+- **Full Request Visibility**: Capture all network activity including:
+ - Requests (URLs, methods, headers, post data)
+ - Responses (status codes, headers, timing)
+ - Failed requests (with error messages)
+
+- **Console Message Access**: View all JavaScript console output:
+ - Log messages
+ - Warnings
+ - Errors with stack traces
+ - Developer debugging information
+
+- **Debugging Power**: Identify issues such as:
+ - Failed API calls or resource loading
+ - JavaScript errors affecting page functionality
+ - CORS or other security issues
+ - Hidden API endpoints and data flows
+
+- **Security Analysis**: Detect:
+ - Unexpected third-party requests
+ - Data leakage in request payloads
+ - Suspicious script behavior
+
+- **Performance Insights**: Analyze:
+ - Request timing data
+ - Resource loading patterns
+ - Potential bottlenecks
+
+## Use Cases
+
+1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
+2. **Debugging**: Track down JavaScript errors affecting page functionality
+3. **Security Auditing**: Detect unwanted third-party requests or data leakage
+4. **Performance Analysis**: Identify slow-loading resources
+5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
+
+This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.
\ No newline at end of file
diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md
index 4c42009b..52cf6ace 100644
--- a/docs/md_v2/api/crawl-result.md
+++ b/docs/md_v2/api/crawl-result.md
@@ -15,6 +15,7 @@ class CrawlResult(BaseModel):
downloaded_files: Optional[List[str]] = None
screenshot: Optional[str] = None
pdf : Optional[bytes] = None
+ mhtml: Optional[str] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
@@ -236,7 +237,16 @@ if result.pdf:
f.write(result.pdf)
```
-### 5.5 **`metadata`** *(Optional[dict])*
+### 5.5 **`mhtml`** *(Optional[str])*
+**What**: MHTML snapshot of the page if `capture_mhtml=True` in `CrawlerRunConfig`. MHTML (MIME HTML) format preserves the entire web page with all its resources (CSS, images, scripts, etc.) in a single file.
+**Usage**:
+```python
+if result.mhtml:
+ with open("page.mhtml", "w", encoding="utf-8") as f:
+ f.write(result.mhtml)
+```
+
+### 5.6 **`metadata`** *(Optional[dict])*
**What**: Page-level metadata if discovered (title, description, OG data, etc.).
**Usage**:
```python
@@ -271,7 +281,69 @@ for result in results:
---
-## 7. Example: Accessing Everything
+## 7. Network Requests & Console Messages
+
+When you enable network and console message capturing in `CrawlerRunConfig` using `capture_network_requests=True` and `capture_console_messages=True`, the `CrawlResult` will include these fields:
+
+### 7.1 **`network_requests`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing information about all network requests, responses, and failures captured during the crawl.
+**Structure**:
+- Each item has an `event_type` field that can be `"request"`, `"response"`, or `"request_failed"`.
+- Request events include `url`, `method`, `headers`, `post_data`, `resource_type`, and `is_navigation_request`.
+- Response events include `url`, `status`, `status_text`, `headers`, and `request_timing`.
+- Failed request events include `url`, `method`, `resource_type`, and `failure_text`.
+- All events include a `timestamp` field.
+
+**Usage**:
+```python
+if result.network_requests:
+ # Count different types of events
+ requests = [r for r in result.network_requests if r.get("event_type") == "request"]
+ responses = [r for r in result.network_requests if r.get("event_type") == "response"]
+ failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
+
+ print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
+
+ # Analyze API calls
+ api_calls = [r for r in requests if "api" in r.get("url", "")]
+
+ # Identify failed resources
+ for failure in failures:
+ print(f"Failed to load: {failure.get('url')} - {failure.get('failure_text')}")
+```
+
+### 7.2 **`console_messages`** *(Optional[List[Dict[str, Any]]])*
+**What**: A list of dictionaries containing all browser console messages captured during the crawl.
+**Structure**:
+- Each item has a `type` field indicating the message type (e.g., `"log"`, `"error"`, `"warning"`, etc.).
+- The `text` field contains the actual message text.
+- Some messages include `location` information (URL, line, column).
+- All messages include a `timestamp` field.
+
+**Usage**:
+```python
+if result.console_messages:
+ # Count messages by type
+ message_types = {}
+ for msg in result.console_messages:
+ msg_type = msg.get("type", "unknown")
+ message_types[msg_type] = message_types.get(msg_type, 0) + 1
+
+ print(f"Message type counts: {message_types}")
+
+ # Display errors (which are usually most important)
+ for msg in result.console_messages:
+ if msg.get("type") == "error":
+ print(f"Error: {msg.get('text')}")
+```
+
+These fields provide deep visibility into the page's network activity and browser console, which is invaluable for debugging, security analysis, and understanding complex web applications.
+
+For more details on network and console capturing, see the [Network & Console Capture documentation](../advanced/network-console-capture.md).
+
+---
+
+## 8. Example: Accessing Everything
```python
async def handle_result(result: CrawlResult):
@@ -304,16 +376,36 @@ async def handle_result(result: CrawlResult):
if result.extracted_content:
print("Structured data:", result.extracted_content)
- # Screenshot/PDF
+ # Screenshot/PDF/MHTML
if result.screenshot:
print("Screenshot length:", len(result.screenshot))
if result.pdf:
print("PDF bytes length:", len(result.pdf))
+ if result.mhtml:
+ print("MHTML length:", len(result.mhtml))
+
+ # Network and console capturing
+ if result.network_requests:
+ print(f"Network requests captured: {len(result.network_requests)}")
+ # Analyze request types
+ req_types = {}
+ for req in result.network_requests:
+ if "resource_type" in req:
+ req_types[req["resource_type"]] = req_types.get(req["resource_type"], 0) + 1
+ print(f"Resource types: {req_types}")
+
+ if result.console_messages:
+ print(f"Console messages captured: {len(result.console_messages)}")
+ # Count by message type
+ msg_types = {}
+ for msg in result.console_messages:
+ msg_types[msg.get("type", "unknown")] = msg_types.get(msg.get("type", "unknown"), 0) + 1
+ print(f"Message types: {msg_types}")
```
---
-## 8. Key Points & Future
+## 9. Key Points & Future
1. **Deprecated legacy properties of CrawlResult**
- `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now!
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index d352e162..c7ac21ae 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -70,7 +70,7 @@ We group them by category.
|------------------------------|--------------------------------------|-------------------------------------------------------------------------------------------------|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
-| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
+| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). Can be customized with options such as `content_source` parameter to select the HTML input source ('cleaned_html', 'raw_html', or 'fit_html'). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
@@ -140,6 +140,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`screenshot_wait_for`** | `float or None` | Extra wait time before the screenshot. |
| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. |
| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. |
+| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an image’s alt text or description to be considered valid. |
| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). |
| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. |
@@ -231,6 +232,7 @@ async def main():
if __name__ == "__main__":
asyncio.run(main())
+```
## 2.4 Compliance & Ethics
diff --git a/docs/md_v2/ask_ai/ask-ai.css b/docs/md_v2/ask_ai/ask-ai.css
new file mode 100644
index 00000000..c464d43b
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.css
@@ -0,0 +1,444 @@
+/* ==== File: docs/ask_ai/ask_ai.css ==== */
+
+/* --- Basic Reset & Font --- */
+body {
+ /* Attempt to inherit variables from parent window (iframe context) */
+ /* Fallback values if variables are not inherited */
+ --fallback-bg: #070708;
+ --fallback-font: #e8e9ed;
+ --fallback-secondary: #a3abba;
+ --fallback-primary: #50ffff;
+ --fallback-primary-dimmed: #09b5a5;
+ --fallback-border: #1d1d20;
+ --fallback-code-bg: #1e1e1e;
+ --fallback-invert-font: #222225;
+ --font-stack: dm, Monaco, Courier New, monospace, serif;
+
+ font-family: var(--font-stack, "Courier New", monospace); /* Use theme font stack */
+ background-color: var(--background-color, var(--fallback-bg));
+ color: var(--font-color, var(--fallback-font));
+ margin: 0;
+ padding: 0;
+ font-size: 14px; /* Match global font size */
+ line-height: 1.5em; /* Match global line height */
+ height: 100vh; /* Ensure body takes full height */
+ overflow: hidden; /* Prevent body scrollbars, panels handle scroll */
+ display: flex; /* Use flex for the main container */
+}
+
+a {
+ color: var(--secondary-color, var(--fallback-secondary));
+ text-decoration: none;
+ transition: color 0.2s;
+}
+a:hover {
+ color: var(--primary-color, var(--fallback-primary));
+}
+
+/* --- Main Container Layout --- */
+.ai-assistant-container {
+ display: flex;
+ width: 100%;
+ height: 100%;
+ background-color: var(--background-color, var(--fallback-bg));
+}
+
+/* --- Sidebar Styling --- */
+.sidebar {
+ flex-shrink: 0; /* Prevent sidebars from shrinking */
+ height: 100%;
+ display: flex;
+ flex-direction: column;
+ /* background-color: var(--code-bg-color, var(--fallback-code-bg)); */
+ overflow-y: hidden; /* Header fixed, list scrolls */
+}
+
+.left-sidebar {
+ flex-basis: 240px; /* Width of history panel */
+ border-right: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.right-sidebar {
+ flex-basis: 280px; /* Width of citations panel */
+ border-left: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.sidebar header {
+ padding: 0.6em 1em;
+ border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+ flex-shrink: 0;
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+}
+
+.sidebar header h3 {
+ margin: 0;
+ font-size: 1.1em;
+ color: var(--font-color, var(--fallback-font));
+}
+
+.sidebar ul {
+ list-style: none;
+ padding: 0;
+ margin: 0;
+ overflow-y: auto; /* Enable scrolling for the list */
+ flex-grow: 1; /* Allow list to take remaining space */
+ padding: 0.5em 0;
+}
+
+.sidebar ul li {
+ padding: 0.3em 1em;
+}
+.sidebar ul li.no-citations,
+.sidebar ul li.no-history {
+ color: var(--secondary-color, var(--fallback-secondary));
+ font-style: italic;
+ font-size: 0.9em;
+ padding-left: 1em;
+}
+
+.sidebar ul li a {
+ color: var(--secondary-color, var(--fallback-secondary));
+ text-decoration: none;
+ display: block;
+ padding: 0.2em 0.5em;
+ border-radius: 3px;
+ transition: background-color 0.2s, color 0.2s;
+}
+
+.sidebar ul li a:hover {
+ color: var(--primary-color, var(--fallback-primary));
+ background-color: rgba(80, 255, 255, 0.08); /* Use primary color with alpha */
+}
+/* Style for active history item */
+#history-list li.active a {
+ color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+ font-weight: bold;
+ background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Chat Panel Styling --- */
+#chat-panel {
+ flex-grow: 1; /* Take remaining space */
+ display: flex;
+ flex-direction: column;
+ height: 100%;
+ overflow: hidden; /* Prevent overflow, internal elements handle scroll */
+}
+
+#chat-messages {
+ flex-grow: 1;
+ overflow-y: auto; /* Scrollable chat history */
+ padding: 1em 1.5em;
+ border-bottom: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+
+.message {
+ margin-bottom: 1em;
+ padding: 0.8em 1.2em;
+ border-radius: 8px;
+ max-width: 90%; /* Slightly wider */
+ line-height: 1.6;
+ /* Apply pre-wrap for better handling of spaces/newlines AND wrapping */
+ white-space: pre-wrap;
+ word-wrap: break-word; /* Ensure long words break */
+}
+
+.user-message {
+ background-color: var(--progress-bar-background, var(--fallback-border)); /* User message background */
+ color: var(--font-color, var(--fallback-font));
+ margin-left: auto; /* Align user messages to the right */
+ text-align: left;
+}
+
+.ai-message {
+ background-color: var(--code-bg-color, var(--fallback-code-bg)); /* AI message background */
+ color: var(--font-color, var(--fallback-font));
+ margin-right: auto; /* Align AI messages to the left */
+ border: 1px solid var(--progress-bar-background, var(--fallback-border));
+}
+.ai-message.welcome-message {
+ border: none;
+ background-color: transparent;
+ max-width: 100%;
+ text-align: center;
+ color: var(--secondary-color, var(--fallback-secondary));
+ white-space: normal;
+}
+
+/* Styles for code within messages */
+.ai-message code {
+ background-color: var(--invert-font-color, var(--fallback-invert-font)) !important; /* Use light bg for code */
+ /* color: var(--background-color, var(--fallback-bg)) !important; Dark text */
+ padding: 0.1em 0.4em;
+ border-radius: 4px;
+ font-size: 0.9em;
+}
+.ai-message pre {
+ background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+ color: var(--background-color, var(--fallback-bg)) !important;
+ padding: 1em;
+ border-radius: 5px;
+ overflow-x: auto;
+ margin: 0.8em 0;
+ white-space: pre;
+}
+.ai-message pre code {
+ background-color: transparent !important;
+ padding: 0;
+ font-size: inherit;
+}
+
+/* Override white-space for specific elements generated by Markdown */
+.ai-message p,
+.ai-message ul,
+.ai-message ol,
+.ai-message blockquote {
+ white-space: normal; /* Allow standard wrapping for block elements */
+}
+
+/* --- Markdown Element Styling within Messages --- */
+.message p {
+ margin-top: 0;
+ margin-bottom: 0.5em;
+}
+.message p:last-child {
+ margin-bottom: 0;
+}
+.message ul,
+.message ol {
+ margin: 0.5em 0 0.5em 1.5em;
+ padding: 0;
+}
+.message li {
+ margin-bottom: 0.2em;
+}
+
+/* Code block styling (adjusts previous rules slightly) */
+.message code {
+ /* Inline code */
+ background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+ color: var(--font-color);
+ padding: 0.1em 0.4em;
+ border-radius: 4px;
+ font-size: 0.9em;
+ /* Ensure inline code breaks nicely */
+ word-break: break-all;
+ white-space: normal; /* Allow inline code to wrap if needed */
+}
+.message pre {
+ /* Code block container */
+ background-color: var(--invert-font-color, var(--fallback-invert-font)) !important;
+ color: var(--background-color, var(--fallback-bg)) !important;
+ padding: 1em;
+ border-radius: 5px;
+ overflow-x: auto;
+ margin: 0.8em 0;
+ font-size: 0.9em; /* Slightly smaller code blocks */
+}
+.message pre code {
+ /* Code within code block */
+ background-color: transparent !important;
+ padding: 0;
+ font-size: inherit;
+ word-break: normal; /* Don't break words in code blocks */
+ white-space: pre; /* Preserve whitespace strictly in code blocks */
+}
+
+/* Thinking indicator */
+.message-thinking {
+ display: inline-block;
+ width: 5px;
+ height: 5px;
+ background-color: var(--primary-color, var(--fallback-primary));
+ border-radius: 50%;
+ margin-left: 8px;
+ vertical-align: middle;
+ animation: thinking 1s infinite ease-in-out;
+}
+@keyframes thinking {
+ 0%,
+ 100% {
+ opacity: 0.5;
+ transform: scale(0.8);
+ }
+ 50% {
+ opacity: 1;
+ transform: scale(1.2);
+ }
+}
+
+/* --- Thinking Indicator (Blinking Cursor Style) --- */
+.thinking-indicator-cursor {
+ display: inline-block;
+ width: 10px; /* Width of the cursor */
+ height: 1.1em; /* Match line height */
+ background-color: var(--primary-color, var(--fallback-primary));
+ margin-left: 5px;
+ vertical-align: text-bottom; /* Align with text baseline */
+ animation: blink-cursor 1s step-end infinite;
+}
+
+@keyframes blink-cursor {
+ from,
+ to {
+ background-color: transparent;
+ }
+ 50% {
+ background-color: var(--primary-color, var(--fallback-primary));
+ }
+}
+
+#chat-input-area {
+ flex-shrink: 0; /* Prevent input area from shrinking */
+ padding: 1em 1.5em;
+ display: flex;
+ align-items: flex-end; /* Align items to bottom */
+ gap: 10px;
+ background-color: var(--code-bg-color, var(--fallback-code-bg)); /* Match sidebars */
+}
+
+#chat-input-area textarea {
+ flex-grow: 1;
+ padding: 0.8em 1em;
+ border: 1px solid var(--progress-bar-background, var(--fallback-border));
+ background-color: var(--background-color, var(--fallback-bg));
+ color: var(--font-color, var(--fallback-font));
+ border-radius: 5px;
+ resize: none; /* Disable manual resize */
+ font-family: inherit;
+ font-size: 1em;
+ line-height: 1.4;
+ max-height: 150px; /* Limit excessive height */
+ overflow-y: auto;
+ /* rows: 2; */
+}
+
+#chat-input-area button {
+ /* Basic button styling - maybe inherit from main theme? */
+ padding: 0.6em 1.2em;
+ border: 1px solid var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+ background-color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+ color: var(--background-color, var(--fallback-bg));
+ border-radius: 5px;
+ cursor: pointer;
+ font-size: 0.9em;
+ transition: background-color 0.2s, border-color 0.2s;
+ height: min-content; /* Align with bottom of textarea */
+}
+
+#chat-input-area button:hover {
+ background-color: var(--primary-color, var(--fallback-primary));
+ border-color: var(--primary-color, var(--fallback-primary));
+}
+#chat-input-area button:disabled {
+ opacity: 0.6;
+ cursor: not-allowed;
+}
+
+.loading-indicator {
+ font-size: 0.9em;
+ color: var(--secondary-color, var(--fallback-secondary));
+ margin-right: 10px;
+ align-self: center;
+}
+
+/* --- Buttons --- */
+/* Inherit some button styles if possible */
+.btn.btn-sm {
+ color: var(--font-color, var(--fallback-font));
+ padding: 0.2em 0.5em;
+ font-size: 0.8em;
+ border: 1px solid var(--secondary-color, var(--fallback-secondary));
+ background: none;
+ border-radius: 3px;
+ cursor: pointer;
+}
+.btn.btn-sm:hover {
+ border-color: var(--font-color, var(--fallback-font));
+ background-color: var(--progress-bar-background, var(--fallback-border));
+}
+
+/* --- Basic Responsiveness --- */
+@media screen and (max-width: 900px) {
+ .left-sidebar {
+ flex-basis: 200px; /* Shrink history */
+ }
+ .right-sidebar {
+ flex-basis: 240px; /* Shrink citations */
+ }
+}
+
+@media screen and (max-width: 768px) {
+ /* Stack layout on mobile? Or hide sidebars? Hiding for now */
+ .sidebar {
+ display: none; /* Hide sidebars on small screens */
+ }
+ /* Could add toggle buttons later */
+}
+
+
+/* ==== File: docs/ask_ai/ask-ai.css (Updates V4 - Delete Button) ==== */
+
+
+.sidebar ul li {
+ /* Use flexbox to align link and delete button */
+ display: flex;
+ justify-content: space-between;
+ align-items: center;
+ padding: 0; /* Remove padding from li, add to link/button */
+ margin: 0.1em 0; /* Small vertical margin */
+}
+
+.sidebar ul li a {
+ /* Link takes most space */
+ flex-grow: 1;
+ padding: 0.3em 0.5em 0.3em 1em; /* Adjust padding */
+ /* Make ellipsis work for long titles */
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ /* Keep existing link styles */
+ color: var(--secondary-color, var(--fallback-secondary));
+ text-decoration: none;
+ display: block;
+ border-radius: 3px;
+ transition: background-color 0.2s, color 0.2s;
+}
+.sidebar ul li a:hover {
+ color: var(--primary-color, var(--fallback-primary));
+ background-color: rgba(80, 255, 255, 0.08);
+}
+
+/* Style for active history item's link */
+#history-list li.active a {
+ color: var(--primary-dimmed-color, var(--fallback-primary-dimmed));
+ font-weight: bold;
+ background-color: rgba(80, 255, 255, 0.12);
+}
+
+/* --- Delete Chat Button --- */
+.delete-chat-btn {
+ flex-shrink: 0; /* Don't shrink */
+ background: none;
+ border: none;
+ color: var(--secondary-color, var(--fallback-secondary));
+ cursor: pointer;
+ padding: 0.4em 0.8em; /* Padding around icon */
+ font-size: 0.9em;
+ opacity: 0.5; /* Dimmed by default */
+ transition: opacity 0.2s, color 0.2s;
+ margin-left: 5px; /* Space between link and button */
+ border-radius: 3px;
+}
+
+.sidebar ul li:hover .delete-chat-btn,
+.delete-chat-btn:hover {
+ opacity: 1; /* Show fully on hover */
+ color: var(--error-color, #ff3c74); /* Use error color on hover */
+}
+.delete-chat-btn:focus {
+ outline: 1px dashed var(--error-color, #ff3c74); /* Accessibility */
+ opacity: 1;
+}
diff --git a/docs/md_v2/ask_ai/ask-ai.js b/docs/md_v2/ask_ai/ask-ai.js
new file mode 100644
index 00000000..bb1b370c
--- /dev/null
+++ b/docs/md_v2/ask_ai/ask-ai.js
@@ -0,0 +1,607 @@
+// ==== File: docs/ask_ai/ask-ai.js (Marked, Streaming, History) ====
+
+document.addEventListener("DOMContentLoaded", () => {
+ console.log("AI Assistant JS V2 Loaded");
+
+ // --- DOM Element Selectors ---
+ const historyList = document.getElementById("history-list");
+ const newChatButton = document.getElementById("new-chat-button");
+ const chatMessages = document.getElementById("chat-messages");
+ const chatInput = document.getElementById("chat-input");
+ const sendButton = document.getElementById("send-button");
+ const citationsList = document.getElementById("citations-list");
+
+ // --- Constants ---
+ const CHAT_INDEX_KEY = "aiAssistantChatIndex_v1";
+ const CHAT_PREFIX = "aiAssistantChat_v1_";
+
+ // --- State ---
+ let currentChatId = null;
+ let conversationHistory = []; // Holds message objects { sender: 'user'/'ai', text: '...' }
+ let isThinking = false;
+ let streamInterval = null; // To control the streaming interval
+
+ // --- Event Listeners ---
+ sendButton.addEventListener("click", handleSendMessage);
+ chatInput.addEventListener("keydown", handleInputKeydown);
+ newChatButton.addEventListener("click", handleNewChat);
+ chatInput.addEventListener("input", autoGrowTextarea);
+
+ // --- Initialization ---
+ loadChatHistoryIndex(); // Load history list on startup
+ const initialQuery = checkForInitialQuery(window.parent.location); // Check for query param
+ if (!initialQuery) {
+ loadInitialChat(); // Load normally if no query
+ }
+
+ // --- Core Functions ---
+
+ function handleSendMessage() {
+ const userMessageText = chatInput.value.trim();
+ if (!userMessageText || isThinking) return;
+
+ setThinking(true); // Start thinking state
+
+ // Add user message to state and UI
+ const userMessage = { sender: "user", text: userMessageText };
+ conversationHistory.push(userMessage);
+ addMessageToChat(userMessage, false); // Add user message without parsing markdown
+
+ chatInput.value = "";
+ autoGrowTextarea(); // Reset textarea height
+
+ // Prepare for AI response (create empty div)
+ const aiMessageDiv = addMessageToChat({ sender: "ai", text: "" }, true); // Add empty div with thinking indicator
+
+ // TODO: Generate fingerprint/JWT here
+
+ // TODO: Send `conversationHistory` + JWT to backend API
+ // Replace placeholder below with actual API call
+ // The backend should ideally return a stream of text tokens
+
+ // --- Placeholder Streaming Simulation ---
+ const simulatedFullResponse = `Okay, Here’s a minimal Python script that creates an AsyncWebCrawler, fetches a webpage, and prints the first 300 characters of its Markdown output:
+
+\`\`\`python
+import asyncio
+from crawl4ai import AsyncWebCrawler
+
+async def main():
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://example.com")
+ print(result.markdown[:300]) # Print first 300 chars
+
+if __name__ == "__main__":
+ asyncio.run(main())
+\`\`\`
+
+A code snippet: \`crawler.run()\`. Check the [quickstart](/core/quickstart).`;
+
+ // Simulate receiving the response stream
+ streamSimulatedResponse(aiMessageDiv, simulatedFullResponse);
+
+ // // Simulate receiving citations *after* stream starts (or with first chunk)
+ // setTimeout(() => {
+ // addCitations([
+ // { title: "Simulated Doc 1", url: "#sim1" },
+ // { title: "Another Concept", url: "#sim2" },
+ // ]);
+ // }, 500); // Citations appear shortly after thinking starts
+ }
+
+ function handleInputKeydown(event) {
+ if (event.key === "Enter" && !event.shiftKey) {
+ event.preventDefault();
+ handleSendMessage();
+ }
+ }
+
+ function addMessageToChat(message, addThinkingIndicator = false) {
+ const messageDiv = document.createElement("div");
+ messageDiv.classList.add("message", `${message.sender}-message`);
+
+ // Parse markdown and set HTML
+ messageDiv.innerHTML = message.text ? marked.parse(message.text) : "";
+
+ if (message.sender === "ai") {
+ // Apply Syntax Highlighting AFTER setting innerHTML
+ messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+ if (typeof hljs !== "undefined") {
+ // Check if already highlighted to prevent double-highlighting issues
+ if (!block.classList.contains("hljs")) {
+ hljs.highlightElement(block);
+ }
+ } else {
+ console.warn("highlight.js (hljs) not found for syntax highlighting.");
+ }
+ });
+
+ // Add thinking indicator if needed (and not already present)
+ if (addThinkingIndicator && !message.text && !messageDiv.querySelector(".thinking-indicator-cursor")) {
+ const thinkingDiv = document.createElement("div");
+ thinkingDiv.className = "thinking-indicator-cursor";
+ messageDiv.appendChild(thinkingDiv);
+ }
+ } else {
+ // User messages remain plain text
+ // messageDiv.textContent = message.text;
+ }
+
+ // wrap each pre in a div.terminal
+ messageDiv.querySelectorAll("pre").forEach((block) => {
+ const wrapper = document.createElement("div");
+ wrapper.className = "terminal";
+ block.parentNode.insertBefore(wrapper, block);
+ wrapper.appendChild(block);
+ });
+
+ chatMessages.appendChild(messageDiv);
+ // Scroll only if user is near the bottom? (More advanced)
+ // Simple scroll for now:
+ scrollToBottom();
+ return messageDiv; // Return the created element
+ }
+
+ function streamSimulatedResponse(messageDiv, fullText) {
+ const thinkingIndicator = messageDiv.querySelector(".thinking-indicator-cursor");
+ if (thinkingIndicator) thinkingIndicator.remove();
+
+ const tokens = fullText.split(/(\s+)/);
+ let currentText = "";
+ let tokenIndex = 0;
+ // Clear previous interval just in case
+ if (streamInterval) clearInterval(streamInterval);
+
+ streamInterval = setInterval(() => {
+ const cursorSpan = ''; // Cursor for streaming
+ if (tokenIndex < tokens.length) {
+ currentText += tokens[tokenIndex];
+ // Render intermediate markdown + cursor
+ messageDiv.innerHTML = marked.parse(currentText + cursorSpan);
+ // Re-highlight code blocks on each stream update - might be slightly inefficient
+ // but ensures partial code blocks look okay. Highlight only final on completion.
+ // messageDiv.querySelectorAll('pre code:not(.hljs)').forEach((block) => {
+ // hljs.highlightElement(block);
+ // });
+ scrollToBottom(); // Keep scrolling as content streams
+ tokenIndex++;
+ } else {
+ // Streaming finished
+ clearInterval(streamInterval);
+ streamInterval = null;
+
+ // Final render without cursor
+ messageDiv.innerHTML = marked.parse(currentText);
+
+ // === Final Syntax Highlighting ===
+ messageDiv.querySelectorAll("pre code:not(.hljs)").forEach((block) => {
+ if (typeof hljs !== "undefined" && !block.classList.contains("hljs")) {
+ hljs.highlightElement(block);
+ }
+ });
+
+ // === Extract Citations ===
+ const citations = extractMarkdownLinks(currentText);
+
+ // Wrap each pre in a div.terminal
+ messageDiv.querySelectorAll("pre").forEach((block) => {
+ const wrapper = document.createElement("div");
+ wrapper.className = "terminal";
+ block.parentNode.insertBefore(wrapper, block);
+ wrapper.appendChild(block);
+ });
+
+ const aiMessage = { sender: "ai", text: currentText, citations: citations };
+ conversationHistory.push(aiMessage);
+ updateCitationsDisplay();
+ saveCurrentChat();
+ setThinking(false);
+ }
+ }, 50); // Adjust speed
+ }
+
+ // === NEW Function to Extract Links ===
+ function extractMarkdownLinks(markdownText) {
+ const regex = /\[([^\]]+)\]\(([^)]+)\)/g; // [text](url)
+ const citations = [];
+ let match;
+ while ((match = regex.exec(markdownText)) !== null) {
+ // Avoid adding self-links from within the citations list if AI includes them
+ if (!match[2].startsWith("#citation-")) {
+ citations.push({
+ title: match[1].trim(),
+ url: match[2].trim(),
+ });
+ }
+ }
+ // Optional: Deduplicate links based on URL
+ const uniqueCitations = citations.filter(
+ (citation, index, self) => index === self.findIndex((c) => c.url === citation.url)
+ );
+ return uniqueCitations;
+ }
+
+ // === REVISED Function to Display Citations ===
+ function updateCitationsDisplay() {
+ let lastCitations = null;
+ // Find the most recent AI message with citations
+ for (let i = conversationHistory.length - 1; i >= 0; i--) {
+ if (
+ conversationHistory[i].sender === "ai" &&
+ conversationHistory[i].citations &&
+ conversationHistory[i].citations.length > 0
+ ) {
+ lastCitations = conversationHistory[i].citations;
+ break; // Found the latest citations
+ }
+ }
+
+ citationsList.innerHTML = ""; // Clear previous
+ if (!lastCitations) {
+ citationsList.innerHTML = '
No citations available.';
+ return;
+ }
+
+ lastCitations.forEach((citation, index) => {
+ const li = document.createElement("li");
+ const a = document.createElement("a");
+ // Generate a unique ID for potential internal linking if needed
+ // a.id = `citation-${index}`;
+ a.href = citation.url || "#";
+ a.textContent = citation.title;
+ a.target = "_top"; // Open in main window
+ li.appendChild(a);
+ citationsList.appendChild(li);
+ });
+ }
+
+ function addCitations(citations) {
+ citationsList.innerHTML = ""; // Clear
+ if (!citations || citations.length === 0) {
+ citationsList.innerHTML = 'No citations available.';
+ return;
+ }
+ citations.forEach((citation) => {
+ const li = document.createElement("li");
+ const a = document.createElement("a");
+ a.href = citation.url || "#";
+ a.textContent = citation.title;
+ a.target = "_top"; // Open in main window
+ li.appendChild(a);
+ citationsList.appendChild(li);
+ });
+ }
+
+ function setThinking(thinking) {
+ isThinking = thinking;
+ sendButton.disabled = thinking;
+ chatInput.disabled = thinking;
+ chatInput.placeholder = thinking ? "AI is responding..." : "Ask about Crawl4AI...";
+ // Stop any existing stream if we start thinking again (e.g., rapid resend)
+ if (thinking && streamInterval) {
+ clearInterval(streamInterval);
+ streamInterval = null;
+ }
+ }
+
+ function autoGrowTextarea() {
+ chatInput.style.height = "auto";
+ chatInput.style.height = `${chatInput.scrollHeight}px`;
+ }
+
+ function scrollToBottom() {
+ chatMessages.scrollTop = chatMessages.scrollHeight;
+ }
+
+ // --- Query Parameter Handling ---
+ function checkForInitialQuery(locationToCheck) {
+ // <-- Receive location object
+ if (!locationToCheck) {
+ console.warn("Ask AI: Could not access parent window location.");
+ return false;
+ }
+ const urlParams = new URLSearchParams(locationToCheck.search); // <-- Use passed location's search string
+ const encodedQuery = urlParams.get("qq"); // <-- Use 'qq'
+
+ if (encodedQuery) {
+ console.log("Initial query found (qq):", encodedQuery);
+ try {
+ const decodedText = decodeURIComponent(escape(atob(encodedQuery)));
+ console.log("Decoded query:", decodedText);
+
+ // Start new chat immediately
+ handleNewChat(true);
+
+ // Delay setting input and sending message slightly
+ setTimeout(() => {
+ chatInput.value = decodedText;
+ autoGrowTextarea();
+ handleSendMessage();
+
+ // Clean the PARENT window's URL
+ try {
+ const cleanUrl = locationToCheck.pathname;
+ // Use parent's history object
+ window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+ } catch (e) {
+ console.warn("Ask AI: Could not clean parent URL using replaceState.", e);
+ // This might fail due to cross-origin restrictions if served differently,
+ // but should work fine with mkdocs serve on the same origin.
+ }
+ }, 100);
+
+ return true; // Query processed
+ } catch (e) {
+ console.error("Error decoding initial query (qq):", e);
+ // Clean the PARENT window's URL even on error
+ try {
+ const cleanUrl = locationToCheck.pathname;
+ window.parent.history.replaceState({}, window.parent.document.title, cleanUrl);
+ } catch (cleanError) {
+ console.warn("Ask AI: Could not clean parent URL after decode error.", cleanError);
+ }
+ return false;
+ }
+ }
+ return false; // No 'qq' query found
+ }
+
+ // --- History Management ---
+
+ function handleNewChat(isFromQuery = false) {
+ if (isThinking) return; // Don't allow new chat while responding
+
+ // Only save if NOT triggered immediately by a query parameter load
+ if (!isFromQuery) {
+ saveCurrentChat();
+ }
+
+ currentChatId = `chat_${Date.now()}`;
+ conversationHistory = []; // Clear message history state
+ chatMessages.innerHTML = ""; // Start with clean slate for query
+ if (!isFromQuery) {
+ // Show welcome only if manually started
+ // chatMessages.innerHTML =
+ // 'Started a new chat! Ask me anything about Crawl4AI.
';
+ chatMessages.innerHTML =
+ 'We will launch this feature very soon.
';
+ }
+ addCitations([]); // Clear citations
+ updateCitationsDisplay(); // Clear UI
+
+ // Add to index and save
+ let index = loadChatIndex();
+ // Generate a generic title initially, update later
+ const newTitle = isFromQuery ? "Chat from Selection" : `Chat ${new Date().toLocaleString()}`;
+ // index.unshift({ id: currentChatId, title: `Chat ${new Date().toLocaleString()}` }); // Add to start
+ index.unshift({ id: currentChatId, title: newTitle });
+ saveChatIndex(index);
+
+ renderHistoryList(index); // Update UI
+ setActiveHistoryItem(currentChatId);
+ saveCurrentChat(); // Save the empty new chat state
+ }
+
+ function loadChat(chatId) {
+ if (isThinking || chatId === currentChatId) return;
+
+ // Check if chat data actually exists before proceeding
+ const storedChat = localStorage.getItem(CHAT_PREFIX + chatId);
+ if (storedChat === null) {
+ console.warn(`Attempted to load non-existent chat: ${chatId}. Removing from index.`);
+ deleteChatData(chatId); // Clean up index
+ loadChatHistoryIndex(); // Reload history list
+ loadInitialChat(); // Load next available chat
+ return;
+ }
+
+ console.log(`Loading chat: ${chatId}`);
+ saveCurrentChat(); // Save current before switching
+
+ try {
+ conversationHistory = JSON.parse(storedChat);
+ currentChatId = chatId;
+ renderChatMessages(conversationHistory);
+ updateCitationsDisplay();
+ setActiveHistoryItem(chatId);
+ } catch (e) {
+ console.error("Error loading chat:", chatId, e);
+ alert("Failed to load chat data.");
+ conversationHistory = [];
+ renderChatMessages(conversationHistory);
+ updateCitationsDisplay();
+ }
+ }
+
+ function saveCurrentChat() {
+ if (currentChatId && conversationHistory.length > 0) {
+ try {
+ localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify(conversationHistory));
+ console.log(`Chat ${currentChatId} saved.`);
+
+ // Update title in index (e.g., use first user message)
+ let index = loadChatIndex();
+ const currentItem = index.find((item) => item.id === currentChatId);
+ if (
+ currentItem &&
+ conversationHistory[0]?.sender === "user" &&
+ !currentItem.title.startsWith("Chat about:")
+ ) {
+ currentItem.title = `Chat about: ${conversationHistory[0].text.substring(0, 30)}...`;
+ saveChatIndex(index);
+ // Re-render history list if title changed - small optimization needed here maybe
+ renderHistoryList(index);
+ setActiveHistoryItem(currentChatId); // Re-set active after re-render
+ }
+ } catch (e) {
+ console.error("Error saving chat:", currentChatId, e);
+ // Handle potential storage full errors
+ if (e.name === "QuotaExceededError") {
+ alert("Local storage is full. Cannot save chat history.");
+ // Consider implementing history pruning logic here
+ }
+ }
+ } else if (currentChatId) {
+ // Save empty state for newly created chats if needed, or remove?
+ localStorage.setItem(CHAT_PREFIX + currentChatId, JSON.stringify([]));
+ }
+ }
+
+ function loadChatIndex() {
+ try {
+ const storedIndex = localStorage.getItem(CHAT_INDEX_KEY);
+ return storedIndex ? JSON.parse(storedIndex) : [];
+ } catch (e) {
+ console.error("Error loading chat index:", e);
+ return []; // Return empty array on error
+ }
+ }
+
+ function saveChatIndex(indexArray) {
+ try {
+ localStorage.setItem(CHAT_INDEX_KEY, JSON.stringify(indexArray));
+ } catch (e) {
+ console.error("Error saving chat index:", e);
+ }
+ }
+
+ function renderHistoryList(indexArray) {
+ historyList.innerHTML = ""; // Clear existing
+ if (!indexArray || indexArray.length === 0) {
+ historyList.innerHTML = 'No past chats found.';
+ return;
+ }
+ indexArray.forEach((item) => {
+ const li = document.createElement("li");
+ li.dataset.chatId = item.id; // Add ID to li for easier selection
+
+ const a = document.createElement("a");
+ a.href = "#";
+ a.dataset.chatId = item.id;
+ a.textContent = item.title || `Chat ${item.id.split("_")[1] || item.id}`;
+ a.title = a.textContent; // Tooltip for potentially long titles
+ a.addEventListener("click", (e) => {
+ e.preventDefault();
+ loadChat(item.id);
+ });
+
+ // === Add Delete Button ===
+ const deleteBtn = document.createElement("button");
+ deleteBtn.className = "delete-chat-btn";
+ deleteBtn.innerHTML = "✕"; // Trash can emoji/icon (or use text/SVG/FontAwesome)
+ deleteBtn.title = "Delete Chat";
+ deleteBtn.dataset.chatId = item.id; // Store ID on button too
+ deleteBtn.addEventListener("click", handleDeleteChat);
+
+ li.appendChild(a);
+ li.appendChild(deleteBtn); // Append button to the list item
+ historyList.appendChild(li);
+ });
+ }
+
+ function renderChatMessages(messages) {
+ chatMessages.innerHTML = ""; // Clear existing messages
+ messages.forEach((message) => {
+ // Ensure highlighting is applied when loading from history
+ addMessageToChat(message, false);
+ });
+ if (messages.length === 0) {
+ // chatMessages.innerHTML =
+ // 'Chat history loaded. Ask a question!
';
+ chatMessages.innerHTML =
+ 'We will launch this feature very soon.
';
+ }
+ // Scroll to bottom after loading messages
+ scrollToBottom();
+ }
+
+ function setActiveHistoryItem(chatId) {
+ document.querySelectorAll("#history-list li").forEach((li) => li.classList.remove("active"));
+ // Select the LI element directly now
+ const activeLi = document.querySelector(`#history-list li[data-chat-id="${chatId}"]`);
+ if (activeLi) {
+ activeLi.classList.add("active");
+ }
+ }
+
+ function loadInitialChat() {
+ const index = loadChatIndex();
+ if (index.length > 0) {
+ loadChat(index[0].id);
+ } else {
+ // Check if handleNewChat wasn't already called by query handler
+ if (!currentChatId) {
+ handleNewChat();
+ }
+ }
+ }
+
+ function loadChatHistoryIndex() {
+ const index = loadChatIndex();
+ renderHistoryList(index);
+ if (currentChatId) setActiveHistoryItem(currentChatId);
+ }
+
+ // === NEW Function to Handle Delete Click ===
+ function handleDeleteChat(event) {
+ event.stopPropagation(); // Prevent triggering loadChat on the link behind it
+ const button = event.currentTarget;
+ const chatIdToDelete = button.dataset.chatId;
+
+ if (!chatIdToDelete) return;
+
+ // Confirmation dialog
+ if (
+ window.confirm(
+ `Are you sure you want to delete this chat session?\n"${
+ button.previousElementSibling?.textContent || "Chat " + chatIdToDelete
+ }"`
+ )
+ ) {
+ console.log(`Deleting chat: ${chatIdToDelete}`);
+
+ // Perform deletion
+ const updatedIndex = deleteChatData(chatIdToDelete);
+
+ // If the deleted chat was the currently active one, load another chat
+ if (currentChatId === chatIdToDelete) {
+ currentChatId = null; // Reset current ID
+ conversationHistory = []; // Clear state
+ if (updatedIndex.length > 0) {
+ // Load the new top chat (most recent remaining)
+ loadChat(updatedIndex[0].id);
+ } else {
+ // No chats left, start a new one
+ handleNewChat();
+ }
+ } else {
+ // If a different chat was deleted, just re-render the list
+ renderHistoryList(updatedIndex);
+ // Re-apply active state in case IDs shifted (though they shouldn't)
+ setActiveHistoryItem(currentChatId);
+ }
+ }
+ }
+
+ // === NEW Function to Delete Chat Data ===
+ function deleteChatData(chatId) {
+ // Remove chat data
+ localStorage.removeItem(CHAT_PREFIX + chatId);
+
+ // Update index
+ let index = loadChatIndex();
+ index = index.filter((item) => item.id !== chatId);
+ saveChatIndex(index);
+
+ console.log(`Chat ${chatId} data and index entry removed.`);
+ return index; // Return the updated index
+ }
+
+ // --- Virtual Scrolling Placeholder ---
+ // NOTE: Virtual scrolling is complex. For now, we do direct rendering.
+ // If performance becomes an issue with very long chats/history,
+ // investigate libraries like 'simple-virtual-scroll' or 'virtual-scroller'.
+ // You would replace parts of `renderChatMessages` and `renderHistoryList`
+ // to work with the chosen library's API (providing data and item renderers).
+ console.warn("Virtual scrolling not implemented. Performance may degrade with very long chat histories.");
+});
diff --git a/docs/md_v2/ask_ai/index.html b/docs/md_v2/ask_ai/index.html
new file mode 100644
index 00000000..ccb7faa4
--- /dev/null
+++ b/docs/md_v2/ask_ai/index.html
@@ -0,0 +1,64 @@
+
+
+
+
+
+ Crawl4AI Assistant
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ Welcome to the Crawl4AI Assistant! How can I help you today?
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
\ No newline at end of file
diff --git a/docs/md_v2/assets/copy_code.js b/docs/md_v2/assets/copy_code.js
new file mode 100644
index 00000000..20e6be4f
--- /dev/null
+++ b/docs/md_v2/assets/copy_code.js
@@ -0,0 +1,62 @@
+// ==== File: docs/assets/copy_code.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+ // Target specifically code blocks within the main content area
+ const codeBlocks = document.querySelectorAll('#terminal-mkdocs-main-content pre > code');
+
+ codeBlocks.forEach((codeElement) => {
+ const preElement = codeElement.parentElement; // The tag
+
+ // Ensure the tag can contain a positioned button
+ if (window.getComputedStyle(preElement).position === 'static') {
+ preElement.style.position = 'relative';
+ }
+
+ // Create the button
+ const copyButton = document.createElement('button');
+ copyButton.className = 'copy-code-button';
+ copyButton.type = 'button';
+ copyButton.setAttribute('aria-label', 'Copy code to clipboard');
+ copyButton.title = 'Copy code to clipboard';
+ copyButton.innerHTML = 'Copy'; // Or use an icon like an SVG or FontAwesome class
+
+ // Append the button to the element
+ preElement.appendChild(copyButton);
+
+ // Add click event listener
+ copyButton.addEventListener('click', () => {
+ copyCodeToClipboard(codeElement, copyButton);
+ });
+ });
+
+ async function copyCodeToClipboard(codeElement, button) {
+ // Use innerText to get the rendered text content, preserving line breaks
+ const textToCopy = codeElement.innerText;
+
+ try {
+ await navigator.clipboard.writeText(textToCopy);
+
+ // Visual feedback
+ button.innerHTML = 'Copied!';
+ button.classList.add('copied');
+ button.disabled = true; // Temporarily disable
+
+ // Revert button state after a short delay
+ setTimeout(() => {
+ button.innerHTML = 'Copy';
+ button.classList.remove('copied');
+ button.disabled = false;
+ }, 2000); // Show "Copied!" for 2 seconds
+
+ } catch (err) {
+ console.error('Failed to copy code: ', err);
+ // Optional: Provide error feedback on the button
+ button.innerHTML = 'Error';
+ setTimeout(() => {
+ button.innerHTML = 'Copy';
+ }, 2000);
+ }
+ }
+
+ console.log("Copy Code Button script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/floating_ask_ai_button.js b/docs/md_v2/assets/floating_ask_ai_button.js
new file mode 100644
index 00000000..177c2356
--- /dev/null
+++ b/docs/md_v2/assets/floating_ask_ai_button.js
@@ -0,0 +1,39 @@
+// ==== File: docs/assets/floating_ask_ai_button.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+ const askAiPagePath = '/core/ask-ai/'; // IMPORTANT: Adjust this path if needed!
+ const currentPath = window.location.pathname;
+
+ // Determine the base URL for constructing the link correctly,
+ // especially if deployed in a sub-directory.
+ // This assumes a simple structure; adjust if needed.
+ const baseUrl = window.location.origin + (currentPath.startsWith('/core/') ? '../..' : '');
+
+
+ // Check if the current page IS the Ask AI page
+ // Use includes() for flexibility (handles trailing slash or .html)
+ if (currentPath.includes(askAiPagePath.replace(/\/$/, ''))) { // Remove trailing slash for includes check
+ console.log("Floating Ask AI Button: Not adding button on the Ask AI page itself.");
+ return; // Don't add the button on the target page
+ }
+
+ // --- Create the button ---
+ const fabLink = document.createElement('a');
+ fabLink.className = 'floating-ask-ai-button';
+ fabLink.href = askAiPagePath; // Construct the correct URL
+ fabLink.title = 'Ask Crawl4AI Assistant';
+ fabLink.setAttribute('aria-label', 'Ask Crawl4AI Assistant');
+
+ // Add content (using SVG icon for better visuals)
+ fabLink.innerHTML = `
+
+ Ask AI
+ `;
+
+ // Append to body
+ document.body.appendChild(fabLink);
+
+ console.log("Floating Ask AI Button added.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/github_stats.js b/docs/md_v2/assets/github_stats.js
new file mode 100644
index 00000000..a48b3de1
--- /dev/null
+++ b/docs/md_v2/assets/github_stats.js
@@ -0,0 +1,119 @@
+// ==== File: assets/github_stats.js ====
+
+document.addEventListener('DOMContentLoaded', async () => {
+ // --- Configuration ---
+ const targetHeaderSelector = '.terminal .container:first-child'; // Selector for your header container
+ const insertBeforeSelector = '.terminal-nav'; // Selector for the element to insert the badge BEFORE (e.g., the main nav)
+ // Or set to null to append at the end of the header.
+
+ // --- Find elements ---
+ const headerContainer = document.querySelector(targetHeaderSelector);
+ if (!headerContainer) {
+ console.warn('GitHub Stats: Header container not found with selector:', targetHeaderSelector);
+ return;
+ }
+
+ const repoLinkElement = headerContainer.querySelector('a[href*="github.com/"]'); // Find the existing GitHub link
+ let repoUrl = 'https://github.com/unclecode/crawl4ai';
+ // if (repoLinkElement) {
+ // repoUrl = repoLinkElement.href;
+ // } else {
+ // // Fallback: Try finding from config (requires template injection - harder)
+ // // Or hardcode if necessary, but reading from the link is better.
+ // console.warn('GitHub Stats: GitHub repo link not found in header.');
+ // // Try to get repo_url from mkdocs config if available globally (less likely)
+ // // repoUrl = window.mkdocs_config?.repo_url; // Requires setting this variable
+ // // if (!repoUrl) return; // Exit if still no URL
+ // return; // Exit for now if link isn't found
+ // }
+
+
+ // --- Extract Repo Owner/Name ---
+ let owner = '';
+ let repo = '';
+ try {
+ const url = new URL(repoUrl);
+ const pathParts = url.pathname.split('/').filter(part => part.length > 0);
+ if (pathParts.length >= 2) {
+ owner = pathParts[0];
+ repo = pathParts[1];
+ }
+ } catch (e) {
+ console.error('GitHub Stats: Could not parse repository URL:', repoUrl, e);
+ return;
+ }
+
+ if (!owner || !repo) {
+ console.warn('GitHub Stats: Could not extract owner/repo from URL:', repoUrl);
+ return;
+ }
+
+ // --- Get Version (Attempt to extract from site title) ---
+ let version = '';
+ const siteTitleElement = headerContainer.querySelector('.terminal-title, .site-title'); // Adjust selector based on theme's title element
+ // Example title: "Crawl4AI Documentation (v0.5.x)"
+ if (siteTitleElement) {
+ const match = siteTitleElement.textContent.match(/\((v?[^)]+)\)/); // Look for text in parentheses starting with 'v' (optional)
+ if (match && match[1]) {
+ version = match[1].trim();
+ }
+ }
+ if (!version) {
+ console.info('GitHub Stats: Could not extract version from title. You might need to adjust the selector or regex.');
+ // You could fallback to config.extra.version if injected into JS
+ // version = window.mkdocs_config?.extra?.version || 'N/A';
+ }
+
+
+ // --- Fetch GitHub API Data ---
+ let stars = '...';
+ let forks = '...';
+ try {
+ const apiUrl = `https://api.github.com/repos/${owner}/${repo}`;
+ const response = await fetch(apiUrl);
+
+ if (response.ok) {
+ const data = await response.json();
+ // Format large numbers (optional)
+ stars = data.stargazers_count > 1000 ? `${(data.stargazers_count / 1000).toFixed(1)}k` : data.stargazers_count;
+ forks = data.forks_count > 1000 ? `${(data.forks_count / 1000).toFixed(1)}k` : data.forks_count;
+ } else {
+ console.warn(`GitHub Stats: API request failed with status ${response.status}. Rate limit exceeded?`);
+ stars = 'N/A';
+ forks = 'N/A';
+ }
+ } catch (error) {
+ console.error('GitHub Stats: Error fetching repository data:', error);
+ stars = 'N/A';
+ forks = 'N/A';
+ }
+
+ // --- Create Badge HTML ---
+ const badgeContainer = document.createElement('div');
+ badgeContainer.className = 'github-stats-badge';
+
+ // Use innerHTML for simplicity, including potential icons (requires FontAwesome or similar)
+ // Ensure your theme loads FontAwesome or add it yourself if you want icons.
+ badgeContainer.innerHTML = `
+
+
+
+ ${owner}/${repo}
+ ${version ? ` ${version}` : ''}
+ ${stars}
+ ${forks}
+
+ `;
+
+ // --- Inject Badge into Header ---
+ const insertBeforeElement = insertBeforeSelector ? headerContainer.querySelector(insertBeforeSelector) : null;
+ if (insertBeforeElement) {
+ // headerContainer.insertBefore(badgeContainer, insertBeforeElement);
+ headerContainer.querySelector(insertBeforeSelector).appendChild(badgeContainer);
+ } else {
+ headerContainer.appendChild(badgeContainer);
+ }
+
+ console.info('GitHub Stats: Badge added to header.');
+
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/layout.css b/docs/md_v2/assets/layout.css
new file mode 100644
index 00000000..044c272b
--- /dev/null
+++ b/docs/md_v2/assets/layout.css
@@ -0,0 +1,576 @@
+/* ==== File: assets/layout.css (Non-Fluid Centered Layout) ==== */
+
+:root {
+ --header-height: 55px; /* Adjust if needed */
+ --sidebar-width: 280px; /* Adjust if needed */
+ --toc-width: 340px; /* As specified */
+ --content-max-width: 90em; /* Max width for the centered content */
+ --layout-transition-speed: 0.2s;
+ --global-space: 10px;
+}
+
+/* --- Basic Setup --- */
+html {
+ scroll-behavior: smooth;
+ scroll-padding-top: calc(var(--header-height) + 15px);
+ box-sizing: border-box;
+}
+*, *:before, *:after {
+ box-sizing: inherit;
+}
+
+body {
+ padding-top: 0;
+ padding-bottom: 0;
+ background-color: var(--background-color);
+ color: var(--font-color);
+ /* Prevents horizontal scrollbars during transitions */
+ overflow-x: hidden;
+}
+
+/* --- Fixed Header --- */
+/* Full width, fixed header */
+.terminal .container:first-child { /* Assuming this targets the header container */
+ position: fixed;
+ top: 0;
+ left: 0;
+ right: 0;
+ height: var(--header-height);
+ background-color: var(--background-color);
+ z-index: 1000;
+ border-bottom: 1px solid var(--progress-bar-background);
+ max-width: none; /* Override any container max-width */
+ padding: 0 calc(var(--global-space) * 2);
+}
+
+/* --- Main Layout Container (Below Header) --- */
+/* This container just provides space for the fixed header */
+.container:has(.terminal-mkdocs-main-grid) {
+ margin: 0 auto;
+ padding: 0;
+ padding-top: var(--header-height); /* Space for fixed header */
+}
+
+/* --- Flex Container: Grid holding content and toc (CENTERED) --- */
+/* THIS is the main centered block */
+.terminal-mkdocs-main-grid {
+ display: flex;
+ align-items: flex-start;
+ /* Enforce max-width and center */
+ max-width: var(--content-max-width);
+ margin-left: auto;
+ margin-right: auto;
+ position: relative;
+ /* Apply side padding within the centered block */
+ padding-left: calc(var(--global-space) * 2);
+ padding-right: calc(var(--global-space) * 2);
+ /* Add margin-left to clear the fixed sidebar - ONLY ON DESKTOP */
+ margin-left: var(--sidebar-width);
+}
+
+/* --- 1. Fixed Left Sidebar (Viewport Relative) --- */
+#terminal-mkdocs-side-panel {
+ position: fixed;
+ top: var(--header-height);
+ left: max(0px, calc((90vw - var(--content-max-width)) / 2));
+ bottom: 0;
+ width: var(--sidebar-width);
+ background-color: var(--background-color);
+ border-right: 1px solid var(--progress-bar-background);
+ overflow-y: auto;
+ z-index: 900;
+ padding: 1em calc(var(--global-space) * 2);
+ padding-bottom: 2em;
+ transition: left var(--layout-transition-speed) ease-in-out;
+}
+
+/* --- 2. Main Content Area (Within Centered Grid) --- */
+#terminal-mkdocs-main-content {
+ flex-grow: 1;
+ flex-shrink: 1;
+ min-width: 0; /* Flexbox shrink fix */
+
+ /* No left/right margins needed here - handled by parent grid */
+ margin-left: 0;
+ margin-right: 0;
+
+ /* Internal Padding */
+ padding: 1.5em 2em;
+
+ position: relative;
+ z-index: 1;
+}
+
+/* --- 3. Right Table of Contents (Sticky, Within Centered Grid) --- */
+#toc-sidebar {
+ flex-basis: var(--toc-width);
+ flex-shrink: 0;
+ width: var(--toc-width);
+
+ position: sticky; /* Sticks within the centered grid */
+ top: var(--header-height);
+ align-self: stretch;
+ height: calc(100vh - var(--header-height));
+ overflow-y: auto;
+
+ padding: 1.5em 1em;
+ font-size: 0.85em;
+ border-left: 1px solid var(--progress-bar-background);
+ z-index: 800;
+ /* display: none; /* JS handles */
+}
+
+/* (ToC link styles remain the same) */
+#toc-sidebar h4 { margin-top: 0; margin-bottom: 1em; font-size: 1.1em; color: var(--secondary-color); padding-left: 0.8em; }
+#toc-sidebar ul { list-style: none; padding: 0; margin: 0; }
+#toc-sidebar ul li a { display: block; padding: 0.3em 0; color: var(--secondary-color); text-decoration: none; border-left: 3px solid transparent; padding-left: 0.8em; transition: all 0.1s ease-in-out; line-height: 1.4; word-break: break-word; }
+#toc-sidebar ul li.toc-level-3 a { padding-left: 1.8em; }
+#toc-sidebar ul li.toc-level-4 a { padding-left: 2.8em; }
+#toc-sidebar ul li a:hover { color: var(--font-color); background-color: rgba(255, 255, 255, 0.05); }
+#toc-sidebar ul li a.active { color: var(--primary-color); border-left-color: var(--primary-color); background-color: rgba(80, 255, 255, 0.08); }
+
+
+/* --- Footer Styling (Respects Centered Layout) --- */
+footer {
+ background-color: var(--code-bg-color);
+ color: var(--secondary-color);
+ position: relative;
+ z-index: 10;
+ margin-top: 2em;
+
+ /* Apply margin-left to clear the fixed sidebar */
+ margin-left: var(--sidebar-width);
+
+ /* Constrain width relative to the centered grid it follows */
+ max-width: calc(var(--content-max-width) - var(--sidebar-width));
+ margin-right: auto; /* Keep it left-aligned within the space next to sidebar */
+
+ /* Use padding consistent with the grid */
+ padding: 2em calc(var(--global-space) * 2);
+}
+
+/* Adjust footer grid if needed */
+.terminal-mkdocs-footer-grid {
+ display: grid;
+ grid-template-columns: 1fr auto;
+ gap: 1em;
+ align-items: center;
+}
+
+/* ==========================================================================
+ RESPONSIVENESS (Adapting the Non-Fluid Layout)
+ ========================================================================== */
+
+/* --- Medium screens: Hide ToC --- */
+@media screen and (max-width: 1200px) {
+ #toc-sidebar {
+ display: none;
+ }
+
+ .terminal-mkdocs-main-grid {
+ /* Grid adjusts automatically as ToC is removed */
+ /* Ensure grid padding remains */
+ padding-left: calc(var(--global-space) * 2);
+ padding-right: calc(var(--global-space) * 2);
+ }
+
+ #terminal-mkdocs-main-content {
+ /* Content area naturally expands */
+ }
+
+ footer {
+ /* Footer still respects the left sidebar and overall max width */
+ margin-left: var(--sidebar-width);
+ max-width: calc(var(--content-max-width) - var(--sidebar-width));
+ /* Padding remains consistent */
+ padding-left: calc(var(--global-space) * 2);
+ padding-right: calc(var(--global-space) * 2);
+ }
+}
+
+/* --- Mobile Menu Styles --- */
+.mobile-menu-toggle {
+ display: none; /* Hidden by default, shown in mobile */
+ background: none;
+ border: none;
+ padding: 10px;
+ cursor: pointer;
+ z-index: 1200;
+ margin-right: 10px;
+ position: absolute;
+ left: 10px;
+ top: 50%;
+ transform: translateY(-50%);
+ /* Make sure it doesn't get moved */
+ min-width: 30px;
+ min-height: 30px;
+}
+
+.hamburger-line {
+ display: block;
+ width: 22px;
+ height: 2px;
+ margin: 5px 0;
+ background-color: var(--font-color);
+ transition: transform 0.3s, opacity 0.3s;
+}
+
+/* Hamburger animation */
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(1) {
+ transform: translateY(7px) rotate(45deg);
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(2) {
+ opacity: 0;
+}
+
+.mobile-menu-toggle.is-active .hamburger-line:nth-child(3) {
+ transform: translateY(-7px) rotate(-45deg);
+}
+
+.mobile-menu-close {
+ display: none; /* Hidden by default, shown in mobile */
+ position: absolute;
+ top: 10px;
+ right: 10px;
+ background: none;
+ border: none;
+ color: var(--font-color);
+ font-size: 24px;
+ cursor: pointer;
+ z-index: 1200;
+ padding: 5px 10px;
+}
+
+.mobile-menu-backdrop {
+ position: fixed;
+ top: 0;
+ left: 0;
+ right: 0;
+ bottom: 0;
+ background-color: rgba(0, 0, 0, 0.7);
+ z-index: 1050;
+}
+
+/* --- Small screens: Hide left sidebar, full width content & footer --- */
+@media screen and (max-width: 768px) {
+ /* Hide the terminal-menu from theme */
+ .terminal-menu {
+ display: none !important;
+ }
+
+ /* Add padding to site name to prevent hamburger overlap */
+ .terminal-mkdocs-site-name,
+ .terminal-logo a,
+ .terminal-nav .logo {
+ padding-left: 40px !important;
+ white-space: nowrap;
+ overflow: hidden;
+ text-overflow: ellipsis;
+ }
+
+ /* Show mobile menu toggle button */
+ .mobile-menu-toggle {
+ display: block;
+ }
+
+ /* Show mobile menu close button */
+ .mobile-menu-close {
+ display: block;
+ }
+
+ #terminal-mkdocs-side-panel {
+ left: -100%; /* Hide completely off-screen */
+ z-index: 1100;
+ box-shadow: 2px 0 10px rgba(0,0,0,0.3);
+ top: 0; /* Start from top edge */
+ height: 100%; /* Full height */
+ transition: left 0.3s ease-in-out;
+ padding-top: 50px; /* Space for close button */
+ overflow-y: auto;
+ width: 85%; /* Wider on mobile */
+ max-width: 320px; /* Maximum width */
+ background-color: var(--background-color); /* Ensure solid background */
+ }
+
+ #terminal-mkdocs-side-panel.sidebar-visible {
+ left: 0;
+ }
+
+ /* Make navigation links more touch-friendly */
+ #terminal-mkdocs-side-panel a {
+ padding: 6px 15px;
+ display: block;
+ /* No border as requested */
+ }
+
+ #terminal-mkdocs-side-panel ul {
+ padding-left: 0;
+ }
+
+ #terminal-mkdocs-side-panel ul ul a {
+ padding-left: 10px;
+ }
+
+ .terminal-mkdocs-main-grid {
+ /* Grid now takes full width (minus body padding) */
+ margin-left: 0 !important; /* Override sidebar margin with !important */
+ margin-right: 0; /* Override auto margin */
+ max-width: 100%; /* Allow full width */
+ padding-left: var(--global-space); /* Reduce padding */
+ padding-right: var(--global-space);
+ }
+
+ #terminal-mkdocs-main-content {
+ padding: 1.5em 1em; /* Adjust internal padding */
+ }
+
+ footer {
+ margin-left: 0; /* Full width footer */
+ max-width: 100%; /* Allow full width */
+ padding: 2em 1em; /* Adjust internal padding */
+ }
+
+ .terminal-mkdocs-footer-grid {
+ grid-template-columns: 1fr; /* Stack footer items */
+ text-align: center;
+ gap: 0.5em;
+ }
+}
+
+
+/* ==== GitHub Stats Badge Styling ==== */
+
+.github-stats-badge {
+ display: inline-block; /* Or flex if needed */
+ margin-left: 2em; /* Adjust spacing */
+ vertical-align: middle; /* Align with other header items */
+ font-size: 0.9em; /* Slightly smaller font */
+}
+
+.github-stats-badge a {
+ color: var(--secondary-color); /* Use secondary color */
+ text-decoration: none;
+ display: flex; /* Use flex for alignment */
+ align-items: center;
+ gap: 0.8em; /* Space between items */
+ padding: 0.2em 0.5em;
+ border: 1px solid var(--progress-bar-background); /* Subtle border */
+ border-radius: 4px;
+ transition: color 0.2s, background-color 0.2s;
+}
+
+.github-stats-badge a:hover {
+ color: var(--font-color); /* Brighter color on hover */
+ background-color: var(--progress-bar-background); /* Subtle background on hover */
+}
+
+.github-stats-badge .repo-name {
+ color: var(--font-color); /* Make repo name stand out slightly */
+ font-weight: 500; /* Optional bolder weight */
+}
+
+.github-stats-badge .stat {
+ /* Styles for individual stats (version, stars, forks) */
+ white-space: nowrap; /* Prevent wrapping */
+}
+
+.github-stats-badge .stat i {
+ /* Optional: Style for FontAwesome icons */
+ margin-right: 0.3em;
+ color: var(--secondary-dimmed-color); /* Dimmer color for icons */
+}
+
+
+/* Adjust positioning relative to search/nav if needed */
+/* Example: If search is floated right */
+/* .terminal-nav { float: left; } */
+/* .github-stats-badge { float: left; } */
+/* #mkdocs-search-query { float: right; } */
+
+/* --- Responsive adjustments --- */
+@media screen and (max-width: 900px) { /* Example breakpoint */
+ .github-stats-badge .repo-name {
+ display: none; /* Hide full repo name on smaller screens */
+ }
+ .github-stats-badge {
+ margin-left: 1em;
+ }
+ .github-stats-badge a {
+ gap: 0.5em;
+ }
+}
+@media screen and (max-width: 768px) {
+ /* Further hide or simplify on mobile if needed */
+ .github-stats-badge {
+ display: none; /* Example: Hide completely on smallest screens */
+ }
+}
+
+/* --- Ask AI Selection Button --- */
+.ask-ai-selection-button {
+ background-color: var(--primary-dimmed-color, #09b5a5);
+ color: var(--background-color, #070708);
+ border: none;
+ padding: 6px 10px;
+ font-size: 0.8em;
+ border-radius: 4px;
+ cursor: pointer;
+ box-shadow: 0 3px 8px rgba(0, 0, 0, 0.3);
+ transition: background-color 0.2s ease, transform 0.15s ease;
+ white-space: nowrap;
+ display: flex;
+ align-items: center;
+ font-weight: 500;
+ animation: askAiButtonAppear 0.2s ease-out;
+}
+
+@keyframes askAiButtonAppear {
+ from {
+ opacity: 0;
+ transform: scale(0.9);
+ }
+ to {
+ opacity: 1;
+ transform: scale(1);
+ }
+}
+
+.ask-ai-selection-button:hover {
+ background-color: var(--primary-color, #50ffff);
+ transform: scale(1.05);
+}
+
+/* Mobile styles for Ask AI button */
+@media screen and (max-width: 768px) {
+ .ask-ai-selection-button {
+ padding: 8px 12px; /* Larger touch target on mobile */
+ font-size: 0.9em; /* Slightly larger text */
+ }
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Copy Code Button Styling --- */
+
+/* Ensure the parent can contain the absolutely positioned button */
+#terminal-mkdocs-main-content pre {
+ position: relative; /* Needed for absolute positioning of child */
+ /* Add a little padding top/right to make space for the button */
+ padding-top: 2.5em;
+ padding-right: 1em; /* Ensure padding is sufficient */
+}
+
+.copy-code-button {
+ position: absolute;
+ top: 0.5em; /* Adjust spacing from top */
+ left: 0.5em; /* Adjust spacing from left */
+ z-index: 1; /* Sit on top of code */
+
+ background-color: var(--progress-bar-background, #444); /* Use a background */
+ color: var(--font-color, #eaeaea);
+ border: 1px solid var(--secondary-color, #727578);
+ padding: 3px 8px;
+ font-size: 0.8em;
+ font-family: var(--font-stack, monospace);
+ border-radius: 4px;
+ cursor: pointer;
+ opacity: 0; /* Hidden by default */
+ transition: opacity 0.2s ease-in-out, background-color 0.2s ease, color 0.2s ease;
+ white-space: nowrap;
+}
+
+/* Show button on hover of the container */
+#terminal-mkdocs-main-content pre:hover .copy-code-button {
+ opacity: 0.8; /* Show partially */
+}
+
+.copy-code-button:hover {
+ opacity: 1; /* Fully visible on button hover */
+ background-color: var(--secondary-color, #727578);
+}
+
+.copy-code-button:focus {
+ opacity: 1; /* Ensure visible when focused */
+ outline: 1px dashed var(--primary-color);
+}
+
+
+/* Style for "Copied!" state */
+.copy-code-button.copied {
+ background-color: var(--primary-dimmed-color, #09b5a5);
+ color: var(--background-color, #070708);
+ border-color: var(--primary-dimmed-color, #09b5a5);
+ opacity: 1; /* Ensure visible */
+}
+.copy-code-button.copied:hover {
+ background-color: var(--primary-dimmed-color, #09b5a5); /* Prevent hover change */
+}
+
+/* ==== File: docs/assets/layout.css (Additions) ==== */
+
+/* ... (keep all existing layout CSS) ... */
+
+/* --- Floating Ask AI Button --- */
+.floating-ask-ai-button {
+ position: fixed;
+ bottom: 25px;
+ right: 25px;
+ z-index: 1050; /* Below modals, above most content */
+
+ background-color: var(--primary-dimmed-color, #09b5a5);
+ color: var(--background-color, #070708);
+ border: none;
+ border-radius: 50%; /* Make it circular */
+ width: 60px; /* Adjust size */
+ height: 60px; /* Adjust size */
+ padding: 10px; /* Adjust padding */
+ box-shadow: 0 4px 10px rgba(0, 0, 0, 0.4);
+ cursor: pointer;
+ transition: background-color 0.2s ease, transform 0.2s ease;
+
+ display: flex;
+ flex-direction: column; /* Stack icon and text */
+ align-items: center;
+ justify-content: center;
+ text-decoration: none;
+ text-align: center;
+}
+
+.floating-ask-ai-button svg {
+ width: 24px; /* Control icon size */
+ height: 24px;
+}
+
+.floating-ask-ai-button span {
+ font-size: 0.7em;
+ margin-top: 2px; /* Space between icon and text */
+ display: block; /* Ensure it takes space */
+ line-height: 1;
+}
+
+
+.floating-ask-ai-button:hover {
+ background-color: var(--primary-color, #50ffff);
+ transform: scale(1.05); /* Slight grow effect */
+}
+
+.floating-ask-ai-button:focus {
+ outline: 2px solid var(--primary-color);
+ outline-offset: 2px;
+}
+
+/* Optional: Hide text on smaller screens if needed */
+@media screen and (max-width: 768px) {
+ .floating-ask-ai-button span {
+ /* display: none; */ /* Uncomment to hide text */
+ }
+ .floating-ask-ai-button {
+ width: 55px;
+ height: 55px;
+ bottom: 20px;
+ right: 20px;
+ }
+}
\ No newline at end of file
diff --git a/docs/md_v2/assets/mobile_menu.js b/docs/md_v2/assets/mobile_menu.js
new file mode 100644
index 00000000..e529839e
--- /dev/null
+++ b/docs/md_v2/assets/mobile_menu.js
@@ -0,0 +1,106 @@
+// mobile_menu.js - Hamburger menu for mobile view
+document.addEventListener('DOMContentLoaded', () => {
+ // Get references to key elements
+ const sidePanel = document.getElementById('terminal-mkdocs-side-panel');
+ const mainHeader = document.querySelector('.terminal .container:first-child');
+
+ if (!sidePanel || !mainHeader) {
+ console.warn('Mobile menu: Required elements not found');
+ return;
+ }
+
+ // Force hide sidebar on mobile
+ const checkMobile = () => {
+ if (window.innerWidth <= 768) {
+ // Force with !important-like priority
+ sidePanel.style.setProperty('left', '-100%', 'important');
+ // Also hide terminal-menu from the theme
+ const terminalMenu = document.querySelector('.terminal-menu');
+ if (terminalMenu) {
+ terminalMenu.style.setProperty('display', 'none', 'important');
+ }
+ } else {
+ sidePanel.style.removeProperty('left');
+ // Restore terminal-menu if it exists
+ const terminalMenu = document.querySelector('.terminal-menu');
+ if (terminalMenu) {
+ terminalMenu.style.removeProperty('display');
+ }
+ }
+ };
+
+ // Run on initial load
+ checkMobile();
+
+ // Also run on resize
+ window.addEventListener('resize', checkMobile);
+
+ // Create hamburger button
+ const hamburgerBtn = document.createElement('button');
+ hamburgerBtn.className = 'mobile-menu-toggle';
+ hamburgerBtn.setAttribute('aria-label', 'Toggle navigation menu');
+ hamburgerBtn.innerHTML = `
+
+
+
+ `;
+
+ // Create backdrop overlay
+ const menuBackdrop = document.createElement('div');
+ menuBackdrop.className = 'mobile-menu-backdrop';
+ menuBackdrop.style.display = 'none';
+ document.body.appendChild(menuBackdrop);
+
+ // Make sure it's properly hidden on page load
+ if (window.innerWidth <= 768) {
+ menuBackdrop.style.display = 'none';
+ }
+
+ // Insert hamburger button into header
+ mainHeader.insertBefore(hamburgerBtn, mainHeader.firstChild);
+
+ // Add menu close button to side panel
+ const closeBtn = document.createElement('button');
+ closeBtn.className = 'mobile-menu-close';
+ closeBtn.setAttribute('aria-label', 'Close navigation menu');
+ closeBtn.innerHTML = `×`;
+ sidePanel.insertBefore(closeBtn, sidePanel.firstChild);
+
+ // Toggle function
+ function toggleMobileMenu() {
+ const isOpen = sidePanel.classList.toggle('sidebar-visible');
+
+ // Toggle backdrop
+ menuBackdrop.style.display = isOpen ? 'block' : 'none';
+
+ // Toggle aria-expanded
+ hamburgerBtn.setAttribute('aria-expanded', isOpen ? 'true' : 'false');
+
+ // Toggle hamburger animation class
+ hamburgerBtn.classList.toggle('is-active');
+
+ // Force sidebar visibility setting
+ if (isOpen) {
+ sidePanel.style.setProperty('left', '0', 'important');
+ } else {
+ sidePanel.style.setProperty('left', '-100%', 'important');
+ }
+
+ // Prevent body scrolling when menu is open
+ document.body.style.overflow = isOpen ? 'hidden' : '';
+ }
+
+ // Event listeners
+ hamburgerBtn.addEventListener('click', toggleMobileMenu);
+ closeBtn.addEventListener('click', toggleMobileMenu);
+ menuBackdrop.addEventListener('click', toggleMobileMenu);
+
+ // Close menu on window resize to desktop
+ window.addEventListener('resize', () => {
+ if (window.innerWidth > 768 && sidePanel.classList.contains('sidebar-visible')) {
+ toggleMobileMenu();
+ }
+ });
+
+ console.log('Mobile menu initialized');
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/selection_ask_ai.js b/docs/md_v2/assets/selection_ask_ai.js
new file mode 100644
index 00000000..e88ad34e
--- /dev/null
+++ b/docs/md_v2/assets/selection_ask_ai.js
@@ -0,0 +1,186 @@
+// ==== File: docs/assets/selection_ask_ai.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+ let askAiButton = null;
+ const askAiPageUrl = '/core/ask-ai/'; // Adjust if your Ask AI page path is different
+
+ function createAskAiButton() {
+ const button = document.createElement('button');
+ button.id = 'ask-ai-selection-btn';
+ button.className = 'ask-ai-selection-button';
+
+ // Add icon and text for better visibility
+ button.innerHTML = `
+
+ Ask AI
+ `;
+
+ // Common styles
+ button.style.display = 'none'; // Initially hidden
+ button.style.position = 'absolute';
+ button.style.zIndex = '1500'; // Ensure it's on top
+ button.style.boxShadow = '0 3px 8px rgba(0, 0, 0, 0.4)'; // More pronounced shadow
+ button.style.transition = 'transform 0.15s ease, background-color 0.2s ease'; // Smooth hover effect
+
+ // Add transform on hover
+ button.addEventListener('mouseover', () => {
+ button.style.transform = 'scale(1.05)';
+ });
+
+ button.addEventListener('mouseout', () => {
+ button.style.transform = 'scale(1)';
+ });
+
+ document.body.appendChild(button);
+ button.addEventListener('click', handleAskAiClick);
+ return button;
+ }
+
+ function getSafeSelectedText() {
+ const selection = window.getSelection();
+ if (!selection || selection.rangeCount === 0) {
+ return null;
+ }
+ // Avoid selecting text within the button itself if it was somehow selected
+ const container = selection.getRangeAt(0).commonAncestorContainer;
+ if (askAiButton && askAiButton.contains(container)) {
+ return null;
+ }
+
+ const text = selection.toString().trim();
+ return text.length > 0 ? text : null;
+ }
+
+ function positionButton(event) {
+ const selection = window.getSelection();
+ if (!selection || selection.rangeCount === 0 || selection.isCollapsed) {
+ hideButton();
+ return;
+ }
+
+ const range = selection.getRangeAt(0);
+ const rect = range.getBoundingClientRect();
+
+ // Get viewport dimensions
+ const viewportWidth = window.innerWidth;
+ const viewportHeight = window.innerHeight;
+
+ // Calculate position based on selection
+ const scrollX = window.scrollX;
+ const scrollY = window.scrollY;
+
+ // Default position (top-right of selection)
+ let buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 5; // 5px above
+ let buttonLeft = rect.right + scrollX + 5; // 5px to the right
+
+ // Check if we're on mobile (which we define as less than 768px)
+ const isMobile = viewportWidth <= 768;
+
+ if (isMobile) {
+ // On mobile, position centered above selection to avoid edge issues
+ buttonTop = rect.top + scrollY - askAiButton.offsetHeight - 10; // 10px above on mobile
+ buttonLeft = rect.left + scrollX + (rect.width / 2) - (askAiButton.offsetWidth / 2); // Centered
+ } else {
+ // For desktop, ensure the button doesn't go off screen
+ // Check right edge
+ if (buttonLeft + askAiButton.offsetWidth > scrollX + viewportWidth) {
+ buttonLeft = scrollX + viewportWidth - askAiButton.offsetWidth - 10; // 10px from right edge
+ }
+ }
+
+ // Check top edge (for all devices)
+ if (buttonTop < scrollY) {
+ // If would go above viewport, position below selection instead
+ buttonTop = rect.bottom + scrollY + 5; // 5px below
+ }
+
+ askAiButton.style.top = `${buttonTop}px`;
+ askAiButton.style.left = `${buttonLeft}px`;
+ askAiButton.style.display = 'block'; // Show the button
+ }
+
+ function hideButton() {
+ if (askAiButton) {
+ askAiButton.style.display = 'none';
+ }
+ }
+
+ function handleAskAiClick(event) {
+ event.stopPropagation(); // Prevent mousedown from hiding button immediately
+ const selectedText = getSafeSelectedText();
+ if (selectedText) {
+ console.log("Selected Text:", selectedText);
+ // Base64 encode for URL safety (handles special chars, line breaks)
+ // Use encodeURIComponent first for proper Unicode handling before btoa
+ const encodedText = btoa(unescape(encodeURIComponent(selectedText)));
+ const targetUrl = `${askAiPageUrl}?qq=${encodedText}`;
+ console.log("Navigating to:", targetUrl);
+ window.location.href = targetUrl; // Navigate to Ask AI page
+ }
+ hideButton(); // Hide after click
+ }
+
+ // --- Event Listeners ---
+
+ // Function to handle selection events (both mouse and touch)
+ function handleSelectionEvent(event) {
+ // Slight delay to ensure selection is registered
+ setTimeout(() => {
+ const selectedText = getSafeSelectedText();
+ if (selectedText) {
+ if (!askAiButton) {
+ askAiButton = createAskAiButton();
+ }
+ // Don't position if the event was ON the button itself
+ if (event.target !== askAiButton) {
+ positionButton(event);
+ }
+ } else {
+ hideButton();
+ }
+ }, 10); // Small delay
+ }
+
+ // Mouse selection events (desktop)
+ document.addEventListener('mouseup', handleSelectionEvent);
+
+ // Touch selection events (mobile)
+ document.addEventListener('touchend', handleSelectionEvent);
+ document.addEventListener('selectionchange', () => {
+ // This helps with mobile selection which can happen without mouseup/touchend
+ setTimeout(() => {
+ const selectedText = getSafeSelectedText();
+ if (selectedText && askAiButton) {
+ positionButton();
+ }
+ }, 300); // Longer delay for selection change
+ });
+
+ // Hide button on various events
+ document.addEventListener('mousedown', (event) => {
+ // Hide if clicking anywhere EXCEPT the button itself
+ if (askAiButton && event.target !== askAiButton) {
+ hideButton();
+ }
+ });
+
+ document.addEventListener('touchstart', (event) => {
+ // Same for touch events, but only hide if not on the button
+ if (askAiButton && event.target !== askAiButton) {
+ hideButton();
+ }
+ });
+
+ document.addEventListener('scroll', hideButton, true); // Capture scroll events
+
+ // Also hide when pressing Escape key
+ document.addEventListener('keydown', (event) => {
+ if (event.key === 'Escape') {
+ hideButton();
+ }
+ });
+
+ console.log("Selection Ask AI script loaded.");
+});
\ No newline at end of file
diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css
index 8ee8cbb1..46b90ab0 100644
--- a/docs/md_v2/assets/styles.css
+++ b/docs/md_v2/assets/styles.css
@@ -6,8 +6,8 @@
}
:root {
- --global-font-size: 16px;
- --global-code-font-size: 16px;
+ --global-font-size: 14px;
+ --global-code-font-size: 13px;
--global-line-height: 1.5em;
--global-space: 10px;
--font-stack: Menlo, Monaco, Lucida Console, Liberation Mono, DejaVu Sans Mono, Bitstream Vera Sans Mono,
@@ -50,8 +50,17 @@
--display-h1-decoration: none;
--display-h1-decoration: none;
+
+ --header-height: 65px; /* Adjust based on your actual header height */
+ --sidebar-width: 280px; /* Adjust based on your desired sidebar width */
+ --toc-width: 240px; /* Adjust based on your desired ToC width */
+ --layout-transition-speed: 0.2s; /* For potential future animations */
+
+ --page-width : 100em; /* Adjust based on your design */
}
+
+
/* body {
background-color: var(--background-color);
color: var(--font-color);
@@ -256,4 +265,9 @@ div.badges a {
}
div.badges a > img {
width: auto;
+}
+
+
+table td, table th {
+ border: 1px solid var(--code-bg-color) !important;
}
\ No newline at end of file
diff --git a/docs/md_v2/assets/toc.js b/docs/md_v2/assets/toc.js
new file mode 100644
index 00000000..8dad06b2
--- /dev/null
+++ b/docs/md_v2/assets/toc.js
@@ -0,0 +1,144 @@
+// ==== File: assets/toc.js ====
+
+document.addEventListener('DOMContentLoaded', () => {
+ const mainContent = document.getElementById('terminal-mkdocs-main-content');
+ const tocContainer = document.getElementById('toc-sidebar');
+ const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
+
+ if (!mainContent) {
+ console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
+ return;
+ }
+
+ // --- Create ToC container if it doesn't exist ---
+ let tocElement = tocContainer;
+ if (!tocElement) {
+ if (!mainGrid) {
+ console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
+ return;
+ }
+ tocElement = document.createElement('aside');
+ tocElement.id = 'toc-sidebar';
+ tocElement.style.display = 'none'; // Keep hidden initially
+ // Append it as the last child of the flex grid
+ mainGrid.appendChild(tocElement);
+ console.info("TOC Generator: Created '#toc-sidebar' element.");
+ }
+
+ // --- Find Headings (h2, h3, h4 are common for ToC) ---
+ const headings = mainContent.querySelectorAll('h2, h3, h4');
+ if (headings.length === 0) {
+ console.info("TOC Generator: No headings found on this page. ToC not generated.");
+ tocElement.style.display = 'none'; // Ensure it's hidden
+ return;
+ }
+
+ // --- Generate ToC List ---
+ const tocList = document.createElement('ul');
+ const observerTargets = []; // Store headings for IntersectionObserver
+
+ headings.forEach((heading, index) => {
+ // Ensure heading has an ID for linking
+ if (!heading.id) {
+ // Create a simple slug-like ID
+ heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
+ }
+
+ const listItem = document.createElement('li');
+ const link = document.createElement('a');
+
+ link.href = `#${heading.id}`;
+ link.textContent = heading.textContent;
+
+ // Add class for styling based on heading level
+ const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
+ listItem.classList.add(`toc-level-${level}`);
+
+ listItem.appendChild(link);
+ tocList.appendChild(listItem);
+ observerTargets.push(heading); // Add to observer list
+ });
+
+ // --- Populate and Show ToC ---
+ // Optional: Add a title
+ const tocTitle = document.createElement('h4');
+ tocTitle.textContent = 'On this page'; // Customize title if needed
+
+ tocElement.innerHTML = ''; // Clear previous content if any
+ tocElement.appendChild(tocTitle);
+ tocElement.appendChild(tocList);
+ tocElement.style.display = ''; // Show the ToC container
+
+ console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
+
+ // --- Scroll Spy using Intersection Observer ---
+ const tocLinks = tocElement.querySelectorAll('a');
+ let activeLink = null; // Keep track of the current active link
+
+ const observerOptions = {
+ // Observe changes relative to the viewport, offset by the header height
+ // Negative top margin pushes the intersection trigger point down
+ // Negative bottom margin ensures elements low on the screen can trigger before they exit
+ rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
+ threshold: 0 // Trigger as soon as any part enters/exits the boundary
+ };
+
+ const observerCallback = (entries) => {
+ let topmostVisibleHeading = null;
+
+ entries.forEach(entry => {
+ const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
+ if (!link) return;
+
+ // Check if the heading is intersecting (partially or fully visible within rootMargin)
+ if (entry.isIntersecting) {
+ // Among visible headings, find the one closest to the top edge (within the rootMargin)
+ if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
+ topmostVisibleHeading = entry.target;
+ }
+ }
+ });
+
+ // If we found a topmost visible heading, activate its link
+ if (topmostVisibleHeading) {
+ const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
+ if (newActiveLink && newActiveLink !== activeLink) {
+ // Remove active class from previous link
+ if (activeLink) {
+ activeLink.classList.remove('active');
+ activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
+ }
+ // Add active class to the new link
+ newActiveLink.classList.add('active');
+ newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
+ activeLink = newActiveLink;
+
+ // Optional: Scroll the ToC sidebar to keep the active link visible
+ // newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
+ }
+ }
+ // If no headings are intersecting (scrolled past the last one?), maybe deactivate all
+ // Or keep the last one active - depends on desired behavior. Current logic keeps last active.
+ };
+
+ const observer = new IntersectionObserver(observerCallback, observerOptions);
+
+ // Observe all target headings
+ observerTargets.forEach(heading => observer.observe(heading));
+
+ // Initial check in case a heading is already in view on load
+ // (Requires slight delay for accurate layout calculation)
+ setTimeout(() => {
+ observerCallback(observer.takeRecords()); // Process initial state
+ }, 100);
+
+ // move footer and the hr before footer to the end of the main content
+ const footer = document.querySelector('footer');
+ const hr = footer.previousElementSibling;
+ if (hr && hr.tagName === 'HR') {
+ mainContent.appendChild(hr);
+ }
+ mainContent.appendChild(footer);
+ console.info("TOC Generator: Footer moved to the end of the main content.");
+
+});
\ No newline at end of file
diff --git a/docs/md_v2/blog/index.md b/docs/md_v2/blog/index.md
index 1eed43d9..55532fce 100644
--- a/docs/md_v2/blog/index.md
+++ b/docs/md_v2/blog/index.md
@@ -4,6 +4,32 @@ Welcome to the Crawl4AI blog! Here you'll find detailed release notes, technical
## Latest Release
+Here’s the blog index entry for **v0.6.0**, written to match the exact tone and structure of your previous entries:
+
+---
+
+### [Crawl4AI v0.6.0 – World-Aware Crawling, Pre-Warmed Browsers, and the MCP API](releases/0.6.0.md)
+*April 23, 2025*
+
+Crawl4AI v0.6.0 is our most powerful release yet. This update brings major architectural upgrades including world-aware crawling (set geolocation, locale, and timezone), real-time traffic capture, and a memory-efficient crawler pool with pre-warmed pages.
+
+The Docker server now exposes a full-featured MCP socket + SSE interface, supports streaming, and comes with a new Playground UI. Plus, table extraction is now native, and the new stress-test framework supports crawling 1,000+ URLs.
+
+Other key changes:
+
+* Native support for `result.media["tables"]` to export DataFrames
+* Full network + console logs and MHTML snapshot per crawl
+* Browser pooling and pre-warming for faster cold starts
+* New streaming endpoints via MCP API and Playground
+* Robots.txt support, proxy rotation, and improved session handling
+* Deprecated old markdown names, legacy modules cleaned up
+* Massive repo cleanup: ~36K insertions, ~5K deletions across 121 files
+
+[Read full release notes →](releases/0.6.0.md)
+
+---
+
+Let me know if you want me to auto-update the actual file or just paste this into the markdown.
### [Crawl4AI v0.5.0: Deep Crawling, Scalability, and a New CLI!](releases/0.5.0.md)
diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md
index 24b0feda..30269a29 100644
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -251,7 +251,7 @@ from crawl4ai import (
RoundRobinProxyStrategy,
)
import asyncio
-from crawl4ai.proxy_strategy import ProxyConfig
+from crawl4ai import ProxyConfig
async def main():
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
diff --git a/docs/md_v2/blog/releases/0.6.0.md b/docs/md_v2/blog/releases/0.6.0.md
new file mode 100644
index 00000000..a3a7c216
--- /dev/null
+++ b/docs/md_v2/blog/releases/0.6.0.md
@@ -0,0 +1,143 @@
+# Crawl4AI v0.6.0 Release Notes
+
+We're excited to announce the release of **Crawl4AI v0.6.0**, our biggest and most feature-rich update yet. This version introduces major architectural upgrades, brand-new capabilities for geo-aware crawling, high-efficiency scraping, and real-time streaming support for scalable deployments.
+
+---
+
+## Highlights
+
+### 1. **World-Aware Crawlers**
+Crawl as if you’re anywhere in the world. With v0.6.0, each crawl can simulate:
+- Specific GPS coordinates
+- Browser locale
+- Timezone
+
+Example:
+```python
+CrawlerRunConfig(
+ url="https://browserleaks.com/geo",
+ locale="en-US",
+ timezone_id="America/Los_Angeles",
+ geolocation=GeolocationConfig(
+ latitude=34.0522,
+ longitude=-118.2437,
+ accuracy=10.0
+ )
+)
+```
+Great for accessing region-specific content or testing global behavior.
+
+---
+
+### 2. **Native Table Extraction**
+Extract HTML tables directly into usable formats like Pandas DataFrames or CSV with zero parsing hassle. All table data is available under `result.media["tables"]`.
+
+Example:
+```python
+raw_df = pd.DataFrame(
+ result.media["tables"][0]["rows"],
+ columns=result.media["tables"][0]["headers"]
+)
+```
+This makes it ideal for scraping financial data, pricing pages, or anything tabular.
+
+---
+
+### 3. **Browser Pooling & Pre-Warming**
+We've overhauled browser management. Now, multiple browser instances can be pooled and pages pre-warmed for ultra-fast launches:
+- Reduces cold-start latency
+- Lowers memory spikes
+- Enhances parallel crawling stability
+
+This powers the new **Docker Playground** experience and streamlines heavy-load crawling.
+
+---
+
+### 4. **Traffic & Snapshot Capture**
+Need full visibility? You can now capture:
+- Full network traffic logs
+- Console output
+- MHTML page snapshots for post-crawl audits and debugging
+
+No more guesswork on what happened during your crawl.
+
+---
+
+### 5. **MCP API and Streaming Support**
+We’re exposing **MCP socket and SSE endpoints**, allowing:
+- Live streaming of crawl results
+- Real-time integration with agents or frontends
+- A new Playground UI for interactive crawling
+
+This is a major step towards making Crawl4AI real-time ready.
+
+---
+
+### 6. **Stress-Test Framework**
+Want to test performance under heavy load? v0.6.0 includes a new memory stress-test suite that supports 1,000+ URL workloads. Ideal for:
+- Load testing
+- Performance benchmarking
+- Validating memory efficiency
+
+---
+
+## Core Improvements
+- Robots.txt compliance
+- Proxy rotation support
+- Improved URL normalization and session reuse
+- Shared data across crawler hooks
+- New page routing logic
+
+---
+
+## Breaking Changes & Deprecations
+- Legacy `crawl4ai/browser/*` modules are removed. Update imports accordingly.
+- `AsyncPlaywrightCrawlerStrategy.get_page` now uses a new function signature.
+- Deprecated markdown generator aliases now point to `DefaultMarkdownGenerator` with warning.
+
+---
+
+## Miscellaneous Updates
+- FastAPI validators replaced custom validation logic
+- Docker build now based on a Chromium layer
+- Repo-wide cleanup: ~36,000 insertions, ~5,000 deletions
+
+---
+
+## New Examples Included
+- Geo-location crawling
+- Network + console log capture
+- Docker MCP API usage
+- Markdown selector usage
+- Crypto project data extraction
+
+---
+
+## Watch the Release Video
+Want a visual walkthrough of all these updates? Watch the video:
+🔗 https://youtu.be/9x7nVcjOZks
+
+If you're new to Crawl4AI, start here:
+🔗 https://www.youtube.com/watch?v=xo3qK6Hg9AA&t=15s
+
+---
+
+## Join the Community
+We’ve just opened up our **Discord** for the public. Join us to:
+- Ask questions
+- Share your projects
+- Get help or contribute
+
+💬 https://discord.gg/wpYFACrHR4
+
+---
+
+## Install or Upgrade
+```bash
+pip install -U crawl4ai
+```
+
+---
+
+Live long and import crawl4ai. 🖖
+
diff --git a/docs/md_v2/core/ask-ai.md b/docs/md_v2/core/ask-ai.md
new file mode 100644
index 00000000..9122bd29
--- /dev/null
+++ b/docs/md_v2/core/ask-ai.md
@@ -0,0 +1,74 @@
+
+
+
+
+
+
+
diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md
index 5f66b3ea..c7c8c166 100644
--- a/docs/md_v2/core/browser-crawler-config.md
+++ b/docs/md_v2/core/browser-crawler-config.md
@@ -1,9 +1,9 @@
# Browser, Crawler & LLM Configuration (Quick Overview)
-Crawl4AI’s flexibility stems from two key classes:
+Crawl4AI's flexibility stems from two key classes:
-1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
-2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
+1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
+2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
@@ -36,18 +36,16 @@ class BrowserConfig:
### Key Fields to Note
-
-
-1. **`browser_type`**
+1. **`browser_type`**
- Options: `"chromium"`, `"firefox"`, or `"webkit"`.
- Defaults to `"chromium"`.
- If you need a different engine, specify it here.
-2. **`headless`**
+2. **`headless`**
- `True`: Runs the browser in headless mode (invisible browser).
- `False`: Runs the browser in visible mode, which helps with debugging.
-3. **`proxy_config`**
+3. **`proxy_config`**
- A dictionary with fields like:
```json
{
@@ -58,31 +56,31 @@ class BrowserConfig:
```
- Leave as `None` if a proxy is not required.
-4. **`viewport_width` & `viewport_height`**:
+4. **`viewport_width` & `viewport_height`**:
- The initial window size.
- Some sites behave differently with smaller or bigger viewports.
-5. **`verbose`**:
+5. **`verbose`**:
- If `True`, prints extra logs.
- Handy for debugging.
-6. **`use_persistent_context`**:
+6. **`use_persistent_context`**:
- If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs.
- Typically also set `user_data_dir` to point to a folder.
-7. **`cookies`** & **`headers`**:
+7. **`cookies`** & **`headers`**:
- If you want to start with specific cookies or add universal HTTP headers, set them here.
- E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`.
-8. **`user_agent`**:
+8. **`user_agent`**:
- Custom User-Agent string. If `None`, a default is used.
- You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection).
-9. **`text_mode`** & **`light_mode`**:
+9. **`text_mode`** & **`light_mode`**:
- `text_mode=True` disables images, possibly speeding up text-only crawls.
- `light_mode=True` turns off certain background features for performance.
-10. **`extra_args`**:
+10. **`extra_args`**:
- Additional flags for the underlying browser.
- E.g. `["--disable-extensions"]`.
@@ -136,6 +134,12 @@ class CrawlerRunConfig:
wait_for=None,
screenshot=False,
pdf=False,
+ capture_mhtml=False,
+ # Location and Identity Parameters
+ locale=None, # e.g. "en-US", "fr-FR"
+ timezone_id=None, # e.g. "America/New_York"
+ geolocation=None, # GeolocationConfig object
+ # Resource Management
enable_rate_limiting=False,
rate_limit_config=None,
memory_threshold_percent=70.0,
@@ -151,58 +155,65 @@ class CrawlerRunConfig:
### Key Fields to Note
-1. **`word_count_threshold`**:
+1. **`word_count_threshold`**:
- The minimum word count before a block is considered.
- If your site has lots of short paragraphs or items, you can lower it.
-2. **`extraction_strategy`**:
+2. **`extraction_strategy`**:
- Where you plug in JSON-based extraction (CSS, LLM, etc.).
- If `None`, no structured extraction is done (only raw/cleaned HTML + markdown).
-3. **`markdown_generator`**:
+3. **`markdown_generator`**:
- E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done.
- If `None`, a default approach is used.
-4. **`cache_mode`**:
+4. **`cache_mode`**:
- Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.).
- If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`.
-5. **`js_code`**:
+5. **`js_code`**:
- A string or list of JS strings to execute.
- - Great for “Load More” buttons or user interactions.
+ - Great for "Load More" buttons or user interactions.
-6. **`wait_for`**:
+6. **`wait_for`**:
- A CSS or JS expression to wait for before extracting content.
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
-7. **`screenshot`** & **`pdf`**:
- - If `True`, captures a screenshot or PDF after the page is fully loaded.
- - The results go to `result.screenshot` (base64) or `result.pdf` (bytes).
+7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
+ - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
+ - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
-8. **`verbose`**:
+8. **Location Parameters**:
+ - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences
+ - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`)
+ - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)`
+ - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control)
+
+9. **`verbose`**:
- Logs additional runtime details.
- - Overlaps with the browser’s verbosity if also set to `True` in `BrowserConfig`.
+ - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`.
-9. **`enable_rate_limiting`**:
+10. **`enable_rate_limiting`**:
- If `True`, enables rate limiting for batch processing.
- Requires `rate_limit_config` to be set.
-10. **`memory_threshold_percent`**:
+11. **`memory_threshold_percent`**:
- The memory threshold (as a percentage) to monitor.
- If exceeded, the crawler will pause or slow down.
-11. **`check_interval`**:
+12. **`check_interval`**:
- The interval (in seconds) to check system resources.
- Affects how often memory and CPU usage are monitored.
-12. **`max_session_permit`**:
+13. **`max_session_permit`**:
- The maximum number of concurrent crawl sessions.
- Helps prevent overwhelming the system.
-13. **`display_mode`**:
+14. **`display_mode`**:
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
- Affects how much information is printed during the crawl.
+
### Helper Methods
The `clone()` method is particularly useful for creating variations of your crawler configuration:
@@ -236,23 +247,20 @@ The `clone()` method:
---
-
-
-
## 3. LLMConfig Essentials
### Key fields to note
-1. **`provider`**:
+1. **`provider`**:
- Which LLM provoder to use.
- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)*
-2. **`api_token`**:
+2. **`api_token`**:
- Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables
- API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"`
- Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"`
-3. **`base_url`**:
+3. **`base_url`**:
- If your provider has a custom endpoint
```python
@@ -261,7 +269,7 @@ llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENA
## 4. Putting It All Together
-In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
+In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs:
```python
import asyncio
diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md
index 961b38f6..d7648ecb 100644
--- a/docs/md_v2/core/crawler-result.md
+++ b/docs/md_v2/core/crawler-result.md
@@ -26,6 +26,7 @@ class CrawlResult(BaseModel):
downloaded_files: Optional[List[str]] = None
screenshot: Optional[str] = None
pdf : Optional[bytes] = None
+ mhtml: Optional[str] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
@@ -51,6 +52,7 @@ class CrawlResult(BaseModel):
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
+| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
| **extracted_content (`Optional[str]`)** | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text. |
| **metadata (`Optional[dict]`)** | Additional info about the crawl or extracted data. |
@@ -190,18 +192,27 @@ for img in images:
print("Image URL:", img["src"], "Alt:", img.get("alt"))
```
-### 5.3 `screenshot` and `pdf`
+### 5.3 `screenshot`, `pdf`, and `mhtml`
-If you set `screenshot=True` or `pdf=True` in **`CrawlerRunConfig`**, then:
+If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
-- `result.screenshot` contains a base64-encoded PNG string.
+- `result.screenshot` contains a base64-encoded PNG string.
- `result.pdf` contains raw PDF bytes (you can write them to a file).
+- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
```python
+# Save the PDF
with open("page.pdf", "wb") as f:
f.write(result.pdf)
+
+# Save the MHTML
+if result.mhtml:
+ with open("page.mhtml", "w", encoding="utf-8") as f:
+ f.write(result.mhtml)
```
+The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
+
### 5.4 `ssl_certificate`
If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the site’s SSL cert, such as issuer, validity dates, etc.
diff --git a/docs/md_v2/core/docker-deployment.md b/docs/md_v2/core/docker-deployment.md
index a3d0def1..7e239d43 100644
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
@@ -1,702 +1,821 @@
-# Docker Deployment
+# Crawl4AI Docker Guide 🐳
-Crawl4AI provides official Docker images for easy deployment and scalability. This guide covers installation, configuration, and usage of Crawl4AI in Docker environments.
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+ - [Option 1: Using Pre-built Docker Hub Images (Recommended)](#option-1-using-pre-built-docker-hub-images-recommended)
+ - [Option 2: Using Docker Compose](#option-2-using-docker-compose)
+ - [Option 3: Manual Local Build & Run](#option-3-manual-local-build--run)
+- [Dockerfile Parameters](#dockerfile-parameters)
+- [Using the API](#using-the-api)
+ - [Playground Interface](#playground-interface)
+ - [Python SDK](#python-sdk)
+ - [Understanding Request Schema](#understanding-request-schema)
+ - [REST API Examples](#rest-api-examples)
+- [Additional API Endpoints](#additional-api-endpoints)
+ - [HTML Extraction Endpoint](#html-extraction-endpoint)
+ - [Screenshot Endpoint](#screenshot-endpoint)
+ - [PDF Export Endpoint](#pdf-export-endpoint)
+ - [JavaScript Execution Endpoint](#javascript-execution-endpoint)
+ - [Library Context Endpoint](#library-context-endpoint)
+- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
+ - [What is MCP?](#what-is-mcp)
+ - [Connecting via MCP](#connecting-via-mcp)
+ - [Using with Claude Code](#using-with-claude-code)
+ - [Available MCP Tools](#available-mcp-tools)
+ - [Testing MCP Connections](#testing-mcp-connections)
+ - [MCP Schemas](#mcp-schemas)
+- [Metrics & Monitoring](#metrics--monitoring)
+- [Deployment Scenarios](#deployment-scenarios)
+- [Complete Examples](#complete-examples)
+- [Server Configuration](#server-configuration)
+ - [Understanding config.yml](#understanding-configyml)
+ - [JWT Authentication](#jwt-authentication)
+ - [Configuration Tips and Best Practices](#configuration-tips-and-best-practices)
+ - [Customizing Your Configuration](#customizing-your-configuration)
+ - [Configuration Recommendations](#configuration-recommendations)
+- [Getting Help](#getting-help)
+- [Summary](#summary)
-## Quick Start 🚀
+## Prerequisites
-Pull and run the basic version:
+Before we dive in, make sure you have:
+- Docker installed and running (version 20.10.0 or higher), including `docker compose` (usually bundled with Docker Desktop).
+- `git` for cloning the repository.
+- At least 4GB of RAM available for the container (more recommended for heavy use).
+- Python 3.10+ (if using the Python SDK).
+- Node.js 16+ (if using the Node.js examples).
+
+> 💡 **Pro tip**: Run `docker info` to check your Docker installation and available resources.
+
+## Installation
+
+We offer several ways to get the Crawl4AI server running. The quickest way is to use our pre-built Docker Hub images.
+
+### Option 1: Using Pre-built Docker Hub Images (Recommended)
+
+Pull and run images directly from Docker Hub without building locally.
+
+#### 1. Pull the Image
+
+Our latest release candidate is `0.6.0-r2`. Images are built with multi-arch manifests, so Docker automatically pulls the correct version for your system.
```bash
-# Basic run without security
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
+# Pull the release candidate (recommended for latest features)
+docker pull unclecode/crawl4ai:0.6.0-r1
-# Run with API security enabled
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:basic
+# Or pull the latest stable version
+docker pull unclecode/crawl4ai:latest
```
-## Running with Docker Compose 🐳
+#### 2. Setup Environment (API Keys)
-### Use Docker Compose (From Local Dockerfile or Docker Hub)
-
-Crawl4AI provides flexibility to use Docker Compose for managing your containerized services. You can either build the image locally from the provided `Dockerfile` or use the pre-built image from Docker Hub.
-
-### **Option 1: Using Docker Compose to Build Locally**
-If you want to build the image locally, use the provided `docker-compose.local.yml` file.
+If you plan to use LLMs, create a `.llm.env` file in your working directory:
```bash
-docker-compose -f docker-compose.local.yml up -d
+# Create a .llm.env file with your API keys
+cat > .llm.env << EOL
+# OpenAI
+OPENAI_API_KEY=sk-your-key
+
+# Anthropic
+ANTHROPIC_API_KEY=your-anthropic-key
+
+# Other providers as needed
+# DEEPSEEK_API_KEY=your-deepseek-key
+# GROQ_API_KEY=your-groq-key
+# TOGETHER_API_KEY=your-together-key
+# MISTRAL_API_KEY=your-mistral-key
+# GEMINI_API_TOKEN=your-gemini-token
+EOL
```
+> 🔑 **Note**: Keep your API keys secure! Never commit `.llm.env` to version control.
-This will:
-1. Build the Docker image from the provided `Dockerfile`.
-2. Start the container and expose it on `http://localhost:11235`.
+#### 3. Run the Container
----
+* **Basic run:**
+ ```bash
+ docker run -d \
+ -p 11235:11235 \
+ --name crawl4ai \
+ --shm-size=1g \
+ unclecode/crawl4ai:latest
+ ```
-### **Option 2: Using Docker Compose with Pre-Built Image from Hub**
-If you prefer using the pre-built image on Docker Hub, use the `docker-compose.hub.yml` file.
+* **With LLM support:**
+ ```bash
+ # Make sure .llm.env is in the current directory
+ docker run -d \
+ -p 11235:11235 \
+ --name crawl4ai \
+ --env-file .llm.env \
+ --shm-size=1g \
+ unclecode/crawl4ai:latest
+ ```
+
+> The server will be available at `http://localhost:11235`. Visit `/playground` to access the interactive testing interface.
+
+#### 4. Stopping the Container
```bash
-docker-compose -f docker-compose.hub.yml up -d
+docker stop crawl4ai && docker rm crawl4ai
```
-This will:
-1. Pull the pre-built image `unclecode/crawl4ai:basic` (or `all`, depending on your configuration).
-2. Start the container and expose it on `http://localhost:11235`.
+#### Docker Hub Versioning Explained
----
+* **Image Name:** `unclecode/crawl4ai`
+* **Tag Format:** `LIBRARY_VERSION[-SUFFIX]` (e.g., `0.6.0-r2`)
+ * `LIBRARY_VERSION`: The semantic version of the core `crawl4ai` Python library
+ * `SUFFIX`: Optional tag for release candidates (``) and revisions (`r1`)
+* **`latest` Tag:** Points to the most recent stable version
+* **Multi-Architecture Support:** All images support both `linux/amd64` and `linux/arm64` architectures through a single tag
-### **Stopping the Running Services**
+### Option 2: Using Docker Compose
-To stop the services started via Docker Compose, you can use:
+Docker Compose simplifies building and running the service, especially for local development and testing.
+
+#### 1. Clone Repository
```bash
-docker-compose -f docker-compose.local.yml down
-# OR
-docker-compose -f docker-compose.hub.yml down
+git clone https://github.com/unclecode/crawl4ai.git
+cd crawl4ai
```
-If the containers don’t stop and the application is still running, check the running containers:
+#### 2. Environment Setup (API Keys)
+
+If you plan to use LLMs, copy the example environment file and add your API keys. This file should be in the **project root directory**.
```bash
-docker ps
+# Make sure you are in the 'crawl4ai' root directory
+cp deploy/docker/.llm.env.example .llm.env
+
+# Now edit .llm.env and add your API keys
```
-Find the `CONTAINER ID` of the running service and stop it forcefully:
+#### 3. Build and Run with Compose
-```bash
-docker stop
-```
+The `docker-compose.yml` file in the project root provides a simplified approach that automatically handles architecture detection using buildx.
----
+* **Run Pre-built Image from Docker Hub:**
+ ```bash
+ # Pulls and runs the release candidate from Docker Hub
+ # Automatically selects the correct architecture
+ IMAGE=unclecode/crawl4ai:latest docker compose up -d
+ ```
-### **Debugging with Docker Compose**
+* **Build and Run Locally:**
+ ```bash
+ # Builds the image locally using Dockerfile and runs it
+ # Automatically uses the correct architecture for your machine
+ docker compose up --build -d
+ ```
-- **Check Logs**: To view the container logs:
- ```bash
- docker-compose -f docker-compose.local.yml logs -f
- ```
-
-- **Remove Orphaned Containers**: If the service is still running unexpectedly:
- ```bash
- docker-compose -f docker-compose.local.yml down --remove-orphans
- ```
-
-- **Manually Remove Network**: If the network is still in use:
- ```bash
- docker network ls
- docker network rm crawl4ai_default
- ```
-
----
-
-### Why Use Docker Compose?
-
-Docker Compose is the recommended way to deploy Crawl4AI because:
-1. It simplifies multi-container setups.
-2. Allows you to define environment variables, resources, and ports in a single file.
-3. Makes it easier to switch between local development and production-ready images.
-
-For example, your `docker-compose.yml` could include API keys, token settings, and memory limits, making deployment quick and consistent.
-
-
-
-
-## API Security 🔒
-
-### Understanding CRAWL4AI_API_TOKEN
-
-The `CRAWL4AI_API_TOKEN` provides optional security for your Crawl4AI instance:
-
-- If `CRAWL4AI_API_TOKEN` is set: All API endpoints (except `/health`) require authentication
-- If `CRAWL4AI_API_TOKEN` is not set: The API is publicly accessible
-
-```bash
-# Secured Instance
-docker run -p 11235:11235 -e CRAWL4AI_API_TOKEN=your_secret_token unclecode/crawl4ai:all
-
-# Unsecured Instance
-docker run -p 11235:11235 unclecode/crawl4ai:all
-```
-
-### Making API Calls
-
-For secured instances, include the token in all requests:
-
-```python
-import requests
-
-# Setup headers if token is being used
-api_token = "your_secret_token" # Same token set in CRAWL4AI_API_TOKEN
-headers = {"Authorization": f"Bearer {api_token}"} if api_token else {}
-
-# Making authenticated requests
-response = requests.post(
- "http://localhost:11235/crawl",
- headers=headers,
- json={
- "urls": "https://example.com",
- "priority": 10
- }
-)
-
-# Checking task status
-task_id = response.json()["task_id"]
-status = requests.get(
- f"http://localhost:11235/task/{task_id}",
- headers=headers
-)
-```
-
-### Using with Docker Compose
-
-In your `docker-compose.yml`:
-```yaml
-services:
- crawl4ai:
- image: unclecode/crawl4ai:all
- environment:
- - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional
- # ... other configuration
-```
-
-Then either:
-1. Set in `.env` file:
-```env
-CRAWL4AI_API_TOKEN=your_secret_token
-```
-
-2. Or set via command line:
-```bash
-CRAWL4AI_API_TOKEN=your_secret_token docker-compose up
-```
-
-> **Security Note**: If you enable the API token, make sure to keep it secure and never commit it to version control. The token will be required for all API endpoints except the health check endpoint (`/health`).
-
-## Configuration Options 🔧
-
-### Environment Variables
-
-You can configure the service using environment variables:
-
-```bash
-# Basic configuration
-docker run -p 11235:11235 \
- -e MAX_CONCURRENT_TASKS=5 \
- unclecode/crawl4ai:all
-
-# With security and LLM support
-docker run -p 11235:11235 \
- -e CRAWL4AI_API_TOKEN=your_secret_token \
- -e OPENAI_API_KEY=sk-... \
- -e ANTHROPIC_API_KEY=sk-ant-... \
- unclecode/crawl4ai:all
-```
-
-### Using Docker Compose (Recommended) 🐳
-
-Create a `docker-compose.yml`:
-
-```yaml
-version: '3.8'
-
-services:
- crawl4ai:
- image: unclecode/crawl4ai:all
- ports:
- - "11235:11235"
- environment:
- - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-} # Optional API security
- - MAX_CONCURRENT_TASKS=5
- # LLM Provider Keys
- - OPENAI_API_KEY=${OPENAI_API_KEY:-}
- - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- volumes:
- - /dev/shm:/dev/shm
- deploy:
- resources:
- limits:
- memory: 4G
- reservations:
- memory: 1G
-```
-
-You can run it in two ways:
-
-1. Using environment variables directly:
-```bash
-CRAWL4AI_API_TOKEN=secret123 OPENAI_API_KEY=sk-... docker-compose up
-```
-
-2. Using a `.env` file (recommended):
-Create a `.env` file in the same directory:
-```env
-# API Security (optional)
-CRAWL4AI_API_TOKEN=your_secret_token
-
-# LLM Provider Keys
-OPENAI_API_KEY=sk-...
-ANTHROPIC_API_KEY=sk-ant-...
-
-# Other Configuration
-MAX_CONCURRENT_TASKS=5
-```
-
-Then simply run:
-```bash
-docker-compose up
-```
-
-### Testing the Deployment 🧪
-
-```python
-import requests
-
-# For unsecured instances
-def test_unsecured():
- # Health check
- health = requests.get("http://localhost:11235/health")
- print("Health check:", health.json())
-
- # Basic crawl
- response = requests.post(
- "http://localhost:11235/crawl",
- json={
- "urls": "https://www.nbcnews.com/business",
- "priority": 10
- }
- )
- task_id = response.json()["task_id"]
- print("Task ID:", task_id)
-
-# For secured instances
-def test_secured(api_token):
- headers = {"Authorization": f"Bearer {api_token}"}
+* **Customize the Build:**
+ ```bash
+ # Build with all features (includes torch and transformers)
+ INSTALL_TYPE=all docker compose up --build -d
- # Basic crawl with authentication
- response = requests.post(
- "http://localhost:11235/crawl",
- headers=headers,
- json={
- "urls": "https://www.nbcnews.com/business",
- "priority": 10
- }
- )
- task_id = response.json()["task_id"]
- print("Task ID:", task_id)
-```
+ # Build with GPU support (for AMD64 platforms)
+ ENABLE_GPU=true docker compose up --build -d
+ ```
-### LLM Extraction Example 🤖
+> The server will be available at `http://localhost:11235`.
-When you've configured your LLM provider keys (via environment variables or `.env`), you can use LLM extraction:
+#### 4. Stopping the Service
-```python
-request = {
- "urls": "https://example.com",
- "extraction_config": {
- "type": "llm",
- "params": {
- "provider": "openai/gpt-4",
- "instruction": "Extract main topics from the page"
- }
- }
-}
-
-# Make the request (add headers if using API security)
-response = requests.post("http://localhost:11235/crawl", json=request)
-```
-
-> **Note**: Remember to add `.env` to your `.gitignore` to keep your API keys secure!
-
-
-## Usage Examples 📝
-
-### Basic Crawling
-
-```python
-request = {
- "urls": "https://www.nbcnews.com/business",
- "priority": 10
-}
-
-response = requests.post("http://localhost:11235/crawl", json=request)
-task_id = response.json()["task_id"]
-
-# Get results
-result = requests.get(f"http://localhost:11235/task/{task_id}")
-```
-
-### Structured Data Extraction
-
-```python
-schema = {
- "name": "Crypto Prices",
- "baseSelector": ".cds-tableRow-t45thuk",
- "fields": [
- {
- "name": "crypto",
- "selector": "td:nth-child(1) h2",
- "type": "text",
- },
- {
- "name": "price",
- "selector": "td:nth-child(2)",
- "type": "text",
- }
- ],
-}
-
-request = {
- "urls": "https://www.coinbase.com/explore",
- "extraction_config": {
- "type": "json_css",
- "params": {"schema": schema}
- }
-}
-```
-
-### Dynamic Content Handling
-
-```python
-request = {
- "urls": "https://www.nbcnews.com/business",
- "js_code": [
- "const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
- ],
- "wait_for": "article.tease-card:nth-child(10)"
-}
-```
-
-### AI-Powered Extraction (Full Version)
-
-```python
-request = {
- "urls": "https://www.nbcnews.com/business",
- "extraction_config": {
- "type": "cosine",
- "params": {
- "semantic_filter": "business finance economy",
- "word_count_threshold": 10,
- "max_dist": 0.2,
- "top_k": 3
- }
- }
-}
-```
-
-## Platform-Specific Instructions 💻
-
-### macOS
```bash
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
+# Stop the service
+docker compose down
```
-### Ubuntu
+### Option 3: Manual Local Build & Run
+
+If you prefer not to use Docker Compose for direct control over the build and run process.
+
+#### 1. Clone Repository & Setup Environment
+
+Follow steps 1 and 2 from the Docker Compose section above (clone repo, `cd crawl4ai`, create `.llm.env` in the root).
+
+#### 2. Build the Image (Multi-Arch)
+
+Use `docker buildx` to build the image. Crawl4AI now uses buildx to handle multi-architecture builds automatically.
+
```bash
-# Basic version
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
+# Make sure you are in the 'crawl4ai' root directory
+# Build for the current architecture and load it into Docker
+docker buildx build -t crawl4ai-local:latest --load .
-# With GPU support
-docker pull unclecode/crawl4ai:gpu
-docker run --gpus all -p 11235:11235 unclecode/crawl4ai:gpu
+# Or build for multiple architectures (useful for publishing)
+docker buildx build --platform linux/amd64,linux/arm64 -t crawl4ai-local:latest --load .
+
+# Build with additional options
+docker buildx build \
+ --build-arg INSTALL_TYPE=all \
+ --build-arg ENABLE_GPU=false \
+ -t crawl4ai-local:latest --load .
```
-### Windows (PowerShell)
-```powershell
-docker pull unclecode/crawl4ai:basic
-docker run -p 11235:11235 unclecode/crawl4ai:basic
+#### 3. Run the Container
+
+* **Basic run (no LLM support):**
+ ```bash
+ docker run -d \
+ -p 11235:11235 \
+ --name crawl4ai-standalone \
+ --shm-size=1g \
+ crawl4ai-local:latest
+ ```
+
+* **With LLM support:**
+ ```bash
+ # Make sure .llm.env is in the current directory (project root)
+ docker run -d \
+ -p 11235:11235 \
+ --name crawl4ai-standalone \
+ --env-file .llm.env \
+ --shm-size=1g \
+ crawl4ai-local:latest
+ ```
+
+> The server will be available at `http://localhost:11235`.
+
+#### 4. Stopping the Manual Container
+
+```bash
+docker stop crawl4ai-standalone && docker rm crawl4ai-standalone
```
-## Testing 🧪
+---
-Save this as `test_docker.py`:
+## MCP (Model Context Protocol) Support
+
+Crawl4AI server includes support for the Model Context Protocol (MCP), allowing you to connect the server's capabilities directly to MCP-compatible clients like Claude Code.
+
+### What is MCP?
+
+MCP is an open protocol that standardizes how applications provide context to LLMs. It allows AI models to access external tools, data sources, and services through a standardized interface.
+
+### Connecting via MCP
+
+The Crawl4AI server exposes two MCP endpoints:
+
+- **Server-Sent Events (SSE)**: `http://localhost:11235/mcp/sse`
+- **WebSocket**: `ws://localhost:11235/mcp/ws`
+
+### Using with Claude Code
+
+You can add Crawl4AI as an MCP tool provider in Claude Code with a simple command:
+
+```bash
+# Add the Crawl4AI server as an MCP provider
+claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
+
+# List all MCP providers to verify it was added
+claude mcp list
+```
+
+Once connected, Claude Code can directly use Crawl4AI's capabilities like screenshot capture, PDF generation, and HTML processing without having to make separate API calls.
+
+### Available MCP Tools
+
+When connected via MCP, the following tools are available:
+
+- `md` - Generate markdown from web content
+- `html` - Extract preprocessed HTML
+- `screenshot` - Capture webpage screenshots
+- `pdf` - Generate PDF documents
+- `execute_js` - Run JavaScript on web pages
+- `crawl` - Perform multi-URL crawling
+- `ask` - Query the Crawl4AI library context
+
+### Testing MCP Connections
+
+You can test the MCP WebSocket connection using the test file included in the repository:
+
+```bash
+# From the repository root
+python tests/mcp/test_mcp_socket.py
+```
+
+### MCP Schemas
+
+Access the MCP tool schemas at `http://localhost:11235/mcp/schema` for detailed information on each tool's parameters and capabilities.
+
+---
+
+## Additional API Endpoints
+
+In addition to the core `/crawl` and `/crawl/stream` endpoints, the server provides several specialized endpoints:
+
+### HTML Extraction Endpoint
+
+```
+POST /html
+```
+
+Crawls the URL and returns preprocessed HTML optimized for schema extraction.
+
+```json
+{
+ "url": "https://example.com"
+}
+```
+
+### Screenshot Endpoint
+
+```
+POST /screenshot
+```
+
+Captures a full-page PNG screenshot of the specified URL.
+
+```json
+{
+ "url": "https://example.com",
+ "screenshot_wait_for": 2,
+ "output_path": "/path/to/save/screenshot.png"
+}
+```
+
+- `screenshot_wait_for`: Optional delay in seconds before capture (default: 2)
+- `output_path`: Optional path to save the screenshot (recommended)
+
+### PDF Export Endpoint
+
+```
+POST /pdf
+```
+
+Generates a PDF document of the specified URL.
+
+```json
+{
+ "url": "https://example.com",
+ "output_path": "/path/to/save/document.pdf"
+}
+```
+
+- `output_path`: Optional path to save the PDF (recommended)
+
+### JavaScript Execution Endpoint
+
+```
+POST /execute_js
+```
+
+Executes JavaScript snippets on the specified URL and returns the full crawl result.
+
+```json
+{
+ "url": "https://example.com",
+ "scripts": [
+ "return document.title",
+ "return Array.from(document.querySelectorAll('a')).map(a => a.href)"
+ ]
+}
+```
+
+- `scripts`: List of JavaScript snippets to execute sequentially
+
+---
+
+## Dockerfile Parameters
+
+You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
+
+```bash
+# Example: Build with 'all' features using buildx
+docker buildx build \
+ --platform linux/amd64,linux/arm64 \
+ --build-arg INSTALL_TYPE=all \
+ -t yourname/crawl4ai-all:latest \
+ --load \
+ . # Build from root context
+```
+
+### Build Arguments Explained
+
+| Argument | Description | Default | Options |
+| :----------- | :--------------------------------------- | :-------- | :--------------------------------- |
+| INSTALL_TYPE | Feature set | `default` | `default`, `all`, `torch`, `transformer` |
+| ENABLE_GPU | GPU support (CUDA for AMD64) | `false` | `true`, `false` |
+| APP_HOME | Install path inside container (advanced) | `/app` | any valid path |
+| USE_LOCAL | Install library from local source | `true` | `true`, `false` |
+| GITHUB_REPO | Git repo to clone if USE_LOCAL=false | *(see Dockerfile)* | any git URL |
+| GITHUB_BRANCH| Git branch to clone if USE_LOCAL=false | `main` | any branch name |
+
+*(Note: PYTHON_VERSION is fixed by the `FROM` instruction in the Dockerfile)*
+
+### Build Best Practices
+
+1. **Choose the Right Install Type**
+ * `default`: Basic installation, smallest image size. Suitable for most standard web scraping and markdown generation.
+ * `all`: Full features including `torch` and `transformers` for advanced extraction strategies (e.g., CosineStrategy, certain LLM filters). Significantly larger image. Ensure you need these extras.
+2. **Platform Considerations**
+ * Use `buildx` for building multi-architecture images, especially for pushing to registries.
+ * Use `docker compose` profiles (`local-amd64`, `local-arm64`) for easy platform-specific local builds.
+3. **Performance Optimization**
+ * The image automatically includes platform-specific optimizations (OpenMP for AMD64, OpenBLAS for ARM64).
+
+---
+
+## Using the API
+
+Communicate with the running Docker server via its REST API (defaulting to `http://localhost:11235`). You can use the Python SDK or make direct HTTP requests.
+
+### Playground Interface
+
+A built-in web playground is available at `http://localhost:11235/playground` for testing and generating API requests. The playground allows you to:
+
+1. Configure `CrawlerRunConfig` and `BrowserConfig` using the main library's Python syntax
+2. Test crawling operations directly from the interface
+3. Generate corresponding JSON for REST API requests based on your configuration
+
+This is the easiest way to translate Python configuration to JSON requests when building integrations.
+
+### Python SDK
+
+Install the SDK: `pip install crawl4ai`
```python
-import requests
-import json
-import time
-import sys
+import asyncio
+from crawl4ai.docker_client import Crawl4aiDockerClient
+from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode # Assuming you have crawl4ai installed
-class Crawl4AiTester:
- def __init__(self, base_url: str = "http://localhost:11235"):
- self.base_url = base_url
-
- def submit_and_wait(self, request_data: dict, timeout: int = 300) -> dict:
- # Submit crawl job
- response = requests.post(f"{self.base_url}/crawl", json=request_data)
- task_id = response.json()["task_id"]
- print(f"Task ID: {task_id}")
-
- # Poll for result
- start_time = time.time()
- while True:
- if time.time() - start_time > timeout:
- raise TimeoutError(f"Task {task_id} timeout")
-
- result = requests.get(f"{self.base_url}/task/{task_id}")
- status = result.json()
-
- if status["status"] == "completed":
- return status
-
- time.sleep(2)
+async def main():
+ # Point to the correct server port
+ async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
+ # If JWT is enabled on the server, authenticate first:
+ # await client.authenticate("user@example.com") # See Server Configuration section
-def test_deployment():
- tester = Crawl4AiTester()
-
- # Test basic crawl
- request = {
- "urls": "https://www.nbcnews.com/business",
- "priority": 10
- }
-
- result = tester.submit_and_wait(request)
- print("Basic crawl successful!")
- print(f"Content length: {len(result['result']['markdown'])}")
+ # Example Non-streaming crawl
+ print("--- Running Non-Streaming Crawl ---")
+ results = await client.crawl(
+ ["https://httpbin.org/html"],
+ browser_config=BrowserConfig(headless=True), # Use library classes for config aid
+ crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
+ )
+ if results: # client.crawl returns None on failure
+ print(f"Non-streaming results success: {results.success}")
+ if results.success:
+ for result in results: # Iterate through the CrawlResultContainer
+ print(f"URL: {result.url}, Success: {result.success}")
+ else:
+ print("Non-streaming crawl failed.")
+
+
+ # Example Streaming crawl
+ print("\n--- Running Streaming Crawl ---")
+ stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
+ try:
+ async for result in await client.crawl( # client.crawl returns an async generator for streaming
+ ["https://httpbin.org/html", "https://httpbin.org/links/5/0"],
+ browser_config=BrowserConfig(headless=True),
+ crawler_config=stream_config
+ ):
+ print(f"Streamed result: URL: {result.url}, Success: {result.success}")
+ except Exception as e:
+ print(f"Streaming crawl failed: {e}")
+
+
+ # Example Get schema
+ print("\n--- Getting Schema ---")
+ schema = await client.get_schema()
+ print(f"Schema received: {bool(schema)}") # Print whether schema was received
if __name__ == "__main__":
- test_deployment()
+ asyncio.run(main())
```
-## Advanced Configuration ⚙️
+*(SDK parameters like timeout, verify_ssl etc. remain the same)*
-### Crawler Parameters
+### Second Approach: Direct API Calls
-The `crawler_params` field allows you to configure the browser instance and crawling behavior. Here are key parameters you can use:
+Crucially, when sending configurations directly via JSON, they **must** follow the `{"type": "ClassName", "params": {...}}` structure for any non-primitive value (like config objects or strategies). Dictionaries must be wrapped as `{"type": "dict", "value": {...}}`.
-```python
-request = {
- "urls": "https://example.com",
- "crawler_params": {
- # Browser Configuration
- "headless": True, # Run in headless mode
- "browser_type": "chromium", # chromium/firefox/webkit
- "user_agent": "custom-agent", # Custom user agent
- "proxy": "http://proxy:8080", # Proxy configuration
-
- # Performance & Behavior
- "page_timeout": 30000, # Page load timeout (ms)
- "verbose": True, # Enable detailed logging
- "semaphore_count": 5, # Concurrent request limit
-
- # Anti-Detection Features
- "simulate_user": True, # Simulate human behavior
- "magic": True, # Advanced anti-detection
- "override_navigator": True, # Override navigator properties
-
- # Session Management
- "user_data_dir": "./browser-data", # Browser profile location
- "use_managed_browser": True, # Use persistent browser
- }
-}
-```
+*(Keep the detailed explanation of Configuration Structure, Basic Pattern, Simple vs Complex, Strategy Pattern, Complex Nested Example, Quick Grammar Overview, Important Rules, Pro Tip)*
-### Extra Parameters
+#### More Examples *(Ensure Schema example uses type/value wrapper)*
-The `extra` field allows passing additional parameters directly to the crawler's `arun` function:
-
-```python
-request = {
- "urls": "https://example.com",
- "extra": {
- "word_count_threshold": 10, # Min words per block
- "only_text": True, # Extract only text
- "bypass_cache": True, # Force fresh crawl
- "process_iframes": True, # Include iframe content
- }
-}
-```
-
-### Complete Examples
-
-1. **Advanced News Crawling**
-```python
-request = {
- "urls": "https://www.nbcnews.com/business",
- "crawler_params": {
- "headless": True,
- "page_timeout": 30000,
- "remove_overlay_elements": True # Remove popups
- },
- "extra": {
- "word_count_threshold": 50, # Longer content blocks
- "bypass_cache": True # Fresh content
- },
- "css_selector": ".article-body"
-}
-```
-
-2. **Anti-Detection Configuration**
-```python
-request = {
- "urls": "https://example.com",
- "crawler_params": {
- "simulate_user": True,
- "magic": True,
- "override_navigator": True,
- "user_agent": "Mozilla/5.0 ...",
- "headers": {
- "Accept-Language": "en-US,en;q=0.9"
- }
- }
-}
-```
-
-3. **LLM Extraction with Custom Parameters**
-```python
-request = {
- "urls": "https://openai.com/pricing",
- "extraction_config": {
- "type": "llm",
- "params": {
- "provider": "openai/gpt-4",
- "schema": pricing_schema
- }
- },
- "crawler_params": {
- "verbose": True,
- "page_timeout": 60000
- },
- "extra": {
- "word_count_threshold": 1,
- "only_text": True
- }
-}
-```
-
-4. **Session-Based Dynamic Content**
-```python
-request = {
- "urls": "https://example.com",
- "crawler_params": {
- "session_id": "dynamic_session",
- "headless": False,
- "page_timeout": 60000
- },
- "js_code": ["window.scrollTo(0, document.body.scrollHeight);"],
- "wait_for": "js:() => document.querySelectorAll('.item').length > 10",
- "extra": {
- "delay_before_return_html": 2.0
- }
-}
-```
-
-5. **Screenshot with Custom Timing**
-```python
-request = {
- "urls": "https://example.com",
- "screenshot": True,
- "crawler_params": {
- "headless": True,
- "screenshot_wait_for": ".main-content"
- },
- "extra": {
- "delay_before_return_html": 3.0
- }
-}
-```
-
-### Parameter Reference Table
-
-| Category | Parameter | Type | Description |
-|----------|-----------|------|-------------|
-| Browser | headless | bool | Run browser in headless mode |
-| Browser | browser_type | str | Browser engine selection |
-| Browser | user_agent | str | Custom user agent string |
-| Network | proxy | str | Proxy server URL |
-| Network | headers | dict | Custom HTTP headers |
-| Timing | page_timeout | int | Page load timeout (ms) |
-| Timing | delay_before_return_html | float | Wait before capture |
-| Anti-Detection | simulate_user | bool | Human behavior simulation |
-| Anti-Detection | magic | bool | Advanced protection |
-| Session | session_id | str | Browser session ID |
-| Session | user_data_dir | str | Profile directory |
-| Content | word_count_threshold | int | Minimum words per block |
-| Content | only_text | bool | Text-only extraction |
-| Content | process_iframes | bool | Include iframe content |
-| Debug | verbose | bool | Detailed logging |
-| Debug | log_console | bool | Browser console logs |
-
-## Troubleshooting 🔍
-
-### Common Issues
-
-1. **Connection Refused**
- ```
- Error: Connection refused at localhost:11235
- ```
- Solution: Ensure the container is running and ports are properly mapped.
-
-2. **Resource Limits**
- ```
- Error: No available slots
- ```
- Solution: Increase MAX_CONCURRENT_TASKS or container resources.
-
-3. **GPU Access**
- ```
- Error: GPU not found
- ```
- Solution: Ensure proper NVIDIA drivers and use `--gpus all` flag.
-
-### Debug Mode
-
-Access container for debugging:
-```bash
-docker run -it --entrypoint /bin/bash unclecode/crawl4ai:all
-```
-
-View container logs:
-```bash
-docker logs [container_id]
-```
-
-## Best Practices 🌟
-
-1. **Resource Management**
- - Set appropriate memory and CPU limits
- - Monitor resource usage via health endpoint
- - Use basic version for simple crawling tasks
-
-2. **Scaling**
- - Use multiple containers for high load
- - Implement proper load balancing
- - Monitor performance metrics
-
-3. **Security**
- - Use environment variables for sensitive data
- - Implement proper network isolation
- - Regular security updates
-
-## API Reference 📚
-
-### Health Check
-```http
-GET /health
-```
-
-### Submit Crawl Task
-```http
-POST /crawl
-Content-Type: application/json
+**Advanced Crawler Configuration**
+*(Keep example, ensure cache_mode uses valid enum value like "bypass")*
+**Extraction Strategy**
+```json
{
- "urls": "string or array",
- "extraction_config": {
- "type": "basic|llm|cosine|json_css",
- "params": {}
- },
- "priority": 1-10,
- "ttl": 3600
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "extraction_strategy": {
+ "type": "JsonCssExtractionStrategy",
+ "params": {
+ "schema": {
+ "type": "dict",
+ "value": {
+ "baseSelector": "article.post",
+ "fields": [
+ {"name": "title", "selector": "h1", "type": "text"},
+ {"name": "content", "selector": ".content", "type": "html"}
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
}
```
-### Get Task Status
-```http
-GET /task/{task_id}
+**LLM Extraction Strategy** *(Keep example, ensure schema uses type/value wrapper)*
+*(Keep Deep Crawler Example)*
+
+### REST API Examples
+
+Update URLs to use port `11235`.
+
+#### Simple Crawl
+
+```python
+import requests
+
+# Configuration objects converted to the required JSON structure
+browser_config_payload = {
+ "type": "BrowserConfig",
+ "params": {"headless": True}
+}
+crawler_config_payload = {
+ "type": "CrawlerRunConfig",
+ "params": {"stream": False, "cache_mode": "bypass"} # Use string value of enum
+}
+
+crawl_payload = {
+ "urls": ["https://httpbin.org/html"],
+ "browser_config": browser_config_payload,
+ "crawler_config": crawler_config_payload
+}
+response = requests.post(
+ "http://localhost:11235/crawl", # Updated port
+ # headers={"Authorization": f"Bearer {token}"}, # If JWT is enabled
+ json=crawl_payload
+)
+print(f"Status Code: {response.status_code}")
+if response.ok:
+ print(response.json())
+else:
+ print(f"Error: {response.text}")
+
```
-For more details, visit the [official documentation](https://docs.crawl4ai.com/).
\ No newline at end of file
+#### Streaming Results
+
+```python
+import json
+import httpx # Use httpx for async streaming example
+
+async def test_stream_crawl(token: str = None): # Made token optional
+ """Test the /crawl/stream endpoint with multiple URLs."""
+ url = "http://localhost:11235/crawl/stream" # Updated port
+ payload = {
+ "urls": [
+ "https://httpbin.org/html",
+ "https://httpbin.org/links/5/0",
+ ],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {"headless": True, "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}} # Viewport needs type:dict
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"stream": True, "cache_mode": "bypass"}
+ }
+ }
+
+ headers = {}
+ # if token:
+ # headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled
+
+ try:
+ async with httpx.AsyncClient() as client:
+ async with client.stream("POST", url, json=payload, headers=headers, timeout=120.0) as response:
+ print(f"Status: {response.status_code} (Expected: 200)")
+ response.raise_for_status() # Raise exception for bad status codes
+
+ # Read streaming response line-by-line (NDJSON)
+ async for line in response.aiter_lines():
+ if line:
+ try:
+ data = json.loads(line)
+ # Check for completion marker
+ if data.get("status") == "completed":
+ print("Stream completed.")
+ break
+ print(f"Streamed Result: {json.dumps(data, indent=2)}")
+ except json.JSONDecodeError:
+ print(f"Warning: Could not decode JSON line: {line}")
+
+ except httpx.HTTPStatusError as e:
+ print(f"HTTP error occurred: {e.response.status_code} - {e.response.text}")
+ except Exception as e:
+ print(f"Error in streaming crawl test: {str(e)}")
+
+# To run this example:
+# import asyncio
+# asyncio.run(test_stream_crawl())
+```
+
+---
+
+## Metrics & Monitoring
+
+Keep an eye on your crawler with these endpoints:
+
+- `/health` - Quick health check
+- `/metrics` - Detailed Prometheus metrics
+- `/schema` - Full API schema
+
+Example health check:
+```bash
+curl http://localhost:11235/health
+```
+
+---
+
+*(Deployment Scenarios and Complete Examples sections remain the same, maybe update links if examples moved)*
+
+---
+
+## Server Configuration
+
+The server's behavior can be customized through the `config.yml` file.
+
+### Understanding config.yml
+
+The configuration file is loaded from `/app/config.yml` inside the container. By default, the file from `deploy/docker/config.yml` in the repository is copied there during the build.
+
+Here's a detailed breakdown of the configuration options (using defaults from `deploy/docker/config.yml`):
+
+```yaml
+# Application Configuration
+app:
+ title: "Crawl4AI API"
+ version: "1.0.0" # Consider setting this to match library version, e.g., "0.5.1"
+ host: "0.0.0.0"
+ port: 8020 # NOTE: This port is used ONLY when running server.py directly. Gunicorn overrides this (see supervisord.conf).
+ reload: False # Default set to False - suitable for production
+ timeout_keep_alive: 300
+
+# Default LLM Configuration
+llm:
+ provider: "openai/gpt-4o-mini"
+ api_key_env: "OPENAI_API_KEY"
+ # api_key: sk-... # If you pass the API key directly then api_key_env will be ignored
+
+# Redis Configuration (Used by internal Redis server managed by supervisord)
+redis:
+ host: "localhost"
+ port: 6379
+ db: 0
+ password: ""
+ # ... other redis options ...
+
+# Rate Limiting Configuration
+rate_limiting:
+ enabled: True
+ default_limit: "1000/minute"
+ trusted_proxies: []
+ storage_uri: "memory://" # Use "redis://localhost:6379" if you need persistent/shared limits
+
+# Security Configuration
+security:
+ enabled: false # Master toggle for security features
+ jwt_enabled: false # Enable JWT authentication (requires security.enabled=true)
+ https_redirect: false # Force HTTPS (requires security.enabled=true)
+ trusted_hosts: ["*"] # Allowed hosts (use specific domains in production)
+ headers: # Security headers (applied if security.enabled=true)
+ x_content_type_options: "nosniff"
+ x_frame_options: "DENY"
+ content_security_policy: "default-src 'self'"
+ strict_transport_security: "max-age=63072000; includeSubDomains"
+
+# Crawler Configuration
+crawler:
+ memory_threshold_percent: 95.0
+ rate_limiter:
+ base_delay: [1.0, 2.0] # Min/max delay between requests in seconds for dispatcher
+ timeouts:
+ stream_init: 30.0 # Timeout for stream initialization
+ batch_process: 300.0 # Timeout for non-streaming /crawl processing
+
+# Logging Configuration
+logging:
+ level: "INFO"
+ format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
+
+# Observability Configuration
+observability:
+ prometheus:
+ enabled: True
+ endpoint: "/metrics"
+ health_check:
+ endpoint: "/health"
+```
+
+*(JWT Authentication section remains the same, just note the default port is now 11235 for requests)*
+
+*(Configuration Tips and Best Practices remain the same)*
+
+### Customizing Your Configuration
+
+You can override the default `config.yml`.
+
+#### Method 1: Modify Before Build
+
+1. Edit the `deploy/docker/config.yml` file in your local repository clone.
+2. Build the image using `docker buildx` or `docker compose --profile local-... up --build`. The modified file will be copied into the image.
+
+#### Method 2: Runtime Mount (Recommended for Custom Deploys)
+
+1. Create your custom configuration file, e.g., `my-custom-config.yml` locally. Ensure it contains all necessary sections.
+2. Mount it when running the container:
+
+ * **Using `docker run`:**
+ ```bash
+ # Assumes my-custom-config.yml is in the current directory
+ docker run -d -p 11235:11235 \
+ --name crawl4ai-custom-config \
+ --env-file .llm.env \
+ --shm-size=1g \
+ -v $(pwd)/my-custom-config.yml:/app/config.yml \
+ unclecode/crawl4ai:latest # Or your specific tag
+ ```
+
+ * **Using `docker-compose.yml`:** Add a `volumes` section to the service definition:
+ ```yaml
+ services:
+ crawl4ai-hub-amd64: # Or your chosen service
+ image: unclecode/crawl4ai:latest
+ profiles: ["hub-amd64"]
+ <<: *base-config
+ volumes:
+ # Mount local custom config over the default one in the container
+ - ./my-custom-config.yml:/app/config.yml
+ # Keep the shared memory volume from base-config
+ - /dev/shm:/dev/shm
+ ```
+ *(Note: Ensure `my-custom-config.yml` is in the same directory as `docker-compose.yml`)*
+
+> 💡 When mounting, your custom file *completely replaces* the default one. Ensure it's a valid and complete configuration.
+
+### Configuration Recommendations
+
+1. **Security First** 🔒
+ - Always enable security in production
+ - Use specific trusted_hosts instead of wildcards
+ - Set up proper rate limiting to protect your server
+ - Consider your environment before enabling HTTPS redirect
+
+2. **Resource Management** 💻
+ - Adjust memory_threshold_percent based on available RAM
+ - Set timeouts according to your content size and network conditions
+ - Use Redis for rate limiting in multi-container setups
+
+3. **Monitoring** 📊
+ - Enable Prometheus if you need metrics
+ - Set DEBUG logging in development, INFO in production
+ - Regular health check monitoring is crucial
+
+4. **Performance Tuning** ⚡
+ - Start with conservative rate limiter delays
+ - Increase batch_process timeout for large content
+ - Adjust stream_init timeout based on initial response times
+
+## Getting Help
+
+We're here to help you succeed with Crawl4AI! Here's how to get support:
+
+- 📖 Check our [full documentation](https://docs.crawl4ai.com)
+- 🐛 Found a bug? [Open an issue](https://github.com/unclecode/crawl4ai/issues)
+- 💬 Join our [Discord community](https://discord.gg/crawl4ai)
+- ⭐ Star us on GitHub to show support!
+
+## Summary
+
+In this guide, we've covered everything you need to get started with Crawl4AI's Docker deployment:
+- Building and running the Docker container
+- Configuring the environment
+- Using the interactive playground for testing
+- Making API requests with proper typing
+- Using the Python SDK
+- Leveraging specialized endpoints for screenshots, PDFs, and JavaScript execution
+- Connecting via the Model Context Protocol (MCP)
+- Monitoring your deployment
+
+The new playground interface at `http://localhost:11235/playground` makes it much easier to test configurations and generate the corresponding JSON for API requests.
+
+For AI application developers, the MCP integration allows tools like Claude Code to directly access Crawl4AI's capabilities without complex API handling.
+
+Remember, the examples in the `examples` folder are your friends - they show real-world usage patterns that you can adapt for your needs.
+
+Keep exploring, and don't hesitate to reach out if you need help! We're building something amazing together. 🚀
+
+Happy crawling! 🕷️
diff --git a/docs/md_v2/core/examples.md b/docs/md_v2/core/examples.md
new file mode 100644
index 00000000..93989552
--- /dev/null
+++ b/docs/md_v2/core/examples.md
@@ -0,0 +1,115 @@
+# Code Examples
+
+This page provides a comprehensive list of example scripts that demonstrate various features and capabilities of Crawl4AI. Each example is designed to showcase specific functionality, making it easier for you to understand how to implement these features in your own projects.
+
+## Getting Started Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hello World | A simple introductory example demonstrating basic usage of AsyncWebCrawler with JavaScript execution and content filtering. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hello_world.py) |
+| Quickstart | A comprehensive collection of examples showcasing various features including basic crawling, content cleaning, link analysis, JavaScript execution, CSS selectors, media handling, custom hooks, proxy configuration, screenshots, and multiple extraction strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart.py) |
+| Quickstart Set 1 | Basic examples for getting started with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_1.py) |
+| Quickstart Set 2 | More advanced examples for working with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/quickstart_examples_set_2.py) |
+
+## Browser & Crawling Features
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Built-in Browser | Demonstrates how to use the built-in browser capabilities. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/builtin_browser_example.py) |
+| Browser Optimization | Focuses on browser performance optimization techniques. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/browser_optimization_example.py) |
+| arun vs arun_many | Compares the `arun` and `arun_many` methods for single vs. multiple URL crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/arun_vs_arun_many.py) |
+| Multiple URLs | Shows how to crawl multiple URLs asynchronously. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/async_webcrawler_multiple_urls_example.py) |
+| Page Interaction | Guide on interacting with dynamic elements through clicks. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/tutorial_dynamic_clicks.md) |
+| Crawler Monitor | Shows how to monitor the crawler's activities and status. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawler_monitor_example.py) |
+| Full Page Screenshot & PDF | Guide on capturing full-page screenshots and PDFs from massive webpages. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/full_page_screenshot_and_pdf_export.md) |
+
+## Advanced Crawling & Deep Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Deep Crawling | An extensive tutorial on deep crawling capabilities, demonstrating BFS and BestFirst strategies, stream vs. non-stream execution, filters, scorers, and advanced configurations. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/deepcrawl_example.py) |
+| Dispatcher | Shows how to use the crawl dispatcher for advanced workload management. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/dispatcher_example.py) |
+| Storage State | Tutorial on managing browser storage state for persistence. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/storage_state_tutorial.md) |
+| Network Console Capture | Demonstrates how to capture and analyze network requests and console logs. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/network_console_capture_example.py) |
+
+## Extraction Strategies
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Extraction Strategies | Demonstrates different extraction strategies with various input formats (markdown, HTML, fit_markdown) and JSON-based extractors (CSS and XPath). | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/extraction_strategies_examples.py) |
+| Scraping Strategies | Compares the performance of different scraping strategies. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/scraping_strategies_performance.py) |
+| LLM Extraction | Demonstrates LLM-based extraction specifically for OpenAI pricing data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_extraction_openai_pricing.py) |
+| LLM Markdown | Shows how to use LLMs to generate markdown from crawled content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/llm_markdown_generator.py) |
+| Summarize Page | Shows how to summarize web page content. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/summarize_page.py) |
+
+## E-commerce & Specialized Crawling
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Amazon Product Extraction | Demonstrates how to extract structured product data from Amazon search results using CSS selectors. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_direct_url.py) |
+| Amazon with Hooks | Shows how to use hooks with Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_hooks.py) |
+| Amazon with JavaScript | Demonstrates using custom JavaScript for Amazon product extraction. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/amazon_product_extraction_using_use_javascript.py) |
+| Crypto Analysis | Demonstrates how to crawl and analyze cryptocurrency data. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crypto_analysis_example.py) |
+| SERP API | Demonstrates using Crawl4AI with search engine result pages. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/serp_api_project_11_feb.py) |
+
+## Customization & Security
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Hooks | Illustrates how to use hooks at different stages of the crawling process for advanced customization. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/hooks_example.py) |
+| Identity-Based Browsing | Illustrates identity-based browsing configurations for authentic browsing experiences. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/identity_based_browsing.py) |
+| Proxy Rotation | Shows how to use proxy rotation for web scraping and avoiding IP blocks. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/proxy_rotation_demo.py) |
+| SSL Certificate | Illustrates SSL certificate handling and verification. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/ssl_example.py) |
+| Language Support | Shows how to handle different languages during crawling. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/language_support_example.py) |
+| Geolocation | Demonstrates how to use geolocation features. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/use_geo_location.py) |
+
+## Docker & Deployment
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Docker Config | Demonstrates how to create and use Docker configuration objects. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_config_obj.py) |
+| Docker Basic | A test suite for Docker deployment, showcasing various functionalities through the Docker API. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_example.py) |
+| Docker REST API | Shows how to interact with Crawl4AI Docker using REST API calls. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py) |
+| Docker SDK | Demonstrates using the Python SDK for Crawl4AI Docker. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py) |
+
+## Application Examples
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Research Assistant | Demonstrates how to build a research assistant using Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/research_assistant.py) |
+| REST Call | Shows how to make REST API calls with Crawl4AI. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/rest_call.py) |
+| Chainlit Integration | Shows how to integrate Crawl4AI with Chainlit. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/chainlit.md) |
+| Crawl4AI vs FireCrawl | Compares Crawl4AI with the FireCrawl library. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/crawlai_vs_firecrawl.py) |
+
+## Content Generation & Markdown
+
+| Example | Description | Link |
+|---------|-------------|------|
+| Content Source | Demonstrates how to work with different content sources in markdown generation. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_example.py) |
+| Content Source (Short) | A simplified version of content source usage. | [View Code](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/markdown/content_source_short_example.py) |
+| Built-in Browser Guide | Guide for using the built-in browser capabilities. | [View Guide](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/README_BUILTIN_BROWSER.md) |
+
+## Running the Examples
+
+To run any of these examples, you'll need to have Crawl4AI installed:
+
+```bash
+pip install crawl4ai
+```
+
+Then, you can run an example script like this:
+
+```bash
+python -m docs.examples.hello_world
+```
+
+For examples that require additional dependencies or environment variables, refer to the comments at the top of each file.
+
+Some examples may require:
+- API keys (for LLM-based examples)
+- Docker setup (for Docker-related examples)
+- Additional dependencies (specified in the example files)
+
+## Contributing New Examples
+
+If you've created an interesting example that demonstrates a unique use case or feature of Crawl4AI, we encourage you to contribute it to our examples collection. Please see our [contribution guidelines](https://github.com/unclecode/crawl4ai/blob/main/CONTRIBUTORS.md) for more information.
\ No newline at end of file
diff --git a/docs/md_v2/core/link-media.md b/docs/md_v2/core/link-media.md
index cccc8df0..58bedcbc 100644
--- a/docs/md_v2/core/link-media.md
+++ b/docs/md_v2/core/link-media.md
@@ -4,7 +4,35 @@ In this tutorial, you’ll learn how to:
1. Extract links (internal, external) from crawled pages
2. Filter or exclude specific domains (e.g., social media or custom domains)
-3. Access and manage media data (especially images) in the crawl result
+3. Access and ma### 3.2 Excluding Images
+
+#### Excluding External Images
+
+If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+ exclude_external_images=True
+)
+```
+
+This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
+
+#### Excluding All Images
+
+If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
+
+```python
+crawler_cfg = CrawlerRunConfig(
+ exclude_all_images=True
+)
+```
+
+This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
+- You don't need image data in your results
+- You're crawling image-heavy pages that cause memory issues
+- You want to focus only on text content
+- You need to maximize crawling speeddata (especially images) in the crawl result
4. Configure your crawler to exclude or prioritize certain images
> **Prerequisites**
@@ -271,8 +299,41 @@ Each extracted table contains:
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
+- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
+#### Example: Capturing Page as MHTML
+
+```python
+import asyncio
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+ crawler_cfg = CrawlerRunConfig(
+ capture_mhtml=True # Enable MHTML capture
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://example.com", config=crawler_cfg)
+
+ if result.success and result.mhtml:
+ # Save the MHTML snapshot to a file
+ with open("example.mhtml", "w", encoding="utf-8") as f:
+ f.write(result.mhtml)
+ print("MHTML snapshot saved to example.mhtml")
+ else:
+ print("Failed to capture MHTML:", result.error_message)
+
+if __name__ == "__main__":
+ asyncio.run(main())
+```
+
+The MHTML format is particularly useful because:
+- It captures the complete page state including all resources
+- It can be opened in most modern browsers for offline viewing
+- It preserves the page exactly as it appeared during crawling
+- It's a single file, making it easy to store and transfer
+
---
## 4. Putting It All Together: Link & Media Filtering
diff --git a/docs/md_v2/core/markdown-generation.md b/docs/md_v2/core/markdown-generation.md
index ac27e5b2..e6f5e12a 100644
--- a/docs/md_v2/core/markdown-generation.md
+++ b/docs/md_v2/core/markdown-generation.md
@@ -111,13 +111,71 @@ Some commonly used `options`:
- **`skip_internal_links`** (bool): If `True`, omit `#localAnchors` or internal links referencing the same page.
- **`include_sup_sub`** (bool): Attempt to handle `` / `` in a more readable way.
+## 4. Selecting the HTML Source for Markdown Generation
+
+The `content_source` parameter allows you to control which HTML content is used as input for markdown generation. This gives you flexibility in how the HTML is processed before conversion to markdown.
+
+```python
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
+
+async def main():
+ # Option 1: Use the raw HTML directly from the webpage (before any processing)
+ raw_md_generator = DefaultMarkdownGenerator(
+ content_source="raw_html",
+ options={"ignore_links": True}
+ )
+
+ # Option 2: Use the cleaned HTML (after scraping strategy processing - default)
+ cleaned_md_generator = DefaultMarkdownGenerator(
+ content_source="cleaned_html", # This is the default
+ options={"ignore_links": True}
+ )
+
+ # Option 3: Use preprocessed HTML optimized for schema extraction
+ fit_md_generator = DefaultMarkdownGenerator(
+ content_source="fit_html",
+ options={"ignore_links": True}
+ )
+
+ # Use one of the generators in your crawler config
+ config = CrawlerRunConfig(
+ markdown_generator=raw_md_generator # Try each of the generators
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ result = await crawler.arun("https://example.com", config=config)
+ if result.success:
+ print("Markdown:\n", result.markdown.raw_markdown[:500])
+ else:
+ print("Crawl failed:", result.error_message)
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(main())
+```
+
+### HTML Source Options
+
+- **`"cleaned_html"`** (default): Uses the HTML after it has been processed by the scraping strategy. This HTML is typically cleaner and more focused on content, with some boilerplate removed.
+
+- **`"raw_html"`**: Uses the original HTML directly from the webpage, before any cleaning or processing. This preserves more of the original content, but may include navigation bars, ads, footers, and other elements that might not be relevant to the main content.
+
+- **`"fit_html"`**: Uses HTML preprocessed for schema extraction. This HTML is optimized for structured data extraction and may have certain elements simplified or removed.
+
+### When to Use Each Option
+
+- Use **`"cleaned_html"`** (default) for most cases where you want a balance of content preservation and noise removal.
+- Use **`"raw_html"`** when you need to preserve all original content, or when the cleaning process is removing content you actually want to keep.
+- Use **`"fit_html"`** when working with structured data or when you need HTML that's optimized for schema extraction.
+
---
-## 4. Content Filters
+## 5. Content Filters
**Content filters** selectively remove or rank sections of text before turning them into Markdown. This is especially helpful if your page has ads, nav bars, or other clutter you don’t want.
-### 4.1 BM25ContentFilter
+### 5.1 BM25ContentFilter
If you have a **search query**, BM25 is a good choice:
@@ -146,7 +204,7 @@ config = CrawlerRunConfig(markdown_generator=md_generator)
**No query provided?** BM25 tries to glean a context from page metadata, or you can simply treat it as a scorched-earth approach that discards text with low generic score. Realistically, you want to supply a query for best results.
-### 4.2 PruningContentFilter
+### 5.2 PruningContentFilter
If you **don’t** have a specific query, or if you just want a robust “junk remover,” use `PruningContentFilter`. It analyzes text density, link density, HTML structure, and known patterns (like “nav,” “footer”) to systematically prune extraneous or repetitive sections.
@@ -170,7 +228,7 @@ prune_filter = PruningContentFilter(
- You want a broad cleanup without a user query.
- The page has lots of repeated sidebars, footers, or disclaimers that hamper text extraction.
-### 4.3 LLMContentFilter
+### 5.3 LLMContentFilter
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
@@ -247,7 +305,7 @@ filter = LLMContentFilter(
---
-## 5. Using Fit Markdown
+## 6. Using Fit Markdown
When a content filter is active, the library produces two forms of markdown inside `result.markdown`:
@@ -284,7 +342,7 @@ if __name__ == "__main__":
---
-## 6. The `MarkdownGenerationResult` Object
+## 7. The `MarkdownGenerationResult` Object
If your library stores detailed markdown output in an object like `MarkdownGenerationResult`, you’ll see fields such as:
@@ -315,7 +373,7 @@ Below is a **revised section** under “Combining Filters (BM25 + Pruning)” th
---
-## 7. Combining Filters (BM25 + Pruning) in Two Passes
+## 8. Combining Filters (BM25 + Pruning) in Two Passes
You might want to **prune out** noisy boilerplate first (with `PruningContentFilter`), and then **rank what’s left** against a user query (with `BM25ContentFilter`). You don’t have to crawl the page twice. Instead:
@@ -407,7 +465,7 @@ If your codebase or pipeline design allows applying multiple filters in one pass
---
-## 8. Common Pitfalls & Tips
+## 9. Common Pitfalls & Tips
1. **No Markdown Output?**
- Make sure the crawler actually retrieved HTML. If the site is heavily JS-based, you may need to enable dynamic rendering or wait for elements.
@@ -427,11 +485,12 @@ If your codebase or pipeline design allows applying multiple filters in one pass
---
-## 9. Summary & Next Steps
+## 10. Summary & Next Steps
In this **Markdown Generation Basics** tutorial, you learned to:
- Configure the **DefaultMarkdownGenerator** with HTML-to-text options.
+- Select different HTML sources using the `content_source` parameter.
- Use **BM25ContentFilter** for query-specific extraction or **PruningContentFilter** for general noise removal.
- Distinguish between raw and filtered markdown (`fit_markdown`).
- Leverage the `MarkdownGenerationResult` object to handle different forms of output (citations, references, etc.).
diff --git a/docs/md_v2/extraction/llm-strategies.md b/docs/md_v2/extraction/llm-strategies.md
index 785ff9b5..9f6a6b3e 100644
--- a/docs/md_v2/extraction/llm-strategies.md
+++ b/docs/md_v2/extraction/llm-strategies.md
@@ -2,7 +2,7 @@
In some cases, you need to extract **complex or unstructured** information from a webpage that a simple CSS/XPath schema cannot easily parse. Or you want **AI**-driven insights, classification, or summarization. For these scenarios, Crawl4AI provides an **LLM-based extraction strategy** that:
-1. Works with **any** large language model supported by [LightLLM](https://github.com/LightLLM) (Ollama, OpenAI, Claude, and more).
+1. Works with **any** large language model supported by [LiteLLM](https://github.com/BerriAI/litellm) (Ollama, OpenAI, Claude, and more).
2. Automatically splits content into chunks (if desired) to handle token limits, then combines results.
3. Lets you define a **schema** (like a Pydantic model) or a simpler “block” extraction approach.
@@ -18,13 +18,19 @@ In some cases, you need to extract **complex or unstructured** information from
---
-## 2. Provider-Agnostic via LightLLM
+## 2. Provider-Agnostic via LiteLLM
-Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LightLLM supports is fair game. You just provide:
+You can use LlmConfig, to quickly configure multiple variations of LLMs and experiment with them to find the optimal one for your use case. You can read more about LlmConfig [here](/api/parameters).
+
+```python
+llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
+```
+
+Crawl4AI uses a “provider string” (e.g., `"openai/gpt-4o"`, `"ollama/llama2.0"`, `"aws/titan"`) to identify your LLM. **Any** model that LiteLLM supports is fair game. You just provide:
- **`provider`**: The `/` identifier (e.g., `"openai/gpt-4"`, `"ollama/llama2"`, `"huggingface/google-flan"`, etc.).
- **`api_token`**: If needed (for OpenAI, HuggingFace, etc.); local models or Ollama might not require it.
-- **`api_base`** (optional): If your provider has a custom endpoint.
+- **`base_url`** (optional): If your provider has a custom endpoint.
This means you **aren’t locked** into a single LLM vendor. Switch or experiment easily.
@@ -52,20 +58,19 @@ For structured data, `"schema"` is recommended. You provide `schema=YourPydantic
Below is an overview of important LLM extraction parameters. All are typically set inside `LLMExtractionStrategy(...)`. You then put that strategy in your `CrawlerRunConfig(..., extraction_strategy=...)`.
-1. **`provider`** (str): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
-2. **`api_token`** (str): The API key or token for that model. May not be needed for local models.
-3. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
-4. **`extraction_type`** (str): `"schema"` or `"block"`.
-5. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
-6. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
-7. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
-8. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
-9. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
+1. **`llmConfig`** (LlmConfig): e.g., `"openai/gpt-4"`, `"ollama/llama2"`.
+2. **`schema`** (dict): A JSON schema describing the fields you want. Usually generated by `YourModel.model_json_schema()`.
+3. **`extraction_type`** (str): `"schema"` or `"block"`.
+4. **`instruction`** (str): Prompt text telling the LLM what you want extracted. E.g., “Extract these fields as a JSON array.”
+5. **`chunk_token_threshold`** (int): Maximum tokens per chunk. If your content is huge, you can break it up for the LLM.
+6. **`overlap_rate`** (float): Overlap ratio between adjacent chunks. E.g., `0.1` means 10% of each chunk is repeated to preserve context continuity.
+7. **`apply_chunking`** (bool): Set `True` to chunk automatically. If you want a single pass, set `False`.
+8. **`input_format`** (str): Determines **which** crawler result is passed to the LLM. Options include:
- `"markdown"`: The raw markdown (default).
- `"fit_markdown"`: The filtered “fit” markdown if you used a content filter.
- `"html"`: The cleaned or raw HTML.
-10. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
-11. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
+9. **`extra_args`** (dict): Additional LLM parameters like `temperature`, `max_tokens`, `top_p`, etc.
+10. **`show_usage()`**: A method you can call to print out usage info (token usage per chunk, total cost if known).
**Example**:
@@ -233,8 +238,7 @@ class KnowledgeGraph(BaseModel):
async def main():
# LLM extraction strategy
llm_strat = LLMExtractionStrategy(
- provider="openai/gpt-4",
- api_token=os.getenv('OPENAI_API_KEY'),
+ llmConfig = LlmConfig(provider="openai/gpt-4", api_token=os.getenv('OPENAI_API_KEY')),
schema=KnowledgeGraph.schema_json(),
extraction_type="schema",
instruction="Extract entities and relationships from the content. Return valid JSON.",
@@ -286,7 +290,7 @@ if __name__ == "__main__":
## 11. Conclusion
-**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LightLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
+**LLM-based extraction** in Crawl4AI is **provider-agnostic**, letting you choose from hundreds of models via LiteLLM. It’s perfect for **semantically complex** tasks or generating advanced structures like knowledge graphs. However, it’s **slower** and potentially costlier than schema-based approaches. Keep these tips in mind:
- Put your LLM strategy **in `CrawlerRunConfig`**.
- Use **`input_format`** to pick which form (markdown, HTML, fit_markdown) the LLM sees.
@@ -317,4 +321,4 @@ If your site’s data is consistent or repetitive, consider [`JsonCssExtractionS
---
-That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
\ No newline at end of file
+That’s it for **Extracting JSON (LLM)**—now you can harness AI to parse, classify, or reorganize data on the web. Happy crawling!
diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md
index 7a230d5d..4e54da7d 100644
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -72,6 +72,14 @@ asyncio.run(main())
---
+## Video Tutorial
+
+
+
+
+
+---
+
## What Does Crawl4AI Do?
Crawl4AI is a feature-rich crawler and scraper that aims to:
diff --git a/docs/tutorials/coming_soon.md b/docs/tutorials/coming_soon.md
new file mode 100644
index 00000000..e69de29b
diff --git a/mkdocs.yml b/mkdocs.yml
index 3082d041..23f4ceda 100644
--- a/mkdocs.yml
+++ b/mkdocs.yml
@@ -1,4 +1,4 @@
-site_name: Crawl4AI Documentation (v0.5.x)
+site_name: Crawl4AI Documentation (v0.6.x)
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
site_url: https://docs.crawl4ai.com
repo_url: https://github.com/unclecode/crawl4ai
@@ -7,10 +7,12 @@ docs_dir: docs/md_v2
nav:
- Home: 'index.md'
+ - "Ask AI": "core/ask-ai.md"
+ - "Quick Start": "core/quickstart.md"
+ - "Code Examples": "core/examples.md"
- Setup & Installation:
- "Installation": "core/installation.md"
- "Docker Deployment": "core/docker-deployment.md"
- - "Quick Start": "core/quickstart.md"
- "Blog & Changelog":
- "Blog Home": "blog/index.md"
- "Changelog": "https://github.com/unclecode/crawl4ai/blob/main/CHANGELOG.md"
@@ -38,6 +40,7 @@ nav:
- "Crawl Dispatcher": "advanced/crawl-dispatcher.md"
- "Identity Based Crawling": "advanced/identity-based-crawling.md"
- "SSL Certificate": "advanced/ssl-certificate.md"
+ - "Network & Console Capture": "advanced/network-console-capture.md"
- Extraction:
- "LLM-Free Strategies": "extraction/no-llm-strategies.md"
- "LLM Strategies": "extraction/llm-strategies.md"
@@ -75,6 +78,7 @@ extra:
version: !ENV [CRAWL4AI_VERSION, 'development']
extra_css:
+ - assets/layout.css
- assets/styles.css
- assets/highlight.css
- assets/dmvendor.css
@@ -82,4 +86,10 @@ extra_css:
extra_javascript:
- assets/highlight.min.js
- assets/highlight_init.js
- - https://buttons.github.io/buttons.js
\ No newline at end of file
+ - https://buttons.github.io/buttons.js
+ - assets/toc.js
+ - assets/github_stats.js
+ - assets/selection_ask_ai.js
+ - assets/copy_code.js
+ - assets/floating_ask_ai_button.js
+ - assets/mobile_menu.js
\ No newline at end of file
diff --git a/prompts/prompt_net_requests.md b/prompts/prompt_net_requests.md
new file mode 100644
index 00000000..d033591e
--- /dev/null
+++ b/prompts/prompt_net_requests.md
@@ -0,0 +1,489 @@
+I want to enhance the `AsyncPlaywrightCrawlerStrategy` to optionally capture network requests and console messages during a crawl, storing them in the final `CrawlResult`.
+
+Here's a breakdown of the proposed changes across the relevant files:
+
+**1. Configuration (`crawl4ai/async_configs.py`)**
+
+* **Goal:** Add flags to `CrawlerRunConfig` to enable/disable capturing.
+* **Changes:**
+ * Add two new boolean attributes to `CrawlerRunConfig`:
+ * `capture_network_requests: bool = False`
+ * `capture_console_messages: bool = False`
+ * Update `__init__`, `from_kwargs`, `to_dict`, and implicitly `clone`/`dump`/`load` to include these new attributes.
+
+```python
+# ==== File: crawl4ai/async_configs.py ====
+# ... (imports) ...
+
+class CrawlerRunConfig():
+ # ... (existing attributes) ...
+
+ # NEW: Network and Console Capturing Parameters
+ capture_network_requests: bool = False
+ capture_console_messages: bool = False
+
+ # Experimental Parameters
+ experimental: Dict[str, Any] = None,
+
+ def __init__(
+ self,
+ # ... (existing parameters) ...
+
+ # NEW: Network and Console Capturing Parameters
+ capture_network_requests: bool = False,
+ capture_console_messages: bool = False,
+
+ # Experimental Parameters
+ experimental: Dict[str, Any] = None,
+ ):
+ # ... (existing assignments) ...
+
+ # NEW: Assign new parameters
+ self.capture_network_requests = capture_network_requests
+ self.capture_console_messages = capture_console_messages
+
+ # Experimental Parameters
+ self.experimental = experimental or {}
+
+ # ... (rest of __init__) ...
+
+ @staticmethod
+ def from_kwargs(kwargs: dict) -> "CrawlerRunConfig":
+ return CrawlerRunConfig(
+ # ... (existing kwargs gets) ...
+
+ # NEW: Get new parameters
+ capture_network_requests=kwargs.get("capture_network_requests", False),
+ capture_console_messages=kwargs.get("capture_console_messages", False),
+
+ # Experimental Parameters
+ experimental=kwargs.get("experimental"),
+ )
+
+ def to_dict(self):
+ return {
+ # ... (existing dict entries) ...
+
+ # NEW: Add new parameters to dict
+ "capture_network_requests": self.capture_network_requests,
+ "capture_console_messages": self.capture_console_messages,
+
+ "experimental": self.experimental,
+ }
+
+ # clone(), dump(), load() should work automatically if they rely on to_dict() and from_kwargs()
+ # or the serialization logic correctly handles all attributes.
+```
+
+**2. Data Models (`crawl4ai/models.py`)**
+
+* **Goal:** Add fields to store the captured data in the response/result objects.
+* **Changes:**
+ * Add `network_requests: Optional[List[Dict[str, Any]]] = None` and `console_messages: Optional[List[Dict[str, Any]]] = None` to `AsyncCrawlResponse`.
+ * Add the same fields to `CrawlResult`.
+
+```python
+# ==== File: crawl4ai/models.py ====
+# ... (imports) ...
+
+# ... (Existing dataclasses/models) ...
+
+class AsyncCrawlResponse(BaseModel):
+ html: str
+ response_headers: Dict[str, str]
+ js_execution_result: Optional[Dict[str, Any]] = None
+ status_code: int
+ screenshot: Optional[str] = None
+ pdf_data: Optional[bytes] = None
+ get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
+ downloaded_files: Optional[List[str]] = None
+ ssl_certificate: Optional[SSLCertificate] = None
+ redirected_url: Optional[str] = None
+ # NEW: Fields for captured data
+ network_requests: Optional[List[Dict[str, Any]]] = None
+ console_messages: Optional[List[Dict[str, Any]]] = None
+
+ class Config:
+ arbitrary_types_allowed = True
+
+# ... (Existing models like MediaItem, Link, etc.) ...
+
+class CrawlResult(BaseModel):
+ url: str
+ html: str
+ success: bool
+ cleaned_html: Optional[str] = None
+ media: Dict[str, List[Dict]] = {}
+ links: Dict[str, List[Dict]] = {}
+ downloaded_files: Optional[List[str]] = None
+ js_execution_result: Optional[Dict[str, Any]] = None
+ screenshot: Optional[str] = None
+ pdf: Optional[bytes] = None
+ mhtml: Optional[str] = None # Added mhtml based on the provided models.py
+ _markdown: Optional[MarkdownGenerationResult] = PrivateAttr(default=None)
+ extracted_content: Optional[str] = None
+ metadata: Optional[dict] = None
+ error_message: Optional[str] = None
+ session_id: Optional[str] = None
+ response_headers: Optional[dict] = None
+ status_code: Optional[int] = None
+ ssl_certificate: Optional[SSLCertificate] = None
+ dispatch_result: Optional[DispatchResult] = None
+ redirected_url: Optional[str] = None
+ # NEW: Fields for captured data
+ network_requests: Optional[List[Dict[str, Any]]] = None
+ console_messages: Optional[List[Dict[str, Any]]] = None
+
+ class Config:
+ arbitrary_types_allowed = True
+
+ # ... (Existing __init__, properties, model_dump for markdown compatibility) ...
+
+# ... (Rest of the models) ...
+```
+
+**3. Crawler Strategy (`crawl4ai/async_crawler_strategy.py`)**
+
+* **Goal:** Implement the actual capturing logic within `AsyncPlaywrightCrawlerStrategy._crawl_web`.
+* **Changes:**
+ * Inside `_crawl_web`, initialize empty lists `captured_requests = []` and `captured_console = []`.
+ * Conditionally attach Playwright event listeners (`page.on(...)`) based on the `config.capture_network_requests` and `config.capture_console_messages` flags.
+ * Define handler functions for these listeners to extract relevant data and append it to the respective lists. Include timestamps.
+ * Pass the captured lists to the `AsyncCrawlResponse` constructor at the end of the method.
+
+```python
+# ==== File: crawl4ai/async_crawler_strategy.py ====
+# ... (imports) ...
+import time # Make sure time is imported
+
+class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
+ # ... (existing methods like __init__, start, close, etc.) ...
+
+ async def _crawl_web(
+ self, url: str, config: CrawlerRunConfig
+ ) -> AsyncCrawlResponse:
+ """
+ Internal method to crawl web URLs with the specified configuration.
+ Includes optional network and console capturing. # MODIFIED DOCSTRING
+ """
+ config.url = url
+ response_headers = {}
+ execution_result = None
+ status_code = None
+ redirected_url = url
+
+ # Reset downloaded files list for new crawl
+ self._downloaded_files = []
+
+ # Initialize capture lists - IMPORTANT: Reset per crawl
+ captured_requests: List[Dict[str, Any]] = []
+ captured_console: List[Dict[str, Any]] = []
+
+ # Handle user agent ... (existing code) ...
+
+ # Get page for session
+ page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
+
+ # ... (existing code for cookies, navigator overrides, hooks) ...
+
+ # --- Setup Capturing Listeners ---
+ # NOTE: These listeners are attached *before* page.goto()
+
+ # Network Request Capturing
+ if config.capture_network_requests:
+ async def handle_request_capture(request):
+ try:
+ post_data_str = None
+ try:
+ # Be cautious with large post data
+ post_data = request.post_data_buffer
+ if post_data:
+ # Attempt to decode, fallback to base64 or size indication
+ try:
+ post_data_str = post_data.decode('utf-8', errors='replace')
+ except UnicodeDecodeError:
+ post_data_str = f"[Binary data: {len(post_data)} bytes]"
+ except Exception:
+ post_data_str = "[Error retrieving post data]"
+
+ captured_requests.append({
+ "event_type": "request",
+ "url": request.url,
+ "method": request.method,
+ "headers": dict(request.headers), # Convert Header dict
+ "post_data": post_data_str,
+ "resource_type": request.resource_type,
+ "is_navigation_request": request.is_navigation_request(),
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ self.logger.warning(f"Error capturing request details for {request.url}: {e}", tag="CAPTURE")
+ captured_requests.append({"event_type": "request_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+ async def handle_response_capture(response):
+ try:
+ # Avoid capturing full response body by default due to size/security
+ # security_details = await response.security_details() # Optional: More SSL info
+ captured_requests.append({
+ "event_type": "response",
+ "url": response.url,
+ "status": response.status,
+ "status_text": response.status_text,
+ "headers": dict(response.headers), # Convert Header dict
+ "from_service_worker": response.from_service_worker,
+ # "security_details": security_details, # Uncomment if needed
+ "request_timing": response.request.timing, # Detailed timing info
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ self.logger.warning(f"Error capturing response details for {response.url}: {e}", tag="CAPTURE")
+ captured_requests.append({"event_type": "response_capture_error", "url": response.url, "error": str(e), "timestamp": time.time()})
+
+ async def handle_request_failed_capture(request):
+ try:
+ captured_requests.append({
+ "event_type": "request_failed",
+ "url": request.url,
+ "method": request.method,
+ "resource_type": request.resource_type,
+ "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ self.logger.warning(f"Error capturing request failed details for {request.url}: {e}", tag="CAPTURE")
+ captured_requests.append({"event_type": "request_failed_capture_error", "url": request.url, "error": str(e), "timestamp": time.time()})
+
+ page.on("request", handle_request_capture)
+ page.on("response", handle_response_capture)
+ page.on("requestfailed", handle_request_failed_capture)
+
+ # Console Message Capturing
+ if config.capture_console_messages:
+ def handle_console_capture(msg):
+ try:
+ location = msg.location()
+ # Attempt to resolve JSHandle args to primitive values
+ resolved_args = []
+ try:
+ for arg in msg.args:
+ resolved_args.append(arg.json_value()) # May fail for complex objects
+ except Exception:
+ resolved_args.append("[Could not resolve JSHandle args]")
+
+ captured_console.append({
+ "type": msg.type(), # e.g., 'log', 'error', 'warning'
+ "text": msg.text(),
+ "args": resolved_args, # Captured arguments
+ "location": f"{location['url']}:{location['lineNumber']}:{location['columnNumber']}" if location else "N/A",
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ self.logger.warning(f"Error capturing console message: {e}", tag="CAPTURE")
+ captured_console.append({"type": "console_capture_error", "error": str(e), "timestamp": time.time()})
+
+ def handle_pageerror_capture(err):
+ try:
+ captured_console.append({
+ "type": "error", # Consistent type for page errors
+ "text": err.message,
+ "stack": err.stack,
+ "timestamp": time.time()
+ })
+ except Exception as e:
+ self.logger.warning(f"Error capturing page error: {e}", tag="CAPTURE")
+ captured_console.append({"type": "pageerror_capture_error", "error": str(e), "timestamp": time.time()})
+
+ page.on("console", handle_console_capture)
+ page.on("pageerror", handle_pageerror_capture)
+ # --- End Setup Capturing Listeners ---
+
+
+ # Set up console logging if requested (Keep original logging logic separate or merge carefully)
+ if config.log_console:
+ # ... (original log_console setup using page.on(...) remains here) ...
+ # This allows logging to screen *and* capturing to the list if both flags are True
+ def log_consol(msg, console_log_type="debug"):
+ # ... existing implementation ...
+ pass # Placeholder for existing code
+
+ page.on("console", lambda msg: log_consol(msg, "debug"))
+ page.on("pageerror", lambda e: log_consol(e, "error"))
+
+
+ try:
+ # ... (existing code for SSL, downloads, goto, waits, JS execution, etc.) ...
+
+ # Get final HTML content
+ # ... (existing code for selector logic or page.content()) ...
+ if config.css_selector:
+ # ... existing selector logic ...
+ html = f"\n" + "\n".join(html_parts) + "\n
"
+ else:
+ html = await page.content()
+
+ await self.execute_hook(
+ "before_return_html", page=page, html=html, context=context, config=config
+ )
+
+ # Handle PDF and screenshot generation
+ # ... (existing code) ...
+
+ # Define delayed content getter
+ # ... (existing code) ...
+
+ # Return complete response - ADD CAPTURED DATA HERE
+ return AsyncCrawlResponse(
+ html=html,
+ response_headers=response_headers,
+ js_execution_result=execution_result,
+ status_code=status_code,
+ screenshot=screenshot_data,
+ pdf_data=pdf_data,
+ get_delayed_content=get_delayed_content,
+ ssl_certificate=ssl_cert,
+ downloaded_files=(
+ self._downloaded_files if self._downloaded_files else None
+ ),
+ redirected_url=redirected_url,
+ # NEW: Pass captured data conditionally
+ network_requests=captured_requests if config.capture_network_requests else None,
+ console_messages=captured_console if config.capture_console_messages else None,
+ )
+
+ except Exception as e:
+ raise e # Re-raise the original exception
+
+ finally:
+ # If no session_id is given we should close the page
+ if not config.session_id:
+ # Detach listeners before closing to prevent potential errors during close
+ if config.capture_network_requests:
+ page.remove_listener("request", handle_request_capture)
+ page.remove_listener("response", handle_response_capture)
+ page.remove_listener("requestfailed", handle_request_failed_capture)
+ if config.capture_console_messages:
+ page.remove_listener("console", handle_console_capture)
+ page.remove_listener("pageerror", handle_pageerror_capture)
+ # Also remove logging listeners if they were attached
+ if config.log_console:
+ # Need to figure out how to remove the lambdas if necessary,
+ # or ensure they don't cause issues on close. Often, it's fine.
+ pass
+
+ await page.close()
+
+ # ... (rest of AsyncPlaywrightCrawlerStrategy methods) ...
+
+```
+
+**4. Core Crawler (`crawl4ai/async_webcrawler.py`)**
+
+* **Goal:** Ensure the captured data from `AsyncCrawlResponse` is transferred to the final `CrawlResult`.
+* **Changes:**
+ * In `arun`, when processing a non-cached result (inside the `if not cached_result or not html:` block), after receiving `async_response` and calling `aprocess_html` to get `crawl_result`, copy the `network_requests` and `console_messages` from `async_response` to `crawl_result`.
+
+```python
+# ==== File: crawl4ai/async_webcrawler.py ====
+# ... (imports) ...
+
+class AsyncWebCrawler:
+ # ... (existing methods) ...
+
+ async def arun(
+ self,
+ url: str,
+ config: CrawlerRunConfig = None,
+ **kwargs,
+ ) -> RunManyReturn:
+ # ... (existing setup, cache check) ...
+
+ async with self._lock or self.nullcontext():
+ try:
+ # ... (existing logging, cache context setup) ...
+
+ if cached_result:
+ # ... (existing cache handling logic) ...
+ # Note: Captured network/console usually not useful from cache
+ # Ensure they are None or empty if read from cache, unless stored explicitly
+ cached_result.network_requests = cached_result.network_requests or None
+ cached_result.console_messages = cached_result.console_messages or None
+ # ... (rest of cache logic) ...
+
+ # Fetch fresh content if needed
+ if not cached_result or not html:
+ t1 = time.perf_counter()
+
+ # ... (existing user agent update, robots.txt check) ...
+
+ ##############################
+ # Call CrawlerStrategy.crawl #
+ ##############################
+ async_response = await self.crawler_strategy.crawl(
+ url,
+ config=config,
+ )
+
+ # ... (existing assignment of html, screenshot, pdf, js_result from async_response) ...
+
+ t2 = time.perf_counter()
+ # ... (existing logging) ...
+
+ ###############################################################
+ # Process the HTML content, Call CrawlerStrategy.process_html #
+ ###############################################################
+ crawl_result: CrawlResult = await self.aprocess_html(
+ # ... (existing args) ...
+ )
+
+ # --- Transfer data from AsyncCrawlResponse to CrawlResult ---
+ crawl_result.status_code = async_response.status_code
+ crawl_result.redirected_url = async_response.redirected_url or url
+ crawl_result.response_headers = async_response.response_headers
+ crawl_result.downloaded_files = async_response.downloaded_files
+ crawl_result.js_execution_result = js_execution_result
+ crawl_result.ssl_certificate = async_response.ssl_certificate
+ # NEW: Copy captured data
+ crawl_result.network_requests = async_response.network_requests
+ crawl_result.console_messages = async_response.console_messages
+ # ------------------------------------------------------------
+
+ crawl_result.success = bool(html)
+ crawl_result.session_id = getattr(config, "session_id", None)
+
+ # ... (existing logging) ...
+
+ # Update cache if appropriate
+ if cache_context.should_write() and not bool(cached_result):
+ # crawl_result now includes network/console data if captured
+ await async_db_manager.acache_url(crawl_result)
+
+ return CrawlResultContainer(crawl_result)
+
+ else: # Cached result was used
+ # ... (existing logging for cache hit) ...
+ cached_result.success = bool(html)
+ cached_result.session_id = getattr(config, "session_id", None)
+ cached_result.redirected_url = cached_result.redirected_url or url
+ return CrawlResultContainer(cached_result)
+
+ except Exception as e:
+ # ... (existing error handling) ...
+ return CrawlResultContainer(
+ CrawlResult(
+ url=url, html="", success=False, error_message=error_message
+ )
+ )
+
+ # ... (aprocess_html remains unchanged regarding capture) ...
+
+ # ... (arun_many remains unchanged regarding capture) ...
+```
+
+**Summary of Changes:**
+
+1. **Configuration:** Added `capture_network_requests` and `capture_console_messages` flags to `CrawlerRunConfig`.
+2. **Models:** Added corresponding `network_requests` and `console_messages` fields (List of Dicts) to `AsyncCrawlResponse` and `CrawlResult`.
+3. **Strategy:** Implemented conditional event listeners in `AsyncPlaywrightCrawlerStrategy._crawl_web` to capture data into lists when flags are true. Populated these fields in the returned `AsyncCrawlResponse`. Added basic error handling within capture handlers. Added timestamps.
+4. **Crawler:** Modified `AsyncWebCrawler.arun` to copy the captured data from `AsyncCrawlResponse` into the final `CrawlResult` for non-cached fetches.
+
+This approach keeps the capturing logic contained within the Playwright strategy, uses clear configuration flags, and integrates the results into the existing data flow. The data format (list of dictionaries) is flexible for storing varied information from requests/responses/console messages.
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index ad07548d..be44397e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ dynamic = ["version"]
description = "🚀🤖 Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
readme = "README.md"
requires-python = ">=3.9"
-license = {text = "MIT"}
+license = "Apache-2.0"
authors = [
{name = "Unclecode", email = "unclecode@kidocode.com"}
]
@@ -40,14 +40,14 @@ dependencies = [
"fake-useragent>=2.0.3",
"click>=8.1.7",
"pyperclip>=1.8.2",
- "faust-cchardet>=2.1.19",
+ "chardet>=5.2.0",
"aiohttp>=3.11.11",
+ "brotli>=1.1.0",
"humanize>=4.10.0",
]
classifiers = [
"Development Status :: 4 - Beta",
"Intended Audience :: Developers",
- "License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
diff --git a/requirements.txt b/requirements.txt
index 8ad6bc41..4aa2dbff 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -21,5 +21,6 @@ psutil>=6.1.1
nltk>=3.9.1
rich>=13.9.4
cssselect>=1.2.0
-faust-cchardet>=2.1.19
+chardet>=5.2.0
+brotli>=1.1.0
fake-useragent>=2.2.0
\ No newline at end of file
diff --git a/setup.py b/setup.py
index 16b1b53c..a0b91041 100644
--- a/setup.py
+++ b/setup.py
@@ -49,13 +49,12 @@ setup(
url="https://github.com/unclecode/crawl4ai",
author="Unclecode",
author_email="unclecode@kidocode.com",
- license="MIT",
+ license="Apache-2.0",
packages=find_packages(),
package_data={"crawl4ai": ["js_snippet/*.js"]},
classifiers=[
"Development Status :: 3 - Alpha",
"Intended Audience :: Developers",
- "License :: OSI Approved :: Apache Software License",
"Programming Language :: Python :: 3",
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
diff --git a/tests/docker/test_rest_api_deep_crawl.py b/tests/docker/test_rest_api_deep_crawl.py
new file mode 100644
index 00000000..c535727f
--- /dev/null
+++ b/tests/docker/test_rest_api_deep_crawl.py
@@ -0,0 +1,596 @@
+# ==== File: test_rest_api_deep_crawl.py ====
+
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv() # Load environment variables from .env file if present
+
+# --- Test Configuration ---
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # If server is running in Docker, use the host's IP
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # If server is running in dev debug mode
+DEEP_CRAWL_BASE_URL = "https://docs.crawl4ai.com/samples/deepcrawl/"
+DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # Used for domain filter
+
+# --- Helper Functions ---
+def load_proxies_from_env() -> List[Dict]:
+ """Load proxies from PROXIES environment variable"""
+ proxies = []
+ proxies_str = os.getenv("PROXIES", "")
+ if not proxies_str:
+ print("PROXIES environment variable not set or empty.")
+ return proxies
+ try:
+ proxy_list = proxies_str.split(",")
+ for proxy in proxy_list:
+ proxy = proxy.strip()
+ if not proxy:
+ continue
+ parts = proxy.split(":")
+ if len(parts) == 4:
+ ip, port, username, password = parts
+ proxies.append({
+ "server": f"http://{ip}:{port}", # Assuming http, adjust if needed
+ "username": username,
+ "password": password,
+ "ip": ip # Store original IP if available
+ })
+ elif len(parts) == 2: # ip:port only
+ ip, port = parts
+ proxies.append({
+ "server": f"http://{ip}:{port}",
+ "ip": ip
+ })
+ else:
+ print(f"Skipping invalid proxy string format: {proxy}")
+
+ except Exception as e:
+ print(f"Error loading proxies from environment: {e}")
+ return proxies
+
+
+async def check_server_health(client: httpx.AsyncClient):
+ """Check if the server is healthy before running tests."""
+ try:
+ response = await client.get("/health")
+ response.raise_for_status()
+ print(f"\nServer healthy: {response.json()}")
+ return True
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any], check_ssl=False):
+ """Asserts the basic structure of a single crawl result."""
+ assert isinstance(result, dict)
+ assert "url" in result
+ assert "success" in result
+ assert "html" in result # Basic crawls should return HTML
+ assert "metadata" in result
+ assert isinstance(result["metadata"], dict)
+ assert "depth" in result["metadata"] # Deep crawls add depth
+
+ if check_ssl:
+ assert "ssl_certificate" in result # Check if SSL info is present
+ assert isinstance(result["ssl_certificate"], dict) or result["ssl_certificate"] is None
+
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+ """Processes an NDJSON streaming response."""
+ results = []
+ completed = False
+ async for line in response.aiter_lines():
+ if line:
+ try:
+ data = json.loads(line)
+ if data.get("status") == "completed":
+ completed = True
+ break # Stop processing after completion marker
+ elif data.get("url"): # Ensure it looks like a result object
+ results.append(data)
+ else:
+ print(f"Received non-result JSON line: {data}") # Log other status messages if needed
+ except json.JSONDecodeError:
+ pytest.fail(f"Failed to decode JSON line: {line}")
+ assert completed, "Streaming response did not end with a completion marker."
+ return results
+
+
+# --- Pytest Fixtures ---
+@pytest_asyncio.fixture(scope="function")
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+ """Provides an async HTTP client"""
+ # Increased timeout for potentially longer deep crawls
+ async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client:
+ yield client
+ # No explicit close needed with 'async with'
+
+# --- Test Class ---
+@pytest.mark.asyncio
+class TestDeepCrawlEndpoints:
+
+ @pytest_asyncio.fixture(autouse=True)
+ async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+ """Fixture to ensure server is healthy before each test in the class."""
+ await check_server_health(async_client)
+
+ # 1. Basic Deep Crawl
+ async def test_deep_crawl_basic_bfs(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl with limited depth and pages."""
+ max_depth = 1
+ max_pages = 3 # start_url + 2 more
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS", # Use string value for CacheMode
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ # Minimal filters for basic test
+ "filter_chain": {
+ "type": "FilterChain",
+ "params": {
+ "filters": [
+ {
+ "type": "DomainFilter",
+ "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+ }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ assert data["success"] is True
+ assert isinstance(data["results"], list)
+ assert len(data["results"]) > 1 # Should be more than just the start URL
+ assert len(data["results"]) <= max_pages # Respect max_pages
+
+ found_depth_0 = False
+ found_depth_1 = False
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert DEEP_CRAWL_DOMAIN in result["url"]
+ depth = result["metadata"]["depth"]
+ assert depth <= max_depth
+ if depth == 0: found_depth_0 = True
+ if depth == 1: found_depth_1 = True
+
+ assert found_depth_0
+ assert found_depth_1
+
+ # 2. Deep Crawl with Filtering
+ async def test_deep_crawl_with_filters(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl with content type and domain filters."""
+ max_depth = 1
+ max_pages = 5
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ "filter_chain": {
+ "type": "FilterChain",
+ "params": {
+ "filters": [
+ {
+ "type": "DomainFilter",
+ "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}
+ },
+ {
+ "type": "ContentTypeFilter",
+ "params": {"allowed_types": ["text/html"]}
+ },
+ # Example: Exclude specific paths using regex
+ {
+ "type": "URLPatternFilter",
+ "params": {
+ "patterns": ["*/category-3/*"], # Block category 3
+ "reverse": True # Block if match
+ }
+ }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ assert data["success"] is True
+ assert len(data["results"]) > 0
+ assert len(data["results"]) <= max_pages
+
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert DEEP_CRAWL_DOMAIN in result["url"]
+ assert "category-3" not in result["url"] # Check if filter worked
+ assert result["metadata"]["depth"] <= max_depth
+
+ # 3. Deep Crawl with Scoring
+ async def test_deep_crawl_with_scoring(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl with URL scoring."""
+ max_depth = 1
+ max_pages = 4
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ "filter_chain": { # Keep basic domain filter
+ "type": "FilterChain",
+ "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+ },
+ "url_scorer": { # Add scorer
+ "type": "CompositeScorer",
+ "params": {
+ "scorers": [
+ { # Favor pages with 'product' in the URL
+ "type": "KeywordRelevanceScorer",
+ "params": {"keywords": ["product"], "weight": 1.0}
+ },
+ { # Penalize deep paths slightly
+ "type": "PathDepthScorer",
+ "params": {"optimal_depth": 2, "weight": -0.2}
+ }
+ ]
+ }
+ },
+ # Set a threshold if needed: "score_threshold": 0.1
+ }
+ }
+ }
+ }
+ }
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ assert data["success"] is True
+ assert len(data["results"]) > 0
+ assert len(data["results"]) <= max_pages
+
+ # Check if results seem biased towards products (harder to assert strictly without knowing exact scores)
+ product_urls_found = any("product_" in result["url"] for result in data["results"] if result["metadata"]["depth"] > 0)
+ print(f"Product URLs found among depth > 0 results: {product_urls_found}")
+ # We expect scoring to prioritize product pages if available within limits
+ # assert product_urls_found # This might be too strict depending on site structure and limits
+
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert result["metadata"]["depth"] <= max_depth
+
+ # 4. Deep Crawl with CSS Extraction
+ async def test_deep_crawl_with_css_extraction(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl combined with JsonCssExtractionStrategy."""
+ max_depth = 6 # Go deep enough to reach product pages
+ max_pages = 20
+ # Schema to extract product details
+ product_schema = {
+ "name": "ProductDetails",
+ "baseSelector": "div.container", # Base for product page
+ "fields": [
+ {"name": "product_title", "selector": "h1", "type": "text"},
+ {"name": "price", "selector": ".product-price", "type": "text"},
+ {"name": "description", "selector": ".product-description p", "type": "text"},
+ {"name": "specs", "selector": ".product-specs li", "type": "list", "fields":[
+ {"name": "spec_name", "selector": ".spec-name", "type": "text"},
+ {"name": "spec_value", "selector": ".spec-value", "type": "text"}
+ ]}
+ ]
+ }
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "extraction_strategy": { # Apply extraction to ALL crawled pages
+ "type": "JsonCssExtractionStrategy",
+ "params": {"schema": {"type": "dict", "value": product_schema}}
+ },
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ "filter_chain": { # Only crawl HTML on our domain
+ "type": "FilterChain",
+ "params": {
+ "filters": [
+ {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+ {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+ ]
+ }
+ }
+ # Optional: Add scoring to prioritize product pages for extraction
+ }
+ }
+ }
+ }
+ }
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ assert data["success"] is True
+ assert len(data["results"]) > 0
+ # assert len(data["results"]) <= max_pages
+
+ found_extracted_product = False
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "extracted_content" in result
+ if "product_" in result["url"]: # Check product pages specifically
+ assert result["extracted_content"] is not None
+ try:
+ extracted = json.loads(result["extracted_content"])
+ # Schema returns list even if one base match
+ assert isinstance(extracted, list)
+ if extracted:
+ item = extracted[0]
+ assert "product_title" in item and item["product_title"]
+ assert "price" in item and item["price"]
+ # Specs might be empty list if not found
+ assert "specs" in item and isinstance(item["specs"], list)
+ found_extracted_product = True
+ print(f"Extracted product: {item.get('product_title')}")
+ except (json.JSONDecodeError, AssertionError, IndexError) as e:
+ pytest.fail(f"Extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+ # else:
+ # # Non-product pages might have None or empty list depending on schema match
+ # assert result["extracted_content"] is None or json.loads(result["extracted_content"]) == []
+
+ assert found_extracted_product, "Did not find any pages where product data was successfully extracted."
+
+ # 5. Deep Crawl with LLM Extraction (Requires Server LLM Setup)
+ async def test_deep_crawl_with_llm_extraction(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl combined with LLMExtractionStrategy."""
+ max_depth = 1 # Limit depth to keep LLM calls manageable
+ max_pages = 3
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "extraction_strategy": { # Apply LLM extraction to crawled pages
+ "type": "LLMExtractionStrategy",
+ "params": {
+ "instruction": "Extract the main H1 title and the text content of the first paragraph.",
+ "llm_config": { # Example override, rely on server default if possible
+ "type": "LLMConfig",
+ "params": {"provider": "openai/gpt-4.1-mini"} # Use a cheaper model for testing
+ },
+ "schema": { # Expected JSON output
+ "type": "dict",
+ "value": {
+ "title": "PageContent", "type": "object",
+ "properties": {
+ "h1_title": {"type": "string"},
+ "first_paragraph": {"type": "string"}
+ }
+ }
+ }
+ }
+ },
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ "filter_chain": {
+ "type": "FilterChain",
+ "params": {
+ "filters": [
+ {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
+ {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ try:
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and LLM API key setup.")
+ except httpx.RequestError as e:
+ pytest.fail(f"Deep Crawl + LLM extraction request failed: {e}.")
+
+
+ assert data["success"] is True
+ assert len(data["results"]) > 0
+ assert len(data["results"]) <= max_pages
+
+ found_llm_extraction = False
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "extracted_content" in result
+ assert result["extracted_content"] is not None
+ try:
+ extracted = json.loads(result["extracted_content"])
+ if isinstance(extracted, list): extracted = extracted[0] # Handle list output
+ assert isinstance(extracted, dict)
+ assert "h1_title" in extracted # Check keys based on schema
+ assert "first_paragraph" in extracted
+ found_llm_extraction = True
+ print(f"LLM extracted from {result['url']}: Title='{extracted.get('h1_title')}'")
+ except (json.JSONDecodeError, AssertionError, IndexError, TypeError) as e:
+ pytest.fail(f"LLM extraction validation failed for {result['url']}: {e}\nContent: {result['extracted_content']}")
+
+ assert found_llm_extraction, "LLM extraction did not yield expected data on any crawled page."
+
+
+ # 6. Deep Crawl with SSL Certificate Fetching
+ async def test_deep_crawl_with_ssl(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl with fetch_ssl_certificate enabled."""
+ max_depth = 0 # Only fetch for start URL to keep test fast
+ max_pages = 1
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "fetch_ssl_certificate": True, # <-- Enable SSL fetching
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ }
+ }
+ }
+ }
+ }
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ assert data["success"] is True
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+
+ await assert_crawl_result_structure(result, check_ssl=True) # <-- Tell helper to check SSL field
+ assert result["success"] is True
+ # Check if SSL info was actually retrieved
+ if result["ssl_certificate"]:
+ # Assert directly using dictionary keys
+ assert isinstance(result["ssl_certificate"], dict) # Verify it's a dict
+ assert "issuer" in result["ssl_certificate"]
+ assert "subject" in result["ssl_certificate"]
+ # --- MODIFIED ASSERTIONS ---
+ assert "not_before" in result["ssl_certificate"] # Check for the actual key
+ assert "not_after" in result["ssl_certificate"] # Check for the actual key
+ # --- END MODIFICATIONS ---
+ assert "fingerprint" in result["ssl_certificate"] # Check another key
+
+ # This print statement using .get() already works correctly with dictionaries
+ print(f"SSL Issuer Org: {result['ssl_certificate'].get('issuer', {}).get('O', 'N/A')}")
+ print(f"SSL Valid From: {result['ssl_certificate'].get('not_before', 'N/A')}")
+ else:
+ # This part remains the same
+ print("SSL Certificate was null in the result.")
+
+
+ # 7. Deep Crawl with Proxy Rotation (Requires PROXIES env var)
+ async def test_deep_crawl_with_proxies(self, async_client: httpx.AsyncClient):
+ """Test BFS deep crawl using proxy rotation."""
+ proxies = load_proxies_from_env()
+ if not proxies:
+ pytest.skip("Skipping proxy test: PROXIES environment variable not set or empty.")
+
+ print(f"\nTesting with {len(proxies)} proxies loaded from environment.")
+
+ max_depth = 1
+ max_pages = 3
+ payload = {
+ "urls": [DEEP_CRAWL_BASE_URL], # Use the dummy site
+ # Use a BrowserConfig that *might* pick up proxy if set, but rely on CrawlerRunConfig
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": "BYPASS",
+ "proxy_rotation_strategy": { # <-- Define the strategy
+ "type": "RoundRobinProxyStrategy",
+ "params": {
+ # Convert ProxyConfig dicts back to the serialized format expected by server
+ "proxies": [{"type": "ProxyConfig", "params": p} for p in proxies]
+ }
+ },
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": max_depth,
+ "max_pages": max_pages,
+ "filter_chain": {
+ "type": "FilterChain",
+ "params": { "filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
+ }
+ }
+ }
+ }
+ }
+ }
+ try:
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ # Proxies often cause connection errors, catch them
+ pytest.fail(f"Proxy deep crawl failed: {e}. Response: {e.response.text}. Are proxies valid and accessible by the server?")
+ except httpx.RequestError as e:
+ pytest.fail(f"Proxy deep crawl request failed: {e}. Are proxies valid and accessible?")
+
+ assert data["success"] is True
+ assert len(data["results"]) > 0
+ assert len(data["results"]) <= max_pages
+ # Primary assertion is that the crawl succeeded *with* proxy config
+ print(f"Proxy deep crawl completed successfully for {len(data['results'])} pages.")
+
+ # Verifying specific proxy usage requires server logs or custom headers/responses
+
+
+# --- Main Execution Block (for running script directly) ---
+if __name__ == "__main__":
+ pytest_args = ["-v", "-s", __file__]
+ # Example: Run only proxy test
+ # pytest_args.append("-k test_deep_crawl_with_proxies")
+ print(f"Running pytest with args: {pytest_args}")
+ exit_code = pytest.main(pytest_args)
+ print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file
diff --git a/tests/docker/test_server_requests.py b/tests/docker/test_server_requests.py
new file mode 100644
index 00000000..56d2ada4
--- /dev/null
+++ b/tests/docker/test_server_requests.py
@@ -0,0 +1,655 @@
+import pytest
+import pytest_asyncio
+import httpx
+import json
+import asyncio
+import os
+from typing import List, Dict, Any, AsyncGenerator
+
+from dotenv import load_dotenv
+load_dotenv()
+
+
+# Optional: Import crawl4ai classes directly for reference/easier payload creation aid
+# You don't strictly NEED these imports for the tests to run against the server,
+# but they help in understanding the structure you are mimicking in JSON.
+from crawl4ai import (
+ BrowserConfig,
+ CrawlerRunConfig,
+ CacheMode,
+ DefaultMarkdownGenerator,
+ PruningContentFilter,
+ BM25ContentFilter,
+ BFSDeepCrawlStrategy,
+ FilterChain,
+ ContentTypeFilter,
+ DomainFilter,
+ CompositeScorer,
+ KeywordRelevanceScorer,
+ PathDepthScorer,
+ JsonCssExtractionStrategy,
+ LLMExtractionStrategy,
+ LLMConfig
+)
+
+# --- Test Configuration ---
+# BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") # Make base URL configurable
+BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Make base URL configurable
+# Use a known simple HTML page for basic tests
+SIMPLE_HTML_URL = "https://httpbin.org/html"
+# Use a site suitable for scraping tests
+SCRAPE_TARGET_URL = "http://books.toscrape.com/"
+# Use a site with internal links for deep crawl tests
+DEEP_CRAWL_URL = "https://python.org"
+
+# --- Pytest Fixtures ---
+
+# Use the built-in event_loop fixture from pytest_asyncio
+# The custom implementation was causing issues with closing the loop
+
+@pytest_asyncio.fixture(scope="function") # Changed to function scope to avoid event loop issues
+async def async_client() -> AsyncGenerator[httpx.AsyncClient, None]:
+ """Provides an async HTTP client"""
+ client = httpx.AsyncClient(base_url=BASE_URL, timeout=120.0)
+ yield client
+ await client.aclose()
+
+# --- Helper Functions ---
+
+async def check_server_health(client: httpx.AsyncClient):
+ """Check if the server is healthy before running tests."""
+ try:
+ response = await client.get("/health")
+ response.raise_for_status()
+ print(f"\nServer healthy: {response.json()}")
+ return True
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ pytest.fail(f"Server health check failed: {e}. Is the server running at {BASE_URL}?", pytrace=False)
+
+async def assert_crawl_result_structure(result: Dict[str, Any]):
+ """Asserts the basic structure of a single crawl result."""
+ assert isinstance(result, dict)
+ assert "url" in result
+ assert "success" in result
+ assert "html" in result
+ # Add more common checks if needed
+
+async def process_streaming_response(response: httpx.Response) -> List[Dict[str, Any]]:
+ """Processes an NDJSON streaming response."""
+ results = []
+ completed = False
+ async for line in response.aiter_lines():
+ if line:
+ try:
+ data = json.loads(line)
+ if data.get("status") == "completed":
+ completed = True
+ break # Stop processing after completion marker
+ else:
+ results.append(data)
+ except json.JSONDecodeError:
+ pytest.fail(f"Failed to decode JSON line: {line}")
+ assert completed, "Streaming response did not end with a completion marker."
+ return results
+
+
+# --- Test Class ---
+
+@pytest.mark.asyncio
+class TestCrawlEndpoints:
+
+ @pytest_asyncio.fixture(autouse=True)
+ async def check_health_before_tests(self, async_client: httpx.AsyncClient):
+ """Fixture to ensure server is healthy before each test in the class."""
+ await check_server_health(async_client)
+
+ # 1. Simple Requests (Primitives)
+ async def test_simple_crawl_single_url(self, async_client: httpx.AsyncClient):
+ """Test /crawl with a single URL and simple config values."""
+ payload = {
+ "urls": [SIMPLE_HTML_URL],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {
+ "headless": True,
+ }
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False, # Explicitly false for /crawl
+ "screenshot": False,
+ "cache_mode": CacheMode.BYPASS.value # Use enum value
+ }
+ }
+ }
+ try:
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error: {e}")
+ print(f"Response content: {e.response.text}")
+ raise
+
+ assert data["success"] is True
+ assert isinstance(data["results"], list)
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert result["url"] == SIMPLE_HTML_URL
+ assert "Herman Melville - Moby-Dick
" in result["html"]
+ # We don't specify a markdown generator in this test, so don't make assumptions about markdown field
+ # It might be null, missing, or populated depending on the server's default behavior
+
+ async def test_simple_crawl_single_url_streaming(self, async_client: httpx.AsyncClient):
+ """Test /crawl/stream with a single URL and simple config values."""
+ payload = {
+ "urls": [SIMPLE_HTML_URL],
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {
+ "headless": True,
+ }
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": True, # Must be true for /crawl/stream
+ "screenshot": False,
+ "cache_mode": CacheMode.BYPASS.value
+ }
+ }
+ }
+ async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+ response.raise_for_status()
+ results = await process_streaming_response(response)
+
+ assert len(results) == 1
+ result = results[0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert result["url"] == SIMPLE_HTML_URL
+ assert "Herman Melville - Moby-Dick
" in result["html"]
+
+
+ # 2. Multi-URL and Dispatcher
+ async def test_multi_url_crawl(self, async_client: httpx.AsyncClient):
+ """Test /crawl with multiple URLs, implicitly testing dispatcher."""
+ urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+ payload = {
+ "urls": urls,
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {"headless": True}
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"stream": False, "cache_mode": CacheMode.BYPASS.value}
+ }
+ }
+ try:
+ print(f"Sending deep crawl request to server...")
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code >= 400:
+ error_detail = response.json().get('detail', 'No detail provided')
+ print(f"Error detail: {error_detail}")
+ print(f"Full response: {response.text}")
+
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error status: {e.response.status_code}")
+ print(f"Server error response: {e.response.text}")
+ try:
+ error_json = e.response.json()
+ print(f"Parsed error: {error_json}")
+ except:
+ print("Could not parse error response as JSON")
+ raise
+
+ assert data["success"] is True
+ assert isinstance(data["results"], list)
+ assert len(data["results"]) == len(urls)
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert result["url"] in urls
+
+ async def test_multi_url_crawl_streaming(self, async_client: httpx.AsyncClient):
+ """Test /crawl/stream with multiple URLs."""
+ urls = [SIMPLE_HTML_URL, "https://httpbin.org/links/10/0"]
+ payload = {
+ "urls": urls,
+ "browser_config": {
+ "type": "BrowserConfig",
+ "params": {"headless": True}
+ },
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"stream": True, "cache_mode": CacheMode.BYPASS.value}
+ }
+ }
+ async with async_client.stream("POST", "/crawl/stream", json=payload) as response:
+ response.raise_for_status()
+ results = await process_streaming_response(response)
+
+ assert len(results) == len(urls)
+ processed_urls = set()
+ for result in results:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert result["url"] in urls
+ processed_urls.add(result["url"])
+ assert processed_urls == set(urls) # Ensure all URLs were processed
+
+
+ # 3. Class Values and Nested Classes (Markdown Generator)
+ async def test_crawl_with_markdown_pruning_filter(self, async_client: httpx.AsyncClient):
+ """Test /crawl with MarkdownGenerator using PruningContentFilter."""
+ payload = {
+ "urls": [SIMPLE_HTML_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": CacheMode.ENABLED.value, # Test different cache mode
+ "markdown_generator": {
+ "type": "DefaultMarkdownGenerator",
+ "params": {
+ "content_filter": {
+ "type": "PruningContentFilter",
+ "params": {
+ "threshold": 0.5, # Example param
+ "threshold_type": "relative"
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ try:
+ print(f"Sending deep crawl request to server...")
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code >= 400:
+ error_detail = response.json().get('detail', 'No detail provided')
+ print(f"Error detail: {error_detail}")
+ print(f"Full response: {response.text}")
+
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error status: {e.response.status_code}")
+ print(f"Server error response: {e.response.text}")
+ try:
+ error_json = e.response.json()
+ print(f"Parsed error: {error_json}")
+ except:
+ print("Could not parse error response as JSON")
+ raise
+
+ assert data["success"] is True
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "markdown" in result
+ assert isinstance(result["markdown"], dict)
+ assert "raw_markdown" in result["markdown"]
+ assert "fit_markdown" in result["markdown"] # Pruning creates fit_markdown
+ assert "Moby-Dick" in result["markdown"]["raw_markdown"]
+ # Fit markdown content might be different/shorter due to pruning
+ assert len(result["markdown"]["fit_markdown"]) <= len(result["markdown"]["raw_markdown"])
+
+ async def test_crawl_with_markdown_bm25_filter(self, async_client: httpx.AsyncClient):
+ """Test /crawl with MarkdownGenerator using BM25ContentFilter."""
+ payload = {
+ "urls": [SIMPLE_HTML_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "markdown_generator": {
+ "type": "DefaultMarkdownGenerator",
+ "params": {
+ "content_filter": {
+ "type": "BM25ContentFilter",
+ "params": {
+ "user_query": "Herman Melville", # Query for BM25
+ "bm25_threshold": 0.1, # Lower threshold to increase matches
+ "language": "english" # Valid parameters
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ try:
+ print(f"Payload for BM25 test: {json.dumps(payload)}")
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code >= 400:
+ error_detail = response.json().get('detail', 'No detail provided')
+ print(f"Error detail: {error_detail}")
+ print(f"Full response: {response.text}")
+
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error status: {e.response.status_code}")
+ print(f"Server error response: {e.response.text}")
+ try:
+ error_json = e.response.json()
+ print(f"Parsed error: {error_json}")
+ except:
+ print("Could not parse error response as JSON")
+ raise
+
+ assert data["success"] is True
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "markdown" in result
+ assert isinstance(result["markdown"], dict)
+ assert "raw_markdown" in result["markdown"]
+ assert "fit_markdown" in result["markdown"] # BM25 creates fit_markdown
+
+ # Print values for debug
+ print(f"Raw markdown length: {len(result['markdown']['raw_markdown'])}")
+ print(f"Fit markdown length: {len(result['markdown']['fit_markdown'])}")
+
+ # Either fit_markdown has content (possibly including our query terms)
+ # or it might be empty if no good BM25 matches were found
+ # Don't assert specific content since it can be environment-dependent
+
+
+ # 4. Deep Crawling
+ async def test_deep_crawl(self, async_client: httpx.AsyncClient):
+ """Test /crawl with a deep crawl strategy."""
+ payload = {
+ "urls": [DEEP_CRAWL_URL], # Start URL
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "stream": False,
+ "cache_mode": CacheMode.BYPASS.value,
+ "deep_crawl_strategy": {
+ "type": "BFSDeepCrawlStrategy",
+ "params": {
+ "max_depth": 1, # Limit depth for testing speed
+ "max_pages": 5, # Limit pages to crawl
+ "filter_chain": {
+ "type": "FilterChain",
+ "params": {
+ "filters": [
+ {
+ "type": "ContentTypeFilter",
+ "params": {"allowed_types": ["text/html"]}
+ },
+ {
+ "type": "DomainFilter",
+ "params": {"allowed_domains": ["python.org", "docs.python.org"]} # Include important subdomains
+ }
+ ]
+ }
+ },
+ "url_scorer": {
+ "type": "CompositeScorer",
+ "params": {
+ "scorers": [
+ {
+ "type": "KeywordRelevanceScorer",
+ "params": {"keywords": ["documentation", "tutorial"]}
+ },
+ {
+ "type": "PathDepthScorer",
+ "params": {"weight": 0.5, "optimal_depth": 2}
+ }
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ try:
+ print(f"Sending deep crawl request to server...")
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code >= 400:
+ error_detail = response.json().get('detail', 'No detail provided')
+ print(f"Error detail: {error_detail}")
+ print(f"Full response: {response.text}")
+
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error status: {e.response.status_code}")
+ print(f"Server error response: {e.response.text}")
+ try:
+ error_json = e.response.json()
+ print(f"Parsed error: {error_json}")
+ except:
+ print("Could not parse error response as JSON")
+ raise
+
+ assert data["success"] is True
+ assert isinstance(data["results"], list)
+ # Expect more than 1 result due to deep crawl (start URL + crawled links)
+ assert len(data["results"]) > 1
+ assert len(data["results"]) <= 6 # Start URL + max_links=5
+
+ start_url_found = False
+ crawled_urls_found = False
+ for result in data["results"]:
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+
+ # Print URL for debugging
+ print(f"Crawled URL: {result['url']}")
+
+ # Allow URLs that contain python.org (including subdomains like docs.python.org)
+ assert "python.org" in result["url"]
+ if result["url"] == DEEP_CRAWL_URL:
+ start_url_found = True
+ else:
+ crawled_urls_found = True
+
+ assert start_url_found
+ assert crawled_urls_found
+
+
+ # 5. Extraction without LLM (JSON/CSS)
+ async def test_json_css_extraction(self, async_client: httpx.AsyncClient):
+ """Test /crawl with JsonCssExtractionStrategy."""
+ payload = {
+ "urls": [SCRAPE_TARGET_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": CacheMode.BYPASS.value,
+ "extraction_strategy": {
+ "type": "JsonCssExtractionStrategy",
+ "params": {
+ "schema": {
+ "type": "dict", # IMPORTANT: Wrap schema dict with type/value structure
+ "value": {
+ "name": "BookList",
+ "baseSelector": "ol.row li.col-xs-6", # Select each book item
+ "fields": [
+ {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
+ {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
+ {"name": "rating", "selector": "article.product_pod p.star-rating", "type": "attribute", "attribute": "class"}
+ ]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+ try:
+ print(f"Sending deep crawl request to server...")
+ response = await async_client.post("/crawl", json=payload)
+ print(f"Response status: {response.status_code}")
+
+ if response.status_code >= 400:
+ error_detail = response.json().get('detail', 'No detail provided')
+ print(f"Error detail: {error_detail}")
+ print(f"Full response: {response.text}")
+
+ response.raise_for_status()
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ print(f"Server error status: {e.response.status_code}")
+ print(f"Server error response: {e.response.text}")
+ try:
+ error_json = e.response.json()
+ print(f"Parsed error: {error_json}")
+ except:
+ print("Could not parse error response as JSON")
+ raise
+
+ assert data["success"] is True
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "extracted_content" in result
+ assert result["extracted_content"] is not None
+
+ # Extracted content should be a JSON string representing a list of dicts
+ try:
+ extracted_data = json.loads(result["extracted_content"])
+ assert isinstance(extracted_data, list)
+ assert len(extracted_data) > 0 # Should find some books
+ # Check structure of the first extracted item
+ first_item = extracted_data[0]
+ assert "title" in first_item
+ assert "price" in first_item
+ assert "rating" in first_item
+ assert "star-rating" in first_item["rating"] # e.g., "star-rating Three"
+ except (json.JSONDecodeError, AssertionError) as e:
+ pytest.fail(f"Extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+
+
+ # 6. Extraction with LLM
+ async def test_llm_extraction(self, async_client: httpx.AsyncClient):
+ """
+ Test /crawl with LLMExtractionStrategy.
+ NOTE: Requires the server to have appropriate LLM API keys (e.g., OPENAI_API_KEY)
+ configured via .llm.env or environment variables.
+ This test uses the default provider configured in the server's config.yml.
+ """
+ payload = {
+ "urls": [SIMPLE_HTML_URL],
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {
+ "cache_mode": CacheMode.BYPASS.value,
+ "extraction_strategy": {
+ "type": "LLMExtractionStrategy",
+ "params": {
+ "instruction": "Extract the main title and the author mentioned in the text into JSON.",
+ # LLMConfig is implicitly defined by server's config.yml and .llm.env
+ # If you needed to override provider/token PER REQUEST:
+ "llm_config": {
+ "type": "LLMConfig",
+ "params": {
+ "provider": "openai/gpt-4o", # Example override
+ "api_token": os.getenv("OPENAI_API_KEY") # Example override
+ }
+ },
+ "schema": { # Optional: Provide a schema for structured output
+ "type": "dict", # IMPORTANT: Wrap schema dict
+ "value": {
+ "title": "Book Info",
+ "type": "object",
+ "properties": {
+ "title": {"type": "string", "description": "The main title of the work"},
+ "author": {"type": "string", "description": "The author of the work"}
+ },
+ "required": ["title", "author"]
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ try:
+ response = await async_client.post("/crawl", json=payload)
+ response.raise_for_status() # Will raise if server returns 500 (e.g., bad API key)
+ data = response.json()
+ except httpx.HTTPStatusError as e:
+ # Catch potential server errors (like 500 due to missing/invalid API keys)
+ pytest.fail(f"LLM extraction request failed: {e}. Response: {e.response.text}. Check server logs and ensure API keys are correctly configured for the server.")
+ except httpx.RequestError as e:
+ pytest.fail(f"LLM extraction request failed: {e}.")
+
+ assert data["success"] is True
+ assert len(data["results"]) == 1
+ result = data["results"][0]
+ await assert_crawl_result_structure(result)
+ assert result["success"] is True
+ assert "extracted_content" in result
+ assert result["extracted_content"] is not None
+
+ # Extracted content should be JSON (because we provided a schema)
+ try:
+ extracted_data = json.loads(result["extracted_content"])
+ print(f"\nLLM Extracted Data: {extracted_data}") # Print for verification
+
+ # Handle both dict and list formats (server returns a list)
+ if isinstance(extracted_data, list):
+ assert len(extracted_data) > 0
+ extracted_item = extracted_data[0] # Take first item
+ assert isinstance(extracted_item, dict)
+ assert "title" in extracted_item
+ assert "author" in extracted_item
+ assert "Moby-Dick" in extracted_item.get("title", "")
+ assert "Herman Melville" in extracted_item.get("author", "")
+ else:
+ assert isinstance(extracted_data, dict)
+ assert "title" in extracted_data
+ assert "author" in extracted_data
+ assert "Moby-Dick" in extracted_data.get("title", "")
+ assert "Herman Melville" in extracted_data.get("author", "")
+ except (json.JSONDecodeError, AssertionError) as e:
+ pytest.fail(f"LLM extracted content parsing or validation failed: {e}\nContent: {result['extracted_content']}")
+ except Exception as e: # Catch any other unexpected error
+ pytest.fail(f"An unexpected error occurred during LLM result processing: {e}\nContent: {result['extracted_content']}")
+
+if __name__ == "__main__":
+ # Define arguments for pytest programmatically
+ # -v: verbose output
+ # -s: show print statements immediately (useful for debugging)
+ # __file__: tells pytest to run tests in the current file
+ pytest_args = ["-v", "-s", __file__]
+
+ # You can add more pytest arguments here if needed, for example:
+ # '-k test_llm_extraction': Run only the LLM test function
+ # pytest_args.append("-k test_llm_extraction")
+
+ print(f"Running pytest with args: {pytest_args}")
+
+ # Execute pytest
+ exit_code = pytest.main(pytest_args)
+
+ print(f"Pytest finished with exit code: {exit_code}")
\ No newline at end of file
diff --git a/tests/general/generate_dummy_site.py b/tests/general/generate_dummy_site.py
new file mode 100644
index 00000000..d4218b6b
--- /dev/null
+++ b/tests/general/generate_dummy_site.py
@@ -0,0 +1,335 @@
+# ==== File: build_dummy_site.py ====
+
+import os
+import random
+import argparse
+from pathlib import Path
+from urllib.parse import quote
+
+# --- Configuration ---
+NUM_CATEGORIES = 3
+NUM_SUBCATEGORIES_PER_CAT = 2 # Results in NUM_CATEGORIES * NUM_SUBCATEGORIES_PER_CAT total L2 categories
+NUM_PRODUCTS_PER_SUBCAT = 5 # Products listed on L3 pages
+MAX_DEPTH_TARGET = 5 # Explicitly set target depth
+
+# --- Helper Functions ---
+
+def generate_lorem(words=20):
+ """Generates simple placeholder text."""
+ lorem_words = ["lorem", "ipsum", "dolor", "sit", "amet", "consectetur",
+ "adipiscing", "elit", "sed", "do", "eiusmod", "tempor",
+ "incididunt", "ut", "labore", "et", "dolore", "magna", "aliqua"]
+ return " ".join(random.choice(lorem_words) for _ in range(words)).capitalize() + "."
+
+def create_html_page(filepath: Path, title: str, body_content: str, breadcrumbs: list = [], head_extras: str = ""):
+ """Creates an HTML file with basic structure and inline CSS."""
+ os.makedirs(filepath.parent, exist_ok=True)
+
+ # Generate breadcrumb HTML using the 'link' provided in the breadcrumbs list
+ breadcrumb_html = ""
+ if breadcrumbs:
+ links_html = " » ".join(f'{bc["name"]}' for bc in breadcrumbs)
+ breadcrumb_html = f""
+
+ # Basic CSS for structure identification (kept the same)
+ css = """
+
+ """
+ html_content = f"""
+
+
+
+
+ {title} - FakeShop
+ {head_extras}
+ {css}
+
+
+
+ {breadcrumb_html}
+
{title}
+ {body_content}
+
+
+"""
+ with open(filepath, "w", encoding="utf-8") as f:
+ f.write(html_content)
+ # Keep print statement concise for clarity
+ # print(f"Created: {filepath}")
+
+def generate_site(base_dir: Path, site_name: str = "FakeShop", base_path: str = ""):
+ """Generates the dummy website structure."""
+ base_dir.mkdir(parents=True, exist_ok=True)
+
+ # --- Clean and prepare the base path for URL construction ---
+ # Ensure it starts with '/' if not empty, and remove any trailing '/'
+ if base_path:
+ full_base_path = "/" + base_path.strip('/')
+ else:
+ full_base_path = "" # Represents the root
+
+ print(f"Using base path for links: '{full_base_path}'")
+
+ # --- Level 0: Homepage ---
+ home_body = "Welcome to FakeShop!
Your one-stop shop for imaginary items.
Categories:
\n"
+ # Define the *actual* link path for the homepage breadcrumb
+ home_link_path = f"{full_base_path}/index.html"
+ breadcrumbs_home = [{"name": "Home", "link": home_link_path}] # Base breadcrumb
+
+ # Links *within* the page content should remain relative
+ for i in range(NUM_CATEGORIES):
+ cat_name = f"Category-{i+1}"
+ cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+ # This path is relative to the current directory (index.html)
+ cat_relative_page_path = f"{cat_folder_name}/index.html"
+ home_body += f'- {cat_name} - {generate_lorem(10)}
'
+ home_body += "
"
+ create_html_page(base_dir / "index.html", "Homepage", home_body, []) # No breadcrumbs *on* the homepage itself
+
+ # --- Levels 1-5 ---
+ for i in range(NUM_CATEGORIES):
+ cat_name = f"Category-{i+1}"
+ cat_folder_name = quote(cat_name.lower().replace(" ", "-"))
+ cat_dir = base_dir / cat_folder_name
+ # This is the *absolute* path for the breadcrumb link
+ cat_link_path = f"{full_base_path}/{cat_folder_name}/index.html"
+ # Update breadcrumbs list for this level
+ breadcrumbs_cat = breadcrumbs_home + [{"name": cat_name, "link": cat_link_path}]
+
+ # --- Level 1: Category Page ---
+ cat_body = f"{generate_lorem(15)} for {cat_name}.
Sub-Categories:
\n"
+ for j in range(NUM_SUBCATEGORIES_PER_CAT):
+ subcat_name = f"{cat_name}-Sub-{j+1}"
+ subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+ # Path relative to the category page
+ subcat_relative_page_path = f"{subcat_folder_name}/index.html"
+ cat_body += f'- {subcat_name} - {generate_lorem(8)}
'
+ cat_body += "
"
+ # Pass the updated breadcrumbs list
+ create_html_page(cat_dir / "index.html", cat_name, cat_body, breadcrumbs_home) # Parent breadcrumb needed here
+
+ for j in range(NUM_SUBCATEGORIES_PER_CAT):
+ subcat_name = f"{cat_name}-Sub-{j+1}"
+ subcat_folder_name = quote(subcat_name.lower().replace(" ", "-"))
+ subcat_dir = cat_dir / subcat_folder_name
+ # Absolute path for the breadcrumb link
+ subcat_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/index.html"
+ # Update breadcrumbs list for this level
+ breadcrumbs_subcat = breadcrumbs_cat + [{"name": subcat_name, "link": subcat_link_path}]
+
+ # --- Level 2: Sub-Category Page (Product List) ---
+ subcat_body = f"Explore products in {subcat_name}. {generate_lorem(12)}
Products:
\n"
+ for k in range(NUM_PRODUCTS_PER_SUBCAT):
+ prod_id = f"P{i+1}{j+1}{k+1:03d}" # e.g., P11001
+ prod_name = f"{subcat_name} Product {k+1} ({prod_id})"
+ # Filename relative to the subcategory page
+ prod_filename = f"product_{prod_id}.html"
+ # Absolute path for the breadcrumb link
+ prod_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{prod_filename}"
+
+ # Preview on list page (link remains relative)
+ subcat_body += f"""
+ -
+
+
{prod_name}
+
{generate_lorem(10)}
+
£{random.uniform(10, 500):.2f}
+
+ """
+
+ # --- Level 3: Product Page ---
+ prod_price = random.uniform(10, 500)
+ prod_desc = generate_lorem(40)
+ prod_specs = {f"Spec {s+1}": generate_lorem(3) for s in range(random.randint(3,6))}
+ prod_reviews_count = random.randint(0, 150)
+ # Relative filenames for links on this page
+ details_filename_relative = f"product_{prod_id}_details.html"
+ reviews_filename_relative = f"product_{prod_id}_reviews.html"
+
+ prod_body = f"""
+ Price: £{prod_price:.2f}
+
+
Description
+
{prod_desc}
+
+
+
Specifications
+
+ {''.join(f'- {name}: {value}
' for name, value in prod_specs.items())}
+
+
+
+
Reviews
+
Total Reviews: {prod_reviews_count}
+
+
+
+ View More Details |
+ See All Reviews
+
+ """
+ # Update breadcrumbs list for this level
+ breadcrumbs_prod = breadcrumbs_subcat + [{"name": prod_name, "link": prod_link_path}]
+ # Pass the updated breadcrumbs list
+ create_html_page(subcat_dir / prod_filename, prod_name, prod_body, breadcrumbs_subcat) # Parent breadcrumb needed here
+
+ # --- Level 4: Product Details Page ---
+ details_filename = f"product_{prod_id}_details.html" # Actual filename
+ # Absolute path for the breadcrumb link
+ details_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{details_filename}"
+ details_body = f"This page contains extremely detailed information about {prod_name}.
{generate_lorem(100)}"
+ # Update breadcrumbs list for this level
+ breadcrumbs_details = breadcrumbs_prod + [{"name": "Details", "link": details_link_path}]
+ # Pass the updated breadcrumbs list
+ create_html_page(subcat_dir / details_filename, f"{prod_name} - Details", details_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+ # --- Level 5: Product Reviews Page ---
+ reviews_filename = f"product_{prod_id}_reviews.html" # Actual filename
+ # Absolute path for the breadcrumb link
+ reviews_link_path = f"{full_base_path}/{cat_folder_name}/{subcat_folder_name}/{reviews_filename}"
+ reviews_body = f"All {prod_reviews_count} reviews for {prod_name} are listed here.
"
+ for r in range(prod_reviews_count):
+ reviews_body += f"- Review {r+1}: {generate_lorem(random.randint(15, 50))}
"
+ reviews_body += "
"
+ # Update breadcrumbs list for this level
+ breadcrumbs_reviews = breadcrumbs_prod + [{"name": "Reviews", "link": reviews_link_path}]
+ # Pass the updated breadcrumbs list
+ create_html_page(subcat_dir / reviews_filename, f"{prod_name} - Reviews", reviews_body, breadcrumbs_prod) # Parent breadcrumb needed here
+
+
+ subcat_body += "
" # Close product-list ul
+ # Pass the correct breadcrumbs list for the subcategory index page
+ create_html_page(subcat_dir / "index.html", subcat_name, subcat_body, breadcrumbs_cat) # Parent breadcrumb needed here
+
+
+# --- Main Execution ---
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser(description="Generate a dummy multi-level retail website.")
+ parser.add_argument(
+ "-o", "--output-dir",
+ type=str,
+ default="dummy_retail_site",
+ help="Directory to generate the website in."
+ )
+ parser.add_argument(
+ "-n", "--site-name",
+ type=str,
+ default="FakeShop",
+ help="Name of the fake shop."
+ )
+ parser.add_argument(
+ "-b", "--base-path",
+ type=str,
+ default="",
+ help="Base path for hosting the site (e.g., 'samples/deepcrawl'). Leave empty if hosted at the root."
+ )
+ # Optional: Add more args to configure counts if needed
+
+ args = parser.parse_args()
+
+ output_directory = Path(args.output_dir)
+ site_name = args.site_name
+ base_path = args.base_path
+
+ print(f"Generating dummy site '{site_name}' in '{output_directory}'...")
+ # Pass the base_path to the generation function
+ generate_site(output_directory, site_name, base_path)
+ print(f"\nCreated {sum(1 for _ in output_directory.rglob('*.html'))} HTML pages.")
+ print("Dummy site generation complete.")
+ print(f"To serve locally (example): python -m http.server --directory {output_directory} 8000")
+ if base_path:
+ print(f"Access the site at: http://localhost:8000/{base_path.strip('/')}/index.html")
+ else:
+ print(f"Access the site at: http://localhost:8000/index.html")
\ No newline at end of file
diff --git a/tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py b/tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_acyn_crawl_wuth_http_crawler_strategy.py
rename to tests/general/test_acyn_crawl_wuth_http_crawler_strategy.py
diff --git a/tests/20241401/test_advanced_deep_crawl.py b/tests/general/test_advanced_deep_crawl.py
similarity index 100%
rename from tests/20241401/test_advanced_deep_crawl.py
rename to tests/general/test_advanced_deep_crawl.py
diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/general/test_async_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_async_crawler_strategy.py
rename to tests/general/test_async_crawler_strategy.py
diff --git a/tests/20241401/test_async_markdown_generator.py b/tests/general/test_async_markdown_generator.py
similarity index 100%
rename from tests/20241401/test_async_markdown_generator.py
rename to tests/general/test_async_markdown_generator.py
diff --git a/tests/20241401/test_async_webcrawler.py b/tests/general/test_async_webcrawler.py
similarity index 100%
rename from tests/20241401/test_async_webcrawler.py
rename to tests/general/test_async_webcrawler.py
diff --git a/tests/20241401/test_cache_context.py b/tests/general/test_cache_context.py
similarity index 100%
rename from tests/20241401/test_cache_context.py
rename to tests/general/test_cache_context.py
diff --git a/tests/general/test_content_source_parameter.py b/tests/general/test_content_source_parameter.py
new file mode 100644
index 00000000..e686eaf8
--- /dev/null
+++ b/tests/general/test_content_source_parameter.py
@@ -0,0 +1,106 @@
+"""
+Tests for the content_source parameter in markdown generation.
+"""
+import unittest
+import asyncio
+from unittest.mock import patch, MagicMock
+
+from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator, MarkdownGenerationStrategy
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig
+from crawl4ai.models import MarkdownGenerationResult
+
+HTML_SAMPLE = """
+
+Test Page
+
+ Test Content
+ This is a test paragraph.
+
+
This is content within a container.
+
+
+
+"""
+
+
+class TestContentSourceParameter(unittest.TestCase):
+ """Test cases for the content_source parameter in markdown generation."""
+
+ def setUp(self):
+ """Set up test fixtures."""
+ self.loop = asyncio.new_event_loop()
+ asyncio.set_event_loop(self.loop)
+
+ def tearDown(self):
+ """Tear down test fixtures."""
+ self.loop.close()
+
+ def test_default_content_source(self):
+ """Test that the default content_source is 'cleaned_html'."""
+ # Can't directly instantiate abstract class, so just test DefaultMarkdownGenerator
+ generator = DefaultMarkdownGenerator()
+ self.assertEqual(generator.content_source, "cleaned_html")
+
+ def test_custom_content_source(self):
+ """Test that content_source can be customized."""
+ generator = DefaultMarkdownGenerator(content_source="fit_html")
+ self.assertEqual(generator.content_source, "fit_html")
+
+ @patch('crawl4ai.markdown_generation_strategy.CustomHTML2Text')
+ def test_html_processing_using_input_html(self, mock_html2text):
+ """Test that generate_markdown uses input_html parameter."""
+ # Setup mock
+ mock_instance = MagicMock()
+ mock_instance.handle.return_value = "# Test Content\n\nThis is a test paragraph."
+ mock_html2text.return_value = mock_instance
+
+ # Create generator and call generate_markdown
+ generator = DefaultMarkdownGenerator()
+ result = generator.generate_markdown(input_html="Test Content
This is a test paragraph.
")
+
+ # Verify input_html was passed to HTML2Text handler
+ mock_instance.handle.assert_called_once()
+ # Get the first positional argument
+ args, _ = mock_instance.handle.call_args
+ self.assertEqual(args[0], "Test Content
This is a test paragraph.
")
+
+ # Check result
+ self.assertIsInstance(result, MarkdownGenerationResult)
+ self.assertEqual(result.raw_markdown, "# Test Content\n\nThis is a test paragraph.")
+
+ def test_html_source_selection_logic(self):
+ """Test that the HTML source selection logic works correctly."""
+ # We'll test the dispatch pattern directly to avoid async complexities
+
+ # Create test data
+ raw_html = "Raw HTML
"
+ cleaned_html = "Cleaned HTML
"
+ fit_html = "Preprocessed HTML
"
+
+ # Test the dispatch pattern
+ html_source_selector = {
+ "raw_html": lambda: raw_html,
+ "cleaned_html": lambda: cleaned_html,
+ "fit_html": lambda: fit_html,
+ }
+
+ # Test Case 1: content_source="cleaned_html"
+ source_lambda = html_source_selector.get("cleaned_html")
+ self.assertEqual(source_lambda(), cleaned_html)
+
+ # Test Case 2: content_source="raw_html"
+ source_lambda = html_source_selector.get("raw_html")
+ self.assertEqual(source_lambda(), raw_html)
+
+ # Test Case 3: content_source="fit_html"
+ source_lambda = html_source_selector.get("fit_html")
+ self.assertEqual(source_lambda(), fit_html)
+
+ # Test Case 4: Invalid content_source falls back to cleaned_html
+ source_lambda = html_source_selector.get("invalid_source", lambda: cleaned_html)
+ self.assertEqual(source_lambda(), cleaned_html)
+
+
+if __name__ == '__main__':
+ unittest.main()
\ No newline at end of file
diff --git a/tests/20241401/test_crawlers.py b/tests/general/test_crawlers.py
similarity index 100%
rename from tests/20241401/test_crawlers.py
rename to tests/general/test_crawlers.py
diff --git a/tests/20241401/test_deep_crawl.py b/tests/general/test_deep_crawl.py
similarity index 100%
rename from tests/20241401/test_deep_crawl.py
rename to tests/general/test_deep_crawl.py
diff --git a/tests/20241401/test_deep_crawl_filters.py b/tests/general/test_deep_crawl_filters.py
similarity index 100%
rename from tests/20241401/test_deep_crawl_filters.py
rename to tests/general/test_deep_crawl_filters.py
diff --git a/tests/20241401/test_deep_crawl_scorers.py b/tests/general/test_deep_crawl_scorers.py
similarity index 100%
rename from tests/20241401/test_deep_crawl_scorers.py
rename to tests/general/test_deep_crawl_scorers.py
diff --git a/tests/20241401/test_http_crawler_strategy.py b/tests/general/test_http_crawler_strategy.py
similarity index 100%
rename from tests/20241401/test_http_crawler_strategy.py
rename to tests/general/test_http_crawler_strategy.py
diff --git a/tests/20241401/test_llm_filter.py b/tests/general/test_llm_filter.py
similarity index 100%
rename from tests/20241401/test_llm_filter.py
rename to tests/general/test_llm_filter.py
diff --git a/tests/general/test_mhtml.py b/tests/general/test_mhtml.py
new file mode 100644
index 00000000..06e0e294
--- /dev/null
+++ b/tests/general/test_mhtml.py
@@ -0,0 +1,213 @@
+# test_mhtml_capture.py
+
+import pytest
+import asyncio
+import re # For more robust MHTML checks
+
+# Assuming these can be imported directly from the crawl4ai library
+from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CrawlResult
+
+# A reliable, simple static HTML page for testing
+# Using httpbin as it's designed for testing clients
+TEST_URL_SIMPLE = "https://httpbin.org/html"
+EXPECTED_CONTENT_SIMPLE = "Herman Melville - Moby-Dick"
+
+# A slightly more complex page that might involve JS (good secondary test)
+TEST_URL_JS = "https://quotes.toscrape.com/js/"
+EXPECTED_CONTENT_JS = "Quotes to Scrape" # Title of the page, which should be present in MHTML
+
+# Removed the custom event_loop fixture as pytest-asyncio provides a default one.
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_enabled():
+ """
+ Verify that when CrawlerRunConfig has capture_mhtml=True,
+ the CrawlResult contains valid MHTML content.
+ """
+ # Create a fresh browser config and crawler instance for this test
+ browser_config = BrowserConfig(headless=True) # Use headless for testing CI/CD
+ # --- Key: Enable MHTML capture in the run config ---
+ run_config = CrawlerRunConfig(capture_mhtml=True)
+
+ # Create a fresh crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+
+ try:
+ # Start the browser
+ await crawler.start()
+
+ # Perform the crawl with the MHTML-enabled config
+ result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+ # --- Assertions ---
+ assert result is not None, "Crawler should return a result object"
+ assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+ # 1. Check if the mhtml attribute exists (will fail if CrawlResult not updated)
+ assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+ # 2. Check if mhtml is populated
+ assert result.mhtml is not None, "MHTML content should be captured when enabled"
+ assert isinstance(result.mhtml, str), "MHTML content should be a string"
+ assert len(result.mhtml) > 500, "MHTML content seems too short, likely invalid" # Basic sanity check
+
+ # 3. Check for MHTML structure indicators (more robust than simple string contains)
+ # MHTML files are multipart MIME messages
+ assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE), \
+ "MHTML should contain 'Content-Type: multipart/related;'"
+ # Should contain a boundary definition
+ assert re.search(r"boundary=\"----MultipartBoundary", result.mhtml), \
+ "MHTML should contain a multipart boundary"
+ # Should contain the main HTML part
+ assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE), \
+ "MHTML should contain a 'Content-Type: text/html' part"
+
+ # 4. Check if the *actual page content* is within the MHTML string
+ # This confirms the snapshot captured the rendered page
+ assert EXPECTED_CONTENT_SIMPLE in result.mhtml, \
+ f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the captured MHTML"
+
+ # 5. Ensure standard HTML is still present and correct
+ assert result.html is not None, "Standard HTML should still be present"
+ assert isinstance(result.html, str), "Standard HTML should be a string"
+ assert EXPECTED_CONTENT_SIMPLE in result.html, \
+ f"Expected content '{EXPECTED_CONTENT_SIMPLE}' not found within the standard HTML"
+
+ finally:
+ # Important: Ensure browser is completely closed even if assertions fail
+ await crawler.close()
+ # Help the garbage collector clean up
+ crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_explicitly():
+ """
+ Verify that when CrawlerRunConfig explicitly has capture_mhtml=False,
+ the CrawlResult.mhtml attribute is None.
+ """
+ # Create a fresh browser config and crawler instance for this test
+ browser_config = BrowserConfig(headless=True)
+ # --- Key: Explicitly disable MHTML capture ---
+ run_config = CrawlerRunConfig(capture_mhtml=False)
+
+ # Create a fresh crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+
+ try:
+ # Start the browser
+ await crawler.start()
+ result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+ assert result is not None
+ assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+ # 1. Check attribute existence (important for TDD start)
+ assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+ # 2. Check mhtml is None
+ assert result.mhtml is None, "MHTML content should be None when explicitly disabled"
+
+ # 3. Ensure standard HTML is still present
+ assert result.html is not None
+ assert EXPECTED_CONTENT_SIMPLE in result.html
+
+ finally:
+ # Important: Ensure browser is completely closed even if assertions fail
+ await crawler.close()
+ # Help the garbage collector clean up
+ crawler = None
+
+
+@pytest.mark.asyncio
+async def test_mhtml_capture_when_disabled_by_default():
+ """
+ Verify that if capture_mhtml is not specified (using its default),
+ the CrawlResult.mhtml attribute is None.
+ (This assumes the default value for capture_mhtml in CrawlerRunConfig is False)
+ """
+ # Create a fresh browser config and crawler instance for this test
+ browser_config = BrowserConfig(headless=True)
+ # --- Key: Use default run config ---
+ run_config = CrawlerRunConfig() # Do not specify capture_mhtml
+
+ # Create a fresh crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+
+ try:
+ # Start the browser
+ await crawler.start()
+ result: CrawlResult = await crawler.arun(TEST_URL_SIMPLE, config=run_config)
+
+ assert result is not None
+ assert result.success is True, f"Crawling {TEST_URL_SIMPLE} should succeed. Error: {result.error_message}"
+
+ # 1. Check attribute existence
+ assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+
+ # 2. Check mhtml is None (assuming default is False)
+ assert result.mhtml is None, "MHTML content should be None when using default config (assuming default=False)"
+
+ # 3. Ensure standard HTML is still present
+ assert result.html is not None
+ assert EXPECTED_CONTENT_SIMPLE in result.html
+
+ finally:
+ # Important: Ensure browser is completely closed even if assertions fail
+ await crawler.close()
+ # Help the garbage collector clean up
+ crawler = None
+
+# Optional: Add a test for a JS-heavy page if needed
+@pytest.mark.asyncio
+async def test_mhtml_capture_on_js_page_when_enabled():
+ """
+ Verify MHTML capture works on a page requiring JavaScript execution.
+ """
+ # Create a fresh browser config and crawler instance for this test
+ browser_config = BrowserConfig(headless=True)
+ run_config = CrawlerRunConfig(
+ capture_mhtml=True,
+ # Add a small wait or JS execution if needed for the JS page to fully render
+ # For quotes.toscrape.com/js/, it renders quickly, but a wait might be safer
+ # wait_for_timeout=2000 # Example: wait up to 2 seconds
+ js_code="await new Promise(r => setTimeout(r, 500));" # Small delay after potential load
+ )
+
+ # Create a fresh crawler instance
+ crawler = AsyncWebCrawler(config=browser_config)
+
+ try:
+ # Start the browser
+ await crawler.start()
+ result: CrawlResult = await crawler.arun(TEST_URL_JS, config=run_config)
+
+ assert result is not None
+ assert result.success is True, f"Crawling {TEST_URL_JS} should succeed. Error: {result.error_message}"
+ assert hasattr(result, 'mhtml'), "CrawlResult object must have an 'mhtml' attribute"
+ assert result.mhtml is not None, "MHTML content should be captured on JS page when enabled"
+ assert isinstance(result.mhtml, str), "MHTML content should be a string"
+ assert len(result.mhtml) > 500, "MHTML content from JS page seems too short"
+
+ # Check for MHTML structure
+ assert re.search(r"Content-Type: multipart/related;", result.mhtml, re.IGNORECASE)
+ assert re.search(r"Content-Type: text/html", result.mhtml, re.IGNORECASE)
+
+ # Check for content rendered by JS within the MHTML
+ assert EXPECTED_CONTENT_JS in result.mhtml, \
+ f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the captured MHTML"
+
+ # Check standard HTML too
+ assert result.html is not None
+ assert EXPECTED_CONTENT_JS in result.html, \
+ f"Expected JS-rendered content '{EXPECTED_CONTENT_JS}' not found within the standard HTML"
+
+ finally:
+ # Important: Ensure browser is completely closed even if assertions fail
+ await crawler.close()
+ # Help the garbage collector clean up
+ crawler = None
+
+if __name__ == "__main__":
+ # Use pytest for async tests
+ pytest.main(["-xvs", __file__])
diff --git a/tests/general/test_network_console_capture.py b/tests/general/test_network_console_capture.py
new file mode 100644
index 00000000..da41ecec
--- /dev/null
+++ b/tests/general/test_network_console_capture.py
@@ -0,0 +1,185 @@
+from crawl4ai.async_webcrawler import AsyncWebCrawler
+from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig
+import asyncio
+import aiohttp
+from aiohttp import web
+import tempfile
+import shutil
+import os, sys, time, json
+
+
+async def start_test_server():
+ app = web.Application()
+
+ async def basic_page(request):
+ return web.Response(text="""
+
+
+
+ Network Request Test
+
+
+ Test Page for Network Capture
+ This page performs network requests and console logging.
+
+
+
+
+ """, content_type="text/html")
+
+ async def image(request):
+ # Return a small 1x1 transparent PNG
+ return web.Response(body=bytes.fromhex('89504E470D0A1A0A0000000D49484452000000010000000108060000001F15C4890000000D4944415478DA63FAFFFF3F030079DB00018D959DE70000000049454E44AE426082'), content_type="image/png")
+
+ async def api_data(request):
+ return web.Response(text="sample data")
+
+ async def api_json(request):
+ return web.json_response({"status": "success", "message": "JSON data"})
+
+ # Register routes
+ app.router.add_get('/', basic_page)
+ app.router.add_get('/image.png', image)
+ app.router.add_get('/api/data', api_data)
+ app.router.add_get('/api/json', api_json)
+
+ runner = web.AppRunner(app)
+ await runner.setup()
+ site = web.TCPSite(runner, 'localhost', 8080)
+ await site.start()
+
+ return runner
+
+
+async def test_network_console_capture():
+ print("\n=== Testing Network and Console Capture ===\n")
+
+ # Start test server
+ runner = await start_test_server()
+ try:
+ browser_config = BrowserConfig(headless=True)
+
+ # Test with capture disabled (default)
+ print("\n1. Testing with capture disabled (default)...")
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ config = CrawlerRunConfig(
+ wait_until="networkidle", # Wait for network to be idle
+ )
+ result = await crawler.arun(url="http://localhost:8080/", config=config)
+
+ assert result.network_requests is None, "Network requests should be None when capture is disabled"
+ assert result.console_messages is None, "Console messages should be None when capture is disabled"
+ print("✓ Default config correctly returns None for network_requests and console_messages")
+
+ # Test with network capture enabled
+ print("\n2. Testing with network capture enabled...")
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ config = CrawlerRunConfig(
+ wait_until="networkidle", # Wait for network to be idle
+ capture_network_requests=True
+ )
+ result = await crawler.arun(url="http://localhost:8080/", config=config)
+
+ assert result.network_requests is not None, "Network requests should be captured"
+ print(f"✓ Captured {len(result.network_requests)} network requests")
+
+ # Check if we have both requests and responses
+ request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
+ response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
+ print(f" - {request_count} requests, {response_count} responses")
+
+ # Check if we captured specific resources
+ urls = [r.get("url") for r in result.network_requests]
+ has_image = any("/image.png" in url for url in urls)
+ has_api_data = any("/api/data" in url for url in urls)
+ has_api_json = any("/api/json" in url for url in urls)
+
+ assert has_image, "Should have captured image request"
+ assert has_api_data, "Should have captured API data request"
+ assert has_api_json, "Should have captured API JSON request"
+ print("✓ Captured expected network requests (image, API endpoints)")
+
+ # Test with console capture enabled
+ print("\n3. Testing with console capture enabled...")
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ config = CrawlerRunConfig(
+ wait_until="networkidle", # Wait for network to be idle
+ capture_console_messages=True
+ )
+ result = await crawler.arun(url="http://localhost:8080/", config=config)
+
+ assert result.console_messages is not None, "Console messages should be captured"
+ print(f"✓ Captured {len(result.console_messages)} console messages")
+
+ # Check if we have different types of console messages
+ message_types = set(msg.get("type") for msg in result.console_messages if "type" in msg)
+ print(f" - Message types: {', '.join(message_types)}")
+
+ # Print all captured messages for debugging
+ print(" - Captured messages:")
+ for msg in result.console_messages:
+ print(f" * Type: {msg.get('type', 'N/A')}, Text: {msg.get('text', 'N/A')}")
+
+ # Look for specific messages
+ messages = [msg.get("text") for msg in result.console_messages if "text" in msg]
+ has_basic_log = any("Basic console log" in msg for msg in messages)
+ has_error_msg = any("Error message" in msg for msg in messages)
+ has_warning_msg = any("Warning message" in msg for msg in messages)
+
+ assert has_basic_log, "Should have captured basic console.log message"
+ assert has_error_msg, "Should have captured console.error message"
+ assert has_warning_msg, "Should have captured console.warn message"
+ print("✓ Captured expected console messages (log, error, warning)")
+
+ # Test with both captures enabled
+ print("\n4. Testing with both network and console capture enabled...")
+ async with AsyncWebCrawler(config=browser_config) as crawler:
+ config = CrawlerRunConfig(
+ wait_until="networkidle", # Wait for network to be idle
+ capture_network_requests=True,
+ capture_console_messages=True
+ )
+ result = await crawler.arun(url="http://localhost:8080/", config=config)
+
+ assert result.network_requests is not None, "Network requests should be captured"
+ assert result.console_messages is not None, "Console messages should be captured"
+ print(f"✓ Successfully captured both {len(result.network_requests)} network requests and {len(result.console_messages)} console messages")
+
+ finally:
+ await runner.cleanup()
+ print("\nTest server shutdown")
+
+
+async def main():
+ try:
+ await test_network_console_capture()
+ print("\n✅ All tests passed successfully!")
+ except Exception as e:
+ print(f"\n❌ Test failed: {str(e)}")
+ raise
+
+if __name__ == "__main__":
+ asyncio.run(main())
\ No newline at end of file
diff --git a/tests/20241401/test_robot_parser.py b/tests/general/test_robot_parser.py
similarity index 100%
rename from tests/20241401/test_robot_parser.py
rename to tests/general/test_robot_parser.py
diff --git a/tests/20241401/test_schema_builder.py b/tests/general/test_schema_builder.py
similarity index 100%
rename from tests/20241401/test_schema_builder.py
rename to tests/general/test_schema_builder.py
diff --git a/tests/20241401/test_stream.py b/tests/general/test_stream.py
similarity index 100%
rename from tests/20241401/test_stream.py
rename to tests/general/test_stream.py
diff --git a/tests/20241401/test_stream_dispatch.py b/tests/general/test_stream_dispatch.py
similarity index 100%
rename from tests/20241401/test_stream_dispatch.py
rename to tests/general/test_stream_dispatch.py
diff --git a/tests/20241401/tets_robot.py b/tests/general/tets_robot.py
similarity index 100%
rename from tests/20241401/tets_robot.py
rename to tests/general/tets_robot.py
diff --git a/tests/mcp/test_mcp_socket.py b/tests/mcp/test_mcp_socket.py
new file mode 100644
index 00000000..32456b31
--- /dev/null
+++ b/tests/mcp/test_mcp_socket.py
@@ -0,0 +1,119 @@
+# pip install "mcp-sdk[ws]" anyio
+import anyio, json
+from mcp.client.websocket import websocket_client
+from mcp.client.session import ClientSession
+
+async def test_list():
+ async with websocket_client("ws://localhost:8020/mcp/ws") as (r, w):
+ async with ClientSession(r, w) as s:
+ await s.initialize()
+
+ print("tools :", [t.name for t in (await s.list_tools()).tools])
+ print("resources :", [r.name for r in (await s.list_resources()).resources])
+ print("templates :", [t.name for t in (await s.list_resource_templates()).resource_templates])
+
+
+async def test_crawl(s: ClientSession) -> None:
+ """Hit the @mcp_tool('crawl') endpoint."""
+ res = await s.call_tool(
+ "crawl",
+ {
+ "urls": ["https://example.com"],
+ "browser_config": {},
+ "crawler_config": {},
+ },
+ )
+ print("crawl →", json.loads(res.content[0].text))
+
+
+async def test_md(s: ClientSession) -> None:
+ """Hit the @mcp_tool('md') endpoint."""
+ res = await s.call_tool(
+ "md",
+ {
+ "url": "https://example.com",
+ "f": "fit", # or RAW, BM25, LLM
+ "q": None,
+ "c": "0",
+ },
+ )
+ result = json.loads(res.content[0].text)
+ print("md →", result['markdown'][:100], "...")
+
+async def test_screenshot(s: ClientSession):
+ res = await s.call_tool(
+ "screenshot",
+ {
+ "url": "https://example.com",
+ "screenshot_wait_for": 1.0,
+ },
+ )
+ png_b64 = json.loads(res.content[0].text)["screenshot"]
+ print("screenshot →", png_b64[:60], "… (base64)")
+
+
+async def test_pdf(s: ClientSession):
+ res = await s.call_tool(
+ "pdf",
+ {
+ "url": "https://example.com",
+ },
+ )
+ pdf_b64 = json.loads(res.content[0].text)["pdf"]
+ print("pdf →", pdf_b64[:60], "… (base64)")
+
+async def test_execute_js(s: ClientSession):
+ # click the “More” link on Hacker News front page and wait 1 s
+ res = await s.call_tool(
+ "execute_js",
+ {
+ "url": "https://news.ycombinator.com/news",
+ "js_code": [
+ "await page.click('a.morelink')",
+ "await page.waitForTimeout(1000)",
+ ],
+ },
+ )
+ crawl_result = json.loads(res.content[0].text)
+ print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+
+async def test_html(s: ClientSession):
+ # click the “More” link on Hacker News front page and wait 1 s
+ res = await s.call_tool(
+ "html",
+ {
+ "url": "https://news.ycombinator.com/news",
+ },
+ )
+ crawl_result = json.loads(res.content[0].text)
+ print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+
+async def test_context(s: ClientSession):
+ # click the “More” link on Hacker News front page and wait 1 s
+ res = await s.call_tool(
+ "ask",
+ {
+ "query": "I hv a question about Crawl4ai library, how to extract internal links when crawling a page?"
+ },
+ )
+ crawl_result = json.loads(res.content[0].text)
+ print("execute_js → status", crawl_result["success"], "| html len:", len(crawl_result["html"]))
+
+
+async def main() -> None:
+ async with websocket_client("ws://localhost:11235/mcp/ws") as (r, w):
+ async with ClientSession(r, w) as s:
+ await s.initialize() # handshake
+ tools = (await s.list_tools()).tools
+ print("tools:", [t.name for t in tools])
+
+ # await test_list()
+ await test_crawl(s)
+ await test_md(s)
+ await test_screenshot(s)
+ await test_pdf(s)
+ await test_execute_js(s)
+ await test_html(s)
+ await test_context(s)
+
+anyio.run(main)
diff --git a/tests/mcp/test_mcp_sse.py b/tests/mcp/test_mcp_sse.py
new file mode 100644
index 00000000..d9eee557
--- /dev/null
+++ b/tests/mcp/test_mcp_sse.py
@@ -0,0 +1,11 @@
+from mcp.client.sse import sse_client
+from mcp.client.session import ClientSession
+
+async def main():
+ async with sse_client("http://127.0.0.1:8020/mcp") as (r, w):
+ async with ClientSession(r, w) as sess:
+ print(await sess.list_tools()) # now works
+
+if __name__ == "__main__":
+ import asyncio
+ asyncio.run(main())
diff --git a/tests/memory/README.md b/tests/memory/README.md
new file mode 100644
index 00000000..164ef095
--- /dev/null
+++ b/tests/memory/README.md
@@ -0,0 +1,315 @@
+# Crawl4AI Stress Testing and Benchmarking
+
+This directory contains tools for stress testing Crawl4AI's `arun_many` method and dispatcher system with high volumes of URLs to evaluate performance, concurrency handling, and potentially detect memory issues. It also includes a benchmarking system to track performance over time.
+
+## Quick Start
+
+```bash
+# Run a default stress test (small config) and generate a report
+# (Assumes run_all.sh is updated to call run_benchmark.py)
+./run_all.sh
+```
+*Note: `run_all.sh` might need to be updated if it directly called the old script.*
+
+## Overview
+
+The stress testing system works by:
+
+1. Generating a local test site with heavy HTML pages (regenerated by default for each test).
+2. Starting a local HTTP server to serve these pages.
+3. Running Crawl4AI's `arun_many` method against this local site using the `MemoryAdaptiveDispatcher` with configurable concurrency (`max_sessions`).
+4. Monitoring performance metrics via the `CrawlerMonitor` and optionally logging memory usage.
+5. Optionally generating detailed benchmark reports with visualizations using `benchmark_report.py`.
+
+## Available Tools
+
+- `test_stress_sdk.py` - Main stress testing script utilizing `arun_many` and dispatchers.
+- `benchmark_report.py` - Report generator for comparing test results (assumes compatibility with `test_stress_sdk.py` outputs).
+- `run_benchmark.py` - Python script with predefined test configurations that orchestrates tests using `test_stress_sdk.py`.
+- `run_all.sh` - Simple wrapper script (may need updating).
+
+## Usage Guide
+
+### Using Predefined Configurations (Recommended)
+
+The `run_benchmark.py` script offers the easiest way to run standardized tests:
+
+```bash
+# Quick test (50 URLs, 4 max sessions)
+python run_benchmark.py quick
+
+# Medium test (500 URLs, 16 max sessions)
+python run_benchmark.py medium
+
+# Large test (1000 URLs, 32 max sessions)
+python run_benchmark.py large
+
+# Extreme test (2000 URLs, 64 max sessions)
+python run_benchmark.py extreme
+
+# Custom configuration
+python run_benchmark.py custom --urls 300 --max-sessions 24 --chunk-size 50
+
+# Run 'small' test in streaming mode
+python run_benchmark.py small --stream
+
+# Override max_sessions for the 'medium' config
+python run_benchmark.py medium --max-sessions 20
+
+# Skip benchmark report generation after the test
+python run_benchmark.py small --no-report
+
+# Clean up reports and site files before running
+python run_benchmark.py medium --clean
+```
+
+#### `run_benchmark.py` Parameters
+
+| Parameter | Default | Description |
+| -------------------- | --------------- | --------------------------------------------------------------------------- |
+| `config` | *required* | Test configuration: `quick`, `small`, `medium`, `large`, `extreme`, `custom`|
+| `--urls` | config-specific | Number of URLs (required for `custom`) |
+| `--max-sessions` | config-specific | Max concurrent sessions managed by dispatcher (required for `custom`) |
+| `--chunk-size` | config-specific | URLs per batch for non-stream logging (required for `custom`) |
+| `--stream` | False | Enable streaming results (disables batch logging) |
+| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live monitor |
+| `--use-rate-limiter` | False | Enable basic rate limiter in the dispatcher |
+| `--port` | 8000 | HTTP server port |
+| `--no-report` | False | Skip generating comparison report via `benchmark_report.py` |
+| `--clean` | False | Clean up reports and site files before running |
+| `--keep-server-alive`| False | Keep local HTTP server running after test |
+| `--use-existing-site`| False | Use existing site on specified port (no local server start/site gen) |
+| `--skip-generation` | False | Use existing site files but start local server |
+| `--keep-site` | False | Keep generated site files after test |
+
+#### Predefined Configurations
+
+| Configuration | URLs | Max Sessions | Chunk Size | Description |
+| ------------- | ------ | ------------ | ---------- | -------------------------------- |
+| `quick` | 50 | 4 | 10 | Quick test for basic validation |
+| `small` | 100 | 8 | 20 | Small test for routine checks |
+| `medium` | 500 | 16 | 50 | Medium test for thorough checks |
+| `large` | 1000 | 32 | 100 | Large test for stress testing |
+| `extreme` | 2000 | 64 | 200 | Extreme test for limit testing |
+
+### Direct Usage of `test_stress_sdk.py`
+
+For fine-grained control or debugging, you can run the stress test script directly:
+
+```bash
+# Test with 200 URLs and 32 max concurrent sessions
+python test_stress_sdk.py --urls 200 --max-sessions 32 --chunk-size 40
+
+# Clean up previous test data first
+python test_stress_sdk.py --clean-reports --clean-site --urls 100 --max-sessions 16 --chunk-size 20
+
+# Change the HTTP server port and use aggregated monitor
+python test_stress_sdk.py --port 8088 --urls 100 --max-sessions 16 --monitor-mode AGGREGATED
+
+# Enable streaming mode and use rate limiting
+python test_stress_sdk.py --urls 50 --max-sessions 8 --stream --use-rate-limiter
+
+# Change report output location
+python test_stress_sdk.py --report-path custom_reports --urls 100 --max-sessions 16
+```
+
+#### `test_stress_sdk.py` Parameters
+
+| Parameter | Default | Description |
+| -------------------- | ---------- | -------------------------------------------------------------------- |
+| `--urls` | 100 | Number of URLs to test |
+| `--max-sessions` | 16 | Maximum concurrent crawling sessions managed by the dispatcher |
+| `--chunk-size` | 10 | Number of URLs per batch (relevant for non-stream logging) |
+| `--stream` | False | Enable streaming results (disables batch logging) |
+| `--monitor-mode` | DETAILED | `DETAILED` or `AGGREGATED` display for the live `CrawlerMonitor` |
+| `--use-rate-limiter` | False | Enable a basic `RateLimiter` within the dispatcher |
+| `--site-path` | "test_site"| Path to store/use the generated test site |
+| `--port` | 8000 | Port for the local HTTP server |
+| `--report-path` | "reports" | Path to save test result summary (JSON) and memory samples (CSV) |
+| `--skip-generation` | False | Use existing test site files but still start local server |
+| `--use-existing-site`| False | Use existing site on specified port (no local server/site gen) |
+| `--keep-server-alive`| False | Keep local HTTP server running after test completion |
+| `--keep-site` | False | Keep the generated test site files after test completion |
+| `--clean-reports` | False | Clean up report directory before running |
+| `--clean-site` | False | Clean up site directory before/after running (see script logic) |
+
+### Generating Reports Only
+
+If you only want to generate a benchmark report from existing test results (assuming `benchmark_report.py` is compatible):
+
+```bash
+# Generate a report from existing test results in ./reports/
+python benchmark_report.py
+
+# Limit to the most recent 5 test results
+python benchmark_report.py --limit 5
+
+# Specify a custom source directory for test results
+python benchmark_report.py --reports-dir alternate_results
+```
+
+#### `benchmark_report.py` Parameters (Assumed)
+
+| Parameter | Default | Description |
+| --------------- | -------------------- | ----------------------------------------------------------- |
+| `--reports-dir` | "reports" | Directory containing `test_stress_sdk.py` result files |
+| `--output-dir` | "benchmark_reports" | Directory to save generated HTML reports and charts |
+| `--limit` | None (all results) | Limit comparison to N most recent test results |
+| `--output-file` | Auto-generated | Custom output filename for the HTML report |
+
+## Understanding the Test Output
+
+### Real-time Progress Display (`CrawlerMonitor`)
+
+When running `test_stress_sdk.py`, the `CrawlerMonitor` provides a live view of the crawling process managed by the dispatcher.
+
+- **DETAILED Mode (Default):** Shows individual task status (Queued, Active, Completed, Failed), timings, memory usage per task (if `psutil` is available), overall queue statistics, and memory pressure status (if `psutil` available).
+- **AGGREGATED Mode:** Shows summary counts (Queued, Active, Completed, Failed), overall progress percentage, estimated time remaining, average URLs/sec, and memory pressure status.
+
+### Batch Log Output (Non-Streaming Mode Only)
+
+If running `test_stress_sdk.py` **without** the `--stream` flag, you will *also* see per-batch summary lines printed to the console *after* the monitor display, once each chunk of URLs finishes processing:
+
+```
+ Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status
+───────────────────────────────────────────────────────────────────────────────────────────
+ 1 | 10.0% | 50.1 MB | 55.3 MB | 23.8 | 10/0 | 0.42 | Success
+ 2 | 20.0% | 55.3 MB | 60.1 MB | 24.1 | 10/0 | 0.41 | Success
+ ...
+```
+
+This display provides chunk-specific metrics:
+- **Batch**: The batch number being reported.
+- **Progress**: Overall percentage of total URLs processed *after* this batch.
+- **Start Mem / End Mem**: Memory usage before and after processing this batch (if tracked).
+- **URLs/sec**: Processing speed *for this specific batch*.
+- **Success/Fail**: Number of successful and failed URLs *in this batch*.
+- **Time (s)**: Wall-clock time taken to process *this batch*.
+- **Status**: Color-coded status for the batch outcome.
+
+### Summary Output
+
+After test completion, a final summary is displayed:
+
+```
+================================================================================
+Test Completed
+================================================================================
+Test ID: 20250418_103015
+Configuration: 100 URLs, 16 max sessions, Chunk: 10, Stream: False, Monitor: DETAILED
+Results: 100 successful, 0 failed (100 processed, 100.0% success)
+Performance: 5.85 seconds total, 17.09 URLs/second avg
+Memory Usage: Start: 50.1 MB, End: 75.3 MB, Max: 78.1 MB, Growth: 25.2 MB
+Results summary saved to reports/test_summary_20250418_103015.json
+```
+
+### HTML Report Structure (Generated by `benchmark_report.py`)
+
+(This section remains the same, assuming `benchmark_report.py` generates these)
+The benchmark report contains several sections:
+1. **Summary**: Overview of the latest test results and trends
+2. **Performance Comparison**: Charts showing throughput across tests
+3. **Memory Usage**: Detailed memory usage graphs for each test
+4. **Detailed Results**: Tabular data of all test metrics
+5. **Conclusion**: Automated analysis of performance and memory patterns
+
+### Memory Metrics
+
+(This section remains conceptually the same)
+Memory growth is the key metric for detecting leaks...
+
+### Performance Metrics
+
+(This section remains conceptually the same, though "URLs per Worker" is less relevant - focus on overall URLs/sec)
+Key performance indicators include:
+- **URLs per Second**: Higher is better (throughput)
+- **Success Rate**: Should be 100% in normal conditions
+- **Total Processing Time**: Lower is better
+- **Dispatcher Efficiency**: Observe queue lengths and wait times in the monitor (Detailed mode)
+
+### Raw Data Files
+
+Raw data is saved in the `--report-path` directory (default `./reports/`):
+
+- **JSON files** (`test_summary_*.json`): Contains the final summary for each test run.
+- **CSV files** (`memory_samples_*.csv`): Contains time-series memory samples taken during the test run.
+
+Example of reading raw data:
+```python
+import json
+import pandas as pd
+
+# Load test summary
+test_id = "20250418_103015" # Example ID
+with open(f'reports/test_summary_{test_id}.json', 'r') as f:
+ results = json.load(f)
+
+# Load memory samples
+memory_df = pd.read_csv(f'reports/memory_samples_{test_id}.csv')
+
+# Analyze memory_df (e.g., calculate growth, plot)
+if not memory_df['memory_info_mb'].isnull().all():
+ growth = memory_df['memory_info_mb'].iloc[-1] - memory_df['memory_info_mb'].iloc[0]
+ print(f"Total Memory Growth: {growth:.1f} MB")
+else:
+ print("No valid memory samples found.")
+
+print(f"Avg URLs/sec: {results['urls_processed'] / results['total_time_seconds']:.2f}")
+```
+
+## Visualization Dependencies
+
+(This section remains the same)
+For full visualization capabilities in the HTML reports generated by `benchmark_report.py`, install additional dependencies...
+
+## Directory Structure
+
+```
+benchmarking/ # Or your top-level directory name
+├── benchmark_reports/ # Generated HTML reports (by benchmark_report.py)
+├── reports/ # Raw test result data (from test_stress_sdk.py)
+├── test_site/ # Generated test content (temporary)
+├── benchmark_report.py# Report generator
+├── run_benchmark.py # Test runner with predefined configs
+├── test_stress_sdk.py # Main stress test implementation using arun_many
+└── run_all.sh # Simple wrapper script (may need updates)
+#└── requirements.txt # Optional: Visualization dependencies for benchmark_report.py
+```
+
+## Cleanup
+
+To clean up after testing:
+
+```bash
+# Remove the test site content (if not using --keep-site)
+rm -rf test_site
+
+# Remove all raw reports and generated benchmark reports
+rm -rf reports benchmark_reports
+
+# Or use the --clean flag with run_benchmark.py
+python run_benchmark.py medium --clean
+```
+
+## Use in CI/CD
+
+(This section remains conceptually the same, just update script names)
+These tests can be integrated into CI/CD pipelines:
+```bash
+# Example CI script
+python run_benchmark.py medium --no-report # Run test without interactive report gen
+# Check exit code
+if [ $? -ne 0 ]; then echo "Stress test failed!"; exit 1; fi
+# Optionally, run report generator and check its output/metrics
+# python benchmark_report.py
+# check_report_metrics.py reports/test_summary_*.json || exit 1
+exit 0
+```
+
+## Troubleshooting
+
+- **HTTP Server Port Conflict**: Use `--port` with `run_benchmark.py` or `test_stress_sdk.py`.
+- **Memory Tracking Issues**: The `SimpleMemoryTracker` uses platform commands (`ps`, `/proc`, `tasklist`). Ensure these are available and the script has permission. If it consistently fails, memory reporting will be limited.
+- **Visualization Missing**: Related to `benchmark_report.py` and its dependencies.
+- **Site Generation Issues**: Check permissions for creating `./test_site/`. Use `--skip-generation` if you want to manage the site manually.
+- **Testing Against External Site**: Ensure the external site is running and use `--use-existing-site --port `.
diff --git a/tests/memory/benchmark_report.py b/tests/memory/benchmark_report.py
new file mode 100755
index 00000000..a634f997
--- /dev/null
+++ b/tests/memory/benchmark_report.py
@@ -0,0 +1,887 @@
+#!/usr/bin/env python3
+"""
+Benchmark reporting tool for Crawl4AI stress tests.
+Generates visual reports and comparisons between test runs.
+"""
+
+import os
+import json
+import glob
+import argparse
+import sys
+from datetime import datetime
+from pathlib import Path
+from rich.console import Console
+from rich.table import Table
+from rich.panel import Panel
+
+# Initialize rich console
+console = Console()
+
+# Try to import optional visualization dependencies
+VISUALIZATION_AVAILABLE = True
+try:
+ import pandas as pd
+ import matplotlib.pyplot as plt
+ import matplotlib as mpl
+ import numpy as np
+ import seaborn as sns
+except ImportError:
+ VISUALIZATION_AVAILABLE = False
+ console.print("[yellow]Warning: Visualization dependencies not found. Install with:[/yellow]")
+ console.print("[yellow]pip install pandas matplotlib seaborn[/yellow]")
+ console.print("[yellow]Only text-based reports will be generated.[/yellow]")
+
+# Configure plotting if available
+if VISUALIZATION_AVAILABLE:
+ # Set plot style for dark theme
+ plt.style.use('dark_background')
+ sns.set_theme(style="darkgrid")
+
+ # Custom color palette based on Nord theme
+ nord_palette = ["#88c0d0", "#81a1c1", "#a3be8c", "#ebcb8b", "#bf616a", "#b48ead", "#5e81ac"]
+ sns.set_palette(nord_palette)
+
+class BenchmarkReporter:
+ """Generates visual reports and comparisons for Crawl4AI stress tests."""
+
+ def __init__(self, reports_dir="reports", output_dir="benchmark_reports"):
+ """Initialize the benchmark reporter.
+
+ Args:
+ reports_dir: Directory containing test result files
+ output_dir: Directory to save generated reports
+ """
+ self.reports_dir = Path(reports_dir)
+ self.output_dir = Path(output_dir)
+ self.output_dir.mkdir(parents=True, exist_ok=True)
+
+ # Configure matplotlib if available
+ if VISUALIZATION_AVAILABLE:
+ # Ensure the matplotlib backend works in headless environments
+ mpl.use('Agg')
+
+ # Set up styling for plots with dark theme
+ mpl.rcParams['figure.figsize'] = (12, 8)
+ mpl.rcParams['font.size'] = 12
+ mpl.rcParams['axes.labelsize'] = 14
+ mpl.rcParams['axes.titlesize'] = 16
+ mpl.rcParams['xtick.labelsize'] = 12
+ mpl.rcParams['ytick.labelsize'] = 12
+ mpl.rcParams['legend.fontsize'] = 12
+ mpl.rcParams['figure.facecolor'] = '#1e1e1e'
+ mpl.rcParams['axes.facecolor'] = '#2e3440'
+ mpl.rcParams['savefig.facecolor'] = '#1e1e1e'
+ mpl.rcParams['text.color'] = '#e0e0e0'
+ mpl.rcParams['axes.labelcolor'] = '#e0e0e0'
+ mpl.rcParams['xtick.color'] = '#e0e0e0'
+ mpl.rcParams['ytick.color'] = '#e0e0e0'
+ mpl.rcParams['grid.color'] = '#444444'
+ mpl.rcParams['figure.edgecolor'] = '#444444'
+
+ def load_test_results(self, limit=None):
+ """Load all test results from the reports directory.
+
+ Args:
+ limit: Optional limit on number of most recent tests to load
+
+ Returns:
+ Dictionary mapping test IDs to result data
+ """
+ result_files = glob.glob(str(self.reports_dir / "test_results_*.json"))
+
+ # Sort files by modification time (newest first)
+ result_files.sort(key=os.path.getmtime, reverse=True)
+
+ if limit:
+ result_files = result_files[:limit]
+
+ results = {}
+ for file_path in result_files:
+ try:
+ with open(file_path, 'r') as f:
+ data = json.load(f)
+ test_id = data.get('test_id')
+ if test_id:
+ results[test_id] = data
+
+ # Try to load the corresponding memory samples
+ csv_path = self.reports_dir / f"memory_samples_{test_id}.csv"
+ if csv_path.exists():
+ try:
+ memory_df = pd.read_csv(csv_path)
+ results[test_id]['memory_samples'] = memory_df
+ except Exception as e:
+ console.print(f"[yellow]Warning: Could not load memory samples for {test_id}: {e}[/yellow]")
+ except Exception as e:
+ console.print(f"[red]Error loading {file_path}: {e}[/red]")
+
+ console.print(f"Loaded {len(results)} test results")
+ return results
+
+ def generate_summary_table(self, results):
+ """Generate a summary table of test results.
+
+ Args:
+ results: Dictionary mapping test IDs to result data
+
+ Returns:
+ Rich Table object
+ """
+ table = Table(title="Crawl4AI Stress Test Summary", show_header=True)
+
+ # Define columns
+ table.add_column("Test ID", style="cyan")
+ table.add_column("Date", style="bright_green")
+ table.add_column("URLs", justify="right")
+ table.add_column("Workers", justify="right")
+ table.add_column("Success %", justify="right")
+ table.add_column("Time (s)", justify="right")
+ table.add_column("Mem Growth", justify="right")
+ table.add_column("URLs/sec", justify="right")
+
+ # Add rows
+ for test_id, data in sorted(results.items(), key=lambda x: x[0], reverse=True):
+ # Parse timestamp from test_id
+ try:
+ date_str = datetime.strptime(test_id, "%Y%m%d_%H%M%S").strftime("%Y-%m-%d %H:%M")
+ except:
+ date_str = "Unknown"
+
+ # Calculate success percentage
+ total_urls = data.get('url_count', 0)
+ successful = data.get('successful_urls', 0)
+ success_pct = (successful / total_urls * 100) if total_urls > 0 else 0
+
+ # Calculate memory growth if available
+ mem_growth = "N/A"
+ if 'memory_samples' in data:
+ samples = data['memory_samples']
+ if len(samples) >= 2:
+ # Try to extract numeric values from memory_info strings
+ try:
+ first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+ last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+ mem_growth = f"{last_mem - first_mem:.1f} MB"
+ except:
+ pass
+
+ # Calculate URLs per second
+ time_taken = data.get('total_time_seconds', 0)
+ urls_per_sec = total_urls / time_taken if time_taken > 0 else 0
+
+ table.add_row(
+ test_id,
+ date_str,
+ str(total_urls),
+ str(data.get('workers', 'N/A')),
+ f"{success_pct:.1f}%",
+ f"{data.get('total_time_seconds', 0):.2f}",
+ mem_growth,
+ f"{urls_per_sec:.1f}"
+ )
+
+ return table
+
+ def generate_performance_chart(self, results, output_file=None):
+ """Generate a performance comparison chart.
+
+ Args:
+ results: Dictionary mapping test IDs to result data
+ output_file: File path to save the chart
+
+ Returns:
+ Path to the saved chart file or None if visualization is not available
+ """
+ if not VISUALIZATION_AVAILABLE:
+ console.print("[yellow]Skipping performance chart - visualization dependencies not available[/yellow]")
+ return None
+
+ # Extract relevant data
+ data = []
+ for test_id, result in results.items():
+ urls = result.get('url_count', 0)
+ workers = result.get('workers', 0)
+ time_taken = result.get('total_time_seconds', 0)
+ urls_per_sec = urls / time_taken if time_taken > 0 else 0
+
+ # Parse timestamp from test_id for sorting
+ try:
+ timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+ data.append({
+ 'test_id': test_id,
+ 'timestamp': timestamp,
+ 'urls': urls,
+ 'workers': workers,
+ 'time_seconds': time_taken,
+ 'urls_per_sec': urls_per_sec
+ })
+ except:
+ console.print(f"[yellow]Warning: Could not parse timestamp from {test_id}[/yellow]")
+
+ if not data:
+ console.print("[yellow]No valid data for performance chart[/yellow]")
+ return None
+
+ # Convert to DataFrame and sort by timestamp
+ df = pd.DataFrame(data)
+ df = df.sort_values('timestamp')
+
+ # Create the plot
+ fig, ax1 = plt.subplots(figsize=(12, 6))
+
+ # Plot URLs per second as bars with properly set x-axis
+ x_pos = range(len(df['test_id']))
+ bars = ax1.bar(x_pos, df['urls_per_sec'], color='#88c0d0', alpha=0.8)
+ ax1.set_ylabel('URLs per Second', color='#88c0d0')
+ ax1.tick_params(axis='y', labelcolor='#88c0d0')
+
+ # Properly set x-axis labels
+ ax1.set_xticks(x_pos)
+ ax1.set_xticklabels(df['test_id'].tolist(), rotation=45, ha='right')
+
+ # Add worker count as text on each bar
+ for i, bar in enumerate(bars):
+ height = bar.get_height()
+ workers = df.iloc[i]['workers']
+ ax1.text(i, height + 0.1,
+ f'W: {workers}', ha='center', va='bottom', fontsize=9, color='#e0e0e0')
+
+ # Add a second y-axis for total URLs
+ ax2 = ax1.twinx()
+ ax2.plot(x_pos, df['urls'], '-', color='#bf616a', alpha=0.8, markersize=6, marker='o')
+ ax2.set_ylabel('Total URLs', color='#bf616a')
+ ax2.tick_params(axis='y', labelcolor='#bf616a')
+
+ # Set title and layout
+ plt.title('Crawl4AI Performance Benchmarks')
+ plt.tight_layout()
+
+ # Save the figure
+ if output_file is None:
+ output_file = self.output_dir / "performance_comparison.png"
+ plt.savefig(output_file, dpi=100, bbox_inches='tight')
+ plt.close()
+
+ return output_file
+
+ def generate_memory_charts(self, results, output_prefix=None):
+ """Generate memory usage charts for each test.
+
+ Args:
+ results: Dictionary mapping test IDs to result data
+ output_prefix: Prefix for output file names
+
+ Returns:
+ List of paths to the saved chart files
+ """
+ if not VISUALIZATION_AVAILABLE:
+ console.print("[yellow]Skipping memory charts - visualization dependencies not available[/yellow]")
+ return []
+
+ output_files = []
+
+ for test_id, result in results.items():
+ if 'memory_samples' not in result:
+ continue
+
+ memory_df = result['memory_samples']
+
+ # Check if we have enough data points
+ if len(memory_df) < 2:
+ continue
+
+ # Try to extract numeric values from memory_info strings
+ try:
+ memory_values = []
+ for mem_str in memory_df['memory_info']:
+ # Extract the number from strings like "142.8 MB"
+ value = float(mem_str.split()[0])
+ memory_values.append(value)
+
+ memory_df['memory_mb'] = memory_values
+ except Exception as e:
+ console.print(f"[yellow]Could not parse memory values for {test_id}: {e}[/yellow]")
+ continue
+
+ # Create the plot
+ plt.figure(figsize=(10, 6))
+
+ # Plot memory usage over time
+ plt.plot(memory_df['elapsed_seconds'], memory_df['memory_mb'],
+ color='#88c0d0', marker='o', linewidth=2, markersize=4)
+
+ # Add annotations for chunk processing
+ chunk_size = result.get('chunk_size', 0)
+ url_count = result.get('url_count', 0)
+ if chunk_size > 0 and url_count > 0:
+ # Estimate chunk processing times
+ num_chunks = (url_count + chunk_size - 1) // chunk_size # Ceiling division
+ total_time = result.get('total_time_seconds', memory_df['elapsed_seconds'].max())
+ chunk_times = np.linspace(0, total_time, num_chunks + 1)[1:]
+
+ for i, time_point in enumerate(chunk_times):
+ if time_point <= memory_df['elapsed_seconds'].max():
+ plt.axvline(x=time_point, color='#4c566a', linestyle='--', alpha=0.6)
+ plt.text(time_point, memory_df['memory_mb'].min(), f'Chunk {i+1}',
+ rotation=90, verticalalignment='bottom', fontsize=8, color='#e0e0e0')
+
+ # Set labels and title
+ plt.xlabel('Elapsed Time (seconds)', color='#e0e0e0')
+ plt.ylabel('Memory Usage (MB)', color='#e0e0e0')
+ plt.title(f'Memory Usage During Test {test_id}\n({url_count} URLs, {result.get("workers", "?")} Workers)',
+ color='#e0e0e0')
+
+ # Add grid and set y-axis to start from zero
+ plt.grid(True, alpha=0.3, color='#4c566a')
+
+ # Add test metadata as text
+ info_text = (
+ f"URLs: {url_count}\n"
+ f"Workers: {result.get('workers', 'N/A')}\n"
+ f"Chunk Size: {result.get('chunk_size', 'N/A')}\n"
+ f"Total Time: {result.get('total_time_seconds', 0):.2f}s\n"
+ )
+
+ # Calculate memory growth
+ if len(memory_df) >= 2:
+ first_mem = memory_df.iloc[0]['memory_mb']
+ last_mem = memory_df.iloc[-1]['memory_mb']
+ growth = last_mem - first_mem
+ growth_rate = growth / result.get('total_time_seconds', 1)
+
+ info_text += f"Memory Growth: {growth:.1f} MB\n"
+ info_text += f"Growth Rate: {growth_rate:.2f} MB/s"
+
+ plt.figtext(0.02, 0.02, info_text, fontsize=9, color='#e0e0e0',
+ bbox=dict(facecolor='#3b4252', alpha=0.8, edgecolor='#4c566a'))
+
+ # Save the figure
+ if output_prefix is None:
+ output_file = self.output_dir / f"memory_chart_{test_id}.png"
+ else:
+ output_file = Path(f"{output_prefix}_memory_{test_id}.png")
+
+ plt.tight_layout()
+ plt.savefig(output_file, dpi=100, bbox_inches='tight')
+ plt.close()
+
+ output_files.append(output_file)
+
+ return output_files
+
+ def generate_comparison_report(self, results, title=None, output_file=None):
+ """Generate a comprehensive comparison report of multiple test runs.
+
+ Args:
+ results: Dictionary mapping test IDs to result data
+ title: Optional title for the report
+ output_file: File path to save the report
+
+ Returns:
+ Path to the saved report file
+ """
+ if not results:
+ console.print("[yellow]No results to generate comparison report[/yellow]")
+ return None
+
+ if output_file is None:
+ timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
+ output_file = self.output_dir / f"comparison_report_{timestamp}.html"
+
+ # Create data for the report
+ rows = []
+ for test_id, data in results.items():
+ # Calculate metrics
+ urls = data.get('url_count', 0)
+ workers = data.get('workers', 0)
+ successful = data.get('successful_urls', 0)
+ failed = data.get('failed_urls', 0)
+ time_seconds = data.get('total_time_seconds', 0)
+
+ # Calculate additional metrics
+ success_rate = (successful / urls) * 100 if urls > 0 else 0
+ urls_per_second = urls / time_seconds if time_seconds > 0 else 0
+ urls_per_worker = urls / workers if workers > 0 else 0
+
+ # Calculate memory growth if available
+ mem_start = None
+ mem_end = None
+ mem_growth = None
+ if 'memory_samples' in data:
+ samples = data['memory_samples']
+ if len(samples) >= 2:
+ try:
+ first_mem = float(samples.iloc[0]['memory_info'].split()[0])
+ last_mem = float(samples.iloc[-1]['memory_info'].split()[0])
+ mem_start = first_mem
+ mem_end = last_mem
+ mem_growth = last_mem - first_mem
+ except:
+ pass
+
+ # Parse timestamp from test_id
+ try:
+ timestamp = datetime.strptime(test_id, "%Y%m%d_%H%M%S")
+ except:
+ timestamp = None
+
+ rows.append({
+ 'test_id': test_id,
+ 'timestamp': timestamp,
+ 'date': timestamp.strftime("%Y-%m-%d %H:%M:%S") if timestamp else "Unknown",
+ 'urls': urls,
+ 'workers': workers,
+ 'chunk_size': data.get('chunk_size', 0),
+ 'successful': successful,
+ 'failed': failed,
+ 'success_rate': success_rate,
+ 'time_seconds': time_seconds,
+ 'urls_per_second': urls_per_second,
+ 'urls_per_worker': urls_per_worker,
+ 'memory_start': mem_start,
+ 'memory_end': mem_end,
+ 'memory_growth': mem_growth
+ })
+
+ # Sort data by timestamp if possible
+ if VISUALIZATION_AVAILABLE:
+ # Convert to DataFrame and sort by timestamp
+ df = pd.DataFrame(rows)
+ if 'timestamp' in df.columns and not df['timestamp'].isna().all():
+ df = df.sort_values('timestamp', ascending=False)
+ else:
+ # Simple sorting without pandas
+ rows.sort(key=lambda x: x.get('timestamp', datetime.now()), reverse=True)
+ df = None
+
+ # Generate HTML report
+ html = []
+ html.append('')
+ html.append('')
+ html.append('')
+ html.append('')
+ html.append('')
+ html.append(f'{title or "Crawl4AI Benchmark Comparison"}')
+ html.append('')
+ html.append('')
+ html.append('')
+
+ # Header
+ html.append(f'{title or "Crawl4AI Benchmark Comparison"}
')
+ html.append(f'Report generated on {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}
')
+
+ # Summary section
+ html.append('')
+ html.append('
Summary
')
+ html.append('
This report compares the performance of Crawl4AI across multiple test runs.
')
+
+ # Summary metrics
+ data_available = (VISUALIZATION_AVAILABLE and df is not None and not df.empty) or (not VISUALIZATION_AVAILABLE and len(rows) > 0)
+ if data_available:
+ # Get the latest test data
+ if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+ latest_test = df.iloc[0]
+ latest_id = latest_test['test_id']
+ else:
+ latest_test = rows[0] # First row (already sorted by timestamp)
+ latest_id = latest_test['test_id']
+
+ html.append('
Latest Test Results
')
+ html.append('
')
+ html.append(f'- Test ID: {latest_id}
')
+ html.append(f'- Date: {latest_test["date"]}
')
+ html.append(f'- URLs: {latest_test["urls"]}
')
+ html.append(f'- Workers: {latest_test["workers"]}
')
+ html.append(f'- Success Rate: {latest_test["success_rate"]:.1f}%
')
+ html.append(f'- Time: {latest_test["time_seconds"]:.2f} seconds
')
+ html.append(f'- Performance: {latest_test["urls_per_second"]:.1f} URLs/second
')
+
+ # Check memory growth (handle both pandas and dict mode)
+ memory_growth_available = False
+ if VISUALIZATION_AVAILABLE and df is not None:
+ if pd.notna(latest_test["memory_growth"]):
+ html.append(f'- Memory Growth: {latest_test["memory_growth"]:.1f} MB
')
+ memory_growth_available = True
+ else:
+ if latest_test["memory_growth"] is not None:
+ html.append(f'- Memory Growth: {latest_test["memory_growth"]:.1f} MB
')
+ memory_growth_available = True
+
+ html.append('
')
+
+ # If we have more than one test, show trend
+ if (VISUALIZATION_AVAILABLE and df is not None and len(df) > 1) or (not VISUALIZATION_AVAILABLE and len(rows) > 1):
+ if VISUALIZATION_AVAILABLE and df is not None:
+ prev_test = df.iloc[1]
+ else:
+ prev_test = rows[1]
+
+ # Calculate performance change
+ perf_change = ((latest_test["urls_per_second"] / prev_test["urls_per_second"]) - 1) * 100 if prev_test["urls_per_second"] > 0 else 0
+
+ status_class = ""
+ if perf_change > 5:
+ status_class = "status-good"
+ elif perf_change < -5:
+ status_class = "status-bad"
+
+ html.append('
Performance Trend
')
+ html.append('
')
+ html.append(f'- Performance Change: {perf_change:+.1f}% compared to previous test
')
+
+ # Memory trend if available
+ memory_trend_available = False
+ if VISUALIZATION_AVAILABLE and df is not None:
+ if pd.notna(latest_test["memory_growth"]) and pd.notna(prev_test["memory_growth"]):
+ mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+ memory_trend_available = True
+ else:
+ if latest_test["memory_growth"] is not None and prev_test["memory_growth"] is not None:
+ mem_change = latest_test["memory_growth"] - prev_test["memory_growth"]
+ memory_trend_available = True
+
+ if memory_trend_available:
+ mem_status = ""
+ if mem_change < -1: # Improved (less growth)
+ mem_status = "status-good"
+ elif mem_change > 1: # Worse (more growth)
+ mem_status = "status-bad"
+
+ html.append(f'- Memory Trend: {mem_change:+.1f} MB change in memory growth
')
+
+ html.append('
')
+
+ html.append('
')
+
+ # Generate performance chart if visualization is available
+ if VISUALIZATION_AVAILABLE:
+ perf_chart = self.generate_performance_chart(results)
+ if perf_chart:
+ html.append('')
+ html.append('
Performance Comparison
')
+ html.append(f'
)})
')
+ html.append('
')
+ else:
+ html.append('')
+ html.append('
Performance Comparison
')
+ html.append('
Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.
')
+ html.append('
')
+
+ # Generate memory charts if visualization is available
+ if VISUALIZATION_AVAILABLE:
+ memory_charts = self.generate_memory_charts(results)
+ if memory_charts:
+ html.append('')
+ html.append('
Memory Usage
')
+
+ for chart in memory_charts:
+ test_id = chart.stem.split('_')[-1]
+ html.append(f'
Test {test_id}
')
+ html.append(f'
)})
')
+
+ html.append('
')
+ else:
+ html.append('')
+ html.append('
Memory Usage
')
+ html.append('
Charts not available - install visualization dependencies (pandas, matplotlib, seaborn) to enable.
')
+ html.append('
')
+
+ # Detailed results table
+ html.append('Detailed Results
')
+
+ # Add the results as an HTML table
+ html.append('')
+
+ # Table headers
+ html.append('')
+ for col in ['Test ID', 'Date', 'URLs', 'Workers', 'Success %', 'Time (s)', 'URLs/sec', 'Mem Growth (MB)']:
+ html.append(f'| {col} | ')
+ html.append('
')
+
+ # Table rows - handle both pandas DataFrame and list of dicts
+ if VISUALIZATION_AVAILABLE and df is not None:
+ # Using pandas DataFrame
+ for _, row in df.iterrows():
+ html.append('')
+ html.append(f'| {row["test_id"]} | ')
+ html.append(f'{row["date"]} | ')
+ html.append(f'{row["urls"]} | ')
+ html.append(f'{row["workers"]} | ')
+ html.append(f'{row["success_rate"]:.1f}% | ')
+ html.append(f'{row["time_seconds"]:.2f} | ')
+ html.append(f'{row["urls_per_second"]:.1f} | ')
+
+ # Memory growth cell
+ if pd.notna(row["memory_growth"]):
+ html.append(f'{row["memory_growth"]:.1f} | ')
+ else:
+ html.append('N/A | ')
+
+ html.append('
')
+ else:
+ # Using list of dicts (when pandas is not available)
+ for row in rows:
+ html.append('')
+ html.append(f'| {row["test_id"]} | ')
+ html.append(f'{row["date"]} | ')
+ html.append(f'{row["urls"]} | ')
+ html.append(f'{row["workers"]} | ')
+ html.append(f'{row["success_rate"]:.1f}% | ')
+ html.append(f'{row["time_seconds"]:.2f} | ')
+ html.append(f'{row["urls_per_second"]:.1f} | ')
+
+ # Memory growth cell
+ if row["memory_growth"] is not None:
+ html.append(f'{row["memory_growth"]:.1f} | ')
+ else:
+ html.append('N/A | ')
+
+ html.append('
')
+
+ html.append('
')
+
+ # Conclusion section
+ html.append('')
+ html.append('
Conclusion
')
+
+ if VISUALIZATION_AVAILABLE and df is not None and not df.empty:
+ # Using pandas for statistics (when available)
+ # Calculate some overall statistics
+ avg_urls_per_sec = df['urls_per_second'].mean()
+ max_urls_per_sec = df['urls_per_second'].max()
+
+ # Determine if we have a trend
+ if len(df) > 1:
+ trend_data = df.sort_values('timestamp')
+ first_perf = trend_data.iloc[0]['urls_per_second']
+ last_perf = trend_data.iloc[-1]['urls_per_second']
+
+ perf_change = ((last_perf / first_perf) - 1) * 100 if first_perf > 0 else 0
+
+ if perf_change > 10:
+ trend_desc = "significantly improved"
+ trend_class = "status-good"
+ elif perf_change > 5:
+ trend_desc = "improved"
+ trend_class = "status-good"
+ elif perf_change < -10:
+ trend_desc = "significantly decreased"
+ trend_class = "status-bad"
+ elif perf_change < -5:
+ trend_desc = "decreased"
+ trend_class = "status-bad"
+ else:
+ trend_desc = "remained stable"
+ trend_class = ""
+
+ html.append(f'
Overall performance has {trend_desc} over the test period.
')
+
+ html.append(f'
Average throughput: {avg_urls_per_sec:.1f} URLs/second
')
+ html.append(f'
Maximum throughput: {max_urls_per_sec:.1f} URLs/second
')
+
+ # Memory leak assessment
+ if 'memory_growth' in df.columns and not df['memory_growth'].isna().all():
+ avg_growth = df['memory_growth'].mean()
+ max_growth = df['memory_growth'].max()
+
+ if avg_growth < 5:
+ leak_assessment = "No significant memory leaks detected"
+ leak_class = "status-good"
+ elif avg_growth < 10:
+ leak_assessment = "Minor memory growth observed"
+ leak_class = "status-warning"
+ else:
+ leak_assessment = "Potential memory leak detected"
+ leak_class = "status-bad"
+
+ html.append(f'
{leak_assessment}. Average memory growth: {avg_growth:.1f} MB per test.
')
+ else:
+ # Manual calculations without pandas
+ if rows:
+ # Calculate average and max throughput
+ total_urls_per_sec = sum(row['urls_per_second'] for row in rows)
+ avg_urls_per_sec = total_urls_per_sec / len(rows)
+ max_urls_per_sec = max(row['urls_per_second'] for row in rows)
+
+ html.append(f'
Average throughput: {avg_urls_per_sec:.1f} URLs/second
')
+ html.append(f'
Maximum throughput: {max_urls_per_sec:.1f} URLs/second
')
+
+ # Memory assessment (simplified without pandas)
+ growth_values = [row['memory_growth'] for row in rows if row['memory_growth'] is not None]
+ if growth_values:
+ avg_growth = sum(growth_values) / len(growth_values)
+
+ if avg_growth < 5:
+ leak_assessment = "No significant memory leaks detected"
+ leak_class = "status-good"
+ elif avg_growth < 10:
+ leak_assessment = "Minor memory growth observed"
+ leak_class = "status-warning"
+ else:
+ leak_assessment = "Potential memory leak detected"
+ leak_class = "status-bad"
+
+ html.append(f'
{leak_assessment}. Average memory growth: {avg_growth:.1f} MB per test.
')
+ else:
+ html.append('
No test data available for analysis.
')
+
+ html.append('
')
+
+ # Footer
+ html.append('')
+ html.append('
Generated by Crawl4AI Benchmark Reporter
')
+ html.append('
')
+
+ html.append('')
+ html.append('')
+
+ # Write the HTML file
+ with open(output_file, 'w') as f:
+ f.write('\n'.join(html))
+
+ # Print a clickable link for terminals that support it (iTerm, VS Code, etc.)
+ file_url = f"file://{os.path.abspath(output_file)}"
+ console.print(f"[green]Comparison report saved to: {output_file}[/green]")
+ console.print(f"[blue underline]Click to open report: {file_url}[/blue underline]")
+ return output_file
+
+ def run(self, limit=None, output_file=None):
+ """Generate a full benchmark report.
+
+ Args:
+ limit: Optional limit on number of most recent tests to include
+ output_file: Optional output file path
+
+ Returns:
+ Path to the generated report file
+ """
+ # Load test results
+ results = self.load_test_results(limit=limit)
+
+ if not results:
+ console.print("[yellow]No test results found. Run some tests first.[/yellow]")
+ return None
+
+ # Generate and display summary table
+ summary_table = self.generate_summary_table(results)
+ console.print(summary_table)
+
+ # Generate comparison report
+ title = f"Crawl4AI Benchmark Report ({len(results)} test runs)"
+ report_file = self.generate_comparison_report(results, title=title, output_file=output_file)
+
+ if report_file:
+ console.print(f"[bold green]Report generated successfully: {report_file}[/bold green]")
+ return report_file
+ else:
+ console.print("[bold red]Failed to generate report[/bold red]")
+ return None
+
+
+def main():
+ """Main entry point for the benchmark reporter."""
+ parser = argparse.ArgumentParser(description="Generate benchmark reports for Crawl4AI stress tests")
+
+ parser.add_argument("--reports-dir", type=str, default="reports",
+ help="Directory containing test result files")
+ parser.add_argument("--output-dir", type=str, default="benchmark_reports",
+ help="Directory to save generated reports")
+ parser.add_argument("--limit", type=int, default=None,
+ help="Limit to most recent N test results")
+ parser.add_argument("--output-file", type=str, default=None,
+ help="Custom output file path for the report")
+
+ args = parser.parse_args()
+
+ # Create the benchmark reporter
+ reporter = BenchmarkReporter(reports_dir=args.reports_dir, output_dir=args.output_dir)
+
+ # Generate the report
+ report_file = reporter.run(limit=args.limit, output_file=args.output_file)
+
+ if report_file:
+ print(f"Report generated at: {report_file}")
+ return 0
+ else:
+ print("Failed to generate report")
+ return 1
+
+
+if __name__ == "__main__":
+ import sys
+ sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/cap_test.py b/tests/memory/cap_test.py
new file mode 100644
index 00000000..56d7b261
--- /dev/null
+++ b/tests/memory/cap_test.py
@@ -0,0 +1,34 @@
+#!/usr/bin/env python3
+"""
+Hammer /crawl with many concurrent requests to prove GLOBAL_SEM works.
+"""
+
+import asyncio, httpx, json, uuid, argparse
+
+API = "http://localhost:8020/crawl"
+URLS_PER_CALL = 1 # keep it minimal so each arun() == 1 page
+CONCURRENT_CALLS = 20 # way above your cap
+
+payload_template = {
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"cache_mode": "BYPASS", "verbose": False},
+ }
+}
+
+async def one_call(client):
+ payload = payload_template.copy()
+ payload["urls"] = [f"https://httpbin.org/anything/{uuid.uuid4()}"]
+ r = await client.post(API, json=payload)
+ r.raise_for_status()
+ return r.json()["server_peak_memory_mb"]
+
+async def main():
+ async with httpx.AsyncClient(timeout=60) as client:
+ tasks = [asyncio.create_task(one_call(client)) for _ in range(CONCURRENT_CALLS)]
+ mem_usages = await asyncio.gather(*tasks)
+ print("Calls finished OK, server peaks reported:", mem_usages)
+
+if __name__ == "__main__":
+ asyncio.run(main())
diff --git a/tests/memory/requirements.txt b/tests/memory/requirements.txt
new file mode 100644
index 00000000..230e0e1f
--- /dev/null
+++ b/tests/memory/requirements.txt
@@ -0,0 +1,4 @@
+pandas>=1.5.0
+matplotlib>=3.5.0
+seaborn>=0.12.0
+rich>=12.0.0
\ No newline at end of file
diff --git a/tests/memory/run_benchmark.py b/tests/memory/run_benchmark.py
new file mode 100755
index 00000000..1e110ddf
--- /dev/null
+++ b/tests/memory/run_benchmark.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Run a complete Crawl4AI benchmark test using test_stress_sdk.py and generate a report.
+"""
+
+import sys
+import os
+import glob
+import argparse
+import subprocess
+import time
+from datetime import datetime
+
+from rich.console import Console
+from rich.text import Text
+
+console = Console()
+
+# Updated TEST_CONFIGS to use max_sessions
+TEST_CONFIGS = {
+ "quick": {"urls": 50, "max_sessions": 4, "chunk_size": 10, "description": "Quick test (50 URLs, 4 sessions)"},
+ "small": {"urls": 100, "max_sessions": 8, "chunk_size": 20, "description": "Small test (100 URLs, 8 sessions)"},
+ "medium": {"urls": 500, "max_sessions": 16, "chunk_size": 50, "description": "Medium test (500 URLs, 16 sessions)"},
+ "large": {"urls": 1000, "max_sessions": 32, "chunk_size": 100,"description": "Large test (1000 URLs, 32 sessions)"},
+ "extreme": {"urls": 2000, "max_sessions": 64, "chunk_size": 200,"description": "Extreme test (2000 URLs, 64 sessions)"},
+}
+
+# Arguments to forward directly if present in custom_args
+FORWARD_ARGS = {
+ "urls": "--urls",
+ "max_sessions": "--max-sessions",
+ "chunk_size": "--chunk-size",
+ "port": "--port",
+ "monitor_mode": "--monitor-mode",
+}
+# Boolean flags to forward if True
+FORWARD_FLAGS = {
+ "stream": "--stream",
+ "use_rate_limiter": "--use-rate-limiter",
+ "keep_server_alive": "--keep-server-alive",
+ "use_existing_site": "--use-existing-site",
+ "skip_generation": "--skip-generation",
+ "keep_site": "--keep-site",
+ "clean_reports": "--clean-reports", # Note: clean behavior is handled here, but pass flag if needed
+ "clean_site": "--clean-site", # Note: clean behavior is handled here, but pass flag if needed
+}
+
+def run_benchmark(config_name, custom_args=None, compare=True, clean=False):
+ """Runs the stress test and optionally the report generator."""
+ if config_name not in TEST_CONFIGS and config_name != "custom":
+ console.print(f"[bold red]Unknown configuration: {config_name}[/bold red]")
+ return False
+
+ # Print header
+ title = "Crawl4AI SDK Benchmark Test"
+ if config_name != "custom":
+ title += f" - {TEST_CONFIGS[config_name]['description']}"
+ else:
+ # Safely get custom args for title
+ urls = custom_args.get('urls', '?') if custom_args else '?'
+ sessions = custom_args.get('max_sessions', '?') if custom_args else '?'
+ title += f" - Custom ({urls} URLs, {sessions} sessions)"
+
+ console.print(f"\n[bold blue]{title}[/bold blue]")
+ console.print("=" * (len(title) + 4)) # Adjust underline length
+
+ console.print("\n[bold white]Preparing test...[/bold white]")
+
+ # --- Command Construction ---
+ # Use the new script name
+ cmd = ["python", "test_stress_sdk.py"]
+
+ # Apply config or custom args
+ args_to_use = {}
+ if config_name != "custom":
+ args_to_use = TEST_CONFIGS[config_name].copy()
+ # If custom args are provided (e.g., boolean flags), overlay them
+ if custom_args:
+ args_to_use.update(custom_args)
+ elif custom_args: # Custom config
+ args_to_use = custom_args.copy()
+
+ # Add arguments with values
+ for key, arg_name in FORWARD_ARGS.items():
+ if key in args_to_use:
+ cmd.extend([arg_name, str(args_to_use[key])])
+
+ # Add boolean flags
+ for key, flag_name in FORWARD_FLAGS.items():
+ if args_to_use.get(key, False): # Check if key exists and is True
+ # Special handling for clean flags - apply locally, don't forward?
+ # Decide if test_stress_sdk.py also needs --clean flags or if run_benchmark handles it.
+ # For now, let's assume run_benchmark handles cleaning based on its own --clean flag.
+ # We'll forward other flags.
+ if key not in ["clean_reports", "clean_site"]:
+ cmd.append(flag_name)
+
+ # Handle the top-level --clean flag for run_benchmark
+ if clean:
+ # Pass clean flags to the stress test script as well, if needed
+ # This assumes test_stress_sdk.py also uses --clean-reports and --clean-site
+ cmd.append("--clean-reports")
+ cmd.append("--clean-site")
+ console.print("[yellow]Applying --clean: Cleaning reports and site before test.[/yellow]")
+ # Actual cleaning logic might reside here or be delegated entirely
+
+ console.print(f"\n[bold white]Running stress test:[/bold white] {' '.join(cmd)}")
+ start = time.time()
+
+ # Execute the stress test script
+ # Use Popen to stream output
+ try:
+ proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, encoding='utf-8', errors='replace')
+ while True:
+ line = proc.stdout.readline()
+ if not line:
+ break
+ console.print(line.rstrip()) # Print line by line
+ proc.wait() # Wait for the process to complete
+ except FileNotFoundError:
+ console.print(f"[bold red]Error: Script 'test_stress_sdk.py' not found. Make sure it's in the correct directory.[/bold red]")
+ return False
+ except Exception as e:
+ console.print(f"[bold red]Error running stress test subprocess: {e}[/bold red]")
+ return False
+
+
+ if proc.returncode != 0:
+ console.print(f"[bold red]Stress test failed with exit code {proc.returncode}[/bold red]")
+ return False
+
+ duration = time.time() - start
+ console.print(f"[bold green]Stress test completed in {duration:.1f} seconds[/bold green]")
+
+ # --- Report Generation (Optional) ---
+ if compare:
+ # Assuming benchmark_report.py exists and works with the generated reports
+ report_script = "benchmark_report.py" # Keep configurable if needed
+ report_cmd = ["python", report_script]
+ console.print(f"\n[bold white]Generating benchmark report: {' '.join(report_cmd)}[/bold white]")
+
+ # Run the report command and capture output
+ try:
+ report_proc = subprocess.run(report_cmd, capture_output=True, text=True, check=False, encoding='utf-8', errors='replace') # Use check=False to handle potential errors
+
+ # Print the captured output from benchmark_report.py
+ if report_proc.stdout:
+ console.print("\n" + report_proc.stdout)
+ if report_proc.stderr:
+ console.print("[yellow]Report generator stderr:[/yellow]\n" + report_proc.stderr)
+
+ if report_proc.returncode != 0:
+ console.print(f"[bold yellow]Benchmark report generation script '{report_script}' failed with exit code {report_proc.returncode}[/bold yellow]")
+ # Don't return False here, test itself succeeded
+ else:
+ console.print(f"[bold green]Benchmark report script '{report_script}' completed.[/bold green]")
+
+ # Find and print clickable links to the reports
+ # Assuming reports are saved in 'benchmark_reports' by benchmark_report.py
+ report_dir = "benchmark_reports"
+ if os.path.isdir(report_dir):
+ report_files = glob.glob(os.path.join(report_dir, "comparison_report_*.html"))
+ if report_files:
+ try:
+ latest_report = max(report_files, key=os.path.getctime)
+ report_path = os.path.abspath(latest_report)
+ report_url = pathlib.Path(report_path).as_uri() # Better way to create file URI
+ console.print(f"[bold cyan]Click to open report: [link={report_url}]{report_url}[/link][/bold cyan]")
+ except Exception as e:
+ console.print(f"[yellow]Could not determine latest report: {e}[/yellow]")
+
+ chart_files = glob.glob(os.path.join(report_dir, "memory_chart_*.png"))
+ if chart_files:
+ try:
+ latest_chart = max(chart_files, key=os.path.getctime)
+ chart_path = os.path.abspath(latest_chart)
+ chart_url = pathlib.Path(chart_path).as_uri()
+ console.print(f"[cyan]Memory chart: [link={chart_url}]{chart_url}[/link][/cyan]")
+ except Exception as e:
+ console.print(f"[yellow]Could not determine latest chart: {e}[/yellow]")
+ else:
+ console.print(f"[yellow]Benchmark report directory '{report_dir}' not found. Cannot link reports.[/yellow]")
+
+ except FileNotFoundError:
+ console.print(f"[bold red]Error: Report script '{report_script}' not found.[/bold red]")
+ except Exception as e:
+ console.print(f"[bold red]Error running report generation subprocess: {e}[/bold red]")
+
+
+ # Prompt to exit
+ console.print("\n[bold green]Benchmark run finished. Press Enter to exit.[/bold green]")
+ try:
+ input() # Wait for user input
+ except EOFError:
+ pass # Handle case where input is piped or unavailable
+
+ return True
+
+def main():
+ parser = argparse.ArgumentParser(description="Run a Crawl4AI SDK benchmark test and generate a report")
+
+ # --- Arguments ---
+ parser.add_argument("config", choices=list(TEST_CONFIGS) + ["custom"],
+ help="Test configuration: quick, small, medium, large, extreme, or custom")
+
+ # Arguments for 'custom' config or to override presets
+ parser.add_argument("--urls", type=int, help="Number of URLs")
+ parser.add_argument("--max-sessions", type=int, help="Max concurrent sessions (replaces --workers)")
+ parser.add_argument("--chunk-size", type=int, help="URLs per batch (for non-stream logging)")
+ parser.add_argument("--port", type=int, help="HTTP server port")
+ parser.add_argument("--monitor-mode", type=str, choices=["DETAILED", "AGGREGATED"], help="Monitor display mode")
+
+ # Boolean flags / options
+ parser.add_argument("--stream", action="store_true", help="Enable streaming results (disables batch logging)")
+ parser.add_argument("--use-rate-limiter", action="store_true", help="Enable basic rate limiter")
+ parser.add_argument("--no-report", action="store_true", help="Skip generating comparison report")
+ parser.add_argument("--clean", action="store_true", help="Clean up reports and site before running")
+ parser.add_argument("--keep-server-alive", action="store_true", help="Keep HTTP server running after test")
+ parser.add_argument("--use-existing-site", action="store_true", help="Use existing site on specified port")
+ parser.add_argument("--skip-generation", action="store_true", help="Use existing site files without regenerating")
+ parser.add_argument("--keep-site", action="store_true", help="Keep generated site files after test")
+ # Removed url_level_logging as it's implicitly handled by stream/batch mode now
+
+ args = parser.parse_args()
+
+ custom_args = {}
+
+ # Populate custom_args from explicit command-line args
+ if args.urls is not None: custom_args["urls"] = args.urls
+ if args.max_sessions is not None: custom_args["max_sessions"] = args.max_sessions
+ if args.chunk_size is not None: custom_args["chunk_size"] = args.chunk_size
+ if args.port is not None: custom_args["port"] = args.port
+ if args.monitor_mode is not None: custom_args["monitor_mode"] = args.monitor_mode
+ if args.stream: custom_args["stream"] = True
+ if args.use_rate_limiter: custom_args["use_rate_limiter"] = True
+ if args.keep_server_alive: custom_args["keep_server_alive"] = True
+ if args.use_existing_site: custom_args["use_existing_site"] = True
+ if args.skip_generation: custom_args["skip_generation"] = True
+ if args.keep_site: custom_args["keep_site"] = True
+ # Clean flags are handled by the 'clean' argument passed to run_benchmark
+
+ # Validate custom config requirements
+ if args.config == "custom":
+ required_custom = ["urls", "max_sessions", "chunk_size"]
+ missing = [f"--{arg}" for arg in required_custom if arg not in custom_args]
+ if missing:
+ console.print(f"[bold red]Error: 'custom' config requires: {', '.join(missing)}[/bold red]")
+ return 1
+
+ success = run_benchmark(
+ config_name=args.config,
+ custom_args=custom_args, # Pass all collected custom args
+ compare=not args.no_report,
+ clean=args.clean
+ )
+ return 0 if success else 1
+
+if __name__ == "__main__":
+ sys.exit(main())
\ No newline at end of file
diff --git a/tests/memory/test_docker_config_gen.py b/tests/memory/test_docker_config_gen.py
new file mode 100644
index 00000000..ae6e533c
--- /dev/null
+++ b/tests/memory/test_docker_config_gen.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python3
+"""
+Quick sanity‑check for /config/dump endpoint.
+
+Usage:
+ python test_config_dump.py [http://localhost:8020]
+
+If the server isn’t running, start it first:
+ uvicorn deploy.docker.server:app --port 8020
+"""
+
+import sys, json, textwrap, requests
+
+# BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:8020"
+BASE = sys.argv[1] if len(sys.argv) > 1 else "http://localhost:11235"
+URL = f"{BASE.rstrip('/')}/config/dump"
+
+CASES = [
+ # --- CrawlRunConfig variants ---
+ "CrawlerRunConfig()",
+ "CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)",
+ "CrawlerRunConfig(js_only=True, wait_until='networkidle')",
+
+ # --- BrowserConfig variants ---
+ "BrowserConfig()",
+ "BrowserConfig(headless=False, extra_args=['--disable-gpu'])",
+ "BrowserConfig(browser_mode='builtin', proxy='http://1.2.3.4:8080')",
+]
+
+for code in CASES:
+ print("\n=== POST:", code)
+ resp = requests.post(URL, json={"code": code}, timeout=15)
+ if resp.ok:
+ print(json.dumps(resp.json(), indent=2)[:400] + "...")
+ else:
+ print("ERROR", resp.status_code, resp.text[:200])
diff --git a/tests/memory/test_stress_api.py b/tests/memory/test_stress_api.py
new file mode 100644
index 00000000..1b4f1a9c
--- /dev/null
+++ b/tests/memory/test_stress_api.py
@@ -0,0 +1,520 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's Docker API server (/crawl and /crawl/stream endpoints).
+
+This version targets a running Crawl4AI API server, sending concurrent requests
+to test its ability to handle multiple crawl jobs simultaneously.
+It uses httpx for async HTTP requests and logs results per batch of requests,
+including server-side memory usage reported by the API.
+"""
+
+import asyncio
+import time
+import uuid
+import argparse
+import json
+import sys
+import os
+import shutil
+from typing import List, Dict, Optional, Union, AsyncGenerator, Tuple
+import httpx
+import pathlib # Import pathlib explicitly
+from rich.console import Console
+from rich.panel import Panel
+from rich.syntax import Syntax
+
+# --- Constants ---
+DEFAULT_API_URL = "http://localhost:11235" # Default port
+DEFAULT_API_URL = "http://localhost:8020" # Default port
+DEFAULT_URL_COUNT = 100
+DEFAULT_MAX_CONCURRENT_REQUESTS = 1
+DEFAULT_CHUNK_SIZE = 10
+DEFAULT_REPORT_PATH = "reports_api"
+DEFAULT_STREAM_MODE = True
+REQUEST_TIMEOUT = 180.0
+
+# Initialize Rich console
+console = Console()
+
+# --- API Health Check (Unchanged) ---
+async def check_server_health(client: httpx.AsyncClient, health_endpoint: str = "/health"):
+ """Check if the API server is healthy."""
+ console.print(f"[bold cyan]Checking API server health at {client.base_url}{health_endpoint}...[/]", end="")
+ try:
+ response = await client.get(health_endpoint, timeout=10.0)
+ response.raise_for_status()
+ health_data = response.json()
+ version = health_data.get('version', 'N/A')
+ console.print(f"[bold green] Server OK! Version: {version}[/]")
+ return True
+ except (httpx.RequestError, httpx.HTTPStatusError) as e:
+ console.print(f"\n[bold red]Server health check FAILED:[/]")
+ console.print(f"Error: {e}")
+ console.print(f"Is the server running and accessible at {client.base_url}?")
+ return False
+ except Exception as e:
+ console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
+ console.print(e)
+ return False
+
+# --- API Stress Test Class ---
+class ApiStressTest:
+ """Orchestrates the stress test by sending concurrent requests to the API."""
+
+ def __init__(
+ self,
+ api_url: str,
+ url_count: int,
+ max_concurrent_requests: int,
+ chunk_size: int,
+ report_path: str,
+ stream_mode: bool,
+ ):
+ self.api_base_url = api_url.rstrip('/')
+ self.url_count = url_count
+ self.max_concurrent_requests = max_concurrent_requests
+ self.chunk_size = chunk_size
+ self.report_path = pathlib.Path(report_path)
+ self.report_path.mkdir(parents=True, exist_ok=True)
+ self.stream_mode = stream_mode
+
+ # Ignore repo path and set it to current file path
+ self.repo_path = pathlib.Path(__file__).parent.resolve()
+
+
+ self.test_id = time.strftime("%Y%m%d_%H%M%S")
+ self.results_summary = {
+ "test_id": self.test_id, "api_url": api_url, "url_count": url_count,
+ "max_concurrent_requests": max_concurrent_requests, "chunk_size": chunk_size,
+ "stream_mode": stream_mode, "start_time": "", "end_time": "",
+ "total_time_seconds": 0, "successful_requests": 0, "failed_requests": 0,
+ "successful_urls": 0, "failed_urls": 0, "total_urls_processed": 0,
+ "total_api_calls": 0,
+ "server_memory_metrics": { # To store aggregated server memory info
+ "batch_mode_avg_delta_mb": None,
+ "batch_mode_max_delta_mb": None,
+ "stream_mode_avg_max_snapshot_mb": None,
+ "stream_mode_max_max_snapshot_mb": None,
+ "samples": [] # Store individual request memory results
+ }
+ }
+ self.http_client = httpx.AsyncClient(base_url=self.api_base_url, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=max_concurrent_requests + 5, max_keepalive_connections=max_concurrent_requests))
+
+ async def close_client(self):
+ """Close the httpx client."""
+ await self.http_client.aclose()
+
+ async def run(self) -> Dict:
+ """Run the API stress test."""
+ # No client memory tracker needed
+ urls_to_process = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(self.url_count)]
+ url_chunks = [urls_to_process[i:i+self.chunk_size] for i in range(0, len(urls_to_process), self.chunk_size)]
+
+ self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+ start_time = time.time()
+
+ console.print(f"\n[bold cyan]Crawl4AI API Stress Test - {self.url_count} URLs, {self.max_concurrent_requests} concurrent requests[/bold cyan]")
+ console.print(f"[bold cyan]Target API:[/bold cyan] {self.api_base_url}, [bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]URLs per Request:[/bold cyan] {self.chunk_size}")
+ # Removed client memory log
+
+ semaphore = asyncio.Semaphore(self.max_concurrent_requests)
+
+ # Updated Batch logging header
+ console.print("\n[bold]API Request Batch Progress:[/bold]")
+ # Adjusted spacing and added Peak
+ console.print("[bold] Batch | Progress | SrvMem Peak / Δ|Max (MB) | Reqs/sec | S/F URLs | Time (s) | Status [/bold]")
+ # Adjust separator length if needed, looks okay for now
+ console.print("─" * 95)
+
+ # No client memory monitor task needed
+
+ tasks = []
+ total_api_calls = len(url_chunks)
+ self.results_summary["total_api_calls"] = total_api_calls
+
+ try:
+ for i, chunk in enumerate(url_chunks):
+ task = asyncio.create_task(self._make_api_request(
+ chunk=chunk,
+ batch_idx=i + 1,
+ total_batches=total_api_calls,
+ semaphore=semaphore
+ # No memory tracker passed
+ ))
+ tasks.append(task)
+
+ api_results = await asyncio.gather(*tasks)
+
+ # Process aggregated results including server memory
+ total_successful_requests = sum(1 for r in api_results if r['request_success'])
+ total_failed_requests = total_api_calls - total_successful_requests
+ total_successful_urls = sum(r['success_urls'] for r in api_results)
+ total_failed_urls = sum(r['failed_urls'] for r in api_results)
+ total_urls_processed = total_successful_urls + total_failed_urls
+
+ # Aggregate server memory metrics
+ valid_samples = [r for r in api_results if r.get('server_delta_or_max_mb') is not None] # Filter results with valid mem data
+ self.results_summary["server_memory_metrics"]["samples"] = valid_samples # Store raw samples with both peak and delta/max
+
+ if valid_samples:
+ delta_or_max_values = [r['server_delta_or_max_mb'] for r in valid_samples]
+ if self.stream_mode:
+ # Stream mode: delta_or_max holds max snapshot
+ self.results_summary["server_memory_metrics"]["stream_mode_avg_max_snapshot_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+ self.results_summary["server_memory_metrics"]["stream_mode_max_max_snapshot_mb"] = max(delta_or_max_values)
+ else: # Batch mode
+ # delta_or_max holds delta
+ self.results_summary["server_memory_metrics"]["batch_mode_avg_delta_mb"] = sum(delta_or_max_values) / len(delta_or_max_values)
+ self.results_summary["server_memory_metrics"]["batch_mode_max_delta_mb"] = max(delta_or_max_values)
+
+ # Aggregate peak values for batch mode
+ peak_values = [r['server_peak_memory_mb'] for r in valid_samples if r.get('server_peak_memory_mb') is not None]
+ if peak_values:
+ self.results_summary["server_memory_metrics"]["batch_mode_avg_peak_mb"] = sum(peak_values) / len(peak_values)
+ self.results_summary["server_memory_metrics"]["batch_mode_max_peak_mb"] = max(peak_values)
+
+
+ self.results_summary.update({
+ "successful_requests": total_successful_requests,
+ "failed_requests": total_failed_requests,
+ "successful_urls": total_successful_urls,
+ "failed_urls": total_failed_urls,
+ "total_urls_processed": total_urls_processed,
+ })
+
+ except Exception as e:
+ console.print(f"[bold red]An error occurred during task execution: {e}[/bold red]")
+ import traceback
+ traceback.print_exc()
+ # No finally block needed for monitor task
+
+ end_time = time.time()
+ self.results_summary.update({
+ "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+ "total_time_seconds": end_time - start_time,
+ # No client memory report
+ })
+ self._save_results()
+ return self.results_summary
+
+ async def _make_api_request(
+ self,
+ chunk: List[str],
+ batch_idx: int,
+ total_batches: int,
+ semaphore: asyncio.Semaphore
+ # No memory tracker
+ ) -> Dict:
+ """Makes a single API request for a chunk of URLs, handling concurrency and logging server memory."""
+ request_success = False
+ success_urls = 0
+ failed_urls = 0
+ status = "Pending"
+ status_color = "grey"
+ server_memory_metric = None # Store delta (batch) or max snapshot (stream)
+ api_call_start_time = time.time()
+
+ async with semaphore:
+ try:
+ # No client memory sampling
+
+ endpoint = "/crawl/stream" if self.stream_mode else "/crawl"
+ payload = {
+ "urls": chunk,
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {
+ "type": "CrawlerRunConfig",
+ "params": {"cache_mode": "BYPASS", "stream": self.stream_mode}
+ }
+ }
+
+ if self.stream_mode:
+ max_server_mem_snapshot = 0.0 # Track max memory seen in this stream
+ async with self.http_client.stream("POST", endpoint, json=payload) as response:
+ initial_status_code = response.status_code
+ response.raise_for_status()
+
+ completed_marker_received = False
+ async for line in response.aiter_lines():
+ if line:
+ try:
+ data = json.loads(line)
+ if data.get("status") == "completed":
+ completed_marker_received = True
+ break
+ elif data.get("url"):
+ if data.get("success"): success_urls += 1
+ else: failed_urls += 1
+ # Extract server memory snapshot per result
+ mem_snapshot = data.get('server_memory_mb')
+ if mem_snapshot is not None:
+ max_server_mem_snapshot = max(max_server_mem_snapshot, float(mem_snapshot))
+ except json.JSONDecodeError:
+ console.print(f"[Batch {batch_idx}] [red]Stream decode error for line:[/red] {line}")
+ failed_urls = len(chunk)
+ break
+ request_success = completed_marker_received
+ if not request_success:
+ failed_urls = len(chunk) - success_urls
+ server_memory_metric = max_server_mem_snapshot # Use max snapshot for stream logging
+
+ else: # Batch mode
+ response = await self.http_client.post(endpoint, json=payload)
+ response.raise_for_status()
+ data = response.json()
+
+ # Extract server memory delta from the response
+ server_memory_metric = data.get('server_memory_delta_mb')
+ server_peak_mem_mb = data.get('server_peak_memory_mb')
+
+ if data.get("success") and "results" in data:
+ request_success = True
+ results_list = data.get("results", [])
+ for result_item in results_list:
+ if result_item.get("success"): success_urls += 1
+ else: failed_urls += 1
+ if len(results_list) != len(chunk):
+ console.print(f"[Batch {batch_idx}] [yellow]Warning: Result count ({len(results_list)}) doesn't match URL count ({len(chunk)})[/yellow]")
+ failed_urls = len(chunk) - success_urls
+ else:
+ request_success = False
+ failed_urls = len(chunk)
+ # Try to get memory from error detail if available
+ detail = data.get('detail')
+ if isinstance(detail, str):
+ try: detail_json = json.loads(detail)
+ except: detail_json = {}
+ elif isinstance(detail, dict):
+ detail_json = detail
+ else: detail_json = {}
+ server_peak_mem_mb = detail_json.get('server_peak_memory_mb', None)
+ server_memory_metric = detail_json.get('server_memory_delta_mb', None)
+ console.print(f"[Batch {batch_idx}] [red]API request failed:[/red] {detail_json.get('error', 'No details')}")
+
+
+ except httpx.HTTPStatusError as e:
+ request_success = False
+ failed_urls = len(chunk)
+ console.print(f"[Batch {batch_idx}] [bold red]HTTP Error {e.response.status_code}:[/] {e.request.url}")
+ try:
+ error_detail = e.response.json()
+ # Attempt to extract memory info even from error responses
+ detail_content = error_detail.get('detail', {})
+ if isinstance(detail_content, str): # Handle if detail is stringified JSON
+ try: detail_content = json.loads(detail_content)
+ except: detail_content = {}
+ server_memory_metric = detail_content.get('server_memory_delta_mb', None)
+ server_peak_mem_mb = detail_content.get('server_peak_memory_mb', None)
+ console.print(f"Response: {error_detail}")
+ except Exception:
+ console.print(f"Response Text: {e.response.text[:200]}...")
+ except httpx.RequestError as e:
+ request_success = False
+ failed_urls = len(chunk)
+ console.print(f"[Batch {batch_idx}] [bold red]Request Error:[/bold] {e.request.url} - {e}")
+ except Exception as e:
+ request_success = False
+ failed_urls = len(chunk)
+ console.print(f"[Batch {batch_idx}] [bold red]Unexpected Error:[/bold] {e}")
+ import traceback
+ traceback.print_exc()
+
+ finally:
+ api_call_time = time.time() - api_call_start_time
+ total_processed_urls = success_urls + failed_urls
+
+ if request_success and failed_urls == 0: status_color, status = "green", "Success"
+ elif request_success and success_urls > 0: status_color, status = "yellow", "Partial"
+ else: status_color, status = "red", "Failed"
+
+ current_total_urls = batch_idx * self.chunk_size
+ progress_pct = min(100.0, (current_total_urls / self.url_count) * 100)
+ reqs_per_sec = 1.0 / api_call_time if api_call_time > 0 else float('inf')
+
+ # --- New Memory Formatting ---
+ mem_display = " N/A " # Default
+ peak_mem_value = None
+ delta_or_max_value = None
+
+ if self.stream_mode:
+ # server_memory_metric holds max snapshot for stream
+ if server_memory_metric is not None:
+ mem_display = f"{server_memory_metric:.1f} (Max)"
+ delta_or_max_value = server_memory_metric # Store for aggregation
+ else: # Batch mode - expect peak and delta
+ # We need to get peak and delta from the API response
+ peak_mem_value = locals().get('server_peak_mem_mb', None) # Get from response data if available
+ delta_value = server_memory_metric # server_memory_metric holds delta for batch
+
+ if peak_mem_value is not None and delta_value is not None:
+ mem_display = f"{peak_mem_value:.1f} / {delta_value:+.1f}"
+ delta_or_max_value = delta_value # Store delta for aggregation
+ elif peak_mem_value is not None:
+ mem_display = f"{peak_mem_value:.1f} / N/A"
+ elif delta_value is not None:
+ mem_display = f"N/A / {delta_value:+.1f}"
+ delta_or_max_value = delta_value # Store delta for aggregation
+
+ # --- Updated Print Statement with Adjusted Padding ---
+ console.print(
+ f" {batch_idx:<5} | {progress_pct:6.1f}% | {mem_display:>24} | {reqs_per_sec:8.1f} | " # Increased width for memory column
+ f"{success_urls:^7}/{failed_urls:<6} | {api_call_time:8.2f} | [{status_color}]{status:<7}[/{status_color}] " # Added trailing space
+ )
+
+ # --- Updated Return Dictionary ---
+ return_data = {
+ "batch_idx": batch_idx,
+ "request_success": request_success,
+ "success_urls": success_urls,
+ "failed_urls": failed_urls,
+ "time": api_call_time,
+ # Return both peak (if available) and delta/max
+ "server_peak_memory_mb": peak_mem_value, # Will be None for stream mode
+ "server_delta_or_max_mb": delta_or_max_value # Delta for batch, Max for stream
+ }
+ # Add back the specific batch mode delta if needed elsewhere, but delta_or_max covers it
+ # if not self.stream_mode:
+ # return_data["server_memory_delta_mb"] = delta_value
+ return return_data
+
+ # No _periodic_memory_sample needed
+
+ def _save_results(self) -> None:
+ """Saves the results summary to a JSON file."""
+ results_path = self.report_path / f"api_test_summary_{self.test_id}.json"
+ try:
+ # No client memory path to convert
+ with open(results_path, 'w', encoding='utf-8') as f:
+ json.dump(self.results_summary, f, indent=2, default=str)
+ except Exception as e:
+ console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function ---
+async def run_full_test(args):
+ """Runs the full API stress test process."""
+ client = httpx.AsyncClient(base_url=args.api_url, timeout=REQUEST_TIMEOUT)
+
+ if not await check_server_health(client):
+ console.print("[bold red]Aborting test due to server health check failure.[/]")
+ await client.aclose()
+ return
+ await client.aclose()
+
+ test = ApiStressTest(
+ api_url=args.api_url,
+ url_count=args.urls,
+ max_concurrent_requests=args.max_concurrent_requests,
+ chunk_size=args.chunk_size,
+ report_path=args.report_path,
+ stream_mode=args.stream,
+ )
+ results = {}
+ try:
+ results = await test.run()
+ finally:
+ await test.close_client()
+
+ if not results:
+ console.print("[bold red]Test did not produce results.[/bold red]")
+ return
+
+ console.print("\n" + "=" * 80)
+ console.print("[bold green]API Stress Test Completed[/bold green]")
+ console.print("=" * 80)
+
+ success_rate_reqs = results["successful_requests"] / results["total_api_calls"] * 100 if results["total_api_calls"] > 0 else 0
+ success_rate_urls = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+ urls_per_second = results["total_urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+ reqs_per_second = results["total_api_calls"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+
+ console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+ console.print(f"[bold cyan]Target API:[/bold cyan] {results['api_url']}")
+ console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_concurrent_requests']} concurrent client requests, URLs/Req: {results['chunk_size']}, Stream: {results['stream_mode']}")
+ console.print(f"[bold cyan]API Requests:[/bold cyan] {results['successful_requests']} successful, {results['failed_requests']} failed ({results['total_api_calls']} total, {success_rate_reqs:.1f}% success)")
+ console.print(f"[bold cyan]URL Processing:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['total_urls_processed']} processed, {success_rate_urls:.1f}% success)")
+ console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f}s total | Avg Reqs/sec: {reqs_per_second:.2f} | Avg URLs/sec: {urls_per_second:.2f}")
+
+ # Report Server Memory
+ mem_metrics = results.get("server_memory_metrics", {})
+ mem_samples = mem_metrics.get("samples", [])
+ if mem_samples:
+ num_samples = len(mem_samples)
+ if results['stream_mode']:
+ avg_mem = mem_metrics.get("stream_mode_avg_max_snapshot_mb")
+ max_mem = mem_metrics.get("stream_mode_max_max_snapshot_mb")
+ avg_str = f"{avg_mem:.1f}" if avg_mem is not None else "N/A"
+ max_str = f"{max_mem:.1f}" if max_mem is not None else "N/A"
+ console.print(f"[bold cyan]Server Memory (Stream):[/bold cyan] Avg Max Snapshot: {avg_str} MB | Max Max Snapshot: {max_str} MB (across {num_samples} requests)")
+ else: # Batch mode
+ avg_delta = mem_metrics.get("batch_mode_avg_delta_mb")
+ max_delta = mem_metrics.get("batch_mode_max_delta_mb")
+ avg_peak = mem_metrics.get("batch_mode_avg_peak_mb")
+ max_peak = mem_metrics.get("batch_mode_max_peak_mb")
+
+ avg_delta_str = f"{avg_delta:.1f}" if avg_delta is not None else "N/A"
+ max_delta_str = f"{max_delta:.1f}" if max_delta is not None else "N/A"
+ avg_peak_str = f"{avg_peak:.1f}" if avg_peak is not None else "N/A"
+ max_peak_str = f"{max_peak:.1f}" if max_peak is not None else "N/A"
+
+ console.print(f"[bold cyan]Server Memory (Batch):[/bold cyan] Avg Peak: {avg_peak_str} MB | Max Peak: {max_peak_str} MB | Avg Delta: {avg_delta_str} MB | Max Delta: {max_delta_str} MB (across {num_samples} requests)")
+ else:
+ console.print("[bold cyan]Server Memory:[/bold cyan] No memory data reported by server.")
+
+
+ # No client memory report
+ summary_path = pathlib.Path(args.report_path) / f"api_test_summary_{results['test_id']}.json"
+ console.print(f"[bold green]Results summary saved to {summary_path}[/bold green]")
+
+ if results["failed_requests"] > 0:
+ console.print(f"\n[bold yellow]Warning: {results['failed_requests']} API requests failed ({100-success_rate_reqs:.1f}% failure rate)[/bold yellow]")
+ if results["failed_urls"] > 0:
+ console.print(f"[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate_urls:.1f}% URL failure rate)[/bold yellow]")
+ if results["total_urls_processed"] < results["url_count"]:
+ console.print(f"\n[bold red]Error: Only {results['total_urls_processed']} out of {results['url_count']} target URLs were processed![/bold red]")
+
+
+# --- main Function (Argument parsing mostly unchanged) ---
+def main():
+ """Main entry point for the script."""
+ parser = argparse.ArgumentParser(description="Crawl4AI API Server Stress Test")
+
+ parser.add_argument("--api-url", type=str, default=DEFAULT_API_URL, help=f"Base URL of the Crawl4AI API server (default: {DEFAULT_API_URL})")
+ parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Total number of unique URLs to process via API calls (default: {DEFAULT_URL_COUNT})")
+ parser.add_argument("--max-concurrent-requests", type=int, default=DEFAULT_MAX_CONCURRENT_REQUESTS, help=f"Maximum concurrent API requests from this client (default: {DEFAULT_MAX_CONCURRENT_REQUESTS})")
+ parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per API request payload (default: {DEFAULT_CHUNK_SIZE})")
+ parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Use the /crawl/stream endpoint instead of /crawl (default: {DEFAULT_STREAM_MODE})")
+ parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+ parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+
+ args = parser.parse_args()
+
+ console.print("[bold underline]Crawl4AI API Stress Test Configuration[/bold underline]")
+ console.print(f"API URL: {args.api_url}")
+ console.print(f"Total URLs: {args.urls}, Concurrent Client Requests: {args.max_concurrent_requests}, URLs per Request: {args.chunk_size}")
+ console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}")
+ console.print(f"Report Path: {args.report_path}")
+ console.print("-" * 40)
+ if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+ console.print("-" * 40)
+
+ if args.clean_reports:
+ report_dir = pathlib.Path(args.report_path)
+ if report_dir.exists():
+ console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]")
+ shutil.rmtree(args.report_path)
+ report_dir.mkdir(parents=True, exist_ok=True)
+
+ try:
+ asyncio.run(run_full_test(args))
+ except KeyboardInterrupt:
+ console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+ except Exception as e:
+ console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}")
+ import traceback
+ traceback.print_exc()
+
+if __name__ == "__main__":
+ # No need to modify sys.path for SimpleMemoryTracker as it's removed
+ main()
\ No newline at end of file
diff --git a/tests/memory/test_stress_api_xs.py b/tests/memory/test_stress_api_xs.py
new file mode 100644
index 00000000..27248883
--- /dev/null
+++ b/tests/memory/test_stress_api_xs.py
@@ -0,0 +1,203 @@
+"""Lite Crawl4AI API stress‑tester.
+
+✔ batch or stream mode (single unified path)
+✔ global stats + JSON summary
+✔ rich table progress
+✔ Typer CLI with presets (quick / soak)
+
+Usage examples:
+ python api_stress_test.py # uses quick preset
+ python api_stress_test.py soak # 5 K URLs stress run
+ python api_stress_test.py --urls 200 --concurrent 10 --chunk 20
+"""
+
+from __future__ import annotations
+
+import asyncio, json, time, uuid, pathlib, statistics
+from typing import List, Dict, Optional
+
+import httpx, typer
+from rich.console import Console
+from rich.table import Table
+
+# ───────────────────────── defaults / presets ──────────────────────────
+PRESETS = {
+ "quick": dict(urls=1, concurrent=1, chunk=1, stream=False),
+ "debug": dict(urls=10, concurrent=2, chunk=5, stream=False),
+ "soak": dict(urls=5000, concurrent=20, chunk=50, stream=True),
+}
+
+API_HEALTH_ENDPOINT = "/health"
+REQUEST_TIMEOUT = 180.0
+
+console = Console()
+app = typer.Typer(add_completion=False, rich_markup_mode="rich")
+
+# ───────────────────────── helpers ─────────────────────────────────────
+async def _check_health(client: httpx.AsyncClient) -> None:
+ resp = await client.get(API_HEALTH_ENDPOINT, timeout=10)
+ resp.raise_for_status()
+ console.print(f"[green]Server healthy — version {resp.json().get('version','?')}[/]")
+
+async def _iter_results(resp: httpx.Response, stream: bool):
+ """Yield result dicts from batch JSON or ND‑JSON stream."""
+ if stream:
+ async for line in resp.aiter_lines():
+ if not line:
+ continue
+ rec = json.loads(line)
+ if rec.get("status") == "completed":
+ break
+ yield rec
+ else:
+ data = resp.json()
+ for rec in data.get("results", []):
+ yield rec, data # rec + whole payload for memory delta/peak
+
+async def _consume_stream(resp: httpx.Response) -> Dict:
+ stats = {"success_urls": 0, "failed_urls": 0, "mem_metric": 0.0}
+ async for line in resp.aiter_lines():
+ if not line:
+ continue
+ rec = json.loads(line)
+ if rec.get("status") == "completed":
+ break
+ if rec.get("success"):
+ stats["success_urls"] += 1
+ else:
+ stats["failed_urls"] += 1
+ mem = rec.get("server_memory_mb")
+ if mem is not None:
+ stats["mem_metric"] = max(stats["mem_metric"], float(mem))
+ return stats
+
+def _consume_batch(body: Dict) -> Dict:
+ stats = {"success_urls": 0, "failed_urls": 0}
+ for rec in body.get("results", []):
+ if rec.get("success"):
+ stats["success_urls"] += 1
+ else:
+ stats["failed_urls"] += 1
+ stats["mem_metric"] = body.get("server_memory_delta_mb")
+ stats["peak"] = body.get("server_peak_memory_mb")
+ return stats
+
+async def _fetch_chunk(
+ client: httpx.AsyncClient,
+ urls: List[str],
+ stream: bool,
+ semaphore: asyncio.Semaphore,
+) -> Dict:
+ endpoint = "/crawl/stream" if stream else "/crawl"
+ payload = {
+ "urls": urls,
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
+ "crawler_config": {"type": "CrawlerRunConfig",
+ "params": {"cache_mode": "BYPASS", "stream": stream}},
+ }
+
+ async with semaphore:
+ start = time.perf_counter()
+
+ if stream:
+ # ---- streaming request ----
+ async with client.stream("POST", endpoint, json=payload) as resp:
+ resp.raise_for_status()
+ stats = await _consume_stream(resp)
+ else:
+ # ---- batch request ----
+ resp = await client.post(endpoint, json=payload)
+ resp.raise_for_status()
+ stats = _consume_batch(resp.json())
+
+ stats["elapsed"] = time.perf_counter() - start
+ return stats
+
+
+# ───────────────────────── core runner ─────────────────────────────────
+async def _run(api: str, urls: int, concurrent: int, chunk: int, stream: bool, report: pathlib.Path):
+ client = httpx.AsyncClient(base_url=api, timeout=REQUEST_TIMEOUT, limits=httpx.Limits(max_connections=concurrent+5))
+ await _check_health(client)
+
+ url_list = [f"https://httpbin.org/anything/{uuid.uuid4()}" for _ in range(urls)]
+ chunks = [url_list[i:i+chunk] for i in range(0, len(url_list), chunk)]
+ sem = asyncio.Semaphore(concurrent)
+
+ table = Table(show_header=True, header_style="bold magenta")
+ table.add_column("Batch", style="dim", width=6)
+ table.add_column("Success/Fail", width=12)
+ table.add_column("Mem", width=14)
+ table.add_column("Time (s)")
+
+ agg_success = agg_fail = 0
+ deltas, peaks = [], []
+
+ start = time.perf_counter()
+ tasks = [asyncio.create_task(_fetch_chunk(client, c, stream, sem)) for c in chunks]
+ for idx, coro in enumerate(asyncio.as_completed(tasks), 1):
+ res = await coro
+ agg_success += res["success_urls"]
+ agg_fail += res["failed_urls"]
+ if res["mem_metric"] is not None:
+ deltas.append(res["mem_metric"])
+ if res["peak"] is not None:
+ peaks.append(res["peak"])
+
+ mem_txt = f"{res['mem_metric']:.1f}" if res["mem_metric"] is not None else "‑"
+ if res["peak"] is not None:
+ mem_txt = f"{res['peak']:.1f}/{mem_txt}"
+
+ table.add_row(str(idx), f"{res['success_urls']}/{res['failed_urls']}", mem_txt, f"{res['elapsed']:.2f}")
+
+ console.print(table)
+ total_time = time.perf_counter() - start
+
+ summary = {
+ "urls": urls,
+ "concurrent": concurrent,
+ "chunk": chunk,
+ "stream": stream,
+ "success_urls": agg_success,
+ "failed_urls": agg_fail,
+ "elapsed_sec": round(total_time, 2),
+ "avg_mem": round(statistics.mean(deltas), 2) if deltas else None,
+ "max_mem": max(deltas) if deltas else None,
+ "avg_peak": round(statistics.mean(peaks), 2) if peaks else None,
+ "max_peak": max(peaks) if peaks else None,
+ }
+ console.print("\n[bold green]Done:[/]" , summary)
+
+ report.mkdir(parents=True, exist_ok=True)
+ path = report / f"api_test_{int(time.time())}.json"
+ path.write_text(json.dumps(summary, indent=2))
+ console.print(f"[green]Summary → {path}")
+
+ await client.aclose()
+
+# ───────────────────────── Typer CLI ──────────────────────────────────
+@app.command()
+def main(
+ preset: str = typer.Argument("quick", help="quick / debug / soak or custom"),
+ api_url: str = typer.Option("http://localhost:8020", show_default=True),
+ urls: int = typer.Option(None, help="Total URLs to crawl"),
+ concurrent: int = typer.Option(None, help="Concurrent API requests"),
+ chunk: int = typer.Option(None, help="URLs per request"),
+ stream: bool = typer.Option(None, help="Use /crawl/stream"),
+ report: pathlib.Path = typer.Option("reports_api", help="Where to save JSON summary"),
+):
+ """Run a stress test against a running Crawl4AI API server."""
+ if preset not in PRESETS and any(v is None for v in (urls, concurrent, chunk, stream)):
+ console.print(f"[red]Unknown preset '{preset}' and custom params missing[/]")
+ raise typer.Exit(1)
+
+ cfg = PRESETS.get(preset, {})
+ urls = urls or cfg.get("urls")
+ concurrent = concurrent or cfg.get("concurrent")
+ chunk = chunk or cfg.get("chunk")
+ stream = stream if stream is not None else cfg.get("stream", False)
+
+ console.print(f"[cyan]API:[/] {api_url} | URLs: {urls} | Concurrency: {concurrent} | Chunk: {chunk} | Stream: {stream}")
+ asyncio.run(_run(api_url, urls, concurrent, chunk, stream, report))
+
+if __name__ == "__main__":
+ app()
diff --git a/tests/memory/test_stress_docker_api.py b/tests/memory/test_stress_docker_api.py
new file mode 100644
index 00000000..05b3bea8
--- /dev/null
+++ b/tests/memory/test_stress_docker_api.py
@@ -0,0 +1,129 @@
+"""
+Crawl4AI Docker API stress tester.
+
+Examples
+--------
+python test_stress_docker_api.py --urls 1000 --concurrency 32
+python test_stress_docker_api.py --urls 1000 --concurrency 32 --stream
+python test_stress_docker_api.py --base-url http://10.0.0.42:11235 --http2
+"""
+
+import argparse, asyncio, json, secrets, statistics, time
+from typing import List, Tuple
+import httpx
+from rich.console import Console
+from rich.progress import Progress, BarColumn, TimeElapsedColumn, TimeRemainingColumn
+from rich.table import Table
+
+console = Console()
+
+
+# ───────────────────────── helpers ─────────────────────────
+def make_fake_urls(n: int) -> List[str]:
+ base = "https://httpbin.org/anything/"
+ return [f"{base}{secrets.token_hex(8)}" for _ in range(n)]
+
+
+async def fire(
+ client: httpx.AsyncClient, endpoint: str, payload: dict, sem: asyncio.Semaphore
+) -> Tuple[bool, float]:
+ async with sem:
+ print(f"POST {endpoint} with {len(payload['urls'])} URLs")
+ t0 = time.perf_counter()
+ try:
+ if endpoint.endswith("/stream"):
+ async with client.stream("POST", endpoint, json=payload) as r:
+ r.raise_for_status()
+ async for _ in r.aiter_lines():
+ pass
+ else:
+ r = await client.post(endpoint, json=payload)
+ r.raise_for_status()
+ return True, time.perf_counter() - t0
+ except Exception:
+ return False, time.perf_counter() - t0
+
+
+def pct(lat: List[float], p: float) -> str:
+ """Return percentile string even for tiny samples."""
+ if not lat:
+ return "-"
+ if len(lat) == 1:
+ return f"{lat[0]:.2f}s"
+ lat_sorted = sorted(lat)
+ k = (p / 100) * (len(lat_sorted) - 1)
+ lo = int(k)
+ hi = min(lo + 1, len(lat_sorted) - 1)
+ frac = k - lo
+ val = lat_sorted[lo] * (1 - frac) + lat_sorted[hi] * frac
+ return f"{val:.2f}s"
+
+
+# ───────────────────────── main ─────────────────────────
+def parse_args() -> argparse.Namespace:
+ p = argparse.ArgumentParser(description="Stress test Crawl4AI Docker API")
+ p.add_argument("--urls", type=int, default=100, help="number of URLs")
+ p.add_argument("--concurrency", type=int, default=1, help="max POSTs in flight")
+ p.add_argument("--chunk-size", type=int, default=50, help="URLs per request")
+ p.add_argument("--base-url", default="http://localhost:11235", help="API root")
+ # p.add_argument("--base-url", default="http://localhost:8020", help="API root")
+ p.add_argument("--stream", action="store_true", help="use /crawl/stream")
+ p.add_argument("--http2", action="store_true", help="enable HTTP/2")
+ p.add_argument("--headless", action="store_true", default=True)
+ return p.parse_args()
+
+
+async def main() -> None:
+ args = parse_args()
+
+ urls = make_fake_urls(args.urls)
+ batches = [urls[i : i + args.chunk_size] for i in range(0, len(urls), args.chunk_size)]
+ endpoint = "/crawl/stream" if args.stream else "/crawl"
+ sem = asyncio.Semaphore(args.concurrency)
+
+ async with httpx.AsyncClient(base_url=args.base_url, http2=args.http2, timeout=None) as client:
+ with Progress(
+ "[progress.description]{task.description}",
+ BarColumn(),
+ "[progress.percentage]{task.percentage:>3.0f}%",
+ TimeElapsedColumn(),
+ TimeRemainingColumn(),
+ ) as progress:
+ task_id = progress.add_task("[cyan]bombarding…", total=len(batches))
+ tasks = []
+ for chunk in batches:
+ payload = {
+ "urls": chunk,
+ "browser_config": {"type": "BrowserConfig", "params": {"headless": args.headless}},
+ "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS", "stream": args.stream}},
+ }
+ tasks.append(asyncio.create_task(fire(client, endpoint, payload, sem)))
+ progress.advance(task_id)
+
+ results = await asyncio.gather(*tasks)
+
+ ok_latencies = [dt for ok, dt in results if ok]
+ err_count = sum(1 for ok, _ in results if not ok)
+
+ table = Table(title="Docker API Stress‑Test Summary")
+ table.add_column("total", justify="right")
+ table.add_column("errors", justify="right")
+ table.add_column("p50", justify="right")
+ table.add_column("p95", justify="right")
+ table.add_column("max", justify="right")
+
+ table.add_row(
+ str(len(results)),
+ str(err_count),
+ pct(ok_latencies, 50),
+ pct(ok_latencies, 95),
+ f"{max(ok_latencies):.2f}s" if ok_latencies else "-",
+ )
+ console.print(table)
+
+
+if __name__ == "__main__":
+ try:
+ asyncio.run(main())
+ except KeyboardInterrupt:
+ console.print("\n[yellow]aborted by user[/]")
diff --git a/tests/memory/test_stress_sdk.py b/tests/memory/test_stress_sdk.py
new file mode 100644
index 00000000..14da94a4
--- /dev/null
+++ b/tests/memory/test_stress_sdk.py
@@ -0,0 +1,500 @@
+#!/usr/bin/env python3
+"""
+Stress test for Crawl4AI's arun_many and dispatcher system.
+This version uses a local HTTP server and focuses on testing
+the SDK's ability to handle multiple URLs concurrently, with per-batch logging.
+"""
+
+import asyncio
+import os
+import time
+import pathlib
+import random
+import secrets
+import argparse
+import json
+import sys
+import subprocess
+import signal
+from typing import List, Dict, Optional, Union, AsyncGenerator
+import shutil
+from rich.console import Console
+
+# Crawl4AI components
+from crawl4ai import (
+ AsyncWebCrawler,
+ CrawlerRunConfig,
+ BrowserConfig,
+ MemoryAdaptiveDispatcher,
+ CrawlerMonitor,
+ DisplayMode,
+ CrawlResult,
+ RateLimiter,
+ CacheMode,
+)
+
+# Constants
+DEFAULT_SITE_PATH = "test_site"
+DEFAULT_PORT = 8000
+DEFAULT_MAX_SESSIONS = 16
+DEFAULT_URL_COUNT = 1
+DEFAULT_CHUNK_SIZE = 1 # Define chunk size for batch logging
+DEFAULT_REPORT_PATH = "reports"
+DEFAULT_STREAM_MODE = False
+DEFAULT_MONITOR_MODE = "DETAILED"
+
+# Initialize Rich console
+console = Console()
+
+# --- SiteGenerator Class (Unchanged) ---
+class SiteGenerator:
+ """Generates a local test site with heavy pages for stress testing."""
+
+ def __init__(self, site_path: str = DEFAULT_SITE_PATH, page_count: int = DEFAULT_URL_COUNT):
+ self.site_path = pathlib.Path(site_path)
+ self.page_count = page_count
+ self.images_dir = self.site_path / "images"
+ self.lorem_words = " ".join("lorem ipsum dolor sit amet " * 100).split()
+
+ self.html_template = """
+
+
+ Test Page {page_num}
+
+
+
+ Test Page {page_num}
+ {paragraphs}
+ {images}
+
+
+"""
+
+ def generate_site(self) -> None:
+ self.site_path.mkdir(parents=True, exist_ok=True)
+ self.images_dir.mkdir(exist_ok=True)
+ console.print(f"Generating {self.page_count} test pages...")
+ for i in range(self.page_count):
+ paragraphs = "\n".join(f"{' '.join(random.choices(self.lorem_words, k=200))}
" for _ in range(5))
+ images = "\n".join(f'
' for j in range(3))
+ page_path = self.site_path / f"page_{i}.html"
+ page_path.write_text(self.html_template.format(page_num=i, paragraphs=paragraphs, images=images), encoding="utf-8")
+ if (i + 1) % (self.page_count // 10 or 1) == 0 or i == self.page_count - 1:
+ console.print(f"Generated {i+1}/{self.page_count} pages")
+ self._create_index_page()
+ console.print(f"[bold green]Successfully generated {self.page_count} test pages in [cyan]{self.site_path}[/cyan][/bold green]")
+
+ def _create_index_page(self) -> None:
+ index_content = """Test Site IndexTest Site Index
This is an automatically generated site for testing Crawl4AI.
\n"""
+ for i in range(self.page_count):
+ index_content += f'
Test Page {i}\n'
+ index_content += """
"""
+ (self.site_path / "index.html").write_text(index_content, encoding="utf-8")
+
+# --- LocalHttpServer Class (Unchanged) ---
+class LocalHttpServer:
+ """Manages a local HTTP server for serving test pages."""
+ def __init__(self, site_path: str = DEFAULT_SITE_PATH, port: int = DEFAULT_PORT):
+ self.site_path = pathlib.Path(site_path)
+ self.port = port
+ self.process = None
+
+ def start(self) -> None:
+ if not self.site_path.exists(): raise FileNotFoundError(f"Site directory {self.site_path} does not exist")
+ console.print(f"Attempting to start HTTP server in [cyan]{self.site_path}[/cyan] on port {self.port}...")
+ try:
+ cmd = ["python", "-m", "http.server", str(self.port)]
+ creationflags = 0; preexec_fn = None
+ if sys.platform == 'win32': creationflags = subprocess.CREATE_NEW_PROCESS_GROUP
+ self.process = subprocess.Popen(cmd, cwd=str(self.site_path), stdout=subprocess.PIPE, stderr=subprocess.PIPE, creationflags=creationflags)
+ time.sleep(1.5)
+ if self.is_running(): console.print(f"[bold green]HTTP server started successfully (PID: {self.process.pid})[/bold green]")
+ else:
+ console.print("[bold red]Failed to start HTTP server. Checking logs...[/bold red]")
+ stdout, stderr = self.process.communicate(); print(stdout.decode(errors='ignore')); print(stderr.decode(errors='ignore'))
+ self.stop(); raise RuntimeError("HTTP server failed to start.")
+ except Exception as e: console.print(f"[bold red]Error starting HTTP server: {str(e)}[/bold red]"); self.stop(); raise
+
+ def stop(self) -> None:
+ if self.process and self.is_running():
+ console.print(f"Stopping HTTP server (PID: {self.process.pid})...")
+ try:
+ if sys.platform == 'win32': self.process.send_signal(signal.CTRL_BREAK_EVENT); time.sleep(0.5)
+ self.process.terminate()
+ try: stdout, stderr = self.process.communicate(timeout=5); console.print("[bold yellow]HTTP server stopped[/bold yellow]")
+ except subprocess.TimeoutExpired: console.print("[bold red]Server did not terminate gracefully, killing...[/bold red]"); self.process.kill(); stdout, stderr = self.process.communicate(); console.print("[bold yellow]HTTP server killed[/bold yellow]")
+ except Exception as e: console.print(f"[bold red]Error stopping HTTP server: {str(e)}[/bold red]"); self.process.kill()
+ finally: self.process = None
+ elif self.process: console.print("[dim]HTTP server process already stopped.[/dim]"); self.process = None
+
+ def is_running(self) -> bool:
+ if not self.process: return False
+ return self.process.poll() is None
+
+# --- SimpleMemoryTracker Class (Unchanged) ---
+class SimpleMemoryTracker:
+ """Basic memory tracker that doesn't rely on psutil."""
+ def __init__(self, report_path: str = DEFAULT_REPORT_PATH, test_id: Optional[str] = None):
+ self.report_path = pathlib.Path(report_path); self.report_path.mkdir(parents=True, exist_ok=True)
+ self.test_id = test_id or time.strftime("%Y%m%d_%H%M%S")
+ self.start_time = time.time(); self.memory_samples = []; self.pid = os.getpid()
+ self.csv_path = self.report_path / f"memory_samples_{self.test_id}.csv"
+ with open(self.csv_path, 'w', encoding='utf-8') as f: f.write("timestamp,elapsed_seconds,memory_info_mb\n")
+
+ def sample(self) -> Dict:
+ try:
+ memory_mb = self._get_memory_info_mb()
+ memory_str = f"{memory_mb:.1f} MB" if memory_mb is not None else "Unknown"
+ timestamp = time.time(); elapsed = timestamp - self.start_time
+ sample = {"timestamp": timestamp, "elapsed_seconds": elapsed, "memory_mb": memory_mb, "memory_str": memory_str}
+ self.memory_samples.append(sample)
+ with open(self.csv_path, 'a', encoding='utf-8') as f: f.write(f"{timestamp},{elapsed:.2f},{memory_mb if memory_mb is not None else ''}\n")
+ return sample
+ except Exception as e: return {"memory_mb": None, "memory_str": "Error"}
+
+ def _get_memory_info_mb(self) -> Optional[float]:
+ pid_str = str(self.pid)
+ try:
+ if sys.platform == 'darwin': result = subprocess.run(["ps", "-o", "rss=", "-p", pid_str], capture_output=True, text=True, check=True, encoding='utf-8'); return int(result.stdout.strip()) / 1024.0
+ elif sys.platform == 'linux':
+ with open(f"/proc/{pid_str}/status", encoding='utf-8') as f:
+ for line in f:
+ if line.startswith("VmRSS:"): return int(line.split()[1]) / 1024.0
+ return None
+ elif sys.platform == 'win32': result = subprocess.run(["tasklist", "/fi", f"PID eq {pid_str}", "/fo", "csv", "/nh"], capture_output=True, text=True, check=True, encoding='cp850', errors='ignore'); parts = result.stdout.strip().split('","'); return int(parts[4].strip().replace('"', '').replace(' K', '').replace(',', '')) / 1024.0 if len(parts) >= 5 else None
+ else: return None
+ except: return None # Catch all exceptions for robustness
+
+ def get_report(self) -> Dict:
+ if not self.memory_samples: return {"error": "No memory samples collected"}
+ total_time = time.time() - self.start_time; valid_samples = [s['memory_mb'] for s in self.memory_samples if s['memory_mb'] is not None]
+ start_mem = valid_samples[0] if valid_samples else None; end_mem = valid_samples[-1] if valid_samples else None
+ max_mem = max(valid_samples) if valid_samples else None; avg_mem = sum(valid_samples) / len(valid_samples) if valid_samples else None
+ growth = (end_mem - start_mem) if start_mem is not None and end_mem is not None else None
+ return {"test_id": self.test_id, "total_time_seconds": total_time, "sample_count": len(self.memory_samples), "valid_sample_count": len(valid_samples), "csv_path": str(self.csv_path), "platform": sys.platform, "start_memory_mb": start_mem, "end_memory_mb": end_mem, "max_memory_mb": max_mem, "average_memory_mb": avg_mem, "memory_growth_mb": growth}
+
+
+# --- CrawlerStressTest Class (Refactored for Per-Batch Logging) ---
+class CrawlerStressTest:
+ """Orchestrates the stress test using arun_many per chunk and a dispatcher."""
+
+ def __init__(
+ self,
+ url_count: int = DEFAULT_URL_COUNT,
+ port: int = DEFAULT_PORT,
+ max_sessions: int = DEFAULT_MAX_SESSIONS,
+ chunk_size: int = DEFAULT_CHUNK_SIZE, # Added chunk_size
+ report_path: str = DEFAULT_REPORT_PATH,
+ stream_mode: bool = DEFAULT_STREAM_MODE,
+ monitor_mode: str = DEFAULT_MONITOR_MODE,
+ use_rate_limiter: bool = False
+ ):
+ self.url_count = url_count
+ self.server_port = port
+ self.max_sessions = max_sessions
+ self.chunk_size = chunk_size # Store chunk size
+ self.report_path = pathlib.Path(report_path)
+ self.report_path.mkdir(parents=True, exist_ok=True)
+ self.stream_mode = stream_mode
+ self.monitor_mode = DisplayMode[monitor_mode.upper()]
+ self.use_rate_limiter = use_rate_limiter
+
+ self.test_id = time.strftime("%Y%m%d_%H%M%S")
+ self.results_summary = {
+ "test_id": self.test_id, "url_count": url_count, "max_sessions": max_sessions,
+ "chunk_size": chunk_size, "stream_mode": stream_mode, "monitor_mode": monitor_mode,
+ "rate_limiter_used": use_rate_limiter, "start_time": "", "end_time": "",
+ "total_time_seconds": 0, "successful_urls": 0, "failed_urls": 0,
+ "urls_processed": 0, "chunks_processed": 0
+ }
+
+ async def run(self) -> Dict:
+ """Run the stress test and return results."""
+ memory_tracker = SimpleMemoryTracker(report_path=self.report_path, test_id=self.test_id)
+ urls = [f"http://localhost:{self.server_port}/page_{i}.html" for i in range(self.url_count)]
+ # Split URLs into chunks based on self.chunk_size
+ url_chunks = [urls[i:i+self.chunk_size] for i in range(0, len(urls), self.chunk_size)]
+
+ self.results_summary["start_time"] = time.strftime("%Y-%m-%d %H:%M:%S")
+ start_time = time.time()
+
+ config = CrawlerRunConfig(
+ wait_for_images=False, verbose=False,
+ stream=self.stream_mode, # Still pass stream mode, affects arun_many return type
+ cache_mode=CacheMode.BYPASS
+ )
+
+ total_successful_urls = 0
+ total_failed_urls = 0
+ total_urls_processed = 0
+ start_memory_sample = memory_tracker.sample()
+ start_memory_str = start_memory_sample.get("memory_str", "Unknown")
+
+ # monitor = CrawlerMonitor(display_mode=self.monitor_mode, total_urls=self.url_count)
+ monitor = None
+ rate_limiter = RateLimiter(base_delay=(0.1, 0.3)) if self.use_rate_limiter else None
+ dispatcher = MemoryAdaptiveDispatcher(max_session_permit=self.max_sessions, monitor=monitor, rate_limiter=rate_limiter)
+
+ console.print(f"\n[bold cyan]Crawl4AI Stress Test - {self.url_count} URLs, {self.max_sessions} max sessions[/bold cyan]")
+ console.print(f"[bold cyan]Mode:[/bold cyan] {'Streaming' if self.stream_mode else 'Batch'}, [bold cyan]Monitor:[/bold cyan] {self.monitor_mode.name}, [bold cyan]Chunk Size:[/bold cyan] {self.chunk_size}")
+ console.print(f"[bold cyan]Initial Memory:[/bold cyan] {start_memory_str}")
+
+ # Print batch log header only if not streaming
+ if not self.stream_mode:
+ console.print("\n[bold]Batch Progress:[/bold] (Monitor below shows overall progress)")
+ console.print("[bold] Batch | Progress | Start Mem | End Mem | URLs/sec | Success/Fail | Time (s) | Status [/bold]")
+ console.print("─" * 90)
+
+ monitor_task = asyncio.create_task(self._periodic_memory_sample(memory_tracker, 2.0))
+
+ try:
+ async with AsyncWebCrawler(
+ config=BrowserConfig( verbose = False)
+ ) as crawler:
+ # Process URLs chunk by chunk
+ for chunk_idx, url_chunk in enumerate(url_chunks):
+ batch_start_time = time.time()
+ chunk_success = 0
+ chunk_failed = 0
+
+ # Sample memory before the chunk
+ start_mem_sample = memory_tracker.sample()
+ start_mem_str = start_mem_sample.get("memory_str", "Unknown")
+
+ # --- Call arun_many for the current chunk ---
+ try:
+ # Note: dispatcher/monitor persist across calls
+ results_gen_or_list: Union[AsyncGenerator[CrawlResult, None], List[CrawlResult]] = \
+ await crawler.arun_many(
+ urls=url_chunk,
+ config=config,
+ dispatcher=dispatcher # Reuse the same dispatcher
+ )
+
+ if self.stream_mode:
+ # Process stream results if needed, but batch logging is less relevant
+ async for result in results_gen_or_list:
+ total_urls_processed += 1
+ if result.success: chunk_success += 1
+ else: chunk_failed += 1
+ # In stream mode, batch summary isn't as meaningful here
+ # We could potentially track completion per chunk async, but it's complex
+
+ else: # Batch mode
+ # Process the list of results for this chunk
+ for result in results_gen_or_list:
+ total_urls_processed += 1
+ if result.success: chunk_success += 1
+ else: chunk_failed += 1
+
+ except Exception as e:
+ console.print(f"[bold red]Error processing chunk {chunk_idx+1}: {e}[/bold red]")
+ chunk_failed = len(url_chunk) # Assume all failed in the chunk on error
+ total_urls_processed += len(url_chunk) # Count them as processed (failed)
+
+ # --- Log batch results (only if not streaming) ---
+ if not self.stream_mode:
+ batch_time = time.time() - batch_start_time
+ urls_per_sec = len(url_chunk) / batch_time if batch_time > 0 else 0
+ end_mem_sample = memory_tracker.sample()
+ end_mem_str = end_mem_sample.get("memory_str", "Unknown")
+
+ progress_pct = (total_urls_processed / self.url_count) * 100
+
+ if chunk_failed == 0: status_color, status = "green", "Success"
+ elif chunk_success == 0: status_color, status = "red", "Failed"
+ else: status_color, status = "yellow", "Partial"
+
+ console.print(
+ f" {chunk_idx+1:<5} | {progress_pct:6.1f}% | {start_mem_str:>9} | {end_mem_str:>9} | {urls_per_sec:8.1f} | "
+ f"{chunk_success:^7}/{chunk_failed:<6} | {batch_time:8.2f} | [{status_color}]{status:<7}[/{status_color}]"
+ )
+
+ # Accumulate totals
+ total_successful_urls += chunk_success
+ total_failed_urls += chunk_failed
+ self.results_summary["chunks_processed"] += 1
+
+ # Optional small delay between starting chunks if needed
+ # await asyncio.sleep(0.1)
+
+ except Exception as e:
+ console.print(f"[bold red]An error occurred during the main crawl loop: {e}[/bold red]")
+ finally:
+ if 'monitor_task' in locals() and not monitor_task.done():
+ monitor_task.cancel()
+ try: await monitor_task
+ except asyncio.CancelledError: pass
+
+ end_time = time.time()
+ self.results_summary.update({
+ "end_time": time.strftime("%Y-%m-%d %H:%M:%S"),
+ "total_time_seconds": end_time - start_time,
+ "successful_urls": total_successful_urls,
+ "failed_urls": total_failed_urls,
+ "urls_processed": total_urls_processed,
+ "memory": memory_tracker.get_report()
+ })
+ self._save_results()
+ return self.results_summary
+
+ async def _periodic_memory_sample(self, tracker: SimpleMemoryTracker, interval: float):
+ """Background task to sample memory periodically."""
+ while True:
+ tracker.sample()
+ try:
+ await asyncio.sleep(interval)
+ except asyncio.CancelledError:
+ break # Exit loop on cancellation
+
+ def _save_results(self) -> None:
+ results_path = self.report_path / f"test_summary_{self.test_id}.json"
+ try:
+ with open(results_path, 'w', encoding='utf-8') as f: json.dump(self.results_summary, f, indent=2, default=str)
+ # console.print(f"\n[bold green]Results summary saved to {results_path}[/bold green]") # Moved summary print to run_full_test
+ except Exception as e: console.print(f"[bold red]Failed to save results summary: {e}[/bold red]")
+
+
+# --- run_full_test Function (Adjusted) ---
+async def run_full_test(args):
+ """Run the complete test process from site generation to crawling."""
+ server = None
+ site_generated = False
+
+ # --- Site Generation --- (Same as before)
+ if not args.use_existing_site and not args.skip_generation:
+ if os.path.exists(args.site_path): console.print(f"[yellow]Removing existing site directory: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+ site_generator = SiteGenerator(site_path=args.site_path, page_count=args.urls); site_generator.generate_site(); site_generated = True
+ elif args.use_existing_site: console.print(f"[cyan]Using existing site assumed to be running on port {args.port}[/cyan]")
+ elif args.skip_generation:
+ console.print(f"[cyan]Skipping site generation, using existing directory: {args.site_path}[/cyan]")
+ if not os.path.exists(args.site_path) or not os.path.isdir(args.site_path): console.print(f"[bold red]Error: Site path '{args.site_path}' does not exist or is not a directory.[/bold red]"); return
+
+ # --- Start Local Server --- (Same as before)
+ server_started = False
+ if not args.use_existing_site:
+ server = LocalHttpServer(site_path=args.site_path, port=args.port)
+ try: server.start(); server_started = True
+ except Exception as e:
+ console.print(f"[bold red]Failed to start local server. Aborting test.[/bold red]")
+ if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+ return
+
+ try:
+ # --- Run the Stress Test ---
+ test = CrawlerStressTest(
+ url_count=args.urls,
+ port=args.port,
+ max_sessions=args.max_sessions,
+ chunk_size=args.chunk_size, # Pass chunk_size
+ report_path=args.report_path,
+ stream_mode=args.stream,
+ monitor_mode=args.monitor_mode,
+ use_rate_limiter=args.use_rate_limiter
+ )
+ results = await test.run() # Run the test which now handles chunks internally
+
+ # --- Print Summary ---
+ console.print("\n" + "=" * 80)
+ console.print("[bold green]Test Completed[/bold green]")
+ console.print("=" * 80)
+
+ # (Summary printing logic remains largely the same)
+ success_rate = results["successful_urls"] / results["url_count"] * 100 if results["url_count"] > 0 else 0
+ urls_per_second = results["urls_processed"] / results["total_time_seconds"] if results["total_time_seconds"] > 0 else 0
+
+ console.print(f"[bold cyan]Test ID:[/bold cyan] {results['test_id']}")
+ console.print(f"[bold cyan]Configuration:[/bold cyan] {results['url_count']} URLs, {results['max_sessions']} sessions, Chunk: {results['chunk_size']}, Stream: {results['stream_mode']}, Monitor: {results['monitor_mode']}")
+ console.print(f"[bold cyan]Results:[/bold cyan] {results['successful_urls']} successful, {results['failed_urls']} failed ({results['urls_processed']} processed, {success_rate:.1f}% success)")
+ console.print(f"[bold cyan]Performance:[/bold cyan] {results['total_time_seconds']:.2f} seconds total, {urls_per_second:.2f} URLs/second avg")
+
+ mem_report = results.get("memory", {})
+ mem_info_str = "Memory tracking data unavailable."
+ if mem_report and not mem_report.get("error"):
+ start_mb = mem_report.get('start_memory_mb'); end_mb = mem_report.get('end_memory_mb'); max_mb = mem_report.get('max_memory_mb'); growth_mb = mem_report.get('memory_growth_mb')
+ mem_parts = []
+ if start_mb is not None: mem_parts.append(f"Start: {start_mb:.1f} MB")
+ if end_mb is not None: mem_parts.append(f"End: {end_mb:.1f} MB")
+ if max_mb is not None: mem_parts.append(f"Max: {max_mb:.1f} MB")
+ if growth_mb is not None: mem_parts.append(f"Growth: {growth_mb:.1f} MB")
+ if mem_parts: mem_info_str = ", ".join(mem_parts)
+ csv_path = mem_report.get('csv_path')
+ if csv_path: console.print(f"[dim]Memory samples saved to: {csv_path}[/dim]")
+
+ console.print(f"[bold cyan]Memory Usage:[/bold cyan] {mem_info_str}")
+ console.print(f"[bold green]Results summary saved to {results['memory']['csv_path'].replace('memory_samples', 'test_summary').replace('.csv', '.json')}[/bold green]") # Infer summary path
+
+
+ if results["failed_urls"] > 0: console.print(f"\n[bold yellow]Warning: {results['failed_urls']} URLs failed to process ({100-success_rate:.1f}% failure rate)[/bold yellow]")
+ if results["urls_processed"] < results["url_count"]: console.print(f"\n[bold red]Error: Only {results['urls_processed']} out of {results['url_count']} URLs were processed![/bold red]")
+
+
+ finally:
+ # --- Stop Server / Cleanup --- (Same as before)
+ if server_started and server and not args.keep_server_alive: server.stop()
+ elif server_started and server and args.keep_server_alive:
+ console.print(f"[bold cyan]Server is kept running on port {args.port}. Press Ctrl+C to stop it.[/bold cyan]")
+ try: await asyncio.Future() # Keep running indefinitely
+ except KeyboardInterrupt: console.print("\n[bold yellow]Stopping server due to user interrupt...[/bold yellow]"); server.stop()
+
+ if site_generated and not args.keep_site: console.print(f"[yellow]Cleaning up generated site: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+ elif args.clean_site and os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+
+# --- main Function (Added chunk_size argument) ---
+def main():
+ """Main entry point for the script."""
+ parser = argparse.ArgumentParser(description="Crawl4AI SDK High Volume Stress Test using arun_many")
+
+ # Test parameters
+ parser.add_argument("--urls", type=int, default=DEFAULT_URL_COUNT, help=f"Number of URLs to test (default: {DEFAULT_URL_COUNT})")
+ parser.add_argument("--max-sessions", type=int, default=DEFAULT_MAX_SESSIONS, help=f"Maximum concurrent crawling sessions (default: {DEFAULT_MAX_SESSIONS})")
+ parser.add_argument("--chunk-size", type=int, default=DEFAULT_CHUNK_SIZE, help=f"Number of URLs per batch for logging (default: {DEFAULT_CHUNK_SIZE})") # Added
+ parser.add_argument("--stream", action="store_true", default=DEFAULT_STREAM_MODE, help=f"Enable streaming mode (disables batch logging) (default: {DEFAULT_STREAM_MODE})")
+ parser.add_argument("--monitor-mode", type=str, default=DEFAULT_MONITOR_MODE, choices=["DETAILED", "AGGREGATED"], help=f"Display mode for the live monitor (default: {DEFAULT_MONITOR_MODE})")
+ parser.add_argument("--use-rate-limiter", action="store_true", default=False, help="Enable a basic rate limiter (default: False)")
+
+ # Environment parameters
+ parser.add_argument("--site-path", type=str, default=DEFAULT_SITE_PATH, help=f"Path to generate/use the test site (default: {DEFAULT_SITE_PATH})")
+ parser.add_argument("--port", type=int, default=DEFAULT_PORT, help=f"Port for the local HTTP server (default: {DEFAULT_PORT})")
+ parser.add_argument("--report-path", type=str, default=DEFAULT_REPORT_PATH, help=f"Path to save reports and logs (default: {DEFAULT_REPORT_PATH})")
+
+ # Site/Server management
+ parser.add_argument("--skip-generation", action="store_true", help="Use existing test site folder without regenerating")
+ parser.add_argument("--use-existing-site", action="store_true", help="Do not generate site or start local server; assume site exists on --port")
+ parser.add_argument("--keep-server-alive", action="store_true", help="Keep the local HTTP server running after test")
+ parser.add_argument("--keep-site", action="store_true", help="Keep the generated test site files after test")
+ parser.add_argument("--clean-reports", action="store_true", help="Clean up report directory before running")
+ parser.add_argument("--clean-site", action="store_true", help="Clean up site directory before running (if generating) or after")
+
+ args = parser.parse_args()
+
+ # Display config
+ console.print("[bold underline]Crawl4AI SDK Stress Test Configuration[/bold underline]")
+ console.print(f"URLs: {args.urls}, Max Sessions: {args.max_sessions}, Chunk Size: {args.chunk_size}") # Added chunk size
+ console.print(f"Mode: {'Streaming' if args.stream else 'Batch'}, Monitor: {args.monitor_mode}, Rate Limit: {args.use_rate_limiter}")
+ console.print(f"Site Path: {args.site_path}, Port: {args.port}, Report Path: {args.report_path}")
+ console.print("-" * 40)
+ # (Rest of config display and cleanup logic is the same)
+ if args.use_existing_site: console.print("[cyan]Mode: Using existing external site/server[/cyan]")
+ elif args.skip_generation: console.print("[cyan]Mode: Using existing site files, starting local server[/cyan]")
+ else: console.print("[cyan]Mode: Generating site files, starting local server[/cyan]")
+ if args.keep_server_alive: console.print("[cyan]Option: Keep server alive after test[/cyan]")
+ if args.keep_site: console.print("[cyan]Option: Keep site files after test[/cyan]")
+ if args.clean_reports: console.print("[cyan]Option: Clean reports before test[/cyan]")
+ if args.clean_site: console.print("[cyan]Option: Clean site directory[/cyan]")
+ console.print("-" * 40)
+
+ if args.clean_reports:
+ if os.path.exists(args.report_path): console.print(f"[yellow]Cleaning up reports directory: {args.report_path}[/yellow]"); shutil.rmtree(args.report_path)
+ os.makedirs(args.report_path, exist_ok=True)
+ if args.clean_site and not args.use_existing_site:
+ if os.path.exists(args.site_path): console.print(f"[yellow]Cleaning up site directory as requested: {args.site_path}[/yellow]"); shutil.rmtree(args.site_path)
+
+ # Run
+ try: asyncio.run(run_full_test(args))
+ except KeyboardInterrupt: console.print("\n[bold yellow]Test interrupted by user.[/bold yellow]")
+ except Exception as e: console.print(f"\n[bold red]An unexpected error occurred:[/bold red] {e}"); import traceback; traceback.print_exc()
+
+if __name__ == "__main__":
+ main()
\ No newline at end of file