fix(browser): correct headless mode default behavior

Modify BrowserConfig to respect explicit headless parameter setting instead of forcing True. Update version to 0.6.2 and clean up code formatting in examples.

BREAKING CHANGE: BrowserConfig no longer defaults to headless=True when explicitly set to False
This commit is contained in:
UncleCode
2025-04-26 21:09:50 +08:00
parent ccec40ed17
commit 2140d9aca4
4 changed files with 404 additions and 211 deletions

View File

@@ -1,3 +1,3 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.6.1" __version__ = "0.6.2"

View File

@@ -427,7 +427,7 @@ class BrowserConfig:
host: str = "localhost", host: str = "localhost",
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless or True self.headless = headless
self.browser_mode = browser_mode self.browser_mode = browser_mode
self.use_managed_browser = use_managed_browser self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url self.cdp_url = cdp_url

View File

@@ -21,17 +21,20 @@ console = Console()
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
# Target URLs # Target URLs
SIMPLE_URL = "https://example.com" # For demo purposes
SIMPLE_URL = "https://httpbin.org/html" SIMPLE_URL = "https://httpbin.org/html"
LINKS_URL = "https://httpbin.org/links/10/0" LINKS_URL = "https://httpbin.org/links/10/0"
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
PYTHON_URL = "https://python.org" # For deeper crawl PYTHON_URL = "https://python.org" # For deeper crawl
# Use the same sample site as deep crawl tests for consistency # Use the same sample site as deep crawl tests for consistency
DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") DEEP_CRAWL_BASE_URL = os.getenv(
"DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
# --- Helper Functions --- # --- Helper Functions ---
async def check_server_health(client: httpx.AsyncClient): async def check_server_health(client: httpx.AsyncClient):
"""Check if the server is healthy before running tests.""" """Check if the server is healthy before running tests."""
console.print("[bold cyan]Checking server health...[/]", end="") console.print("[bold cyan]Checking server health...[/]", end="")
@@ -39,7 +42,8 @@ async def check_server_health(client: httpx.AsyncClient):
response = await client.get("/health", timeout=10.0) response = await client.get("/health", timeout=10.0)
response.raise_for_status() response.raise_for_status()
health_data = response.json() health_data = response.json()
console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]") console.print(
f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
return True return True
except (httpx.RequestError, httpx.HTTPStatusError) as e: except (httpx.RequestError, httpx.HTTPStatusError) as e:
console.print(f"\n[bold red]Server health check FAILED:[/]") console.print(f"\n[bold red]Server health check FAILED:[/]")
@@ -47,10 +51,12 @@ async def check_server_health(client: httpx.AsyncClient):
console.print(f"Is the server running at {BASE_URL}?") console.print(f"Is the server running at {BASE_URL}?")
return False return False
except Exception as e: except Exception as e:
console.print(f"\n[bold red]An unexpected error occurred during health check:[/]") console.print(
f"\n[bold red]An unexpected error occurred during health check:[/]")
console.print(e) console.print(e)
return False return False
def print_payload(payload: Dict[str, Any]): def print_payload(payload: Dict[str, Any]):
"""Prints the JSON payload nicely with a dark theme.""" """Prints the JSON payload nicely with a dark theme."""
syntax = Syntax( syntax = Syntax(
@@ -60,7 +66,9 @@ def print_payload(payload: Dict[str, Any]):
line_numbers=False, line_numbers=False,
word_wrap=True # Added word wrap for potentially long payloads word_wrap=True # Added word wrap for potentially long payloads
) )
console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False)) console.print(Panel(syntax, title="Request Payload",
border_style="blue", expand=False))
def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3): def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
"""Prints a concise summary of crawl results.""" """Prints a concise summary of crawl results."""
@@ -68,11 +76,13 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
console.print(f"[yellow]{title}: No results received.[/]") console.print(f"[yellow]{title}: No results received.[/]")
return return
console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False)) console.print(Panel(f"[bold]{title}[/]",
border_style="green", expand=False))
count = 0 count = 0
for result in results: for result in results:
if count >= max_items: if count >= max_items:
console.print(f"... (showing first {max_items} of {len(results)} results)") console.print(
f"... (showing first {max_items} of {len(results)} results)")
break break
count += 1 count += 1
success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]" success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
@@ -81,14 +91,16 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
content_info = "" content_info = ""
if result.get('extracted_content'): if result.get('extracted_content'):
content_str = json.dumps(result['extracted_content']) content_str = json.dumps(result['extracted_content'])
snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str snippet = (
content_str[:70] + '...') if len(content_str) > 70 else content_str
content_info = f" | Extracted: [cyan]{snippet}[/]" content_info = f" | Extracted: [cyan]{snippet}[/]"
elif result.get('markdown'): elif result.get('markdown'):
content_info = f" | Markdown: [cyan]Present[/]" content_info = f" | Markdown: [cyan]Present[/]"
elif result.get('html'): elif result.get('html'):
content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]" content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}") console.print(
f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
if "metadata" in result and "depth" in result["metadata"]: if "metadata" in result and "depth" in result["metadata"]:
console.print(f" Depth: {result['metadata']['depth']}") console.print(f" Depth: {result['metadata']['depth']}")
if not result.get('success') and result.get('error_message'): if not result.get('success') and result.get('error_message'):
@@ -104,7 +116,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
start_time = time.time() start_time = time.time()
response = await client.post(endpoint, json=payload) response = await client.post(endpoint, json=payload)
duration = time.time() - start_time duration = time.time() - start_time
console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)") console.print(
f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
response.raise_for_status() response.raise_for_status()
data = response.json() data = response.json()
if data.get("success"): if data.get("success"):
@@ -119,7 +132,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
console.print(f"[bold red]HTTP Error:[/]") console.print(f"[bold red]HTTP Error:[/]")
console.print(f"Status: {e.response.status_code}") console.print(f"Status: {e.response.status_code}")
try: try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
except json.JSONDecodeError: except json.JSONDecodeError:
console.print(f"Response Body: {e.response.text}") console.print(f"Response Body: {e.response.text}")
except httpx.RequestError as e: except httpx.RequestError as e:
@@ -128,11 +142,13 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
console.print(f"[bold red]Unexpected Error: {e}[/]") console.print(f"[bold red]Unexpected Error: {e}[/]")
return None return None
async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str): async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
"""Handles streaming POST requests.""" """Handles streaming POST requests."""
console.rule(f"[bold magenta]{title}[/]", style="magenta") console.rule(f"[bold magenta]{title}[/]", style="magenta")
print_payload(payload) print_payload(payload)
console.print(f"Sending POST stream request to {client.base_url}{endpoint}...") console.print(
f"Sending POST stream request to {client.base_url}{endpoint}...")
all_results = [] all_results = []
initial_status_code = None # Store initial status code initial_status_code = None # Store initial status code
@@ -141,7 +157,8 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
async with client.stream("POST", endpoint, json=payload) as response: async with client.stream("POST", endpoint, json=payload) as response:
initial_status_code = response.status_code # Capture initial status initial_status_code = response.status_code # Capture initial status
duration = time.time() - start_time # Time to first byte potentially duration = time.time() - start_time # Time to first byte potentially
console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)") console.print(
f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
response.raise_for_status() # Raise exception for bad *initial* status codes response.raise_for_status() # Raise exception for bad *initial* status codes
console.print("[magenta]--- Streaming Results ---[/]") console.print("[magenta]--- Streaming Results ---[/]")
@@ -152,24 +169,31 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
data = json.loads(line) data = json.loads(line)
if data.get("status") == "completed": if data.get("status") == "completed":
completed = True completed = True
console.print("[bold green]--- Stream Completed ---[/]") console.print(
"[bold green]--- Stream Completed ---[/]")
break break
elif data.get("url"): # Looks like a result dictionary elif data.get("url"): # Looks like a result dictionary
all_results.append(data) all_results.append(data)
# Display summary info as it arrives # Display summary info as it arrives
success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]" success_icon = "[green]✔[/]" if data.get(
'success') else "[red]✘[/]"
url = data.get('url', 'N/A') url = data.get('url', 'N/A')
# Display status code FROM THE RESULT DATA if available # Display status code FROM THE RESULT DATA if available
result_status = data.get('status_code', 'N/A') result_status = data.get('status_code', 'N/A')
console.print(f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})") console.print(
f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
if not data.get('success') and data.get('error_message'): if not data.get('success') and data.get('error_message'):
console.print(f" [red]Error: {data['error_message']}[/]") console.print(
f" [red]Error: {data['error_message']}[/]")
else: else:
console.print(f" [yellow]Stream meta-data:[/yellow] {data}") console.print(
f" [yellow]Stream meta-data:[/yellow] {data}")
except json.JSONDecodeError: except json.JSONDecodeError:
console.print(f" [red]Stream decode error for line:[/red] {line}") console.print(
f" [red]Stream decode error for line:[/red] {line}")
if not completed: if not completed:
console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]") console.print(
"[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
# Use the captured initial status code if available, otherwise from the exception # Use the captured initial status code if available, otherwise from the exception
@@ -177,18 +201,21 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
console.print(f"[bold red]HTTP Error (Initial Request):[/]") console.print(f"[bold red]HTTP Error (Initial Request):[/]")
console.print(f"Status: {status}") console.print(f"Status: {status}")
try: try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
except json.JSONDecodeError: except json.JSONDecodeError:
console.print(f"Response Body: {e.response.text}") console.print(f"Response Body: {e.response.text}")
except httpx.RequestError as e: except httpx.RequestError as e:
console.print(f"[bold red]Request Error: {e}[/]") console.print(f"[bold red]Request Error: {e}[/]")
except Exception as e: except Exception as e:
console.print(f"[bold red]Unexpected Error during streaming: {e}[/]") console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
console.print_exception(show_locals=False) # Print stack trace for unexpected errors # Print stack trace for unexpected errors
console.print_exception(show_locals=False)
# Call print_result_summary with the *collected* results AFTER the stream is done # Call print_result_summary with the *collected* results AFTER the stream is done
print_result_summary(all_results, title=f"{title} Collected Results") print_result_summary(all_results, title=f"{title} Collected Results")
def load_proxies_from_env() -> List[Dict]: def load_proxies_from_env() -> List[Dict]:
""" """
Load proxies from the PROXIES environment variable. Load proxies from the PROXIES environment variable.
@@ -226,23 +253,25 @@ def load_proxies_from_env() -> List[Dict]:
# "ip": ip # "ip": ip
} }
else: else:
console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}") console.print(
f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
continue continue
proxies_params_list.append(proxy_dict) proxies_params_list.append(proxy_dict)
except Exception as e: except Exception as e:
console.print(f"[red]Error loading proxies from environment:[/red] {e}") console.print(
f"[red]Error loading proxies from environment:[/red] {e}")
if proxies_params_list: if proxies_params_list:
console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]") console.print(
f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
# else: # else:
# console.print("[yellow]No valid proxies loaded from environment.[/]") # console.print("[yellow]No valid proxies loaded from environment.[/]")
return proxies_params_list return proxies_params_list
# --- Demo Functions --- # --- Demo Functions ---
# 1. Basic Crawling # 1. Basic Crawling
@@ -250,11 +279,17 @@ async def demo_basic_single_url(client: httpx.AsyncClient):
payload = { payload = {
"urls": [SIMPLE_URL], "urls": [SIMPLE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}} "crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "BYPASS"
}
}
} }
result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl") result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
return result return result
async def demo_basic_multi_url(client: httpx.AsyncClient): async def demo_basic_multi_url(client: httpx.AsyncClient):
payload = { payload = {
"urls": [SIMPLE_URL, LINKS_URL], "urls": [SIMPLE_URL, LINKS_URL],
@@ -264,16 +299,31 @@ async def demo_basic_multi_url(client: httpx.AsyncClient):
result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl") result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
return result return result
async def demo_streaming_multi_url(client: httpx.AsyncClient): async def demo_streaming_multi_url(client: httpx.AsyncClient):
payload = { payload = {
"urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL # "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
"urls": [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3",
"https://example.com/page4",
"https://example.com/page5"
], # Add another URL
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}} "crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"stream": True,
} }
result = stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl") }
}
result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
return result return result
# 2. Markdown Generation & Content Filtering # 2. Markdown Generation & Content Filtering
async def demo_markdown_default(client: httpx.AsyncClient): async def demo_markdown_default(client: httpx.AsyncClient):
payload = { payload = {
"urls": [SIMPLE_URL], "urls": [SIMPLE_URL],
@@ -281,14 +331,25 @@ async def demo_markdown_default(client: httpx.AsyncClient):
"crawler_config": { "crawler_config": {
"type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
"params": { "params": {
"cache_mode": "BYPASS", "markdown_generator": {
"markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default "type": "DefaultMarkdownGenerator",
"params": {
"content_source": "fit_html",
"options": {
"type": "dict",
"value": {
"ignore_links": True
}
}
}
} # Explicitly default
} }
} }
} }
result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation") result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
return result return result
async def demo_markdown_pruning(client: httpx.AsyncClient): async def demo_markdown_pruning(client: httpx.AsyncClient):
payload = { payload = {
"urls": [PYTHON_URL], # Use a more complex page "urls": [PYTHON_URL], # Use a more complex page
@@ -302,7 +363,10 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
"params": { "params": {
"content_filter": { "content_filter": {
"type": "PruningContentFilter", "type": "PruningContentFilter",
"params": {"threshold": 0.6, "threshold_type": "relative"} "params": {
"threshold": 0.6,
"threshold_type": "relative"
}
} }
} }
} }
@@ -312,6 +376,7 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter") result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
return result return result
async def demo_markdown_bm25(client: httpx.AsyncClient): async def demo_markdown_bm25(client: httpx.AsyncClient):
payload = { payload = {
"urls": [PYTHON_URL], "urls": [PYTHON_URL],
@@ -325,7 +390,9 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
"params": { "params": {
"content_filter": { "content_filter": {
"type": "BM25ContentFilter", "type": "BM25ContentFilter",
"params": {"user_query": "Python documentation language reference"} "params": {
"user_query": "Python documentation language reference"
}
} }
} }
} }
@@ -337,21 +404,22 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
# 3. Specific Parameters # 3. Specific Parameters
# Corrected Demo Function: demo_param_css_selector # Corrected Demo Function: demo_param_css_selector
async def demo_param_css_selector(client: httpx.AsyncClient): async def demo_param_css_selector(client: httpx.AsyncClient):
target_selector = ".main-content" # Using the suggested correct selector css_selector = ".main-content" # Using the suggested correct selector
payload = { payload = {
"urls": [PYTHON_URL], "urls": [PYTHON_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": { "crawler_config": {
"type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
"params": { "params": {
"cache_mode": "BYPASS", "css_selector": css_selector # Target specific div
"css_selector": target_selector # Target specific div
# No extraction strategy is needed to demo this parameter's effect on input HTML # No extraction strategy is needed to demo this parameter's effect on input HTML
} }
} }
} }
results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')") results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')")
if results: if results:
result = results[0] result = results[0]
@@ -360,24 +428,32 @@ async def demo_param_css_selector(client: httpx.AsyncClient):
# A simple check: does it contain expected content from within the selector, # A simple check: does it contain expected content from within the selector,
# and does it LACK content known to be outside (like footer links)? # and does it LACK content known to be outside (like footer links)?
html_content = result['html'] html_content = result['html']
content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere # Text likely within .main-content somewhere
footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content content_present = 'Python Software Foundation' in html_content
# Text likely in the footer, outside .main-content
footer_absent = 'Legal Statements' not in html_content
console.print(f" Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}") console.print(
console.print(f" Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}") f" Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
console.print(
f" Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
if not content_present or not footer_absent: if not content_present or not footer_absent:
console.print(f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}") console.print(
f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
else: else:
console.print(f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}") console.print(
f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
elif result['success']: elif result['success']:
console.print("[yellow]HTML content was empty in the successful result.[/]") console.print(
"[yellow]HTML content was empty in the successful result.[/]")
# Error message is handled by print_result_summary called by make_request # Error message is handled by print_result_summary called by make_request
async def demo_param_js_execution(client: httpx.AsyncClient): async def demo_param_js_execution(client: httpx.AsyncClient):
payload = { payload = {
"urls": [FORMS_URL], # Use a page with a form "urls": ["https://example.com"], # Use a page with a form
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": { "crawler_config": {
"type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
@@ -385,10 +461,10 @@ async def demo_param_js_execution(client: httpx.AsyncClient):
"cache_mode": "BYPASS", "cache_mode": "BYPASS",
# Simple JS to fill and maybe click (won't submit without more complex setup) # Simple JS to fill and maybe click (won't submit without more complex setup)
"js_code": """ "js_code": """
() => { (() => {
document.querySelector('[name="custname"]').value = 'Crawl4AI Demo'; document.querySelector('h1').innerText = 'Crawl4AI Demo';
return { filled_name: document.querySelector('[name="custname"]').value }; return { filled_name: document.querySelector('h1').innerText };
} })();
""", """,
"delay_before_return_html": 0.5 # Give JS time to potentially run "delay_before_return_html": 0.5 # Give JS time to potentially run
} }
@@ -396,10 +472,12 @@ async def demo_param_js_execution(client: httpx.AsyncClient):
} }
results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter") results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
if results and results[0].get("js_execution_result"): if results and results[0].get("js_execution_result"):
console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"]) console.print("[cyan]JS Execution Result:[/]",
results[0]["js_execution_result"])
elif results: elif results:
console.print("[yellow]JS Execution Result not found in response.[/]") console.print("[yellow]JS Execution Result not found in response.[/]")
async def demo_param_screenshot(client: httpx.AsyncClient): async def demo_param_screenshot(client: httpx.AsyncClient):
payload = { payload = {
"urls": [SIMPLE_URL], "urls": [SIMPLE_URL],
@@ -411,10 +489,12 @@ async def demo_param_screenshot(client: httpx.AsyncClient):
} }
results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot") results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
if results and results[0].get("screenshot"): if results and results[0].get("screenshot"):
console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}") console.print(
f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
elif results: elif results:
console.print("[yellow]Screenshot data not found in response.[/]") console.print("[yellow]Screenshot data not found in response.[/]")
async def demo_param_ssl_fetch(client: httpx.AsyncClient): async def demo_param_ssl_fetch(client: httpx.AsyncClient):
payload = { payload = {
"urls": [PYTHON_URL], # Needs HTTPS "urls": [PYTHON_URL], # Needs HTTPS
@@ -431,10 +511,12 @@ async def demo_param_ssl_fetch(client: httpx.AsyncClient):
elif results: elif results:
console.print("[yellow]SSL Certificate data not found in response.[/]") console.print("[yellow]SSL Certificate data not found in response.[/]")
async def demo_param_proxy(client: httpx.AsyncClient): async def demo_param_proxy(client: httpx.AsyncClient):
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
if not proxy_params_list: if not proxy_params_list:
console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow") console.rule(
"[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
console.print("Set the PROXIES environment variable to run this demo.") console.print("Set the PROXIES environment variable to run this demo.")
console.print("Format: IP:PORT:USR:PWD,IP:PORT,...") console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
return return
@@ -450,8 +532,21 @@ async def demo_param_proxy(client: httpx.AsyncClient):
"type": "RoundRobinProxyStrategy", "type": "RoundRobinProxyStrategy",
"params": { "params": {
"proxies": [ "proxies": [
# [
# {
# "type": "ProxyConfig",
# "params": {
# server:"...",
# "username": "...",
# "password": "..."
# }
# },
# ...
# ]
# Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
{"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}} {"type": "ProxyConfig", "params": {
k: v for k, v in p.items() if k != 'ip'}}
for p in proxy_params_list for p in proxy_params_list
] ]
} }
@@ -480,37 +575,49 @@ async def demo_param_proxy(client: httpx.AsyncClient):
if json_str: if json_str:
ip_data = json.loads(json_str) ip_data = json.loads(json_str)
origin_ip = ip_data.get("origin") origin_ip = ip_data.get("origin")
console.print(f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]") console.print(
f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
# Extract the IPs from the proxy list for comparison # Extract the IPs from the proxy list for comparison
proxy_ips = {p.get("server").split(":")[1][2:] for p in proxy_params_list} proxy_ips = {p.get("server").split(
":")[1][2:] for p in proxy_params_list}
if origin_ip and origin_ip in proxy_ips: if origin_ip and origin_ip in proxy_ips:
console.print("[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]") console.print(
"[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
elif origin_ip: elif origin_ip:
console.print("[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]") console.print(
"[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
console.print(f" Provided Proxy IPs: {proxy_ips}") console.print(f" Provided Proxy IPs: {proxy_ips}")
else: else:
console.print("[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]") console.print(
"[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]")
else: else:
console.print("[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]") console.print(
"[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
# console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
except json.JSONDecodeError: except json.JSONDecodeError:
console.print("[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]") console.print(
"[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
except Exception as e: except Exception as e:
console.print(f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]") console.print(
f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]")
elif results: elif results:
console.print("[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]") console.print(
"[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]")
# 4. Extraction Strategies
# 4. Extraction Strategies (Non-Deep)
async def demo_extract_css(client: httpx.AsyncClient): async def demo_extract_css(client: httpx.AsyncClient):
# Schema to extract book titles and prices # Schema to extract book titles and prices
book_schema = { book_schema = {
"name": "BookList", "name": "BookList",
"baseSelector": "ol.row li.col-xs-6", "baseSelector": "ol.row li.col-xs-6",
"fields": [ "fields": [
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"}, {"name": "title", "selector": "article.product_pod h3 a",
"type": "attribute", "attribute": "title"},
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"}, {"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
] ]
} }
@@ -523,7 +630,12 @@ async def demo_extract_css(client: httpx.AsyncClient):
"cache_mode": "BYPASS", "cache_mode": "BYPASS",
"extraction_strategy": { "extraction_strategy": {
"type": "JsonCssExtractionStrategy", "type": "JsonCssExtractionStrategy",
"params": {"schema": {"type": "dict", "value": book_schema}} "params": {
"schema": {
"type": "dict",
"value": book_schema
}
}
} }
} }
} }
@@ -539,21 +651,28 @@ async def demo_extract_css(client: httpx.AsyncClient):
table.add_column("Title", style="dim") table.add_column("Title", style="dim")
table.add_column("Price") table.add_column("Price")
for item in extracted_data[:5]: # Show first 5 for item in extracted_data[:5]: # Show first 5
table.add_row(item.get('title', 'N/A'), item.get('price', 'N/A')) table.add_row(item.get('title', 'N/A'),
item.get('price', 'N/A'))
console.print(table) console.print(table)
else: else:
console.print("[yellow]CSS extraction did not return a list of results.[/]") console.print(
"[yellow]CSS extraction did not return a list of results.[/]")
console.print(extracted_data) console.print(extracted_data)
except json.JSONDecodeError: except json.JSONDecodeError:
console.print("[red]Failed to parse extracted_content as JSON.[/]") console.print("[red]Failed to parse extracted_content as JSON.[/]")
except Exception as e: except Exception as e:
console.print(f"[red]Error processing extracted CSS content: {e}[/]") console.print(
f"[red]Error processing extracted CSS content: {e}[/]")
# 5. LLM Extraction # 5. LLM Extraction
async def demo_extract_llm(client: httpx.AsyncClient): async def demo_extract_llm(client: httpx.AsyncClient):
if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
console.rule("[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow") console.rule(
console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") "[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
console.print(
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
return return
payload = { payload = {
@@ -571,7 +690,10 @@ async def demo_extract_llm(client: httpx.AsyncClient):
"type": "LLMConfig", "type": "LLMConfig",
"params": {} "params": {}
# Relies on server's default provider from config.yml & keys from .llm.env # Relies on server's default provider from config.yml & keys from .llm.env
# "params": {"provider": "openai/gpt-4o-mini"} # "params": {
# "provider": "openai/gpt-4o-mini",
# "api_key": os.getenv("OPENAI_API_KEY") # Optional: Override key
# }
}, },
"schema": { # Request structured output "schema": { # Request structured output
"type": "dict", "type": "dict",
@@ -599,17 +721,23 @@ async def demo_extract_llm(client: httpx.AsyncClient):
if isinstance(extracted_data, dict): if isinstance(extracted_data, dict):
console.print("[cyan]Extracted Data (LLM):[/]") console.print("[cyan]Extracted Data (LLM):[/]")
syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False) syntax = Syntax(json.dumps(extracted_data, indent=2),
"json", theme="monokai", line_numbers=False)
console.print(Panel(syntax, border_style="cyan", expand=False)) console.print(Panel(syntax, border_style="cyan", expand=False))
else: else:
console.print("[yellow]LLM extraction did not return expected dictionary.[/]") console.print(
"[yellow]LLM extraction did not return expected dictionary.[/]")
console.print(extracted_data) console.print(extracted_data)
except json.JSONDecodeError: except json.JSONDecodeError:
console.print("[red]Failed to parse LLM extracted_content as JSON.[/]") console.print(
"[red]Failed to parse LLM extracted_content as JSON.[/]")
except Exception as e: except Exception as e:
console.print(f"[red]Error processing extracted LLM content: {e}[/]") console.print(
f"[red]Error processing extracted LLM content: {e}[/]")
# 6. Deep Crawling # 6. Deep Crawling
async def demo_deep_basic(client: httpx.AsyncClient): async def demo_deep_basic(client: httpx.AsyncClient):
payload = { payload = {
"urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
@@ -625,7 +753,17 @@ async def demo_deep_basic(client: httpx.AsyncClient):
"max_pages": 4, "max_pages": 4,
"filter_chain": { "filter_chain": {
"type": "FilterChain", "type": "FilterChain",
"params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]} "params": {
"filters": [
{
"type": "DomainFilter",
"params":
{
"allowed_domains": [DEEP_CRAWL_DOMAIN]
}
}
]
}
} }
} }
} }
@@ -642,6 +780,8 @@ async def demo_deep_basic(client: httpx.AsyncClient):
console.print(f" [red]Error: {result['error_message']}[/]") console.print(f" [red]Error: {result['error_message']}[/]")
# 5. Streaming Deep Crawl # 5. Streaming Deep Crawl
async def demo_deep_streaming(client: httpx.AsyncClient): async def demo_deep_streaming(client: httpx.AsyncClient):
payload = { payload = {
"urls": [DEEP_CRAWL_BASE_URL], "urls": [DEEP_CRAWL_BASE_URL],
@@ -669,6 +809,8 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl") await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
# 5a. Deep Crawl with Filtering & Scoring # 5a. Deep Crawl with Filtering & Scoring
async def demo_deep_filtering_scoring(client: httpx.AsyncClient): async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
"""Demonstrates deep crawl with advanced URL filtering and scoring.""" """Demonstrates deep crawl with advanced URL filtering and scoring."""
max_depth = 2 # Go a bit deeper to see scoring/filtering effects max_depth = 2 # Go a bit deeper to see scoring/filtering effects
@@ -717,7 +859,8 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
"scorers": [ "scorers": [
{ # Boost score for URLs containing the keyword { # Boost score for URLs containing the keyword
"type": "KeywordRelevanceScorer", "type": "KeywordRelevanceScorer",
"params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight # Higher weight
"params": {"keywords": [keyword_to_score], "weight": 1.5}
}, },
{ # Slightly penalize deeper pages { # Slightly penalize deeper pages
"type": "PathDepthScorer", "type": "PathDepthScorer",
@@ -747,39 +890,51 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
depth = result.get("metadata", {}).get("depth", -1) depth = result.get("metadata", {}).get("depth", -1)
# Check Filtering # Check Filtering
if excluded_pattern.strip('*') in url: # Check if the excluded part is present # Check if the excluded part is present
console.print(f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}") if excluded_pattern.strip('*') in url:
console.print(
f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
excluded_found = True excluded_found = True
# Check Scoring (Observation) # Check Scoring (Observation)
if keyword_to_score in url: if keyword_to_score in url:
prioritized_found_overall = True prioritized_found_overall = True
if depth == 1: # Check if prioritized keywords appeared early (depth 1) # Check if prioritized keywords appeared early (depth 1)
if depth == 1:
prioritized_found_at_depth1 = True prioritized_found_at_depth1 = True
if not excluded_found: if not excluded_found:
console.print(f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.") console.print(
f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
else: else:
console.print(f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).") console.print(
f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
if prioritized_found_at_depth1: if prioritized_found_at_depth1:
console.print(f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).") console.print(
f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
elif prioritized_found_overall: elif prioritized_found_overall:
console.print(f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).") console.print(
f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
else: else:
console.print(f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.") console.print(
f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
# print_result_summary called by make_request already shows URLs and depths # print_result_summary called by make_request already shows URLs and depths
# 6. Deep Crawl with Extraction # 6. Deep Crawl with Extraction
async def demo_deep_with_css_extraction(client: httpx.AsyncClient): async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
# Schema to extract H1 and first paragraph from any page # Schema to extract H1 and first paragraph from any page
general_schema = { general_schema = {
"name": "PageContent", "name": "PageContent",
"baseSelector": "body", # Apply to whole body "baseSelector": "body", # Apply to whole body
"fields": [ "fields": [
{"name": "page_title", "selector": "h1", "type": "text", "default": "N/A"}, {"name": "page_title", "selector": "h1",
{"name": "first_p", "selector": "p", "type": "text", "default": "N/A"}, # Gets first p tag "type": "text", "default": "N/A"},
{"name": "first_p", "selector": "p", "type": "text",
"default": "N/A"}, # Gets first p tag
] ]
} }
payload = { payload = {
@@ -801,8 +956,10 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
"filter_chain": { "filter_chain": {
"type": "FilterChain", "type": "FilterChain",
"params": {"filters": [ "params": {"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, {"type": "DomainFilter", "params": {
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {
"allowed_types": ["text/html"]}}
]} ]}
} }
} }
@@ -818,21 +975,31 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
if result.get("success") and result.get("extracted_content"): if result.get("success") and result.get("extracted_content"):
try: try:
extracted = json.loads(result["extracted_content"]) extracted = json.loads(result["extracted_content"])
if isinstance(extracted, list) and extracted: extracted = extracted[0] # Use first item if isinstance(extracted, list) and extracted:
title = extracted.get('page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error' extracted = extracted[0] # Use first item
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}") title = extracted.get(
'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
except Exception: except Exception:
console.print(f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content") console.print(
f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
elif result.get("success"): elif result.get("success"):
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
else: else:
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 6b. Deep Crawl with LLM Extraction # 6b. Deep Crawl with LLM Extraction
async def demo_deep_with_llm_extraction(client: httpx.AsyncClient): async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
if not os.getenv("OPENAI_API_KEY"): # Basic check if not os.getenv("OPENAI_API_KEY"): # Basic check
console.rule("[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow") console.rule(
console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") "[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
console.print(
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
return return
payload = { payload = {
@@ -857,8 +1024,10 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
"filter_chain": { "filter_chain": {
"type": "FilterChain", "type": "FilterChain",
"params": {"filters": [ "params": {"filters": [
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, {"type": "DomainFilter", "params": {
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} "allowed_domains": [DEEP_CRAWL_DOMAIN]}},
{"type": "ContentTypeFilter", "params": {
"allowed_types": ["text/html"]}}
]} ]}
} }
} }
@@ -872,23 +1041,28 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]") console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
for result in results: for result in results:
if result.get("success") and result.get("extracted_content"): if result.get("success") and result.get("extracted_content"):
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}") console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
elif result.get("success"): elif result.get("success"):
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
else: else:
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 6c. Deep Crawl with Proxies # 6c. Deep Crawl with Proxies
async def demo_deep_with_proxy(client: httpx.AsyncClient): async def demo_deep_with_proxy(client: httpx.AsyncClient):
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
if not proxy_params_list: if not proxy_params_list:
console.rule("[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow") console.rule(
"[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
console.print("Set the PROXIES environment variable to run this demo.") console.print("Set the PROXIES environment variable to run this demo.")
return return
payload = { payload = {
"urls": [DEEP_CRAWL_BASE_URL], # Use a site likely accessible via proxies # Use a site likely accessible via proxies
"urls": [DEEP_CRAWL_BASE_URL],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": { "crawler_config": {
"type": "CrawlerRunConfig", "type": "CrawlerRunConfig",
@@ -899,7 +1073,8 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
"params": { "params": {
# Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key # Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
"proxies": [ "proxies": [
{"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}} {"type": "ProxyConfig", "params": {
k: v for k, v in p.items() if k != 'ip'}}
for p in proxy_params_list for p in proxy_params_list
] ]
} }
@@ -965,13 +1140,18 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
issuer_org = cert.get('issuer', {}).get('O', 'N/A') issuer_org = cert.get('issuer', {}).get('O', 'N/A')
valid_from = cert.get('not_before', 'N/A') valid_from = cert.get('not_before', 'N/A')
valid_to = cert.get('not_after', 'N/A') valid_to = cert.get('not_after', 'N/A')
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}") console.print(
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
elif result.get("success"): elif result.get("success"):
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.") console.print(
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
else: else:
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") console.print(
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
# 7. Markdown helper endpoint # 7. Markdown helper endpoint
async def demo_markdown_endpoint(client: httpx.AsyncClient): async def demo_markdown_endpoint(client: httpx.AsyncClient):
""" """
One-shot helper around /md. One-shot helper around /md.
@@ -987,15 +1167,19 @@ async def demo_markdown_endpoint(client: httpx.AsyncClient):
t0 = time.time() t0 = time.time()
resp = await client.post("/md", json=payload) resp = await client.post("/md", json=payload)
dt = time.time() - t0 dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status() resp.raise_for_status()
md = resp.json().get("markdown", "") md = resp.json().get("markdown", "")
snippet = (md[:500] + "...") if len(md) > 500 else md snippet = (md[:500] + "...") if len(md) > 500 else md
console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False)) console.print(Panel(snippet, title="Markdown snippet",
border_style="cyan", expand=False))
except Exception as e: except Exception as e:
console.print(f"[bold red]Error hitting /md:[/] {e}") console.print(f"[bold red]Error hitting /md:[/] {e}")
# 8. LLM QA helper endpoint # 8. LLM QA helper endpoint
async def demo_llm_endpoint(client: httpx.AsyncClient): async def demo_llm_endpoint(client: httpx.AsyncClient):
""" """
Quick QA round-trip with /llm. Quick QA round-trip with /llm.
@@ -1012,10 +1196,12 @@ async def demo_llm_endpoint(client: httpx.AsyncClient):
t0 = time.time() t0 = time.time()
resp = await client.get(f"/llm/{enc}", params={"q": question}) resp = await client.get(f"/llm/{enc}", params={"q": question})
dt = time.time() - t0 dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status() resp.raise_for_status()
answer = resp.json().get("answer", "") answer = resp.json().get("answer", "")
console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False)) console.print(Panel(answer or "No answer returned",
title="LLM answer", border_style="magenta", expand=False))
except Exception as e: except Exception as e:
console.print(f"[bold red]Error hitting /llm:[/] {e}") console.print(f"[bold red]Error hitting /llm:[/] {e}")
@@ -1036,10 +1222,12 @@ async def demo_config_dump_valid(client: httpx.AsyncClient):
t0 = time.time() t0 = time.time()
resp = await client.post("/config/dump", json=payload) resp = await client.post("/config/dump", json=payload)
dt = time.time() - t0 dt = time.time() - t0
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
resp.raise_for_status() resp.raise_for_status()
dump_json = resp.json() dump_json = resp.json()
console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan")) console.print(Panel(Syntax(json.dumps(dump_json, indent=2),
"json", theme="monokai"), title="Dump()", border_style="cyan"))
except Exception as e: except Exception as e:
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}") console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
@@ -1053,21 +1241,25 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
""").strip() """).strip()
payload = {"code": bad_code} payload = {"code": bad_code}
console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta") console.rule(
"[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
print_payload(payload) print_payload(payload)
try: try:
resp = await client.post("/config/dump", json=payload) resp = await client.post("/config/dump", json=payload)
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]") console.print(
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
resp.raise_for_status() # should throw -> except resp.raise_for_status() # should throw -> except
except httpx.HTTPStatusError as e: except httpx.HTTPStatusError as e:
console.print("[cyan]Expected parse/validation failure captured:[/]") console.print("[cyan]Expected parse/validation failure captured:[/]")
try: try:
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload")) console.print(Panel(Syntax(json.dumps(
e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
except Exception: except Exception:
console.print(e.response.text) console.print(e.response.text)
except Exception as e: except Exception as e:
console.print(f"[bold red]Unexpected error during invalid test:[/] {e}") console.print(
f"[bold red]Unexpected error during invalid test:[/] {e}")
# --- Update Main Runner to include new demo --- # --- Update Main Runner to include new demo ---
@@ -1077,31 +1269,31 @@ async def main_demo():
return return
# --- Run Demos --- # --- Run Demos ---
await demo_basic_single_url(client) # await demo_basic_single_url(client)
await demo_basic_multi_url(client) # await demo_basic_multi_url(client)
await demo_streaming_multi_url(client) # await demo_streaming_multi_url(client)
await demo_markdown_default(client) # await demo_markdown_default(client)
await demo_markdown_pruning(client) # await demo_markdown_pruning(client)
await demo_markdown_bm25(client) # await demo_markdown_bm25(client)
await demo_param_css_selector(client) # await demo_param_css_selector(client)
await demo_param_js_execution(client) # await demo_param_js_execution(client)
await demo_param_screenshot(client) # await demo_param_screenshot(client)
await demo_param_ssl_fetch(client) # await demo_param_ssl_fetch(client)
await demo_param_proxy(client) # Skips if no PROXIES env var # await demo_param_proxy(client) # Skips if no PROXIES env var
await demo_extract_css(client) # await demo_extract_css(client)
await demo_extract_llm(client) # Skips if no common LLM key env var # await demo_extract_llm(client) # Skips if no common LLM key env var
await demo_deep_basic(client) # await demo_deep_basic(client)
await demo_deep_streaming(client) # This need extra work # await demo_deep_streaming(client) # This need extra work
# await demo_deep_with_css_extraction(client)
await demo_deep_with_css_extraction(client) # # Skips if no common LLM key env var
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var # await demo_deep_with_llm_extraction(client)
await demo_deep_with_proxy(client) # Skips if no PROXIES env var # await demo_deep_with_proxy(client) # Skips if no PROXIES env var
await demo_deep_with_ssl(client) # Added the new demo # await demo_deep_with_ssl(client) # Added the new demo
# --- Helper endpoints --- # --- Helper endpoints ---
await demo_markdown_endpoint(client) await demo_markdown_endpoint(client)
@@ -1120,5 +1312,6 @@ if __name__ == "__main__":
except KeyboardInterrupt: except KeyboardInterrupt:
console.print("\n[yellow]Demo interrupted by user.[/]") console.print("\n[yellow]Demo interrupted by user.[/]")
except Exception as e: except Exception as e:
console.print(f"\n[bold red]An error occurred during demo execution:[/]") console.print(
f"\n[bold red]An error occurred during demo execution:[/]")
console.print_exception(show_locals=False) console.print_exception(show_locals=False)

View File

@@ -31,7 +31,7 @@ async def example_cdp():
async def main(): async def main():
browser_config = BrowserConfig(headless=True, verbose=True) browser_config = BrowserConfig(headless=False, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, cache_mode=CacheMode.BYPASS,