diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index fe6f9b8a..8a5cb2c4 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,3 +1,3 @@ # crawl4ai/_version.py -__version__ = "0.6.1" +__version__ = "0.6.2" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index dd5c584a..c93516bd 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -427,7 +427,7 @@ class BrowserConfig: host: str = "localhost", ): self.browser_type = browser_type - self.headless = headless or True + self.headless = headless self.browser_mode = browser_mode self.use_managed_browser = use_managed_browser self.cdp_url = cdp_url diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py index d989e030..0a3d51af 100644 --- a/docs/examples/docker/demo_docker_api.py +++ b/docs/examples/docker/demo_docker_api.py @@ -21,17 +21,20 @@ console = Console() BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") # Target URLs +SIMPLE_URL = "https://example.com" # For demo purposes SIMPLE_URL = "https://httpbin.org/html" LINKS_URL = "https://httpbin.org/links/10/0" -FORMS_URL = "https://httpbin.org/forms/post" # For JS demo -BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction -PYTHON_URL = "https://python.org" # For deeper crawl +FORMS_URL = "https://httpbin.org/forms/post" # For JS demo +BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction +PYTHON_URL = "https://python.org" # For deeper crawl # Use the same sample site as deep crawl tests for consistency -DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") +DEEP_CRAWL_BASE_URL = os.getenv( + "DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" # --- Helper Functions --- + async def check_server_health(client: httpx.AsyncClient): """Check if the server is healthy before running tests.""" console.print("[bold cyan]Checking server health...[/]", end="") @@ -39,7 +42,8 @@ async def check_server_health(client: httpx.AsyncClient): response = await client.get("/health", timeout=10.0) response.raise_for_status() health_data = response.json() - console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]") + console.print( + f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]") return True except (httpx.RequestError, httpx.HTTPStatusError) as e: console.print(f"\n[bold red]Server health check FAILED:[/]") @@ -47,10 +51,12 @@ async def check_server_health(client: httpx.AsyncClient): console.print(f"Is the server running at {BASE_URL}?") return False except Exception as e: - console.print(f"\n[bold red]An unexpected error occurred during health check:[/]") + console.print( + f"\n[bold red]An unexpected error occurred during health check:[/]") console.print(e) return False + def print_payload(payload: Dict[str, Any]): """Prints the JSON payload nicely with a dark theme.""" syntax = Syntax( @@ -60,7 +66,9 @@ def print_payload(payload: Dict[str, Any]): line_numbers=False, word_wrap=True # Added word wrap for potentially long payloads ) - console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False)) + console.print(Panel(syntax, title="Request Payload", + border_style="blue", expand=False)) + def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3): """Prints a concise summary of crawl results.""" @@ -68,11 +76,13 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu console.print(f"[yellow]{title}: No results received.[/]") return - console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False)) + console.print(Panel(f"[bold]{title}[/]", + border_style="green", expand=False)) count = 0 for result in results: if count >= max_items: - console.print(f"... (showing first {max_items} of {len(results)} results)") + console.print( + f"... (showing first {max_items} of {len(results)} results)") break count += 1 success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]" @@ -81,14 +91,16 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu content_info = "" if result.get('extracted_content'): content_str = json.dumps(result['extracted_content']) - snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str + snippet = ( + content_str[:70] + '...') if len(content_str) > 70 else content_str content_info = f" | Extracted: [cyan]{snippet}[/]" elif result.get('markdown'): - content_info = f" | Markdown: [cyan]Present[/]" + content_info = f" | Markdown: [cyan]Present[/]" elif result.get('html'): content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]" - console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}") + console.print( + f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}") if "metadata" in result and "depth" in result["metadata"]: console.print(f" Depth: {result['metadata']['depth']}") if not result.get('success') and result.get('error_message'): @@ -104,7 +116,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s start_time = time.time() response = await client.post(endpoint, json=payload) duration = time.time() - start_time - console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)") + console.print( + f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)") response.raise_for_status() data = response.json() if data.get("success"): @@ -119,7 +132,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s console.print(f"[bold red]HTTP Error:[/]") console.print(f"Status: {e.response.status_code}") try: - console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) + console.print(Panel(Syntax(json.dumps( + e.response.json(), indent=2), "json", theme="default"), title="Error Response")) except json.JSONDecodeError: console.print(f"Response Body: {e.response.text}") except httpx.RequestError as e: @@ -128,21 +142,24 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s console.print(f"[bold red]Unexpected Error: {e}[/]") return None + async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str): """Handles streaming POST requests.""" console.rule(f"[bold magenta]{title}[/]", style="magenta") print_payload(payload) - console.print(f"Sending POST stream request to {client.base_url}{endpoint}...") + console.print( + f"Sending POST stream request to {client.base_url}{endpoint}...") all_results = [] - initial_status_code = None # Store initial status code + initial_status_code = None # Store initial status code try: start_time = time.time() async with client.stream("POST", endpoint, json=payload) as response: - initial_status_code = response.status_code # Capture initial status - duration = time.time() - start_time # Time to first byte potentially - console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)") - response.raise_for_status() # Raise exception for bad *initial* status codes + initial_status_code = response.status_code # Capture initial status + duration = time.time() - start_time # Time to first byte potentially + console.print( + f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)") + response.raise_for_status() # Raise exception for bad *initial* status codes console.print("[magenta]--- Streaming Results ---[/]") completed = False @@ -152,24 +169,31 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict data = json.loads(line) if data.get("status") == "completed": completed = True - console.print("[bold green]--- Stream Completed ---[/]") + console.print( + "[bold green]--- Stream Completed ---[/]") break - elif data.get("url"): # Looks like a result dictionary + elif data.get("url"): # Looks like a result dictionary all_results.append(data) # Display summary info as it arrives - success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]" + success_icon = "[green]✔[/]" if data.get( + 'success') else "[red]✘[/]" url = data.get('url', 'N/A') # Display status code FROM THE RESULT DATA if available result_status = data.get('status_code', 'N/A') - console.print(f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})") + console.print( + f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})") if not data.get('success') and data.get('error_message'): - console.print(f" [red]Error: {data['error_message']}[/]") + console.print( + f" [red]Error: {data['error_message']}[/]") else: - console.print(f" [yellow]Stream meta-data:[/yellow] {data}") + console.print( + f" [yellow]Stream meta-data:[/yellow] {data}") except json.JSONDecodeError: - console.print(f" [red]Stream decode error for line:[/red] {line}") + console.print( + f" [red]Stream decode error for line:[/red] {line}") if not completed: - console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]") + console.print( + "[bold yellow]Warning: Stream ended without 'completed' marker.[/]") except httpx.HTTPStatusError as e: # Use the captured initial status code if available, otherwise from the exception @@ -177,18 +201,21 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict console.print(f"[bold red]HTTP Error (Initial Request):[/]") console.print(f"Status: {status}") try: - console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) + console.print(Panel(Syntax(json.dumps( + e.response.json(), indent=2), "json", theme="default"), title="Error Response")) except json.JSONDecodeError: console.print(f"Response Body: {e.response.text}") except httpx.RequestError as e: console.print(f"[bold red]Request Error: {e}[/]") except Exception as e: console.print(f"[bold red]Unexpected Error during streaming: {e}[/]") - console.print_exception(show_locals=False) # Print stack trace for unexpected errors + # Print stack trace for unexpected errors + console.print_exception(show_locals=False) # Call print_result_summary with the *collected* results AFTER the stream is done print_result_summary(all_results, title=f"{title} Collected Results") + def load_proxies_from_env() -> List[Dict]: """ Load proxies from the PROXIES environment variable. @@ -199,7 +226,7 @@ def load_proxies_from_env() -> List[Dict]: proxies_str = os.getenv("PROXIES", "") if not proxies_str: # console.print("[yellow]PROXIES environment variable not set or empty.[/]") - return proxies_params_list # Return empty list if not set + return proxies_params_list # Return empty list if not set try: proxy_entries = proxies_str.split(",") @@ -211,38 +238,40 @@ def load_proxies_from_env() -> List[Dict]: parts = entry.split(":") proxy_dict = {} - if len(parts) == 4: # Format: IP:PORT:USER:PASS + if len(parts) == 4: # Format: IP:PORT:USER:PASS ip, port, username, password = parts proxy_dict = { - "server": f"http://{ip}:{port}", # Assuming http protocol + "server": f"http://{ip}:{port}", # Assuming http protocol "username": username, "password": password, # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it } - elif len(parts) == 2: # Format: IP:PORT + elif len(parts) == 2: # Format: IP:PORT ip, port = parts proxy_dict = { "server": f"http://{ip}:{port}", # "ip": ip } else: - console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}") - continue + console.print( + f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}") + continue proxies_params_list.append(proxy_dict) except Exception as e: - console.print(f"[red]Error loading proxies from environment:[/red] {e}") + console.print( + f"[red]Error loading proxies from environment:[/red] {e}") if proxies_params_list: - console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]") + console.print( + f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]") # else: # console.print("[yellow]No valid proxies loaded from environment.[/]") return proxies_params_list - # --- Demo Functions --- # 1. Basic Crawling @@ -250,11 +279,17 @@ async def demo_basic_single_url(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, - "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}} + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS" + } + } } result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl") return result + async def demo_basic_multi_url(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL, LINKS_URL], @@ -264,16 +299,31 @@ async def demo_basic_multi_url(client: httpx.AsyncClient): result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl") return result + async def demo_streaming_multi_url(client: httpx.AsyncClient): payload = { - "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL + # "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL + "urls": [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3", + "https://example.com/page4", + "https://example.com/page5" + ], # Add another URL "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, - "crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}} + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "stream": True, + } + } } - result = stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl") + result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl") return result # 2. Markdown Generation & Content Filtering + + async def demo_markdown_default(client: httpx.AsyncClient): payload = { "urls": [SIMPLE_URL], @@ -281,17 +331,28 @@ async def demo_markdown_default(client: httpx.AsyncClient): "crawler_config": { "type": "CrawlerRunConfig", "params": { - "cache_mode": "BYPASS", - "markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_source": "fit_html", + "options": { + "type": "dict", + "value": { + "ignore_links": True + } + } + } + } # Explicitly default } } } result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation") return result + async def demo_markdown_pruning(client: httpx.AsyncClient): payload = { - "urls": [PYTHON_URL], # Use a more complex page + "urls": [PYTHON_URL], # Use a more complex page "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", @@ -302,7 +363,10 @@ async def demo_markdown_pruning(client: httpx.AsyncClient): "params": { "content_filter": { "type": "PruningContentFilter", - "params": {"threshold": 0.6, "threshold_type": "relative"} + "params": { + "threshold": 0.6, + "threshold_type": "relative" + } } } } @@ -312,6 +376,7 @@ async def demo_markdown_pruning(client: httpx.AsyncClient): result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter") return result + async def demo_markdown_bm25(client: httpx.AsyncClient): payload = { "urls": [PYTHON_URL], @@ -325,7 +390,9 @@ async def demo_markdown_bm25(client: httpx.AsyncClient): "params": { "content_filter": { "type": "BM25ContentFilter", - "params": {"user_query": "Python documentation language reference"} + "params": { + "user_query": "Python documentation language reference" + } } } } @@ -337,21 +404,22 @@ async def demo_markdown_bm25(client: httpx.AsyncClient): # 3. Specific Parameters # Corrected Demo Function: demo_param_css_selector + + async def demo_param_css_selector(client: httpx.AsyncClient): - target_selector = ".main-content" # Using the suggested correct selector + css_selector = ".main-content" # Using the suggested correct selector payload = { "urls": [PYTHON_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { - "cache_mode": "BYPASS", - "css_selector": target_selector # Target specific div + "css_selector": css_selector # Target specific div # No extraction strategy is needed to demo this parameter's effect on input HTML } } } - results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')") + results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')") if results: result = results[0] @@ -360,45 +428,55 @@ async def demo_param_css_selector(client: httpx.AsyncClient): # A simple check: does it contain expected content from within the selector, # and does it LACK content known to be outside (like footer links)? html_content = result['html'] - content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere - footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content + # Text likely within .main-content somewhere + content_present = 'Python Software Foundation' in html_content + # Text likely in the footer, outside .main-content + footer_absent = 'Legal Statements' not in html_content - console.print(f" Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}") - console.print(f" Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}") + console.print( + f" Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}") + console.print( + f" Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}") if not content_present or not footer_absent: - console.print(f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}") + console.print( + f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}") else: - console.print(f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}") + console.print( + f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}") elif result['success']: - console.print("[yellow]HTML content was empty in the successful result.[/]") + console.print( + "[yellow]HTML content was empty in the successful result.[/]") # Error message is handled by print_result_summary called by make_request + async def demo_param_js_execution(client: httpx.AsyncClient): payload = { - "urls": [FORMS_URL], # Use a page with a form + "urls": ["https://example.com"], # Use a page with a form "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", - # Simple JS to fill and maybe click (won't submit without more complex setup) + # Simple JS to fill and maybe click (won't submit without more complex setup) "js_code": """ - () => { - document.querySelector('[name="custname"]').value = 'Crawl4AI Demo'; - return { filled_name: document.querySelector('[name="custname"]').value }; - } + (() => { + document.querySelector('h1').innerText = 'Crawl4AI Demo'; + return { filled_name: document.querySelector('h1').innerText }; + })(); """, - "delay_before_return_html": 0.5 # Give JS time to potentially run + "delay_before_return_html": 0.5 # Give JS time to potentially run } } } results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter") if results and results[0].get("js_execution_result"): - console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"]) + console.print("[cyan]JS Execution Result:[/]", + results[0]["js_execution_result"]) elif results: - console.print("[yellow]JS Execution Result not found in response.[/]") + console.print("[yellow]JS Execution Result not found in response.[/]") + async def demo_param_screenshot(client: httpx.AsyncClient): payload = { @@ -411,13 +489,15 @@ async def demo_param_screenshot(client: httpx.AsyncClient): } results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot") if results and results[0].get("screenshot"): - console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}") + console.print( + f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}") elif results: - console.print("[yellow]Screenshot data not found in response.[/]") + console.print("[yellow]Screenshot data not found in response.[/]") + async def demo_param_ssl_fetch(client: httpx.AsyncClient): payload = { - "urls": [PYTHON_URL], # Needs HTTPS + "urls": [PYTHON_URL], # Needs HTTPS "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", @@ -429,18 +509,20 @@ async def demo_param_ssl_fetch(client: httpx.AsyncClient): console.print("[cyan]SSL Certificate Info:[/]") console.print(results[0]["ssl_certificate"]) elif results: - console.print("[yellow]SSL Certificate data not found in response.[/]") + console.print("[yellow]SSL Certificate data not found in response.[/]") + async def demo_param_proxy(client: httpx.AsyncClient): - proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts + proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts if not proxy_params_list: - console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow") + console.rule( + "[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow") console.print("Set the PROXIES environment variable to run this demo.") console.print("Format: IP:PORT:USR:PWD,IP:PORT,...") return payload = { - "urls": ["https://httpbin.org/ip"], # URL that shows originating IP + "urls": ["https://httpbin.org/ip"], # URL that shows originating IP "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", @@ -450,8 +532,21 @@ async def demo_param_proxy(client: httpx.AsyncClient): "type": "RoundRobinProxyStrategy", "params": { "proxies": [ + # [ + # { + # "type": "ProxyConfig", + # "params": { + # server:"...", + # "username": "...", + # "password": "..." + # } + # }, + # ... + # ] + # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig - {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}} + {"type": "ProxyConfig", "params": { + k: v for k, v in p.items() if k != 'ip'}} for p in proxy_params_list ] } @@ -474,43 +569,55 @@ async def demo_param_proxy(client: httpx.AsyncClient): end = html_content.rfind('}') if start != -1 and end != -1: json_str = html_content[start:end+1] - elif html_content.strip().startswith('{'): # Maybe it's just JSON - json_str = html_content.strip() + elif html_content.strip().startswith('{'): # Maybe it's just JSON + json_str = html_content.strip() if json_str: ip_data = json.loads(json_str) origin_ip = ip_data.get("origin") - console.print(f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]") + console.print( + f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]") # Extract the IPs from the proxy list for comparison - proxy_ips = {p.get("server").split(":")[1][2:] for p in proxy_params_list} + proxy_ips = {p.get("server").split( + ":")[1][2:] for p in proxy_params_list} if origin_ip and origin_ip in proxy_ips: - console.print("[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]") + console.print( + "[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]") elif origin_ip: - console.print("[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]") + console.print( + "[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]") console.print(f" Provided Proxy IPs: {proxy_ips}") else: - console.print("[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]") + console.print( + "[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]") else: - console.print("[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]") - # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging + console.print( + "[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]") + # console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging except json.JSONDecodeError: - console.print("[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]") + console.print( + "[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]") except Exception as e: - console.print(f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]") + console.print( + f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]") elif results: - console.print("[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]") + console.print( + "[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]") + +# 4. Extraction Strategies + -# 4. Extraction Strategies (Non-Deep) async def demo_extract_css(client: httpx.AsyncClient): # Schema to extract book titles and prices book_schema = { "name": "BookList", "baseSelector": "ol.row li.col-xs-6", "fields": [ - {"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"}, + {"name": "title", "selector": "article.product_pod h3 a", + "type": "attribute", "attribute": "title"}, {"name": "price", "selector": "article.product_pod .price_color", "type": "text"}, ] } @@ -523,7 +630,12 @@ async def demo_extract_css(client: httpx.AsyncClient): "cache_mode": "BYPASS", "extraction_strategy": { "type": "JsonCssExtractionStrategy", - "params": {"schema": {"type": "dict", "value": book_schema}} + "params": { + "schema": { + "type": "dict", + "value": book_schema + } + } } } } @@ -534,27 +646,34 @@ async def demo_extract_css(client: httpx.AsyncClient): try: extracted_data = json.loads(results[0]["extracted_content"]) if isinstance(extracted_data, list) and extracted_data: - console.print("[cyan]Sample Extracted Books (CSS):[/]") - table = Table(show_header=True, header_style="bold magenta") - table.add_column("Title", style="dim") - table.add_column("Price") - for item in extracted_data[:5]: # Show first 5 - table.add_row(item.get('title', 'N/A'), item.get('price', 'N/A')) - console.print(table) + console.print("[cyan]Sample Extracted Books (CSS):[/]") + table = Table(show_header=True, header_style="bold magenta") + table.add_column("Title", style="dim") + table.add_column("Price") + for item in extracted_data[:5]: # Show first 5 + table.add_row(item.get('title', 'N/A'), + item.get('price', 'N/A')) + console.print(table) else: - console.print("[yellow]CSS extraction did not return a list of results.[/]") - console.print(extracted_data) + console.print( + "[yellow]CSS extraction did not return a list of results.[/]") + console.print(extracted_data) except json.JSONDecodeError: - console.print("[red]Failed to parse extracted_content as JSON.[/]") + console.print("[red]Failed to parse extracted_content as JSON.[/]") except Exception as e: - console.print(f"[red]Error processing extracted CSS content: {e}[/]") + console.print( + f"[red]Error processing extracted CSS content: {e}[/]") # 5. LLM Extraction + + async def demo_extract_llm(client: httpx.AsyncClient): - if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key - console.rule("[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow") - console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") - return + if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key + console.rule( + "[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow") + console.print( + "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") + return payload = { "urls": [SIMPLE_URL], @@ -567,13 +686,16 @@ async def demo_extract_llm(client: httpx.AsyncClient): "type": "LLMExtractionStrategy", "params": { "instruction": "Extract title and author into JSON.", - "llm_config": { # Optional: Specify provider if not default + "llm_config": { # Optional: Specify provider if not default "type": "LLMConfig", "params": {} # Relies on server's default provider from config.yml & keys from .llm.env - # "params": {"provider": "openai/gpt-4o-mini"} + # "params": { + # "provider": "openai/gpt-4o-mini", + # "api_key": os.getenv("OPENAI_API_KEY") # Optional: Override key + # } }, - "schema": { # Request structured output + "schema": { # Request structured output "type": "dict", "value": { "title": "BookInfo", "type": "object", @@ -598,18 +720,24 @@ async def demo_extract_llm(client: httpx.AsyncClient): extracted_data = extracted_data[0] if isinstance(extracted_data, dict): - console.print("[cyan]Extracted Data (LLM):[/]") - syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False) - console.print(Panel(syntax, border_style="cyan", expand=False)) + console.print("[cyan]Extracted Data (LLM):[/]") + syntax = Syntax(json.dumps(extracted_data, indent=2), + "json", theme="monokai", line_numbers=False) + console.print(Panel(syntax, border_style="cyan", expand=False)) else: - console.print("[yellow]LLM extraction did not return expected dictionary.[/]") - console.print(extracted_data) + console.print( + "[yellow]LLM extraction did not return expected dictionary.[/]") + console.print(extracted_data) except json.JSONDecodeError: - console.print("[red]Failed to parse LLM extracted_content as JSON.[/]") + console.print( + "[red]Failed to parse LLM extracted_content as JSON.[/]") except Exception as e: - console.print(f"[red]Error processing extracted LLM content: {e}[/]") + console.print( + f"[red]Error processing extracted LLM content: {e}[/]") # 6. Deep Crawling + + async def demo_deep_basic(client: httpx.AsyncClient): payload = { "urls": [DEEP_CRAWL_BASE_URL], @@ -625,7 +753,17 @@ async def demo_deep_basic(client: httpx.AsyncClient): "max_pages": 4, "filter_chain": { "type": "FilterChain", - "params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]} + "params": { + "filters": [ + { + "type": "DomainFilter", + "params": + { + "allowed_domains": [DEEP_CRAWL_DOMAIN] + } + } + ] + } } } } @@ -642,6 +780,8 @@ async def demo_deep_basic(client: httpx.AsyncClient): console.print(f" [red]Error: {result['error_message']}[/]") # 5. Streaming Deep Crawl + + async def demo_deep_streaming(client: httpx.AsyncClient): payload = { "urls": [DEEP_CRAWL_BASE_URL], @@ -649,7 +789,7 @@ async def demo_deep_streaming(client: httpx.AsyncClient): "crawler_config": { "type": "CrawlerRunConfig", "params": { - "stream": True, # Enable streaming + "stream": True, # Enable streaming "cache_mode": "BYPASS", "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", @@ -669,11 +809,13 @@ async def demo_deep_streaming(client: httpx.AsyncClient): await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl") # 5a. Deep Crawl with Filtering & Scoring + + async def demo_deep_filtering_scoring(client: httpx.AsyncClient): """Demonstrates deep crawl with advanced URL filtering and scoring.""" - max_depth = 2 # Go a bit deeper to see scoring/filtering effects + max_depth = 2 # Go a bit deeper to see scoring/filtering effects max_pages = 6 - excluded_pattern = "*/category-1/*" # Example pattern to exclude + excluded_pattern = "*/category-1/*" # Example pattern to exclude keyword_to_score = "product" # Example keyword to prioritize payload = { @@ -705,7 +847,7 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient): "type": "URLPatternFilter", "params": { "patterns": [excluded_pattern], - "reverse": True # Block if match + "reverse": True # Block if match } } ] @@ -717,7 +859,8 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient): "scorers": [ { # Boost score for URLs containing the keyword "type": "KeywordRelevanceScorer", - "params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight + # Higher weight + "params": {"keywords": [keyword_to_score], "weight": 1.5} }, { # Slightly penalize deeper pages "type": "PathDepthScorer", @@ -747,39 +890,51 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient): depth = result.get("metadata", {}).get("depth", -1) # Check Filtering - if excluded_pattern.strip('*') in url: # Check if the excluded part is present - console.print(f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}") + # Check if the excluded part is present + if excluded_pattern.strip('*') in url: + console.print( + f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}") excluded_found = True # Check Scoring (Observation) if keyword_to_score in url: - prioritized_found_overall = True - if depth == 1: # Check if prioritized keywords appeared early (depth 1) - prioritized_found_at_depth1 = True + prioritized_found_overall = True + # Check if prioritized keywords appeared early (depth 1) + if depth == 1: + prioritized_found_at_depth1 = True if not excluded_found: - console.print(f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.") + console.print( + f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.") else: - console.print(f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).") + console.print( + f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).") if prioritized_found_at_depth1: - console.print(f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).") + console.print( + f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).") elif prioritized_found_overall: - console.print(f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).") + console.print( + f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).") else: - console.print(f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.") + console.print( + f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.") # print_result_summary called by make_request already shows URLs and depths # 6. Deep Crawl with Extraction + + async def demo_deep_with_css_extraction(client: httpx.AsyncClient): # Schema to extract H1 and first paragraph from any page general_schema = { "name": "PageContent", - "baseSelector": "body", # Apply to whole body + "baseSelector": "body", # Apply to whole body "fields": [ - {"name": "page_title", "selector": "h1", "type": "text", "default": "N/A"}, - {"name": "first_p", "selector": "p", "type": "text", "default": "N/A"}, # Gets first p tag + {"name": "page_title", "selector": "h1", + "type": "text", "default": "N/A"}, + {"name": "first_p", "selector": "p", "type": "text", + "default": "N/A"}, # Gets first p tag ] } payload = { @@ -789,7 +944,7 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient): "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", - "extraction_strategy": { # Apply CSS extraction to each page + "extraction_strategy": { # Apply CSS extraction to each page "type": "JsonCssExtractionStrategy", "params": {"schema": {"type": "dict", "value": general_schema}} }, @@ -801,9 +956,11 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient): "filter_chain": { "type": "FilterChain", "params": {"filters": [ - {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, - {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} - ]} + {"type": "DomainFilter", "params": { + "allowed_domains": [DEEP_CRAWL_DOMAIN]}}, + {"type": "ContentTypeFilter", "params": { + "allowed_types": ["text/html"]}} + ]} } } } @@ -818,22 +975,32 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient): if result.get("success") and result.get("extracted_content"): try: extracted = json.loads(result["extracted_content"]) - if isinstance(extracted, list) and extracted: extracted = extracted[0] # Use first item - title = extracted.get('page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error' - console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}") + if isinstance(extracted, list) and extracted: + extracted = extracted[0] # Use first item + title = extracted.get( + 'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error' + console.print( + f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}") except Exception: - console.print(f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content") + console.print( + f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content") elif result.get("success"): - console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") + console.print( + f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") else: - console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") + console.print( + f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") # 6b. Deep Crawl with LLM Extraction + + async def demo_deep_with_llm_extraction(client: httpx.AsyncClient): - if not os.getenv("OPENAI_API_KEY"): # Basic check - console.rule("[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow") - console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") - return + if not os.getenv("OPENAI_API_KEY"): # Basic check + console.rule( + "[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow") + console.print( + "Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.") + return payload = { "urls": [DEEP_CRAWL_BASE_URL], @@ -842,7 +1009,7 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient): "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", - "extraction_strategy": { # Apply LLM extraction to each page + "extraction_strategy": { # Apply LLM extraction to each page "type": "LLMExtractionStrategy", "params": { "instruction": "What is the main topic of this page based on the H1 and first paragraph? Respond with just the topic.", @@ -853,13 +1020,15 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient): "type": "BFSDeepCrawlStrategy", "params": { "max_depth": 1, - "max_pages": 2, # Reduce pages for LLM cost/time - "filter_chain": { + "max_pages": 2, # Reduce pages for LLM cost/time + "filter_chain": { "type": "FilterChain", "params": {"filters": [ - {"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}, - {"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}} - ]} + {"type": "DomainFilter", "params": { + "allowed_domains": [DEEP_CRAWL_DOMAIN]}}, + {"type": "ContentTypeFilter", "params": { + "allowed_types": ["text/html"]}} + ]} } } } @@ -872,34 +1041,40 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient): console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]") for result in results: if result.get("success") and result.get("extracted_content"): - console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}") + console.print( + f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}") elif result.get("success"): - console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") + console.print( + f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.") else: - console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") + console.print( + f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") # 6c. Deep Crawl with Proxies async def demo_deep_with_proxy(client: httpx.AsyncClient): - proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts + proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts if not proxy_params_list: - console.rule("[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow") + console.rule( + "[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow") console.print("Set the PROXIES environment variable to run this demo.") return payload = { - "urls": [DEEP_CRAWL_BASE_URL], # Use a site likely accessible via proxies + # Use a site likely accessible via proxies + "urls": [DEEP_CRAWL_BASE_URL], "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "cache_mode": "BYPASS", - "proxy_rotation_strategy": { + "proxy_rotation_strategy": { "type": "RoundRobinProxyStrategy", - "params": { + "params": { # Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key "proxies": [ - {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}} + {"type": "ProxyConfig", "params": { + k: v for k, v in p.items() if k != 'ip'}} for p in proxy_params_list ] } @@ -907,7 +1082,7 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient): "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", "params": { - "max_depth": 1, # Just crawl start URL via proxy + "max_depth": 1, # Just crawl start URL via proxy "max_pages": 5, } } @@ -933,18 +1108,18 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient): async def demo_deep_with_ssl(client: httpx.AsyncClient): """Test BFS deep crawl with fetch_ssl_certificate enabled.""" payload = { - "urls": [DEEP_CRAWL_BASE_URL], # Needs HTTPS + "urls": [DEEP_CRAWL_BASE_URL], # Needs HTTPS "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, "crawler_config": { "type": "CrawlerRunConfig", "params": { "stream": False, "cache_mode": "BYPASS", - "fetch_ssl_certificate": True, # <-- Enable SSL fetching + "fetch_ssl_certificate": True, # <-- Enable SSL fetching "deep_crawl_strategy": { "type": "BFSDeepCrawlStrategy", "params": { - "max_depth": 1, # Crawl a bit deeper + "max_depth": 1, # Crawl a bit deeper "max_pages": 3, "filter_chain": { "type": "FilterChain", @@ -961,17 +1136,22 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient): console.print("[cyan]SSL Certificate Summary from Deep Crawl:[/]") for result in results: if result.get("success") and result.get("ssl_certificate"): - cert = result["ssl_certificate"] - issuer_org = cert.get('issuer', {}).get('O', 'N/A') - valid_from = cert.get('not_before', 'N/A') - valid_to = cert.get('not_after', 'N/A') - console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}") + cert = result["ssl_certificate"] + issuer_org = cert.get('issuer', {}).get('O', 'N/A') + valid_from = cert.get('not_before', 'N/A') + valid_to = cert.get('not_after', 'N/A') + console.print( + f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}") elif result.get("success"): - console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.") + console.print( + f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.") else: - console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") + console.print( + f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.") # 7. Markdown helper endpoint + + async def demo_markdown_endpoint(client: httpx.AsyncClient): """ One-shot helper around /md. @@ -987,15 +1167,19 @@ async def demo_markdown_endpoint(client: httpx.AsyncClient): t0 = time.time() resp = await client.post("/md", json=payload) dt = time.time() - t0 - console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + console.print( + f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") resp.raise_for_status() md = resp.json().get("markdown", "") snippet = (md[:500] + "...") if len(md) > 500 else md - console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False)) + console.print(Panel(snippet, title="Markdown snippet", + border_style="cyan", expand=False)) except Exception as e: console.print(f"[bold red]Error hitting /md:[/] {e}") # 8. LLM QA helper endpoint + + async def demo_llm_endpoint(client: httpx.AsyncClient): """ Quick QA round-trip with /llm. @@ -1012,10 +1196,12 @@ async def demo_llm_endpoint(client: httpx.AsyncClient): t0 = time.time() resp = await client.get(f"/llm/{enc}", params={"q": question}) dt = time.time() - t0 - console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + console.print( + f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") resp.raise_for_status() answer = resp.json().get("answer", "") - console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False)) + console.print(Panel(answer or "No answer returned", + title="LLM answer", border_style="magenta", expand=False)) except Exception as e: console.print(f"[bold red]Error hitting /llm:[/] {e}") @@ -1036,10 +1222,12 @@ async def demo_config_dump_valid(client: httpx.AsyncClient): t0 = time.time() resp = await client.post("/config/dump", json=payload) dt = time.time() - t0 - console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") + console.print( + f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)") resp.raise_for_status() dump_json = resp.json() - console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan")) + console.print(Panel(Syntax(json.dumps(dump_json, indent=2), + "json", theme="monokai"), title="Dump()", border_style="cyan")) except Exception as e: console.print(f"[bold red]Error in valid /config/dump call:[/] {e}") @@ -1053,21 +1241,25 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient): """).strip() payload = {"code": bad_code} - console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta") + console.rule( + "[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta") print_payload(payload) try: resp = await client.post("/config/dump", json=payload) - console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]") + console.print( + f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]") resp.raise_for_status() # should throw -> except except httpx.HTTPStatusError as e: console.print("[cyan]Expected parse/validation failure captured:[/]") try: - console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload")) + console.print(Panel(Syntax(json.dumps( + e.response.json(), indent=2), "json", theme="fruity"), title="Error payload")) except Exception: console.print(e.response.text) except Exception as e: - console.print(f"[bold red]Unexpected error during invalid test:[/] {e}") + console.print( + f"[bold red]Unexpected error during invalid test:[/] {e}") # --- Update Main Runner to include new demo --- @@ -1075,33 +1267,33 @@ async def main_demo(): async with httpx.AsyncClient(base_url=BASE_URL, timeout=300.0) as client: if not await check_server_health(client): return - + # --- Run Demos --- - await demo_basic_single_url(client) - await demo_basic_multi_url(client) - await demo_streaming_multi_url(client) + # await demo_basic_single_url(client) + # await demo_basic_multi_url(client) + # await demo_streaming_multi_url(client) - await demo_markdown_default(client) - await demo_markdown_pruning(client) - await demo_markdown_bm25(client) + # await demo_markdown_default(client) + # await demo_markdown_pruning(client) + # await demo_markdown_bm25(client) - await demo_param_css_selector(client) - await demo_param_js_execution(client) - await demo_param_screenshot(client) - await demo_param_ssl_fetch(client) - await demo_param_proxy(client) # Skips if no PROXIES env var + # await demo_param_css_selector(client) + # await demo_param_js_execution(client) + # await demo_param_screenshot(client) + # await demo_param_ssl_fetch(client) + # await demo_param_proxy(client) # Skips if no PROXIES env var - await demo_extract_css(client) - await demo_extract_llm(client) # Skips if no common LLM key env var + # await demo_extract_css(client) + # await demo_extract_llm(client) # Skips if no common LLM key env var - await demo_deep_basic(client) - await demo_deep_streaming(client) # This need extra work - + # await demo_deep_basic(client) + # await demo_deep_streaming(client) # This need extra work - await demo_deep_with_css_extraction(client) - await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var - await demo_deep_with_proxy(client) # Skips if no PROXIES env var - await demo_deep_with_ssl(client) # Added the new demo + # await demo_deep_with_css_extraction(client) + # # Skips if no common LLM key env var + # await demo_deep_with_llm_extraction(client) + # await demo_deep_with_proxy(client) # Skips if no PROXIES env var + # await demo_deep_with_ssl(client) # Added the new demo # --- Helper endpoints --- await demo_markdown_endpoint(client) @@ -1120,5 +1312,6 @@ if __name__ == "__main__": except KeyboardInterrupt: console.print("\n[yellow]Demo interrupted by user.[/]") except Exception as e: - console.print(f"\n[bold red]An error occurred during demo execution:[/]") - console.print_exception(show_locals=False) \ No newline at end of file + console.print( + f"\n[bold red]An error occurred during demo execution:[/]") + console.print_exception(show_locals=False) diff --git a/docs/examples/hello_world.py b/docs/examples/hello_world.py index fbdd5283..0d351fa8 100644 --- a/docs/examples/hello_world.py +++ b/docs/examples/hello_world.py @@ -31,7 +31,7 @@ async def example_cdp(): async def main(): - browser_config = BrowserConfig(headless=True, verbose=True) + browser_config = BrowserConfig(headless=False, verbose=True) async with AsyncWebCrawler(config=browser_config) as crawler: crawler_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS,