diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 9ba508b2..5cdc95b9 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -111,7 +111,8 @@ class AsyncWebCrawler: self, crawler_strategy: AsyncCrawlerStrategy = None, config: BrowserConfig = None, - base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), + base_directory: str = str( + os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())), thread_safe: bool = False, logger: AsyncLoggerBase = None, **kwargs, @@ -139,7 +140,8 @@ class AsyncWebCrawler: ) # Initialize crawler strategy - params = {k: v for k, v in kwargs.items() if k in ["browser_config", "logger"]} + params = {k: v for k, v in kwargs.items() if k in [ + "browser_config", "logger"]} self.crawler_strategy = crawler_strategy or AsyncPlaywrightCrawlerStrategy( browser_config=browser_config, logger=self.logger, @@ -237,7 +239,8 @@ class AsyncWebCrawler: config = config or CrawlerRunConfig() if not isinstance(url, str) or not url: - raise ValueError("Invalid URL, make sure the URL is a non-empty string") + raise ValueError( + "Invalid URL, make sure the URL is a non-empty string") async with self._lock or self.nullcontext(): try: @@ -291,12 +294,12 @@ class AsyncWebCrawler: # Update proxy configuration from rotation strategy if available if config and config.proxy_rotation_strategy: - next_proxy : ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy() + next_proxy: ProxyConfig = await config.proxy_rotation_strategy.get_next_proxy() if next_proxy: self.logger.info( message="Switch proxy: {proxy}", tag="PROXY", - params={"proxy": next_proxy.server} + params={"proxy": next_proxy.server} ) config.proxy_config = next_proxy # config = config.clone(proxy_config=next_proxy) @@ -306,7 +309,8 @@ class AsyncWebCrawler: t1 = time.perf_counter() if config.user_agent: - self.crawler_strategy.update_user_agent(config.user_agent) + self.crawler_strategy.update_user_agent( + config.user_agent) # Check robots.txt if enabled if config and config.check_robots_txt: @@ -372,7 +376,8 @@ class AsyncWebCrawler: crawl_result.console_messages = async_response.console_messages crawl_result.success = bool(html) - crawl_result.session_id = getattr(config, "session_id", None) + crawl_result.session_id = getattr( + config, "session_id", None) self.logger.success( message="{url:.50}... | Status: {status} | Total: {timing}", @@ -407,7 +412,8 @@ class AsyncWebCrawler: ) cached_result.success = bool(html) - cached_result.session_id = getattr(config, "session_id", None) + cached_result.session_id = getattr( + config, "session_id", None) cached_result.redirected_url = cached_result.redirected_url or url return CrawlResultContainer(cached_result) @@ -474,12 +480,14 @@ class AsyncWebCrawler: params = config.__dict__.copy() params.pop("url", None) # add keys from kwargs to params that doesn't exist in params - params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) + params.update({k: v for k, v in kwargs.items() + if k not in params.keys()}) ################################ # Scraping Strategy Execution # ################################ - result: ScrapingResult = scraping_strategy.scrap(url, html, **params) + result: ScrapingResult = scraping_strategy.scrap( + url, html, **params) if result is None: raise ValueError( @@ -495,7 +503,8 @@ class AsyncWebCrawler: # Extract results - handle both dict and ScrapingResult if isinstance(result, dict): - cleaned_html = sanitize_input_encode(result.get("cleaned_html", "")) + cleaned_html = sanitize_input_encode( + result.get("cleaned_html", "")) media = result.get("media", {}) links = result.get("links", {}) metadata = result.get("metadata", {}) diff --git a/docs/examples/docker/demo_docker_api.py b/docs/examples/docker/demo_docker_api.py new file mode 100644 index 00000000..56d0173c --- /dev/null +++ b/docs/examples/docker/demo_docker_api.py @@ -0,0 +1,883 @@ +import asyncio +import httpx +import json +import os +import time +from typing import List, Dict, Any, AsyncGenerator, Optional +from dotenv import load_dotenv +from rich.console import Console +from rich.syntax import Syntax +from rich.panel import Panel +from rich.table import Table + +# --- Setup & Configuration --- +load_dotenv() # Load environment variables from .env file + +console = Console() + +# --- Configuration --- +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235") +BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020") +# Target URLs +SIMPLE_URL = "https://httpbin.org/html" +LINKS_URL = "https://httpbin.org/links/10/0" +FORMS_URL = "https://httpbin.org/forms/post" # For JS demo +BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction +PYTHON_URL = "https://python.org" # For deeper crawl +# Use the same sample site as deep crawl tests for consistency +DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/") +DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com" + +# --- Helper Functions --- + +async def check_server_health(client: httpx.AsyncClient): + """Check if the server is healthy before running tests.""" + console.print("[bold cyan]Checking server health...[/]", end="") + try: + response = await client.get("/health", timeout=10.0) + response.raise_for_status() + health_data = response.json() + console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]") + return True + except (httpx.RequestError, httpx.HTTPStatusError) as e: + console.print(f"\n[bold red]Server health check FAILED:[/]") + console.print(f"Error: {e}") + console.print(f"Is the server running at {BASE_URL}?") + return False + except Exception as e: + console.print(f"\n[bold red]An unexpected error occurred during health check:[/]") + console.print(e) + return False + +def print_payload(payload: Dict[str, Any]): + """Prints the JSON payload nicely.""" + syntax = Syntax(json.dumps(payload, indent=2), "json", theme="default", line_numbers=False) + console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False)) + +def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3): + """Prints a concise summary of crawl results.""" + if not results: + console.print(f"[yellow]{title}: No results received.[/]") + return + + console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False)) + count = 0 + for result in results: + if count >= max_items: + console.print(f"... (showing first {max_items} of {len(results)} results)") + break + count += 1 + success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]" + url = result.get('url', 'N/A') + status = result.get('status_code', 'N/A') + content_info = "" + if result.get('extracted_content'): + content_str = json.dumps(result['extracted_content']) + snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str + content_info = f" | Extracted: [cyan]{snippet}[/]" + elif result.get('markdown'): + content_info = f" | Markdown: [cyan]Present[/]" + elif result.get('html'): + content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]" + + console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}") + if "metadata" in result and "depth" in result["metadata"]: + console.print(f" Depth: {result['metadata']['depth']}") + if not result.get('success') and result.get('error_message'): + console.print(f" [red]Error: {result['error_message']}[/]") + + +async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str) -> Optional[List[Dict[str, Any]]]: + """Handles non-streaming POST requests.""" + console.rule(f"[bold blue]{title}[/]", style="blue") + print_payload(payload) + console.print(f"Sending POST request to {client.base_url}{endpoint}...") + try: + start_time = time.time() + response = await client.post(endpoint, json=payload) + duration = time.time() - start_time + console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)") + response.raise_for_status() + data = response.json() + if data.get("success"): + results = data.get("results", []) + print_result_summary(results, title=f"{title} Results") + return results + else: + console.print("[bold red]Request reported failure:[/]") + console.print(data) + return None + except httpx.HTTPStatusError as e: + console.print(f"[bold red]HTTP Error:[/]") + console.print(f"Status: {e.response.status_code}") + try: + console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) + except json.JSONDecodeError: + console.print(f"Response Body: {e.response.text}") + except httpx.RequestError as e: + console.print(f"[bold red]Request Error: {e}[/]") + except Exception as e: + console.print(f"[bold red]Unexpected Error: {e}[/]") + return None + +async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str): + """Handles streaming POST requests.""" + console.rule(f"[bold magenta]{title}[/]", style="magenta") + print_payload(payload) + console.print(f"Sending POST stream request to {client.base_url}{endpoint}...") + all_results = [] + try: + start_time = time.time() + async with client.stream("POST", endpoint, json=payload) as response: + duration = time.time() - start_time # Time to first byte potentially + console.print(f"Initial Response Status: [bold {'green' if response.status_code == 200 else 'red'}]{response.status_code}[/] (first byte ~{duration:.2f}s)") + response.raise_for_status() + + console.print("[magenta]--- Streaming Results ---[/]") + completed = False + async for line in response.aiter_lines(): + if line: + try: + data = json.loads(line) + if data.get("status") == "completed": + completed = True + console.print("[bold green]--- Stream Completed ---[/]") + break + elif data.get("url"): # Looks like a result + all_results.append(data) + success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]" + url = data.get('url', 'N/A') + console.print(f" {success_icon} Received: [link={url}]{url}[/link]") + else: + console.print(f" [yellow]Stream meta-data:[/yellow] {data}") + except json.JSONDecodeError: + console.print(f" [red]Stream decode error for line:[/red] {line}") + if not completed: + console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]") + + except httpx.HTTPStatusError as e: + console.print(f"[bold red]HTTP Error:[/]") + console.print(f"Status: {e.response.status_code}") + try: + console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response")) + except json.JSONDecodeError: + console.print(f"Response Body: {e.response.text}") + except httpx.RequestError as e: + console.print(f"[bold red]Request Error: {e}[/]") + except Exception as e: + console.print(f"[bold red]Unexpected Error: {e}[/]") + + print_result_summary(all_results, title=f"{title} Collected Results") + + +def load_proxies_from_env() -> List[Dict]: + """ + Load proxies from the PROXIES environment variable. + Expected format: IP:PORT:USER:PASS,IP:PORT,IP2:PORT2:USER2:PASS2,... + Returns a list of dictionaries suitable for the 'params' of ProxyConfig. + """ + proxies_params_list = [] + proxies_str = os.getenv("PROXIES", "") + if not proxies_str: + # console.print("[yellow]PROXIES environment variable not set or empty.[/]") + return proxies_params_list # Return empty list if not set + + try: + proxy_entries = proxies_str.split(",") + for entry in proxy_entries: + entry = entry.strip() + if not entry: + continue + + parts = entry.split(":") + proxy_dict = {} + + if len(parts) == 4: # Format: IP:PORT:USER:PASS + ip, port, username, password = parts + proxy_dict = { + "server": f"http://{ip}:{port}", # Assuming http protocol + "username": username, + "password": password, + # "ip": ip # 'ip' is not a standard ProxyConfig param, 'server' contains it + } + elif len(parts) == 2: # Format: IP:PORT + ip, port = parts + proxy_dict = { + "server": f"http://{ip}:{port}", + # "ip": ip + } + else: + console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}") + continue + + proxies_params_list.append(proxy_dict) + + except Exception as e: + console.print(f"[red]Error loading proxies from environment:[/red] {e}") + + if proxies_params_list: + console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]") + # else: + # console.print("[yellow]No valid proxies loaded from environment.[/]") + + return proxies_params_list + + + +# --- Demo Functions --- + +# 1. Basic Crawling +async def demo_basic_single_url(client: httpx.AsyncClient): + payload = { + "urls": [SIMPLE_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}} + } + result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl") + return result + +async def demo_basic_multi_url(client: httpx.AsyncClient): + payload = { + "urls": [SIMPLE_URL, LINKS_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}} + } + result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl") + return result + +async def demo_streaming_multi_url(client: httpx.AsyncClient): + payload = { + "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}} + } + result = stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl") + return result + +# 2. Markdown Generation & Content Filtering +async def demo_markdown_default(client: httpx.AsyncClient): + payload = { + "urls": [SIMPLE_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default + } + } + } + result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation") + return result + +async def demo_markdown_pruning(client: httpx.AsyncClient): + payload = { + "urls": [PYTHON_URL], # Use a more complex page + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": {"threshold": 0.6, "threshold_type": "relative"} + } + } + } + } + } + } + result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter") + return result + +async def demo_markdown_bm25(client: httpx.AsyncClient): + payload = { + "urls": [PYTHON_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "BM25ContentFilter", + "params": {"user_query": "Python documentation language reference"} + } + } + } + } + } + } + result = await make_request(client, "/crawl", payload, "Demo 2c: Markdown with BM25 Filter") + return result + +# 3. Specific Parameters +# Corrected Demo Function: demo_param_css_selector +async def demo_param_css_selector(client: httpx.AsyncClient): + target_selector = ".main-content" # Using the suggested correct selector + payload = { + "urls": [PYTHON_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "css_selector": target_selector # Target specific div + # No extraction strategy is needed to demo this parameter's effect on input HTML + } + } + } + results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')") + + if results: + result = results[0] + if result['success'] and result.get('html'): + # Check if the returned HTML is likely constrained + # A simple check: does it contain expected content from within the selector, + # and does it LACK content known to be outside (like footer links)? + html_content = result['html'] + content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere + footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content + + console.print(f" Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}") + console.print(f" Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}") + + if not content_present or not footer_absent: + console.print(f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}") + else: + console.print(f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}") + + elif result['success']: + console.print("[yellow]HTML content was empty in the successful result.[/]") + # Error message is handled by print_result_summary called by make_request + +async def demo_param_js_execution(client: httpx.AsyncClient): + payload = { + "urls": [FORMS_URL], # Use a page with a form + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + # Simple JS to fill and maybe click (won't submit without more complex setup) + "js_code": """ + () => { + document.querySelector('[name="custname"]').value = 'Crawl4AI Demo'; + return { filled_name: document.querySelector('[name="custname"]').value }; + } + """, + "delay_before_return_html": 0.5 # Give JS time to potentially run + } + } + } + results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter") + if results and results[0].get("js_execution_result"): + console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"]) + elif results: + console.print("[yellow]JS Execution Result not found in response.[/]") + + +async def demo_param_screenshot(client: httpx.AsyncClient): + payload = { + "urls": [SIMPLE_URL], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"cache_mode": "BYPASS", "screenshot": True} + } + } + results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot") + if results and results[0].get("screenshot"): + console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}") + elif results: + console.print("[yellow]Screenshot data not found in response.[/]") + +async def demo_param_ssl_fetch(client: httpx.AsyncClient): + payload = { + "urls": [PYTHON_URL], # Needs HTTPS + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": {"cache_mode": "BYPASS", "fetch_ssl_certificate": True} + } + } + results = await make_request(client, "/crawl", payload, "Demo 3d: Fetching SSL Certificate") + if results and results[0].get("ssl_certificate"): + console.print("[cyan]SSL Certificate Info:[/]") + console.print(results[0]["ssl_certificate"]) + elif results: + console.print("[yellow]SSL Certificate data not found in response.[/]") + + + +async def demo_param_proxy(client: httpx.AsyncClient): + proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts + if not proxy_params_list: + console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow") + console.print("Set the PROXIES environment variable to run this demo.") + console.print("Format: IP:PORT:USR:PWD,IP:PORT,...") + return + + payload = { + "urls": ["https://httpbin.org/ip"], # URL that shows originating IP + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "BYPASS", + "proxy_rotation_strategy": { + "type": "RoundRobinProxyStrategy", + "params": { + "proxies": [ + # Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig + {"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}} + for p in proxy_params_list + ] + } + } + } + } + } + results = await make_request(client, "/crawl", payload, "Demo 3e: Using Proxies") + + # --- Verification Logic --- + if results and results[0].get("success"): + result = results[0] + try: + # httpbin.org/ip returns JSON within the HTML body's
tag
+ html_content = result.get('html', '')
+ # Basic extraction - find JSON within tags or just the JSON itself
+ json_str = None
+ if '