|
|
|
@@ -21,17 +21,20 @@ console = Console()
|
|
|
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
|
|
|
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:8020")
|
|
|
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
|
|
|
|
BASE_URL = os.getenv("CRAWL4AI_TEST_URL", "http://localhost:11235")
|
|
|
|
# Target URLs
|
|
|
|
# Target URLs
|
|
|
|
|
|
|
|
SIMPLE_URL = "https://example.com" # For demo purposes
|
|
|
|
SIMPLE_URL = "https://httpbin.org/html"
|
|
|
|
SIMPLE_URL = "https://httpbin.org/html"
|
|
|
|
LINKS_URL = "https://httpbin.org/links/10/0"
|
|
|
|
LINKS_URL = "https://httpbin.org/links/10/0"
|
|
|
|
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
|
|
|
|
FORMS_URL = "https://httpbin.org/forms/post" # For JS demo
|
|
|
|
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
|
|
|
|
BOOKS_URL = "http://books.toscrape.com/" # For CSS extraction
|
|
|
|
PYTHON_URL = "https://python.org" # For deeper crawl
|
|
|
|
PYTHON_URL = "https://python.org" # For deeper crawl
|
|
|
|
# Use the same sample site as deep crawl tests for consistency
|
|
|
|
# Use the same sample site as deep crawl tests for consistency
|
|
|
|
DEEP_CRAWL_BASE_URL = os.getenv("DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
|
|
|
|
DEEP_CRAWL_BASE_URL = os.getenv(
|
|
|
|
|
|
|
|
"DEEP_CRAWL_TEST_SITE", "https://docs.crawl4ai.com/samples/deepcrawl/")
|
|
|
|
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
|
|
|
|
DEEP_CRAWL_DOMAIN = "docs.crawl4ai.com"
|
|
|
|
|
|
|
|
|
|
|
|
# --- Helper Functions ---
|
|
|
|
# --- Helper Functions ---
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def check_server_health(client: httpx.AsyncClient):
|
|
|
|
async def check_server_health(client: httpx.AsyncClient):
|
|
|
|
"""Check if the server is healthy before running tests."""
|
|
|
|
"""Check if the server is healthy before running tests."""
|
|
|
|
console.print("[bold cyan]Checking server health...[/]", end="")
|
|
|
|
console.print("[bold cyan]Checking server health...[/]", end="")
|
|
|
|
@@ -39,7 +42,8 @@ async def check_server_health(client: httpx.AsyncClient):
|
|
|
|
response = await client.get("/health", timeout=10.0)
|
|
|
|
response = await client.get("/health", timeout=10.0)
|
|
|
|
response.raise_for_status()
|
|
|
|
response.raise_for_status()
|
|
|
|
health_data = response.json()
|
|
|
|
health_data = response.json()
|
|
|
|
console.print(f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[bold green] Server OK! Version: {health_data.get('version', 'N/A')}[/]")
|
|
|
|
return True
|
|
|
|
return True
|
|
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
|
|
except (httpx.RequestError, httpx.HTTPStatusError) as e:
|
|
|
|
console.print(f"\n[bold red]Server health check FAILED:[/]")
|
|
|
|
console.print(f"\n[bold red]Server health check FAILED:[/]")
|
|
|
|
@@ -47,10 +51,12 @@ async def check_server_health(client: httpx.AsyncClient):
|
|
|
|
console.print(f"Is the server running at {BASE_URL}?")
|
|
|
|
console.print(f"Is the server running at {BASE_URL}?")
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"\n[bold red]An unexpected error occurred during health check:[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"\n[bold red]An unexpected error occurred during health check:[/]")
|
|
|
|
console.print(e)
|
|
|
|
console.print(e)
|
|
|
|
return False
|
|
|
|
return False
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_payload(payload: Dict[str, Any]):
|
|
|
|
def print_payload(payload: Dict[str, Any]):
|
|
|
|
"""Prints the JSON payload nicely with a dark theme."""
|
|
|
|
"""Prints the JSON payload nicely with a dark theme."""
|
|
|
|
syntax = Syntax(
|
|
|
|
syntax = Syntax(
|
|
|
|
@@ -60,7 +66,9 @@ def print_payload(payload: Dict[str, Any]):
|
|
|
|
line_numbers=False,
|
|
|
|
line_numbers=False,
|
|
|
|
word_wrap=True # Added word wrap for potentially long payloads
|
|
|
|
word_wrap=True # Added word wrap for potentially long payloads
|
|
|
|
)
|
|
|
|
)
|
|
|
|
console.print(Panel(syntax, title="Request Payload", border_style="blue", expand=False))
|
|
|
|
console.print(Panel(syntax, title="Request Payload",
|
|
|
|
|
|
|
|
border_style="blue", expand=False))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
|
|
|
|
def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Results Summary", max_items: int = 3):
|
|
|
|
"""Prints a concise summary of crawl results."""
|
|
|
|
"""Prints a concise summary of crawl results."""
|
|
|
|
@@ -68,11 +76,13 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
|
|
|
|
console.print(f"[yellow]{title}: No results received.[/]")
|
|
|
|
console.print(f"[yellow]{title}: No results received.[/]")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
console.print(Panel(f"[bold]{title}[/]", border_style="green", expand=False))
|
|
|
|
console.print(Panel(f"[bold]{title}[/]",
|
|
|
|
|
|
|
|
border_style="green", expand=False))
|
|
|
|
count = 0
|
|
|
|
count = 0
|
|
|
|
for result in results:
|
|
|
|
for result in results:
|
|
|
|
if count >= max_items:
|
|
|
|
if count >= max_items:
|
|
|
|
console.print(f"... (showing first {max_items} of {len(results)} results)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"... (showing first {max_items} of {len(results)} results)")
|
|
|
|
break
|
|
|
|
break
|
|
|
|
count += 1
|
|
|
|
count += 1
|
|
|
|
success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
|
|
|
|
success_icon = "[green]✔[/]" if result.get('success') else "[red]✘[/]"
|
|
|
|
@@ -81,14 +91,16 @@ def print_result_summary(results: List[Dict[str, Any]], title: str = "Crawl Resu
|
|
|
|
content_info = ""
|
|
|
|
content_info = ""
|
|
|
|
if result.get('extracted_content'):
|
|
|
|
if result.get('extracted_content'):
|
|
|
|
content_str = json.dumps(result['extracted_content'])
|
|
|
|
content_str = json.dumps(result['extracted_content'])
|
|
|
|
snippet = (content_str[:70] + '...') if len(content_str) > 70 else content_str
|
|
|
|
snippet = (
|
|
|
|
|
|
|
|
content_str[:70] + '...') if len(content_str) > 70 else content_str
|
|
|
|
content_info = f" | Extracted: [cyan]{snippet}[/]"
|
|
|
|
content_info = f" | Extracted: [cyan]{snippet}[/]"
|
|
|
|
elif result.get('markdown'):
|
|
|
|
elif result.get('markdown'):
|
|
|
|
content_info = f" | Markdown: [cyan]Present[/]"
|
|
|
|
content_info = f" | Markdown: [cyan]Present[/]"
|
|
|
|
elif result.get('html'):
|
|
|
|
elif result.get('html'):
|
|
|
|
content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
|
|
|
|
content_info = f" | HTML Size: [cyan]{len(result['html'])}[/]"
|
|
|
|
|
|
|
|
|
|
|
|
console.print(f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"{success_icon} URL: [link={url}]{url}[/link] (Status: {status}){content_info}")
|
|
|
|
if "metadata" in result and "depth" in result["metadata"]:
|
|
|
|
if "metadata" in result and "depth" in result["metadata"]:
|
|
|
|
console.print(f" Depth: {result['metadata']['depth']}")
|
|
|
|
console.print(f" Depth: {result['metadata']['depth']}")
|
|
|
|
if not result.get('success') and result.get('error_message'):
|
|
|
|
if not result.get('success') and result.get('error_message'):
|
|
|
|
@@ -104,7 +116,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
|
|
|
|
start_time = time.time()
|
|
|
|
start_time = time.time()
|
|
|
|
response = await client.post(endpoint, json=payload)
|
|
|
|
response = await client.post(endpoint, json=payload)
|
|
|
|
duration = time.time() - start_time
|
|
|
|
duration = time.time() - start_time
|
|
|
|
console.print(f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Response Status: [bold {'green' if response.is_success else 'red'}]{response.status_code}[/] (took {duration:.2f}s)")
|
|
|
|
response.raise_for_status()
|
|
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
|
|
data = response.json()
|
|
|
|
if data.get("success"):
|
|
|
|
if data.get("success"):
|
|
|
|
@@ -119,7 +132,8 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
|
|
|
|
console.print(f"[bold red]HTTP Error:[/]")
|
|
|
|
console.print(f"[bold red]HTTP Error:[/]")
|
|
|
|
console.print(f"Status: {e.response.status_code}")
|
|
|
|
console.print(f"Status: {e.response.status_code}")
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
|
|
|
|
console.print(Panel(Syntax(json.dumps(
|
|
|
|
|
|
|
|
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print(f"Response Body: {e.response.text}")
|
|
|
|
console.print(f"Response Body: {e.response.text}")
|
|
|
|
except httpx.RequestError as e:
|
|
|
|
except httpx.RequestError as e:
|
|
|
|
@@ -128,11 +142,13 @@ async def make_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[s
|
|
|
|
console.print(f"[bold red]Unexpected Error: {e}[/]")
|
|
|
|
console.print(f"[bold red]Unexpected Error: {e}[/]")
|
|
|
|
return None
|
|
|
|
return None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
|
|
|
|
async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict[str, Any], title: str):
|
|
|
|
"""Handles streaming POST requests."""
|
|
|
|
"""Handles streaming POST requests."""
|
|
|
|
console.rule(f"[bold magenta]{title}[/]", style="magenta")
|
|
|
|
console.rule(f"[bold magenta]{title}[/]", style="magenta")
|
|
|
|
print_payload(payload)
|
|
|
|
print_payload(payload)
|
|
|
|
console.print(f"Sending POST stream request to {client.base_url}{endpoint}...")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Sending POST stream request to {client.base_url}{endpoint}...")
|
|
|
|
all_results = []
|
|
|
|
all_results = []
|
|
|
|
initial_status_code = None # Store initial status code
|
|
|
|
initial_status_code = None # Store initial status code
|
|
|
|
|
|
|
|
|
|
|
|
@@ -141,7 +157,8 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
|
|
|
|
async with client.stream("POST", endpoint, json=payload) as response:
|
|
|
|
async with client.stream("POST", endpoint, json=payload) as response:
|
|
|
|
initial_status_code = response.status_code # Capture initial status
|
|
|
|
initial_status_code = response.status_code # Capture initial status
|
|
|
|
duration = time.time() - start_time # Time to first byte potentially
|
|
|
|
duration = time.time() - start_time # Time to first byte potentially
|
|
|
|
console.print(f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Initial Response Status: [bold {'green' if response.is_success else 'red'}]{initial_status_code}[/] (first byte ~{duration:.2f}s)")
|
|
|
|
response.raise_for_status() # Raise exception for bad *initial* status codes
|
|
|
|
response.raise_for_status() # Raise exception for bad *initial* status codes
|
|
|
|
|
|
|
|
|
|
|
|
console.print("[magenta]--- Streaming Results ---[/]")
|
|
|
|
console.print("[magenta]--- Streaming Results ---[/]")
|
|
|
|
@@ -152,24 +169,31 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
|
|
|
|
data = json.loads(line)
|
|
|
|
data = json.loads(line)
|
|
|
|
if data.get("status") == "completed":
|
|
|
|
if data.get("status") == "completed":
|
|
|
|
completed = True
|
|
|
|
completed = True
|
|
|
|
console.print("[bold green]--- Stream Completed ---[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[bold green]--- Stream Completed ---[/]")
|
|
|
|
break
|
|
|
|
break
|
|
|
|
elif data.get("url"): # Looks like a result dictionary
|
|
|
|
elif data.get("url"): # Looks like a result dictionary
|
|
|
|
all_results.append(data)
|
|
|
|
all_results.append(data)
|
|
|
|
# Display summary info as it arrives
|
|
|
|
# Display summary info as it arrives
|
|
|
|
success_icon = "[green]✔[/]" if data.get('success') else "[red]✘[/]"
|
|
|
|
success_icon = "[green]✔[/]" if data.get(
|
|
|
|
|
|
|
|
'success') else "[red]✘[/]"
|
|
|
|
url = data.get('url', 'N/A')
|
|
|
|
url = data.get('url', 'N/A')
|
|
|
|
# Display status code FROM THE RESULT DATA if available
|
|
|
|
# Display status code FROM THE RESULT DATA if available
|
|
|
|
result_status = data.get('status_code', 'N/A')
|
|
|
|
result_status = data.get('status_code', 'N/A')
|
|
|
|
console.print(f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" {success_icon} Received: [link={url}]{url}[/link] (Status: {result_status})")
|
|
|
|
if not data.get('success') and data.get('error_message'):
|
|
|
|
if not data.get('success') and data.get('error_message'):
|
|
|
|
console.print(f" [red]Error: {data['error_message']}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]Error: {data['error_message']}[/]")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [yellow]Stream meta-data:[/yellow] {data}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]Stream meta-data:[/yellow] {data}")
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print(f" [red]Stream decode error for line:[/red] {line}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]Stream decode error for line:[/red] {line}")
|
|
|
|
if not completed:
|
|
|
|
if not completed:
|
|
|
|
console.print("[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[bold yellow]Warning: Stream ended without 'completed' marker.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
|
|
# Use the captured initial status code if available, otherwise from the exception
|
|
|
|
# Use the captured initial status code if available, otherwise from the exception
|
|
|
|
@@ -177,18 +201,21 @@ async def stream_request(client: httpx.AsyncClient, endpoint: str, payload: Dict
|
|
|
|
console.print(f"[bold red]HTTP Error (Initial Request):[/]")
|
|
|
|
console.print(f"[bold red]HTTP Error (Initial Request):[/]")
|
|
|
|
console.print(f"Status: {status}")
|
|
|
|
console.print(f"Status: {status}")
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
|
|
|
|
console.print(Panel(Syntax(json.dumps(
|
|
|
|
|
|
|
|
e.response.json(), indent=2), "json", theme="default"), title="Error Response"))
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print(f"Response Body: {e.response.text}")
|
|
|
|
console.print(f"Response Body: {e.response.text}")
|
|
|
|
except httpx.RequestError as e:
|
|
|
|
except httpx.RequestError as e:
|
|
|
|
console.print(f"[bold red]Request Error: {e}[/]")
|
|
|
|
console.print(f"[bold red]Request Error: {e}[/]")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
|
|
|
|
console.print(f"[bold red]Unexpected Error during streaming: {e}[/]")
|
|
|
|
console.print_exception(show_locals=False) # Print stack trace for unexpected errors
|
|
|
|
# Print stack trace for unexpected errors
|
|
|
|
|
|
|
|
console.print_exception(show_locals=False)
|
|
|
|
|
|
|
|
|
|
|
|
# Call print_result_summary with the *collected* results AFTER the stream is done
|
|
|
|
# Call print_result_summary with the *collected* results AFTER the stream is done
|
|
|
|
print_result_summary(all_results, title=f"{title} Collected Results")
|
|
|
|
print_result_summary(all_results, title=f"{title} Collected Results")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_proxies_from_env() -> List[Dict]:
|
|
|
|
def load_proxies_from_env() -> List[Dict]:
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Load proxies from the PROXIES environment variable.
|
|
|
|
Load proxies from the PROXIES environment variable.
|
|
|
|
@@ -226,23 +253,25 @@ def load_proxies_from_env() -> List[Dict]:
|
|
|
|
# "ip": ip
|
|
|
|
# "ip": ip
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[yellow]Skipping invalid proxy string format:[/yellow] {entry}")
|
|
|
|
continue
|
|
|
|
continue
|
|
|
|
|
|
|
|
|
|
|
|
proxies_params_list.append(proxy_dict)
|
|
|
|
proxies_params_list.append(proxy_dict)
|
|
|
|
|
|
|
|
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[red]Error loading proxies from environment:[/red] {e}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[red]Error loading proxies from environment:[/red] {e}")
|
|
|
|
|
|
|
|
|
|
|
|
if proxies_params_list:
|
|
|
|
if proxies_params_list:
|
|
|
|
console.print(f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[cyan]Loaded {len(proxies_params_list)} proxies from environment.[/]")
|
|
|
|
# else:
|
|
|
|
# else:
|
|
|
|
# console.print("[yellow]No valid proxies loaded from environment.[/]")
|
|
|
|
# console.print("[yellow]No valid proxies loaded from environment.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
return proxies_params_list
|
|
|
|
return proxies_params_list
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- Demo Functions ---
|
|
|
|
# --- Demo Functions ---
|
|
|
|
|
|
|
|
|
|
|
|
# 1. Basic Crawling
|
|
|
|
# 1. Basic Crawling
|
|
|
|
@@ -250,11 +279,17 @@ async def demo_basic_single_url(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "BYPASS"}}
|
|
|
|
"crawler_config": {
|
|
|
|
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"cache_mode": "BYPASS"
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 1a: Basic Single URL Crawl")
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_basic_multi_url(client: httpx.AsyncClient):
|
|
|
|
async def demo_basic_multi_url(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [SIMPLE_URL, LINKS_URL],
|
|
|
|
"urls": [SIMPLE_URL, LINKS_URL],
|
|
|
|
@@ -264,16 +299,31 @@ async def demo_basic_multi_url(client: httpx.AsyncClient):
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 1b: Basic Multi URL Crawl")
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_streaming_multi_url(client: httpx.AsyncClient):
|
|
|
|
async def demo_streaming_multi_url(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
|
|
|
|
# "urls": [SIMPLE_URL, LINKS_URL, FORMS_URL, SIMPLE_URL, LINKS_URL, FORMS_URL], # Add another URL
|
|
|
|
|
|
|
|
"urls": [
|
|
|
|
|
|
|
|
"https://example.com/page1",
|
|
|
|
|
|
|
|
"https://example.com/page2",
|
|
|
|
|
|
|
|
"https://example.com/page3",
|
|
|
|
|
|
|
|
"https://example.com/page4",
|
|
|
|
|
|
|
|
"https://example.com/page5"
|
|
|
|
|
|
|
|
], # Add another URL
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"crawler_config": {"type": "CrawlerRunConfig", "params": {"stream": True, "cache_mode": "BYPASS"}}
|
|
|
|
"crawler_config": {
|
|
|
|
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"stream": True,
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result = stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
result = await stream_request(client, "/crawl/stream", payload, "Demo 1c: Streaming Multi URL Crawl")
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
# 2. Markdown Generation & Content Filtering
|
|
|
|
# 2. Markdown Generation & Content Filtering
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_markdown_default(client: httpx.AsyncClient):
|
|
|
|
async def demo_markdown_default(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
@@ -281,14 +331,25 @@ async def demo_markdown_default(client: httpx.AsyncClient):
|
|
|
|
"crawler_config": {
|
|
|
|
"crawler_config": {
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
"markdown_generator": {
|
|
|
|
"markdown_generator": {"type": "DefaultMarkdownGenerator", "params": {}} # Explicitly default
|
|
|
|
"type": "DefaultMarkdownGenerator",
|
|
|
|
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"content_source": "fit_html",
|
|
|
|
|
|
|
|
"options": {
|
|
|
|
|
|
|
|
"type": "dict",
|
|
|
|
|
|
|
|
"value": {
|
|
|
|
|
|
|
|
"ignore_links": True
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
} # Explicitly default
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 2a: Default Markdown Generation")
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_markdown_pruning(client: httpx.AsyncClient):
|
|
|
|
async def demo_markdown_pruning(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [PYTHON_URL], # Use a more complex page
|
|
|
|
"urls": [PYTHON_URL], # Use a more complex page
|
|
|
|
@@ -302,7 +363,10 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
"content_filter": {
|
|
|
|
"content_filter": {
|
|
|
|
"type": "PruningContentFilter",
|
|
|
|
"type": "PruningContentFilter",
|
|
|
|
"params": {"threshold": 0.6, "threshold_type": "relative"}
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"threshold": 0.6,
|
|
|
|
|
|
|
|
"threshold_type": "relative"
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -312,6 +376,7 @@ async def demo_markdown_pruning(client: httpx.AsyncClient):
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
|
|
|
|
result = await make_request(client, "/crawl", payload, "Demo 2b: Markdown with Pruning Filter")
|
|
|
|
return result
|
|
|
|
return result
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_markdown_bm25(client: httpx.AsyncClient):
|
|
|
|
async def demo_markdown_bm25(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [PYTHON_URL],
|
|
|
|
"urls": [PYTHON_URL],
|
|
|
|
@@ -325,7 +390,9 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
"content_filter": {
|
|
|
|
"content_filter": {
|
|
|
|
"type": "BM25ContentFilter",
|
|
|
|
"type": "BM25ContentFilter",
|
|
|
|
"params": {"user_query": "Python documentation language reference"}
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"user_query": "Python documentation language reference"
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -337,21 +404,22 @@ async def demo_markdown_bm25(client: httpx.AsyncClient):
|
|
|
|
|
|
|
|
|
|
|
|
# 3. Specific Parameters
|
|
|
|
# 3. Specific Parameters
|
|
|
|
# Corrected Demo Function: demo_param_css_selector
|
|
|
|
# Corrected Demo Function: demo_param_css_selector
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_param_css_selector(client: httpx.AsyncClient):
|
|
|
|
async def demo_param_css_selector(client: httpx.AsyncClient):
|
|
|
|
target_selector = ".main-content" # Using the suggested correct selector
|
|
|
|
css_selector = ".main-content" # Using the suggested correct selector
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [PYTHON_URL],
|
|
|
|
"urls": [PYTHON_URL],
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"crawler_config": {
|
|
|
|
"crawler_config": {
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
"css_selector": css_selector # Target specific div
|
|
|
|
"css_selector": target_selector # Target specific div
|
|
|
|
|
|
|
|
# No extraction strategy is needed to demo this parameter's effect on input HTML
|
|
|
|
# No extraction strategy is needed to demo this parameter's effect on input HTML
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{target_selector}')")
|
|
|
|
results = await make_request(client, "/crawl", payload, f"Demo 3a: Using css_selector ('{css_selector}')")
|
|
|
|
|
|
|
|
|
|
|
|
if results:
|
|
|
|
if results:
|
|
|
|
result = results[0]
|
|
|
|
result = results[0]
|
|
|
|
@@ -360,24 +428,32 @@ async def demo_param_css_selector(client: httpx.AsyncClient):
|
|
|
|
# A simple check: does it contain expected content from within the selector,
|
|
|
|
# A simple check: does it contain expected content from within the selector,
|
|
|
|
# and does it LACK content known to be outside (like footer links)?
|
|
|
|
# and does it LACK content known to be outside (like footer links)?
|
|
|
|
html_content = result['html']
|
|
|
|
html_content = result['html']
|
|
|
|
content_present = 'Python Software Foundation' in html_content # Text likely within .main-content somewhere
|
|
|
|
# Text likely within .main-content somewhere
|
|
|
|
footer_absent = 'Legal Statements' not in html_content # Text likely in the footer, outside .main-content
|
|
|
|
content_present = 'Python Software Foundation' in html_content
|
|
|
|
|
|
|
|
# Text likely in the footer, outside .main-content
|
|
|
|
|
|
|
|
footer_absent = 'Legal Statements' not in html_content
|
|
|
|
|
|
|
|
|
|
|
|
console.print(f" Content Check: Text inside '{target_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
|
|
|
|
console.print(
|
|
|
|
console.print(f" Content Check: Text outside '{target_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
|
|
|
|
f" Content Check: Text inside '{css_selector}' likely present? {'[green]Yes[/]' if content_present else '[red]No[/]'}")
|
|
|
|
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" Content Check: Text outside '{css_selector}' (footer) likely absent? {'[green]Yes[/]' if footer_absent else '[red]No[/]'}")
|
|
|
|
|
|
|
|
|
|
|
|
if not content_present or not footer_absent:
|
|
|
|
if not content_present or not footer_absent:
|
|
|
|
console.print(f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]Note:[/yellow] HTML filtering might not be precise or page structure changed. Result HTML length: {len(html_content)}")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]Verified:[/green] Returned HTML appears limited by css_selector. Result HTML length: {len(html_content)}")
|
|
|
|
|
|
|
|
|
|
|
|
elif result['success']:
|
|
|
|
elif result['success']:
|
|
|
|
console.print("[yellow]HTML content was empty in the successful result.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow]HTML content was empty in the successful result.[/]")
|
|
|
|
# Error message is handled by print_result_summary called by make_request
|
|
|
|
# Error message is handled by print_result_summary called by make_request
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_param_js_execution(client: httpx.AsyncClient):
|
|
|
|
async def demo_param_js_execution(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [FORMS_URL], # Use a page with a form
|
|
|
|
"urls": ["https://example.com"], # Use a page with a form
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"crawler_config": {
|
|
|
|
"crawler_config": {
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
@@ -385,10 +461,10 @@ async def demo_param_js_execution(client: httpx.AsyncClient):
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
# Simple JS to fill and maybe click (won't submit without more complex setup)
|
|
|
|
# Simple JS to fill and maybe click (won't submit without more complex setup)
|
|
|
|
"js_code": """
|
|
|
|
"js_code": """
|
|
|
|
() => {
|
|
|
|
(() => {
|
|
|
|
document.querySelector('[name="custname"]').value = 'Crawl4AI Demo';
|
|
|
|
document.querySelector('h1').innerText = 'Crawl4AI Demo';
|
|
|
|
return { filled_name: document.querySelector('[name="custname"]').value };
|
|
|
|
return { filled_name: document.querySelector('h1').innerText };
|
|
|
|
}
|
|
|
|
})();
|
|
|
|
""",
|
|
|
|
""",
|
|
|
|
"delay_before_return_html": 0.5 # Give JS time to potentially run
|
|
|
|
"delay_before_return_html": 0.5 # Give JS time to potentially run
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -396,10 +472,12 @@ async def demo_param_js_execution(client: httpx.AsyncClient):
|
|
|
|
}
|
|
|
|
}
|
|
|
|
results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
|
|
|
|
results = await make_request(client, "/crawl", payload, "Demo 3b: Using js_code Parameter")
|
|
|
|
if results and results[0].get("js_execution_result"):
|
|
|
|
if results and results[0].get("js_execution_result"):
|
|
|
|
console.print("[cyan]JS Execution Result:[/]", results[0]["js_execution_result"])
|
|
|
|
console.print("[cyan]JS Execution Result:[/]",
|
|
|
|
|
|
|
|
results[0]["js_execution_result"])
|
|
|
|
elif results:
|
|
|
|
elif results:
|
|
|
|
console.print("[yellow]JS Execution Result not found in response.[/]")
|
|
|
|
console.print("[yellow]JS Execution Result not found in response.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_param_screenshot(client: httpx.AsyncClient):
|
|
|
|
async def demo_param_screenshot(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
"urls": [SIMPLE_URL],
|
|
|
|
@@ -411,10 +489,12 @@ async def demo_param_screenshot(client: httpx.AsyncClient):
|
|
|
|
}
|
|
|
|
}
|
|
|
|
results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
|
|
|
|
results = await make_request(client, "/crawl", payload, "Demo 3c: Taking a Screenshot")
|
|
|
|
if results and results[0].get("screenshot"):
|
|
|
|
if results and results[0].get("screenshot"):
|
|
|
|
console.print(f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[cyan]Screenshot data received (length):[/] {len(results[0]['screenshot'])}")
|
|
|
|
elif results:
|
|
|
|
elif results:
|
|
|
|
console.print("[yellow]Screenshot data not found in response.[/]")
|
|
|
|
console.print("[yellow]Screenshot data not found in response.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_param_ssl_fetch(client: httpx.AsyncClient):
|
|
|
|
async def demo_param_ssl_fetch(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [PYTHON_URL], # Needs HTTPS
|
|
|
|
"urls": [PYTHON_URL], # Needs HTTPS
|
|
|
|
@@ -431,10 +511,12 @@ async def demo_param_ssl_fetch(client: httpx.AsyncClient):
|
|
|
|
elif results:
|
|
|
|
elif results:
|
|
|
|
console.print("[yellow]SSL Certificate data not found in response.[/]")
|
|
|
|
console.print("[yellow]SSL Certificate data not found in response.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_param_proxy(client: httpx.AsyncClient):
|
|
|
|
async def demo_param_proxy(client: httpx.AsyncClient):
|
|
|
|
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
|
|
|
|
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
|
|
|
|
if not proxy_params_list:
|
|
|
|
if not proxy_params_list:
|
|
|
|
console.rule("[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
|
|
|
|
console.rule(
|
|
|
|
|
|
|
|
"[bold yellow]Demo 3e: Using Proxies (SKIPPED)[/]", style="yellow")
|
|
|
|
console.print("Set the PROXIES environment variable to run this demo.")
|
|
|
|
console.print("Set the PROXIES environment variable to run this demo.")
|
|
|
|
console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
|
|
|
|
console.print("Format: IP:PORT:USR:PWD,IP:PORT,...")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
@@ -450,8 +532,21 @@ async def demo_param_proxy(client: httpx.AsyncClient):
|
|
|
|
"type": "RoundRobinProxyStrategy",
|
|
|
|
"type": "RoundRobinProxyStrategy",
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
"proxies": [
|
|
|
|
"proxies": [
|
|
|
|
|
|
|
|
# [
|
|
|
|
|
|
|
|
# {
|
|
|
|
|
|
|
|
# "type": "ProxyConfig",
|
|
|
|
|
|
|
|
# "params": {
|
|
|
|
|
|
|
|
# server:"...",
|
|
|
|
|
|
|
|
# "username": "...",
|
|
|
|
|
|
|
|
# "password": "..."
|
|
|
|
|
|
|
|
# }
|
|
|
|
|
|
|
|
# },
|
|
|
|
|
|
|
|
# ...
|
|
|
|
|
|
|
|
# ]
|
|
|
|
|
|
|
|
|
|
|
|
# Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
|
|
|
|
# Filter out the 'ip' key when sending to server, as it's not part of ProxyConfig
|
|
|
|
{"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
|
|
|
|
{"type": "ProxyConfig", "params": {
|
|
|
|
|
|
|
|
k: v for k, v in p.items() if k != 'ip'}}
|
|
|
|
for p in proxy_params_list
|
|
|
|
for p in proxy_params_list
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -480,37 +575,49 @@ async def demo_param_proxy(client: httpx.AsyncClient):
|
|
|
|
if json_str:
|
|
|
|
if json_str:
|
|
|
|
ip_data = json.loads(json_str)
|
|
|
|
ip_data = json.loads(json_str)
|
|
|
|
origin_ip = ip_data.get("origin")
|
|
|
|
origin_ip = ip_data.get("origin")
|
|
|
|
console.print(f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" Origin IP reported by httpbin: [bold yellow]{origin_ip}[/]")
|
|
|
|
|
|
|
|
|
|
|
|
# Extract the IPs from the proxy list for comparison
|
|
|
|
# Extract the IPs from the proxy list for comparison
|
|
|
|
proxy_ips = {p.get("server").split(":")[1][2:] for p in proxy_params_list}
|
|
|
|
proxy_ips = {p.get("server").split(
|
|
|
|
|
|
|
|
":")[1][2:] for p in proxy_params_list}
|
|
|
|
|
|
|
|
|
|
|
|
if origin_ip and origin_ip in proxy_ips:
|
|
|
|
if origin_ip and origin_ip in proxy_ips:
|
|
|
|
console.print("[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[bold green] Verification SUCCESS: Origin IP matches one of the provided proxies![/]")
|
|
|
|
elif origin_ip:
|
|
|
|
elif origin_ip:
|
|
|
|
console.print("[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[bold red] Verification FAILED: Origin IP does not match any provided proxy IPs.[/]")
|
|
|
|
console.print(f" Provided Proxy IPs: {proxy_ips}")
|
|
|
|
console.print(f" Provided Proxy IPs: {proxy_ips}")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print("[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow] Verification SKIPPED: Could not extract origin IP from response.[/]")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print("[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow] Verification SKIPPED: Could not find JSON in httpbin response HTML.[/]")
|
|
|
|
# console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
|
|
|
|
# console.print(f"HTML Received:\n{html_content[:500]}...") # Uncomment for debugging
|
|
|
|
|
|
|
|
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print("[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[red] Verification FAILED: Could not parse JSON from httpbin response HTML.[/]")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[red] Verification Error: An unexpected error occurred during IP check: {e}[/]")
|
|
|
|
elif results:
|
|
|
|
elif results:
|
|
|
|
console.print("[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow] Verification SKIPPED: Crawl for IP check was not successful.[/]")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 4. Extraction Strategies
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 4. Extraction Strategies (Non-Deep)
|
|
|
|
|
|
|
|
async def demo_extract_css(client: httpx.AsyncClient):
|
|
|
|
async def demo_extract_css(client: httpx.AsyncClient):
|
|
|
|
# Schema to extract book titles and prices
|
|
|
|
# Schema to extract book titles and prices
|
|
|
|
book_schema = {
|
|
|
|
book_schema = {
|
|
|
|
"name": "BookList",
|
|
|
|
"name": "BookList",
|
|
|
|
"baseSelector": "ol.row li.col-xs-6",
|
|
|
|
"baseSelector": "ol.row li.col-xs-6",
|
|
|
|
"fields": [
|
|
|
|
"fields": [
|
|
|
|
{"name": "title", "selector": "article.product_pod h3 a", "type": "attribute", "attribute": "title"},
|
|
|
|
{"name": "title", "selector": "article.product_pod h3 a",
|
|
|
|
|
|
|
|
"type": "attribute", "attribute": "title"},
|
|
|
|
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
|
|
|
|
{"name": "price", "selector": "article.product_pod .price_color", "type": "text"},
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -523,7 +630,12 @@ async def demo_extract_css(client: httpx.AsyncClient):
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
"cache_mode": "BYPASS",
|
|
|
|
"extraction_strategy": {
|
|
|
|
"extraction_strategy": {
|
|
|
|
"type": "JsonCssExtractionStrategy",
|
|
|
|
"type": "JsonCssExtractionStrategy",
|
|
|
|
"params": {"schema": {"type": "dict", "value": book_schema}}
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"schema": {
|
|
|
|
|
|
|
|
"type": "dict",
|
|
|
|
|
|
|
|
"value": book_schema
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -539,21 +651,28 @@ async def demo_extract_css(client: httpx.AsyncClient):
|
|
|
|
table.add_column("Title", style="dim")
|
|
|
|
table.add_column("Title", style="dim")
|
|
|
|
table.add_column("Price")
|
|
|
|
table.add_column("Price")
|
|
|
|
for item in extracted_data[:5]: # Show first 5
|
|
|
|
for item in extracted_data[:5]: # Show first 5
|
|
|
|
table.add_row(item.get('title', 'N/A'), item.get('price', 'N/A'))
|
|
|
|
table.add_row(item.get('title', 'N/A'),
|
|
|
|
|
|
|
|
item.get('price', 'N/A'))
|
|
|
|
console.print(table)
|
|
|
|
console.print(table)
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print("[yellow]CSS extraction did not return a list of results.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow]CSS extraction did not return a list of results.[/]")
|
|
|
|
console.print(extracted_data)
|
|
|
|
console.print(extracted_data)
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print("[red]Failed to parse extracted_content as JSON.[/]")
|
|
|
|
console.print("[red]Failed to parse extracted_content as JSON.[/]")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[red]Error processing extracted CSS content: {e}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[red]Error processing extracted CSS content: {e}[/]")
|
|
|
|
|
|
|
|
|
|
|
|
# 5. LLM Extraction
|
|
|
|
# 5. LLM Extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_extract_llm(client: httpx.AsyncClient):
|
|
|
|
async def demo_extract_llm(client: httpx.AsyncClient):
|
|
|
|
if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
|
|
|
|
if not os.getenv("OPENAI_API_KEY"): # Basic check for a common key
|
|
|
|
console.rule("[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
|
|
|
|
console.rule(
|
|
|
|
console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
|
|
|
|
"[bold yellow]Demo 4b: LLM Extraction (SKIPPED)[/]", style="yellow")
|
|
|
|
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
@@ -571,7 +690,10 @@ async def demo_extract_llm(client: httpx.AsyncClient):
|
|
|
|
"type": "LLMConfig",
|
|
|
|
"type": "LLMConfig",
|
|
|
|
"params": {}
|
|
|
|
"params": {}
|
|
|
|
# Relies on server's default provider from config.yml & keys from .llm.env
|
|
|
|
# Relies on server's default provider from config.yml & keys from .llm.env
|
|
|
|
# "params": {"provider": "openai/gpt-4o-mini"}
|
|
|
|
# "params": {
|
|
|
|
|
|
|
|
# "provider": "openai/gpt-4o-mini",
|
|
|
|
|
|
|
|
# "api_key": os.getenv("OPENAI_API_KEY") # Optional: Override key
|
|
|
|
|
|
|
|
# }
|
|
|
|
},
|
|
|
|
},
|
|
|
|
"schema": { # Request structured output
|
|
|
|
"schema": { # Request structured output
|
|
|
|
"type": "dict",
|
|
|
|
"type": "dict",
|
|
|
|
@@ -599,17 +721,23 @@ async def demo_extract_llm(client: httpx.AsyncClient):
|
|
|
|
|
|
|
|
|
|
|
|
if isinstance(extracted_data, dict):
|
|
|
|
if isinstance(extracted_data, dict):
|
|
|
|
console.print("[cyan]Extracted Data (LLM):[/]")
|
|
|
|
console.print("[cyan]Extracted Data (LLM):[/]")
|
|
|
|
syntax = Syntax(json.dumps(extracted_data, indent=2), "json", theme="monokai", line_numbers=False)
|
|
|
|
syntax = Syntax(json.dumps(extracted_data, indent=2),
|
|
|
|
|
|
|
|
"json", theme="monokai", line_numbers=False)
|
|
|
|
console.print(Panel(syntax, border_style="cyan", expand=False))
|
|
|
|
console.print(Panel(syntax, border_style="cyan", expand=False))
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print("[yellow]LLM extraction did not return expected dictionary.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[yellow]LLM extraction did not return expected dictionary.[/]")
|
|
|
|
console.print(extracted_data)
|
|
|
|
console.print(extracted_data)
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
except json.JSONDecodeError:
|
|
|
|
console.print("[red]Failed to parse LLM extracted_content as JSON.[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"[red]Failed to parse LLM extracted_content as JSON.[/]")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[red]Error processing extracted LLM content: {e}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[red]Error processing extracted LLM content: {e}[/]")
|
|
|
|
|
|
|
|
|
|
|
|
# 6. Deep Crawling
|
|
|
|
# 6. Deep Crawling
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_deep_basic(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_basic(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL],
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL],
|
|
|
|
@@ -625,7 +753,17 @@ async def demo_deep_basic(client: httpx.AsyncClient):
|
|
|
|
"max_pages": 4,
|
|
|
|
"max_pages": 4,
|
|
|
|
"filter_chain": {
|
|
|
|
"filter_chain": {
|
|
|
|
"type": "FilterChain",
|
|
|
|
"type": "FilterChain",
|
|
|
|
"params": {"filters": [{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}}]}
|
|
|
|
"params": {
|
|
|
|
|
|
|
|
"filters": [
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"type": "DomainFilter",
|
|
|
|
|
|
|
|
"params":
|
|
|
|
|
|
|
|
{
|
|
|
|
|
|
|
|
"allowed_domains": [DEEP_CRAWL_DOMAIN]
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
]
|
|
|
|
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -642,6 +780,8 @@ async def demo_deep_basic(client: httpx.AsyncClient):
|
|
|
|
console.print(f" [red]Error: {result['error_message']}[/]")
|
|
|
|
console.print(f" [red]Error: {result['error_message']}[/]")
|
|
|
|
|
|
|
|
|
|
|
|
# 5. Streaming Deep Crawl
|
|
|
|
# 5. Streaming Deep Crawl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_deep_streaming(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_streaming(client: httpx.AsyncClient):
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL],
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL],
|
|
|
|
@@ -669,6 +809,8 @@ async def demo_deep_streaming(client: httpx.AsyncClient):
|
|
|
|
await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
|
|
|
|
await stream_request(client, "/crawl/stream", payload, "Demo 5b: Streaming Deep Crawl")
|
|
|
|
|
|
|
|
|
|
|
|
# 5a. Deep Crawl with Filtering & Scoring
|
|
|
|
# 5a. Deep Crawl with Filtering & Scoring
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
|
|
|
|
"""Demonstrates deep crawl with advanced URL filtering and scoring."""
|
|
|
|
"""Demonstrates deep crawl with advanced URL filtering and scoring."""
|
|
|
|
max_depth = 2 # Go a bit deeper to see scoring/filtering effects
|
|
|
|
max_depth = 2 # Go a bit deeper to see scoring/filtering effects
|
|
|
|
@@ -717,7 +859,8 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
|
|
|
|
"scorers": [
|
|
|
|
"scorers": [
|
|
|
|
{ # Boost score for URLs containing the keyword
|
|
|
|
{ # Boost score for URLs containing the keyword
|
|
|
|
"type": "KeywordRelevanceScorer",
|
|
|
|
"type": "KeywordRelevanceScorer",
|
|
|
|
"params": {"keywords": [keyword_to_score], "weight": 1.5} # Higher weight
|
|
|
|
# Higher weight
|
|
|
|
|
|
|
|
"params": {"keywords": [keyword_to_score], "weight": 1.5}
|
|
|
|
},
|
|
|
|
},
|
|
|
|
{ # Slightly penalize deeper pages
|
|
|
|
{ # Slightly penalize deeper pages
|
|
|
|
"type": "PathDepthScorer",
|
|
|
|
"type": "PathDepthScorer",
|
|
|
|
@@ -747,39 +890,51 @@ async def demo_deep_filtering_scoring(client: httpx.AsyncClient):
|
|
|
|
depth = result.get("metadata", {}).get("depth", -1)
|
|
|
|
depth = result.get("metadata", {}).get("depth", -1)
|
|
|
|
|
|
|
|
|
|
|
|
# Check Filtering
|
|
|
|
# Check Filtering
|
|
|
|
if excluded_pattern.strip('*') in url: # Check if the excluded part is present
|
|
|
|
# Check if the excluded part is present
|
|
|
|
console.print(f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
|
|
|
|
if excluded_pattern.strip('*') in url:
|
|
|
|
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [bold red]Filter FAILED:[/bold red] Excluded pattern part '{excluded_pattern.strip('*')}' found in URL: {url}")
|
|
|
|
excluded_found = True
|
|
|
|
excluded_found = True
|
|
|
|
|
|
|
|
|
|
|
|
# Check Scoring (Observation)
|
|
|
|
# Check Scoring (Observation)
|
|
|
|
if keyword_to_score in url:
|
|
|
|
if keyword_to_score in url:
|
|
|
|
prioritized_found_overall = True
|
|
|
|
prioritized_found_overall = True
|
|
|
|
if depth == 1: # Check if prioritized keywords appeared early (depth 1)
|
|
|
|
# Check if prioritized keywords appeared early (depth 1)
|
|
|
|
|
|
|
|
if depth == 1:
|
|
|
|
prioritized_found_at_depth1 = True
|
|
|
|
prioritized_found_at_depth1 = True
|
|
|
|
|
|
|
|
|
|
|
|
if not excluded_found:
|
|
|
|
if not excluded_found:
|
|
|
|
console.print(f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]Filter Check:[/green] No URLs matching excluded pattern '{excluded_pattern}' found.")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]Filter Check:[/red] URLs matching excluded pattern '{excluded_pattern}' were found (unexpected).")
|
|
|
|
|
|
|
|
|
|
|
|
if prioritized_found_at_depth1:
|
|
|
|
if prioritized_found_at_depth1:
|
|
|
|
console.print(f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]Scoring Check:[/green] URLs with keyword '{keyword_to_score}' were found at depth 1 (scoring likely influenced).")
|
|
|
|
elif prioritized_found_overall:
|
|
|
|
elif prioritized_found_overall:
|
|
|
|
console.print(f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]Scoring Check:[/yellow] URLs with keyword '{keyword_to_score}' found, but not necessarily prioritized early (check max_pages/depth limits).")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]Scoring Check:[/yellow] No URLs with keyword '{keyword_to_score}' found within crawl limits.")
|
|
|
|
|
|
|
|
|
|
|
|
# print_result_summary called by make_request already shows URLs and depths
|
|
|
|
# print_result_summary called by make_request already shows URLs and depths
|
|
|
|
|
|
|
|
|
|
|
|
# 6. Deep Crawl with Extraction
|
|
|
|
# 6. Deep Crawl with Extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
|
|
|
|
# Schema to extract H1 and first paragraph from any page
|
|
|
|
# Schema to extract H1 and first paragraph from any page
|
|
|
|
general_schema = {
|
|
|
|
general_schema = {
|
|
|
|
"name": "PageContent",
|
|
|
|
"name": "PageContent",
|
|
|
|
"baseSelector": "body", # Apply to whole body
|
|
|
|
"baseSelector": "body", # Apply to whole body
|
|
|
|
"fields": [
|
|
|
|
"fields": [
|
|
|
|
{"name": "page_title", "selector": "h1", "type": "text", "default": "N/A"},
|
|
|
|
{"name": "page_title", "selector": "h1",
|
|
|
|
{"name": "first_p", "selector": "p", "type": "text", "default": "N/A"}, # Gets first p tag
|
|
|
|
"type": "text", "default": "N/A"},
|
|
|
|
|
|
|
|
{"name": "first_p", "selector": "p", "type": "text",
|
|
|
|
|
|
|
|
"default": "N/A"}, # Gets first p tag
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
@@ -801,8 +956,10 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
|
|
|
|
"filter_chain": {
|
|
|
|
"filter_chain": {
|
|
|
|
"type": "FilterChain",
|
|
|
|
"type": "FilterChain",
|
|
|
|
"params": {"filters": [
|
|
|
|
"params": {"filters": [
|
|
|
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
|
|
|
{"type": "DomainFilter", "params": {
|
|
|
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
|
|
|
"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
|
|
|
|
|
|
|
{"type": "ContentTypeFilter", "params": {
|
|
|
|
|
|
|
|
"allowed_types": ["text/html"]}}
|
|
|
|
]}
|
|
|
|
]}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -818,21 +975,31 @@ async def demo_deep_with_css_extraction(client: httpx.AsyncClient):
|
|
|
|
if result.get("success") and result.get("extracted_content"):
|
|
|
|
if result.get("success") and result.get("extracted_content"):
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
extracted = json.loads(result["extracted_content"])
|
|
|
|
extracted = json.loads(result["extracted_content"])
|
|
|
|
if isinstance(extracted, list) and extracted: extracted = extracted[0] # Use first item
|
|
|
|
if isinstance(extracted, list) and extracted:
|
|
|
|
title = extracted.get('page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
|
|
|
|
extracted = extracted[0] # Use first item
|
|
|
|
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
|
|
|
|
title = extracted.get(
|
|
|
|
|
|
|
|
'page_title', 'N/A') if isinstance(extracted, dict) else 'Parse Error'
|
|
|
|
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Title: {title}")
|
|
|
|
except Exception:
|
|
|
|
except Exception:
|
|
|
|
console.print(f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]![/] URL: [link={result['url']}]{result['url']}[/link] | Failed to parse extracted content")
|
|
|
|
elif result.get("success"):
|
|
|
|
elif result.get("success"):
|
|
|
|
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
|
|
|
|
|
|
|
|
# 6b. Deep Crawl with LLM Extraction
|
|
|
|
# 6b. Deep Crawl with LLM Extraction
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
|
|
|
|
if not os.getenv("OPENAI_API_KEY"): # Basic check
|
|
|
|
if not os.getenv("OPENAI_API_KEY"): # Basic check
|
|
|
|
console.rule("[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
|
|
|
|
console.rule(
|
|
|
|
console.print("Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
|
|
|
|
"[bold yellow]Demo 6b: Deep Crawl + LLM Extraction (SKIPPED)[/]", style="yellow")
|
|
|
|
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
"Set an LLM API key (e.g., OPENAI_API_KEY) in your .env file or environment.")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
@@ -857,8 +1024,10 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
|
|
|
|
"filter_chain": {
|
|
|
|
"filter_chain": {
|
|
|
|
"type": "FilterChain",
|
|
|
|
"type": "FilterChain",
|
|
|
|
"params": {"filters": [
|
|
|
|
"params": {"filters": [
|
|
|
|
{"type": "DomainFilter", "params": {"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
|
|
|
{"type": "DomainFilter", "params": {
|
|
|
|
{"type": "ContentTypeFilter", "params": {"allowed_types": ["text/html"]}}
|
|
|
|
"allowed_domains": [DEEP_CRAWL_DOMAIN]}},
|
|
|
|
|
|
|
|
{"type": "ContentTypeFilter", "params": {
|
|
|
|
|
|
|
|
"allowed_types": ["text/html"]}}
|
|
|
|
]}
|
|
|
|
]}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -872,23 +1041,28 @@ async def demo_deep_with_llm_extraction(client: httpx.AsyncClient):
|
|
|
|
console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
|
|
|
|
console.print("[cyan]LLM Extraction Summary from Deep Crawl:[/]")
|
|
|
|
for result in results:
|
|
|
|
for result in results:
|
|
|
|
if result.get("success") and result.get("extracted_content"):
|
|
|
|
if result.get("success") and result.get("extracted_content"):
|
|
|
|
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Topic: {result['extracted_content']}")
|
|
|
|
elif result.get("success"):
|
|
|
|
elif result.get("success"):
|
|
|
|
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | No content extracted.")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# 6c. Deep Crawl with Proxies
|
|
|
|
# 6c. Deep Crawl with Proxies
|
|
|
|
async def demo_deep_with_proxy(client: httpx.AsyncClient):
|
|
|
|
async def demo_deep_with_proxy(client: httpx.AsyncClient):
|
|
|
|
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
|
|
|
|
proxy_params_list = load_proxies_from_env() # Get the list of parameter dicts
|
|
|
|
if not proxy_params_list:
|
|
|
|
if not proxy_params_list:
|
|
|
|
console.rule("[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
|
|
|
|
console.rule(
|
|
|
|
|
|
|
|
"[bold yellow]Demo 6c: Deep Crawl + Proxies (SKIPPED)[/]", style="yellow")
|
|
|
|
console.print("Set the PROXIES environment variable to run this demo.")
|
|
|
|
console.print("Set the PROXIES environment variable to run this demo.")
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
payload = {
|
|
|
|
payload = {
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL], # Use a site likely accessible via proxies
|
|
|
|
# Use a site likely accessible via proxies
|
|
|
|
|
|
|
|
"urls": [DEEP_CRAWL_BASE_URL],
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
|
|
|
"crawler_config": {
|
|
|
|
"crawler_config": {
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
"type": "CrawlerRunConfig",
|
|
|
|
@@ -899,7 +1073,8 @@ async def demo_deep_with_proxy(client: httpx.AsyncClient):
|
|
|
|
"params": {
|
|
|
|
"params": {
|
|
|
|
# Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
|
|
|
|
# Correctly create the list of {"type": ..., "params": ...} structures, excluding the demo 'ip' key
|
|
|
|
"proxies": [
|
|
|
|
"proxies": [
|
|
|
|
{"type": "ProxyConfig", "params": {k: v for k, v in p.items() if k != 'ip'}}
|
|
|
|
{"type": "ProxyConfig", "params": {
|
|
|
|
|
|
|
|
k: v for k, v in p.items() if k != 'ip'}}
|
|
|
|
for p in proxy_params_list
|
|
|
|
for p in proxy_params_list
|
|
|
|
]
|
|
|
|
]
|
|
|
|
}
|
|
|
|
}
|
|
|
|
@@ -965,13 +1140,18 @@ async def demo_deep_with_ssl(client: httpx.AsyncClient):
|
|
|
|
issuer_org = cert.get('issuer', {}).get('O', 'N/A')
|
|
|
|
issuer_org = cert.get('issuer', {}).get('O', 'N/A')
|
|
|
|
valid_from = cert.get('not_before', 'N/A')
|
|
|
|
valid_from = cert.get('not_before', 'N/A')
|
|
|
|
valid_to = cert.get('not_after', 'N/A')
|
|
|
|
valid_to = cert.get('not_after', 'N/A')
|
|
|
|
console.print(f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [green]✔[/] URL: [link={result['url']}]{result['url']}[/link] | Issuer: {issuer_org} | Valid: {valid_from} - {valid_to}")
|
|
|
|
elif result.get("success"):
|
|
|
|
elif result.get("success"):
|
|
|
|
console.print(f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [yellow]-[/] URL: [link={result['url']}]{result['url']}[/link] | SSL cert not fetched or N/A.")
|
|
|
|
else:
|
|
|
|
else:
|
|
|
|
console.print(f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f" [red]✘[/] URL: [link={result['url']}]{result['url']}[/link] | Crawl failed.")
|
|
|
|
|
|
|
|
|
|
|
|
# 7. Markdown helper endpoint
|
|
|
|
# 7. Markdown helper endpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_markdown_endpoint(client: httpx.AsyncClient):
|
|
|
|
async def demo_markdown_endpoint(client: httpx.AsyncClient):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
One-shot helper around /md.
|
|
|
|
One-shot helper around /md.
|
|
|
|
@@ -987,15 +1167,19 @@ async def demo_markdown_endpoint(client: httpx.AsyncClient):
|
|
|
|
t0 = time.time()
|
|
|
|
t0 = time.time()
|
|
|
|
resp = await client.post("/md", json=payload)
|
|
|
|
resp = await client.post("/md", json=payload)
|
|
|
|
dt = time.time() - t0
|
|
|
|
dt = time.time() - t0
|
|
|
|
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
resp.raise_for_status()
|
|
|
|
resp.raise_for_status()
|
|
|
|
md = resp.json().get("markdown", "")
|
|
|
|
md = resp.json().get("markdown", "")
|
|
|
|
snippet = (md[:500] + "...") if len(md) > 500 else md
|
|
|
|
snippet = (md[:500] + "...") if len(md) > 500 else md
|
|
|
|
console.print(Panel(snippet, title="Markdown snippet", border_style="cyan", expand=False))
|
|
|
|
console.print(Panel(snippet, title="Markdown snippet",
|
|
|
|
|
|
|
|
border_style="cyan", expand=False))
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[bold red]Error hitting /md:[/] {e}")
|
|
|
|
console.print(f"[bold red]Error hitting /md:[/] {e}")
|
|
|
|
|
|
|
|
|
|
|
|
# 8. LLM QA helper endpoint
|
|
|
|
# 8. LLM QA helper endpoint
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
async def demo_llm_endpoint(client: httpx.AsyncClient):
|
|
|
|
async def demo_llm_endpoint(client: httpx.AsyncClient):
|
|
|
|
"""
|
|
|
|
"""
|
|
|
|
Quick QA round-trip with /llm.
|
|
|
|
Quick QA round-trip with /llm.
|
|
|
|
@@ -1012,10 +1196,12 @@ async def demo_llm_endpoint(client: httpx.AsyncClient):
|
|
|
|
t0 = time.time()
|
|
|
|
t0 = time.time()
|
|
|
|
resp = await client.get(f"/llm/{enc}", params={"q": question})
|
|
|
|
resp = await client.get(f"/llm/{enc}", params={"q": question})
|
|
|
|
dt = time.time() - t0
|
|
|
|
dt = time.time() - t0
|
|
|
|
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
resp.raise_for_status()
|
|
|
|
resp.raise_for_status()
|
|
|
|
answer = resp.json().get("answer", "")
|
|
|
|
answer = resp.json().get("answer", "")
|
|
|
|
console.print(Panel(answer or "No answer returned", title="LLM answer", border_style="magenta", expand=False))
|
|
|
|
console.print(Panel(answer or "No answer returned",
|
|
|
|
|
|
|
|
title="LLM answer", border_style="magenta", expand=False))
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[bold red]Error hitting /llm:[/] {e}")
|
|
|
|
console.print(f"[bold red]Error hitting /llm:[/] {e}")
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1036,10 +1222,12 @@ async def demo_config_dump_valid(client: httpx.AsyncClient):
|
|
|
|
t0 = time.time()
|
|
|
|
t0 = time.time()
|
|
|
|
resp = await client.post("/config/dump", json=payload)
|
|
|
|
resp = await client.post("/config/dump", json=payload)
|
|
|
|
dt = time.time() - t0
|
|
|
|
dt = time.time() - t0
|
|
|
|
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/] (took {dt:.2f}s)")
|
|
|
|
resp.raise_for_status()
|
|
|
|
resp.raise_for_status()
|
|
|
|
dump_json = resp.json()
|
|
|
|
dump_json = resp.json()
|
|
|
|
console.print(Panel(Syntax(json.dumps(dump_json, indent=2), "json", theme="monokai"), title="Dump()", border_style="cyan"))
|
|
|
|
console.print(Panel(Syntax(json.dumps(dump_json, indent=2),
|
|
|
|
|
|
|
|
"json", theme="monokai"), title="Dump()", border_style="cyan"))
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
|
|
|
|
console.print(f"[bold red]Error in valid /config/dump call:[/] {e}")
|
|
|
|
|
|
|
|
|
|
|
|
@@ -1053,21 +1241,25 @@ async def demo_config_dump_invalid(client: httpx.AsyncClient):
|
|
|
|
""").strip()
|
|
|
|
""").strip()
|
|
|
|
payload = {"code": bad_code}
|
|
|
|
payload = {"code": bad_code}
|
|
|
|
|
|
|
|
|
|
|
|
console.rule("[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
|
|
|
|
console.rule(
|
|
|
|
|
|
|
|
"[bold magenta]Demo 8b: /config/dump (invalid)[/]", style="magenta")
|
|
|
|
print_payload(payload)
|
|
|
|
print_payload(payload)
|
|
|
|
|
|
|
|
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
resp = await client.post("/config/dump", json=payload)
|
|
|
|
resp = await client.post("/config/dump", json=payload)
|
|
|
|
console.print(f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"Response Status: [bold {'green' if resp.is_success else 'red'}]{resp.status_code}[/]")
|
|
|
|
resp.raise_for_status() # should throw -> except
|
|
|
|
resp.raise_for_status() # should throw -> except
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
|
|
except httpx.HTTPStatusError as e:
|
|
|
|
console.print("[cyan]Expected parse/validation failure captured:[/]")
|
|
|
|
console.print("[cyan]Expected parse/validation failure captured:[/]")
|
|
|
|
try:
|
|
|
|
try:
|
|
|
|
console.print(Panel(Syntax(json.dumps(e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
|
|
|
|
console.print(Panel(Syntax(json.dumps(
|
|
|
|
|
|
|
|
e.response.json(), indent=2), "json", theme="fruity"), title="Error payload"))
|
|
|
|
except Exception:
|
|
|
|
except Exception:
|
|
|
|
console.print(e.response.text)
|
|
|
|
console.print(e.response.text)
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"[bold red]Unexpected error during invalid test:[/] {e}")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"[bold red]Unexpected error during invalid test:[/] {e}")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# --- Update Main Runner to include new demo ---
|
|
|
|
# --- Update Main Runner to include new demo ---
|
|
|
|
@@ -1077,31 +1269,31 @@ async def main_demo():
|
|
|
|
return
|
|
|
|
return
|
|
|
|
|
|
|
|
|
|
|
|
# --- Run Demos ---
|
|
|
|
# --- Run Demos ---
|
|
|
|
await demo_basic_single_url(client)
|
|
|
|
# await demo_basic_single_url(client)
|
|
|
|
await demo_basic_multi_url(client)
|
|
|
|
# await demo_basic_multi_url(client)
|
|
|
|
await demo_streaming_multi_url(client)
|
|
|
|
# await demo_streaming_multi_url(client)
|
|
|
|
|
|
|
|
|
|
|
|
await demo_markdown_default(client)
|
|
|
|
# await demo_markdown_default(client)
|
|
|
|
await demo_markdown_pruning(client)
|
|
|
|
# await demo_markdown_pruning(client)
|
|
|
|
await demo_markdown_bm25(client)
|
|
|
|
# await demo_markdown_bm25(client)
|
|
|
|
|
|
|
|
|
|
|
|
await demo_param_css_selector(client)
|
|
|
|
# await demo_param_css_selector(client)
|
|
|
|
await demo_param_js_execution(client)
|
|
|
|
# await demo_param_js_execution(client)
|
|
|
|
await demo_param_screenshot(client)
|
|
|
|
# await demo_param_screenshot(client)
|
|
|
|
await demo_param_ssl_fetch(client)
|
|
|
|
# await demo_param_ssl_fetch(client)
|
|
|
|
await demo_param_proxy(client) # Skips if no PROXIES env var
|
|
|
|
# await demo_param_proxy(client) # Skips if no PROXIES env var
|
|
|
|
|
|
|
|
|
|
|
|
await demo_extract_css(client)
|
|
|
|
# await demo_extract_css(client)
|
|
|
|
await demo_extract_llm(client) # Skips if no common LLM key env var
|
|
|
|
# await demo_extract_llm(client) # Skips if no common LLM key env var
|
|
|
|
|
|
|
|
|
|
|
|
await demo_deep_basic(client)
|
|
|
|
# await demo_deep_basic(client)
|
|
|
|
await demo_deep_streaming(client) # This need extra work
|
|
|
|
# await demo_deep_streaming(client) # This need extra work
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# await demo_deep_with_css_extraction(client)
|
|
|
|
await demo_deep_with_css_extraction(client)
|
|
|
|
# # Skips if no common LLM key env var
|
|
|
|
await demo_deep_with_llm_extraction(client) # Skips if no common LLM key env var
|
|
|
|
# await demo_deep_with_llm_extraction(client)
|
|
|
|
await demo_deep_with_proxy(client) # Skips if no PROXIES env var
|
|
|
|
# await demo_deep_with_proxy(client) # Skips if no PROXIES env var
|
|
|
|
await demo_deep_with_ssl(client) # Added the new demo
|
|
|
|
# await demo_deep_with_ssl(client) # Added the new demo
|
|
|
|
|
|
|
|
|
|
|
|
# --- Helper endpoints ---
|
|
|
|
# --- Helper endpoints ---
|
|
|
|
await demo_markdown_endpoint(client)
|
|
|
|
await demo_markdown_endpoint(client)
|
|
|
|
@@ -1120,5 +1312,6 @@ if __name__ == "__main__":
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
except KeyboardInterrupt:
|
|
|
|
console.print("\n[yellow]Demo interrupted by user.[/]")
|
|
|
|
console.print("\n[yellow]Demo interrupted by user.[/]")
|
|
|
|
except Exception as e:
|
|
|
|
except Exception as e:
|
|
|
|
console.print(f"\n[bold red]An error occurred during demo execution:[/]")
|
|
|
|
console.print(
|
|
|
|
|
|
|
|
f"\n[bold red]An error occurred during demo execution:[/]")
|
|
|
|
console.print_exception(show_locals=False)
|
|
|
|
console.print_exception(show_locals=False)
|