Merge pull request #1390 from unclecode/fix/docker-raw-html

Check for raw: and raw:// URLs before auto-appending https:// prefix
This commit is contained in:
Nasrin
2025-08-13 13:56:33 +08:00
committed by GitHub
4 changed files with 42 additions and 11 deletions

View File

@@ -168,7 +168,7 @@ class SimpleApiTester:
print("\n=== CORE APIs ===")
test_url = "https://example.com"
test_raw_html_url = "raw://<html><body><h1>Hello, World!</h1></body></html>"
# Test markdown endpoint
md_payload = {
"url": test_url,
@@ -180,6 +180,17 @@ class SimpleApiTester:
# print(result['data'].get('markdown', ''))
self.print_result(result)
# Test markdown endpoint with raw HTML
raw_md_payload = {
"url": test_raw_html_url,
"f": "fit",
"q": "test query",
"c": "0"
}
result = self.test_post_endpoint("/md", raw_md_payload)
self.print_result(result)
# Test HTML endpoint
html_payload = {"url": test_url}
result = self.test_post_endpoint("/html", html_payload)
@@ -215,6 +226,15 @@ class SimpleApiTester:
result = self.test_post_endpoint("/crawl", crawl_payload)
self.print_result(result)
# Test crawl endpoint with raw HTML
crawl_payload = {
"urls": [test_raw_html_url],
"browser_config": {},
"crawler_config": {}
}
result = self.test_post_endpoint("/crawl", crawl_payload)
self.print_result(result)
# Test config dump
config_payload = {"code": "CrawlerRunConfig()"}
result = self.test_post_endpoint("/config/dump", config_payload)

View File

@@ -74,7 +74,7 @@ async def test_direct_api():
# Make direct API call
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
"http://localhost:11235/crawl",
json=request_data,
timeout=300
)
@@ -100,13 +100,24 @@ async def test_direct_api():
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
"http://localhost:11235/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Structured extraction result:", result["success"])
# Test 3: Raw HTML
request_data["urls"] = ["raw://<html><body><h1>Hello, World!</h1><a href='https://example.com'>Example</a></body></html>"]
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:11235/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Raw HTML result:", result["success"])
# Test 3: Get schema
# async with httpx.AsyncClient() as client:
# response = await client.get("http://localhost:8000/schema")
@@ -118,7 +129,7 @@ async def test_with_client():
"""Test using the Crawl4AI Docker client SDK"""
print("\n=== Testing Client SDK ===")
async with Crawl4aiDockerClient(verbose=True) as client:
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
# Test 1: Basic crawl
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(