fix: Check for raw: and raw:// URLs before auto-appending https:// prefix

- Add raw HTML URL validation alongside http/https checks
- Fix URL preprocessing logic to handle raw: and raw:// prefixes
- Update error message and add comprehensive test cases
This commit is contained in:
Soham Kukreti
2025-08-11 22:10:53 +05:30
parent f0ce7b2710
commit f30811b524
4 changed files with 42 additions and 11 deletions

View File

@@ -74,7 +74,7 @@ async def test_direct_api():
# Make direct API call
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
"http://localhost:11235/crawl",
json=request_data,
timeout=300
)
@@ -100,13 +100,24 @@ async def test_direct_api():
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:8000/crawl",
"http://localhost:11235/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Structured extraction result:", result["success"])
# Test 3: Raw HTML
request_data["urls"] = ["raw://<html><body><h1>Hello, World!</h1><a href='https://example.com'>Example</a></body></html>"]
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:11235/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Raw HTML result:", result["success"])
# Test 3: Get schema
# async with httpx.AsyncClient() as client:
# response = await client.get("http://localhost:8000/schema")
@@ -118,7 +129,7 @@ async def test_with_client():
"""Test using the Crawl4AI Docker client SDK"""
print("\n=== Testing Client SDK ===")
async with Crawl4aiDockerClient(verbose=True) as client:
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
# Test 1: Basic crawl
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(