fix: Check for raw: and raw:// URLs before auto-appending https:// prefix

- Add raw HTML URL validation alongside http/https checks
- Fix URL preprocessing logic to handle raw: and raw:// prefixes
- Update error message and add comprehensive test cases
This commit is contained in:
Soham Kukreti
2025-08-11 22:10:53 +05:30
parent f0ce7b2710
commit f30811b524
4 changed files with 42 additions and 11 deletions

View File

@@ -65,7 +65,7 @@ async def handle_llm_qa(
) -> str: ) -> str:
"""Process QA using LLM with crawled content as context.""" """Process QA using LLM with crawled content as context."""
try: try:
if not url.startswith(('http://', 'https://')): if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
url = 'https://' + url url = 'https://' + url
# Extract base URL by finding last '?q=' occurrence # Extract base URL by finding last '?q=' occurrence
last_q_index = url.rfind('?q=') last_q_index = url.rfind('?q=')
@@ -191,7 +191,7 @@ async def handle_markdown_request(
detail=error_msg detail=error_msg
) )
decoded_url = unquote(url) decoded_url = unquote(url)
if not decoded_url.startswith(('http://', 'https://')): if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
decoded_url = 'https://' + decoded_url decoded_url = 'https://' + decoded_url
if filter_type == FilterType.RAW: if filter_type == FilterType.RAW:
@@ -328,7 +328,7 @@ async def create_new_task(
) -> JSONResponse: ) -> JSONResponse:
"""Create and initialize a new task.""" """Create and initialize a new task."""
decoded_url = unquote(input_path) decoded_url = unquote(input_path)
if not decoded_url.startswith(('http://', 'https://')): if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
decoded_url = 'https://' + decoded_url decoded_url = 'https://' + decoded_url
from datetime import datetime from datetime import datetime
@@ -428,7 +428,7 @@ async def handle_crawl_request(
peak_mem_mb = start_mem_mb peak_mem_mb = start_mem_mb
try: try:
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls] urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
browser_config = BrowserConfig.load(browser_config) browser_config = BrowserConfig.load(browser_config)
crawler_config = CrawlerRunConfig.load(crawler_config) crawler_config = CrawlerRunConfig.load(crawler_config)

View File

@@ -237,9 +237,9 @@ async def get_markdown(
body: MarkdownRequest, body: MarkdownRequest,
_td: Dict = Depends(token_dep), _td: Dict = Depends(token_dep),
): ):
if not body.url.startswith(("http://", "https://")): if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
raise HTTPException( raise HTTPException(
400, "URL must be absolute and start with http/https") 400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
markdown = await handle_markdown_request( markdown = await handle_markdown_request(
body.url, body.f, body.q, body.c, config, body.provider body.url, body.f, body.q, body.c, config, body.provider
) )
@@ -401,7 +401,7 @@ async def llm_endpoint(
): ):
if not q: if not q:
raise HTTPException(400, "Query parameter 'q' is required") raise HTTPException(400, "Query parameter 'q' is required")
if not url.startswith(("http://", "https://")): if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
url = "https://" + url url = "https://" + url
answer = await handle_llm_qa(url, q, config) answer = await handle_llm_qa(url, q, config)
return JSONResponse({"answer": answer}) return JSONResponse({"answer": answer})

View File

@@ -168,7 +168,7 @@ class SimpleApiTester:
print("\n=== CORE APIs ===") print("\n=== CORE APIs ===")
test_url = "https://example.com" test_url = "https://example.com"
test_raw_html_url = "raw://<html><body><h1>Hello, World!</h1></body></html>"
# Test markdown endpoint # Test markdown endpoint
md_payload = { md_payload = {
"url": test_url, "url": test_url,
@@ -180,6 +180,17 @@ class SimpleApiTester:
# print(result['data'].get('markdown', '')) # print(result['data'].get('markdown', ''))
self.print_result(result) self.print_result(result)
# Test markdown endpoint with raw HTML
raw_md_payload = {
"url": test_raw_html_url,
"f": "fit",
"q": "test query",
"c": "0"
}
result = self.test_post_endpoint("/md", raw_md_payload)
self.print_result(result)
# Test HTML endpoint # Test HTML endpoint
html_payload = {"url": test_url} html_payload = {"url": test_url}
result = self.test_post_endpoint("/html", html_payload) result = self.test_post_endpoint("/html", html_payload)
@@ -215,6 +226,15 @@ class SimpleApiTester:
result = self.test_post_endpoint("/crawl", crawl_payload) result = self.test_post_endpoint("/crawl", crawl_payload)
self.print_result(result) self.print_result(result)
# Test crawl endpoint with raw HTML
crawl_payload = {
"urls": [test_raw_html_url],
"browser_config": {},
"crawler_config": {}
}
result = self.test_post_endpoint("/crawl", crawl_payload)
self.print_result(result)
# Test config dump # Test config dump
config_payload = {"code": "CrawlerRunConfig()"} config_payload = {"code": "CrawlerRunConfig()"}
result = self.test_post_endpoint("/config/dump", config_payload) result = self.test_post_endpoint("/config/dump", config_payload)

View File

@@ -74,7 +74,7 @@ async def test_direct_api():
# Make direct API call # Make direct API call
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
response = await client.post( response = await client.post(
"http://localhost:8000/crawl", "http://localhost:11235/crawl",
json=request_data, json=request_data,
timeout=300 timeout=300
) )
@@ -100,13 +100,24 @@ async def test_direct_api():
async with httpx.AsyncClient() as client: async with httpx.AsyncClient() as client:
response = await client.post( response = await client.post(
"http://localhost:8000/crawl", "http://localhost:11235/crawl",
json=request_data json=request_data
) )
assert response.status_code == 200 assert response.status_code == 200
result = response.json() result = response.json()
print("Structured extraction result:", result["success"]) print("Structured extraction result:", result["success"])
# Test 3: Raw HTML
request_data["urls"] = ["raw://<html><body><h1>Hello, World!</h1><a href='https://example.com'>Example</a></body></html>"]
async with httpx.AsyncClient() as client:
response = await client.post(
"http://localhost:11235/crawl",
json=request_data
)
assert response.status_code == 200
result = response.json()
print("Raw HTML result:", result["success"])
# Test 3: Get schema # Test 3: Get schema
# async with httpx.AsyncClient() as client: # async with httpx.AsyncClient() as client:
# response = await client.get("http://localhost:8000/schema") # response = await client.get("http://localhost:8000/schema")
@@ -118,7 +129,7 @@ async def test_with_client():
"""Test using the Crawl4AI Docker client SDK""" """Test using the Crawl4AI Docker client SDK"""
print("\n=== Testing Client SDK ===") print("\n=== Testing Client SDK ===")
async with Crawl4aiDockerClient(verbose=True) as client: async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
# Test 1: Basic crawl # Test 1: Basic crawl
browser_config = BrowserConfig(headless=True) browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(