fix: Check for raw: and raw:// URLs before auto-appending https:// prefix
- Add raw HTML URL validation alongside http/https checks - Fix URL preprocessing logic to handle raw: and raw:// prefixes - Update error message and add comprehensive test cases
This commit is contained in:
@@ -65,7 +65,7 @@ async def handle_llm_qa(
|
|||||||
) -> str:
|
) -> str:
|
||||||
"""Process QA using LLM with crawled content as context."""
|
"""Process QA using LLM with crawled content as context."""
|
||||||
try:
|
try:
|
||||||
if not url.startswith(('http://', 'https://')):
|
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
|
||||||
url = 'https://' + url
|
url = 'https://' + url
|
||||||
# Extract base URL by finding last '?q=' occurrence
|
# Extract base URL by finding last '?q=' occurrence
|
||||||
last_q_index = url.rfind('?q=')
|
last_q_index = url.rfind('?q=')
|
||||||
@@ -191,7 +191,7 @@ async def handle_markdown_request(
|
|||||||
detail=error_msg
|
detail=error_msg
|
||||||
)
|
)
|
||||||
decoded_url = unquote(url)
|
decoded_url = unquote(url)
|
||||||
if not decoded_url.startswith(('http://', 'https://')):
|
if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
|
||||||
decoded_url = 'https://' + decoded_url
|
decoded_url = 'https://' + decoded_url
|
||||||
|
|
||||||
if filter_type == FilterType.RAW:
|
if filter_type == FilterType.RAW:
|
||||||
@@ -328,7 +328,7 @@ async def create_new_task(
|
|||||||
) -> JSONResponse:
|
) -> JSONResponse:
|
||||||
"""Create and initialize a new task."""
|
"""Create and initialize a new task."""
|
||||||
decoded_url = unquote(input_path)
|
decoded_url = unquote(input_path)
|
||||||
if not decoded_url.startswith(('http://', 'https://')):
|
if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
|
||||||
decoded_url = 'https://' + decoded_url
|
decoded_url = 'https://' + decoded_url
|
||||||
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
@@ -428,7 +428,7 @@ async def handle_crawl_request(
|
|||||||
peak_mem_mb = start_mem_mb
|
peak_mem_mb = start_mem_mb
|
||||||
|
|
||||||
try:
|
try:
|
||||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
|
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
|
||||||
browser_config = BrowserConfig.load(browser_config)
|
browser_config = BrowserConfig.load(browser_config)
|
||||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||||
|
|
||||||
|
|||||||
@@ -237,9 +237,9 @@ async def get_markdown(
|
|||||||
body: MarkdownRequest,
|
body: MarkdownRequest,
|
||||||
_td: Dict = Depends(token_dep),
|
_td: Dict = Depends(token_dep),
|
||||||
):
|
):
|
||||||
if not body.url.startswith(("http://", "https://")):
|
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
|
||||||
raise HTTPException(
|
raise HTTPException(
|
||||||
400, "URL must be absolute and start with http/https")
|
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
||||||
markdown = await handle_markdown_request(
|
markdown = await handle_markdown_request(
|
||||||
body.url, body.f, body.q, body.c, config, body.provider
|
body.url, body.f, body.q, body.c, config, body.provider
|
||||||
)
|
)
|
||||||
@@ -401,7 +401,7 @@ async def llm_endpoint(
|
|||||||
):
|
):
|
||||||
if not q:
|
if not q:
|
||||||
raise HTTPException(400, "Query parameter 'q' is required")
|
raise HTTPException(400, "Query parameter 'q' is required")
|
||||||
if not url.startswith(("http://", "https://")):
|
if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
|
||||||
url = "https://" + url
|
url = "https://" + url
|
||||||
answer = await handle_llm_qa(url, q, config)
|
answer = await handle_llm_qa(url, q, config)
|
||||||
return JSONResponse({"answer": answer})
|
return JSONResponse({"answer": answer})
|
||||||
|
|||||||
@@ -168,7 +168,7 @@ class SimpleApiTester:
|
|||||||
print("\n=== CORE APIs ===")
|
print("\n=== CORE APIs ===")
|
||||||
|
|
||||||
test_url = "https://example.com"
|
test_url = "https://example.com"
|
||||||
|
test_raw_html_url = "raw://<html><body><h1>Hello, World!</h1></body></html>"
|
||||||
# Test markdown endpoint
|
# Test markdown endpoint
|
||||||
md_payload = {
|
md_payload = {
|
||||||
"url": test_url,
|
"url": test_url,
|
||||||
@@ -180,6 +180,17 @@ class SimpleApiTester:
|
|||||||
# print(result['data'].get('markdown', ''))
|
# print(result['data'].get('markdown', ''))
|
||||||
self.print_result(result)
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test markdown endpoint with raw HTML
|
||||||
|
raw_md_payload = {
|
||||||
|
"url": test_raw_html_url,
|
||||||
|
"f": "fit",
|
||||||
|
"q": "test query",
|
||||||
|
"c": "0"
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/md", raw_md_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
|
|
||||||
# Test HTML endpoint
|
# Test HTML endpoint
|
||||||
html_payload = {"url": test_url}
|
html_payload = {"url": test_url}
|
||||||
result = self.test_post_endpoint("/html", html_payload)
|
result = self.test_post_endpoint("/html", html_payload)
|
||||||
@@ -215,6 +226,15 @@ class SimpleApiTester:
|
|||||||
result = self.test_post_endpoint("/crawl", crawl_payload)
|
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||||
self.print_result(result)
|
self.print_result(result)
|
||||||
|
|
||||||
|
# Test crawl endpoint with raw HTML
|
||||||
|
crawl_payload = {
|
||||||
|
"urls": [test_raw_html_url],
|
||||||
|
"browser_config": {},
|
||||||
|
"crawler_config": {}
|
||||||
|
}
|
||||||
|
result = self.test_post_endpoint("/crawl", crawl_payload)
|
||||||
|
self.print_result(result)
|
||||||
|
|
||||||
# Test config dump
|
# Test config dump
|
||||||
config_payload = {"code": "CrawlerRunConfig()"}
|
config_payload = {"code": "CrawlerRunConfig()"}
|
||||||
result = self.test_post_endpoint("/config/dump", config_payload)
|
result = self.test_post_endpoint("/config/dump", config_payload)
|
||||||
|
|||||||
@@ -74,7 +74,7 @@ async def test_direct_api():
|
|||||||
# Make direct API call
|
# Make direct API call
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
"http://localhost:8000/crawl",
|
"http://localhost:11235/crawl",
|
||||||
json=request_data,
|
json=request_data,
|
||||||
timeout=300
|
timeout=300
|
||||||
)
|
)
|
||||||
@@ -100,13 +100,24 @@ async def test_direct_api():
|
|||||||
|
|
||||||
async with httpx.AsyncClient() as client:
|
async with httpx.AsyncClient() as client:
|
||||||
response = await client.post(
|
response = await client.post(
|
||||||
"http://localhost:8000/crawl",
|
"http://localhost:11235/crawl",
|
||||||
json=request_data
|
json=request_data
|
||||||
)
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
result = response.json()
|
result = response.json()
|
||||||
print("Structured extraction result:", result["success"])
|
print("Structured extraction result:", result["success"])
|
||||||
|
|
||||||
|
# Test 3: Raw HTML
|
||||||
|
request_data["urls"] = ["raw://<html><body><h1>Hello, World!</h1><a href='https://example.com'>Example</a></body></html>"]
|
||||||
|
async with httpx.AsyncClient() as client:
|
||||||
|
response = await client.post(
|
||||||
|
"http://localhost:11235/crawl",
|
||||||
|
json=request_data
|
||||||
|
)
|
||||||
|
assert response.status_code == 200
|
||||||
|
result = response.json()
|
||||||
|
print("Raw HTML result:", result["success"])
|
||||||
|
|
||||||
# Test 3: Get schema
|
# Test 3: Get schema
|
||||||
# async with httpx.AsyncClient() as client:
|
# async with httpx.AsyncClient() as client:
|
||||||
# response = await client.get("http://localhost:8000/schema")
|
# response = await client.get("http://localhost:8000/schema")
|
||||||
@@ -118,7 +129,7 @@ async def test_with_client():
|
|||||||
"""Test using the Crawl4AI Docker client SDK"""
|
"""Test using the Crawl4AI Docker client SDK"""
|
||||||
print("\n=== Testing Client SDK ===")
|
print("\n=== Testing Client SDK ===")
|
||||||
|
|
||||||
async with Crawl4aiDockerClient(verbose=True) as client:
|
async with Crawl4aiDockerClient(base_url="http://localhost:11235", verbose=True) as client:
|
||||||
# Test 1: Basic crawl
|
# Test 1: Basic crawl
|
||||||
browser_config = BrowserConfig(headless=True)
|
browser_config = BrowserConfig(headless=True)
|
||||||
crawler_config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
|
|||||||
Reference in New Issue
Block a user