fix: Check for raw: and raw:// URLs before auto-appending https:// prefix
- Add raw HTML URL validation alongside http/https checks - Fix URL preprocessing logic to handle raw: and raw:// prefixes - Update error message and add comprehensive test cases
This commit is contained in:
@@ -65,7 +65,7 @@ async def handle_llm_qa(
|
||||
) -> str:
|
||||
"""Process QA using LLM with crawled content as context."""
|
||||
try:
|
||||
if not url.startswith(('http://', 'https://')):
|
||||
if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")):
|
||||
url = 'https://' + url
|
||||
# Extract base URL by finding last '?q=' occurrence
|
||||
last_q_index = url.rfind('?q=')
|
||||
@@ -191,7 +191,7 @@ async def handle_markdown_request(
|
||||
detail=error_msg
|
||||
)
|
||||
decoded_url = unquote(url)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
|
||||
if filter_type == FilterType.RAW:
|
||||
@@ -328,7 +328,7 @@ async def create_new_task(
|
||||
) -> JSONResponse:
|
||||
"""Create and initialize a new task."""
|
||||
decoded_url = unquote(input_path)
|
||||
if not decoded_url.startswith(('http://', 'https://')):
|
||||
if not decoded_url.startswith(('http://', 'https://')) and not decoded_url.startswith(("raw:", "raw://")):
|
||||
decoded_url = 'https://' + decoded_url
|
||||
|
||||
from datetime import datetime
|
||||
@@ -428,7 +428,7 @@ async def handle_crawl_request(
|
||||
peak_mem_mb = start_mem_mb
|
||||
|
||||
try:
|
||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) else url for url in urls]
|
||||
urls = [('https://' + url) if not url.startswith(('http://', 'https://')) and not url.startswith(("raw:", "raw://")) else url for url in urls]
|
||||
browser_config = BrowserConfig.load(browser_config)
|
||||
crawler_config = CrawlerRunConfig.load(crawler_config)
|
||||
|
||||
|
||||
@@ -237,9 +237,9 @@ async def get_markdown(
|
||||
body: MarkdownRequest,
|
||||
_td: Dict = Depends(token_dep),
|
||||
):
|
||||
if not body.url.startswith(("http://", "https://")):
|
||||
if not body.url.startswith(("http://", "https://")) and not body.url.startswith(("raw:", "raw://")):
|
||||
raise HTTPException(
|
||||
400, "URL must be absolute and start with http/https")
|
||||
400, "Invalid URL format. Must start with http://, https://, or for raw HTML (raw:, raw://)")
|
||||
markdown = await handle_markdown_request(
|
||||
body.url, body.f, body.q, body.c, config, body.provider
|
||||
)
|
||||
@@ -401,7 +401,7 @@ async def llm_endpoint(
|
||||
):
|
||||
if not q:
|
||||
raise HTTPException(400, "Query parameter 'q' is required")
|
||||
if not url.startswith(("http://", "https://")):
|
||||
if not url.startswith(("http://", "https://")) and not url.startswith(("raw:", "raw://")):
|
||||
url = "https://" + url
|
||||
answer = await handle_llm_qa(url, q, config)
|
||||
return JSONResponse({"answer": answer})
|
||||
|
||||
Reference in New Issue
Block a user