Merge pull request #752 from jl-martins/fix-raw-url-parsing

Fix `raw://` URL parsing logic. issue ref #1118
This commit is contained in:
Nasrin
2025-06-03 11:10:29 +02:00
committed by GitHub
2 changed files with 27 additions and 2 deletions

View File

@@ -466,9 +466,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
console_messages=captured_console,
)
elif url.startswith("raw:") or url.startswith("raw://"):
elif url.startswith("raw:"):
# Process raw HTML content
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html)

View File

@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
@pytest.fixture
def basic_html():
return """
<html lang="en">
<head>
<title>Basic HTML</title>
</head>
<body>
<h1>Main Heading</h1>
<main>
<div class="container">
<p>Basic HTML document for testing purposes.</p>
</div>
</main>
</body>
</html>
"""
# Test Config Files
@pytest.fixture
def basic_browser_config():
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
)
assert response.status_code == 200
@pytest.mark.asyncio
@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
async def test_raw_urls(crawler_strategy, basic_html, prefix):
url = f"{prefix}{basic_html}"
response = await crawler_strategy.crawl(url, CrawlerRunConfig())
assert response.html == basic_html
# Error Handling Tests
@pytest.mark.asyncio
async def test_invalid_url():