Fix "raw://" URL parsing logic

Closes https://github.com/unclecode/crawl4ai/issues/686
This commit is contained in:
João Martins
2025-02-15 15:34:59 +00:00
parent dde14eba7d
commit 27af4cc27b
2 changed files with 27 additions and 2 deletions

View File

@@ -1231,9 +1231,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
get_delayed_content=None,
)
elif url.startswith("raw:") or url.startswith("raw://"):
elif url.startswith("raw:"):
# Process raw HTML content
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
raw_html = url[6:] if url.startswith("raw://") else url[4:]
html = raw_html
if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html)

View File

@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
@pytest.fixture
def basic_html():
return """
<html lang="en">
<head>
<title>Basic HTML</title>
</head>
<body>
<h1>Main Heading</h1>
<main>
<div class="container">
<p>Basic HTML document for testing purposes.</p>
</div>
</main>
</body>
</html>
"""
# Test Config Files
@pytest.fixture
def basic_browser_config():
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
)
assert response.status_code == 200
@pytest.mark.asyncio
@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
async def test_raw_urls(crawler_strategy, basic_html, prefix):
url = f"{prefix}{basic_html}"
response = await crawler_strategy.crawl(url, CrawlerRunConfig())
assert response.html == basic_html
# Error Handling Tests
@pytest.mark.asyncio
async def test_invalid_url():