Fix "raw://" URL parsing logic
Closes https://github.com/unclecode/crawl4ai/issues/686
This commit is contained in:
@@ -1231,9 +1231,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
get_delayed_content=None,
|
get_delayed_content=None,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif url.startswith("raw:") or url.startswith("raw://"):
|
elif url.startswith("raw:"):
|
||||||
# Process raw HTML content
|
# Process raw HTML content
|
||||||
raw_html = url[4:] if url[:4] == "raw:" else url[7:]
|
raw_html = url[6:] if url.startswith("raw://") else url[4:]
|
||||||
html = raw_html
|
html = raw_html
|
||||||
if config.screenshot:
|
if config.screenshot:
|
||||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||||
|
|||||||
@@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai")
|
|||||||
if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
|
if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists():
|
||||||
CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
|
CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True)
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def basic_html():
|
||||||
|
return """
|
||||||
|
<html lang="en">
|
||||||
|
<head>
|
||||||
|
<title>Basic HTML</title>
|
||||||
|
</head>
|
||||||
|
<body>
|
||||||
|
<h1>Main Heading</h1>
|
||||||
|
<main>
|
||||||
|
<div class="container">
|
||||||
|
<p>Basic HTML document for testing purposes.</p>
|
||||||
|
</div>
|
||||||
|
</main>
|
||||||
|
</body>
|
||||||
|
</html>
|
||||||
|
"""
|
||||||
|
|
||||||
# Test Config Files
|
# Test Config Files
|
||||||
@pytest.fixture
|
@pytest.fixture
|
||||||
def basic_browser_config():
|
def basic_browser_config():
|
||||||
@@ -325,6 +343,13 @@ async def test_stealth_mode(crawler_strategy):
|
|||||||
)
|
)
|
||||||
assert response.status_code == 200
|
assert response.status_code == 200
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
@pytest.mark.parametrize("prefix", ("raw:", "raw://"))
|
||||||
|
async def test_raw_urls(crawler_strategy, basic_html, prefix):
|
||||||
|
url = f"{prefix}{basic_html}"
|
||||||
|
response = await crawler_strategy.crawl(url, CrawlerRunConfig())
|
||||||
|
assert response.html == basic_html
|
||||||
|
|
||||||
# Error Handling Tests
|
# Error Handling Tests
|
||||||
@pytest.mark.asyncio
|
@pytest.mark.asyncio
|
||||||
async def test_invalid_url():
|
async def test_invalid_url():
|
||||||
|
|||||||
Reference in New Issue
Block a user