Basic HTML document for testing purposes.
+From 27af4cc27bafa5bfa60849c5c124eb3ae47ec987 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jo=C3=A3o=20Martins?= <11438285+jl-martins@users.noreply.github.com> Date: Sat, 15 Feb 2025 15:34:59 +0000 Subject: [PATCH] Fix "raw://" URL parsing logic Closes https://github.com/unclecode/crawl4ai/issues/686 --- crawl4ai/async_crawler_strategy.py | 4 +-- tests/20241401/test_async_crawler_strategy.py | 25 +++++++++++++++++++ 2 files changed, 27 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 62ee4c65..5e3c2519 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1231,9 +1231,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): get_delayed_content=None, ) - elif url.startswith("raw:") or url.startswith("raw://"): + elif url.startswith("raw:"): # Process raw HTML content - raw_html = url[4:] if url[:4] == "raw:" else url[7:] + raw_html = url[6:] if url.startswith("raw://") else url[4:] html = raw_html if config.screenshot: screenshot_data = await self._generate_screenshot_from_html(html) diff --git a/tests/20241401/test_async_crawler_strategy.py b/tests/20241401/test_async_crawler_strategy.py index 68fe4a88..8426fe0a 100644 --- a/tests/20241401/test_async_crawler_strategy.py +++ b/tests/20241401/test_async_crawler_strategy.py @@ -15,6 +15,24 @@ CRAWL4AI_HOME_DIR = Path(os.path.expanduser("~")).joinpath(".crawl4ai") if not CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").exists(): CRAWL4AI_HOME_DIR.joinpath("profiles", "test_profile").mkdir(parents=True) +@pytest.fixture +def basic_html(): + return """ + +
+Basic HTML document for testing purposes.
+