From edd0b576b103df4acb251e41cfd4d55b53e25ae7 Mon Sep 17 00:00:00 2001
From: rbushria <rbushri@gmail.com>
Date: Thu, 28 Aug 2025 10:46:44 +0300
Subject: [PATCH] Fix: Use correct URL variable for raw HTML extraction (#1116)

- Prevents full HTML content from being passed as URL to extraction strategies
- Added unit tests to verify raw HTML and regular URL processing

Fix: Wrong URL variable used for extraction of raw html
---
 crawl4ai/async_webcrawler.py           |  2 +-
 tests/general/test_async_webcrawler.py | 81 +++++++++++++++++++++++++-
 2 files changed, 81 insertions(+), 2 deletions(-)
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index ebd2859d..359aa73c 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -615,7 +615,7 @@ class AsyncWebCrawler:
                 else config.chunking_strategy
             )
             sections = chunking.chunk(content)
-            extracted_content = config.extraction_strategy.run(url, sections)
+            extracted_content = config.extraction_strategy.run(_url, sections)
             extracted_content = json.dumps(
                 extracted_content, indent=4, default=str, ensure_ascii=False
             )
diff --git a/tests/general/test_async_webcrawler.py b/tests/general/test_async_webcrawler.py
index 4d7aa815..80d4acbe 100644
--- a/tests/general/test_async_webcrawler.py
+++ b/tests/general/test_async_webcrawler.py
@@ -9,6 +9,21 @@ from crawl4ai import (
     RateLimiter,
     CacheMode
 )
+from crawl4ai.extraction_strategy import ExtractionStrategy
+
+class MockExtractionStrategy(ExtractionStrategy):
+    """Mock extraction strategy for testing URL parameter handling"""
+
+    def __init__(self):
+        super().__init__()
+        self.run_calls = []
+
+    def extract(self, url: str, html: str, *args, **kwargs):
+        return [{"test": "data"}]
+
+    def run(self, url: str, sections: List[str], *args, **kwargs):
+        self.run_calls.append(url)
+        return super().run(url, sections, *args, **kwargs)
 
 @pytest.mark.asyncio
 @pytest.mark.parametrize("viewport", [
@@ -142,8 +157,72 @@ async def test_error_handling(error_url):
         assert not result.success
         assert result.error_message is not None
 
+@pytest.mark.asyncio
+async def test_extraction_strategy_run_with_regular_url():
+    """
+    Regression test for extraction_strategy.run URL parameter handling with regular URLs.
+
+    This test verifies that when is_raw_html=False (regular URL),
+    extraction_strategy.run is called with the actual URL.
+    """
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        mock_strategy = MockExtractionStrategy()
+
+        # Test regular URL (is_raw_html=False)
+        regular_url = "https://example.com"
+        result = await crawler.arun(
+            url=regular_url,
+            config=CrawlerRunConfig(
+                page_timeout=30000,
+                extraction_strategy=mock_strategy,
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+
+        assert result.success
+        assert len(mock_strategy.run_calls) == 1
+        assert mock_strategy.run_calls[0] == regular_url, f"Expected '{regular_url}', got '{mock_strategy.run_calls[0]}'"
+
+@pytest.mark.asyncio
+async def test_extraction_strategy_run_with_raw_html():
+    """
+    Regression test for extraction_strategy.run URL parameter handling with raw HTML.
+
+    This test verifies that when is_raw_html=True (URL starts with "raw:"),
+    extraction_strategy.run is called with "Raw HTML" instead of the actual URL.
+    """
+    browser_config = BrowserConfig(
+        browser_type="chromium",
+        headless=True
+    )
+
+    async with AsyncWebCrawler(config=browser_config) as crawler:
+        mock_strategy = MockExtractionStrategy()
+
+        # Test raw HTML URL (is_raw_html=True automatically set)
+        raw_html_url = "raw:<html><body><h1>Test HTML</h1><p>This is a test.</p></body></html>"
+        result = await crawler.arun(
+            url=raw_html_url,
+            config=CrawlerRunConfig(
+                page_timeout=30000,
+                extraction_strategy=mock_strategy,
+                cache_mode=CacheMode.BYPASS
+            )
+        )
+
+        assert result.success
+        assert len(mock_strategy.run_calls) == 1
+        assert mock_strategy.run_calls[0] == "Raw HTML", f"Expected 'Raw HTML', got '{mock_strategy.run_calls[0]}'"
+
 if __name__ == "__main__":
     asyncio.run(test_viewport_config((1024, 768)))
     asyncio.run(test_memory_management())
     asyncio.run(test_rate_limiting())
-    asyncio.run(test_javascript_execution())
\ No newline at end of file
+    asyncio.run(test_javascript_execution())
+    asyncio.run(test_extraction_strategy_run_with_regular_url())
+    asyncio.run(test_extraction_strategy_run_with_raw_html())