From 3937efcf0b0f2d2971e1935f58884d8b63d30b83 Mon Sep 17 00:00:00 2001
From: unclecode <unclecode@kidocode.com>
Date: Wed, 24 Dec 2025 06:05:55 +0000
Subject: [PATCH] Add base_url parameter to CrawlerRunConfig for raw HTML
 processing

When processing raw: HTML (e.g., from cache), the URL parameter is meaningless
for markdown link resolution. This adds a base_url parameter that can be set
explicitly to provide proper URL resolution context.

Changes:
- Add base_url parameter to CrawlerRunConfig.__init__
- Add base_url to CrawlerRunConfig.from_kwargs
- Update aprocess_html to use base_url for markdown generation

Usage:
  config = CrawlerRunConfig(base_url='https://example.com')
  result = await crawler.arun(url='raw:{html}', config=config)
---
 crawl4ai/async_configs.py    | 3 +++
 crawl4ai/async_webcrawler.py | 3 ++-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index 0ee0f728..f289fc28 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1292,6 +1292,7 @@ class CrawlerRunConfig():
         method: str = "GET",
         stream: bool = False,
         url: str = None,
+        base_url: str = None,  # Base URL for markdown link resolution (used with raw: HTML)
         check_robots_txt: bool = False,
         user_agent: str = None,
         user_agent_mode: str = None,
@@ -1310,6 +1311,7 @@ class CrawlerRunConfig():
     ):
         # TODO: Planning to set properties dynamically based on the __init__ signature
         self.url = url
+        self.base_url = base_url  # Base URL for markdown link resolution
 
         # Content Processing Parameters
         self.word_count_threshold = word_count_threshold
@@ -1701,6 +1703,7 @@ class CrawlerRunConfig():
             # Link Extraction Parameters
             link_preview_config=kwargs.get("link_preview_config"),
             url=kwargs.get("url"),
+            base_url=kwargs.get("base_url"),
             # URL Matching Parameters
             url_matcher=kwargs.get("url_matcher"),
             match_mode=kwargs.get("match_mode", MatchMode.OR),
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 50adfaba..94aec156 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -618,7 +618,8 @@ class AsyncWebCrawler:
         markdown_result: MarkdownGenerationResult = (
             markdown_generator.generate_markdown(
                 input_html=markdown_input_html,
-                base_url=params.get("redirected_url", url)
+                # Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url
+                base_url=params.get("base_url") or params.get("redirected_url") or url
                 # html2text_options=kwargs.get('html2text', {})
             )
         )