From 3937efcf0b0f2d2971e1935f58884d8b63d30b83 Mon Sep 17 00:00:00 2001 From: unclecode Date: Wed, 24 Dec 2025 06:05:55 +0000 Subject: [PATCH] Add base_url parameter to CrawlerRunConfig for raw HTML processing When processing raw: HTML (e.g., from cache), the URL parameter is meaningless for markdown link resolution. This adds a base_url parameter that can be set explicitly to provide proper URL resolution context. Changes: - Add base_url parameter to CrawlerRunConfig.__init__ - Add base_url to CrawlerRunConfig.from_kwargs - Update aprocess_html to use base_url for markdown generation Usage: config = CrawlerRunConfig(base_url='https://example.com') result = await crawler.arun(url='raw:{html}', config=config) --- crawl4ai/async_configs.py | 3 +++ crawl4ai/async_webcrawler.py | 3 ++- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 0ee0f728..f289fc28 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1292,6 +1292,7 @@ class CrawlerRunConfig(): method: str = "GET", stream: bool = False, url: str = None, + base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML) check_robots_txt: bool = False, user_agent: str = None, user_agent_mode: str = None, @@ -1310,6 +1311,7 @@ class CrawlerRunConfig(): ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url + self.base_url = base_url # Base URL for markdown link resolution # Content Processing Parameters self.word_count_threshold = word_count_threshold @@ -1701,6 +1703,7 @@ class CrawlerRunConfig(): # Link Extraction Parameters link_preview_config=kwargs.get("link_preview_config"), url=kwargs.get("url"), + base_url=kwargs.get("base_url"), # URL Matching Parameters url_matcher=kwargs.get("url_matcher"), match_mode=kwargs.get("match_mode", MatchMode.OR), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 50adfaba..94aec156 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -618,7 +618,8 @@ class AsyncWebCrawler: markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( input_html=markdown_input_html, - base_url=params.get("redirected_url", url) + # Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url + base_url=params.get("base_url") or params.get("redirected_url") or url # html2text_options=kwargs.get('html2text', {}) ) )