Add base_url parameter to CrawlerRunConfig for raw HTML processing

When processing raw: HTML (e.g., from cache), the URL parameter is meaningless for markdown link resolution. This adds a base_url parameter that can be set explicitly to provide proper URL resolution context. Changes: - Add base_url parameter to CrawlerRunConfig.__init__ - Add base_url to CrawlerRunConfig.from_kwargs - Update aprocess_html to use base_url for markdown generation Usage: config = CrawlerRunConfig(base_url='https://example.com') result = await crawler.arun(url='raw:{html}', config=config)
2025-12-24 06:05:55 +00:00
parent 624e34164d
commit 3937efcf0b
2 changed files with 5 additions and 1 deletions
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -1292,6 +1292,7 @@ class CrawlerRunConfig():
        method: str = "GET",
        stream: bool = False,
        url: str = None,
+        base_url: str = None,  # Base URL for markdown link resolution (used with raw: HTML)
        check_robots_txt: bool = False,
        user_agent: str = None,
        user_agent_mode: str = None,
@@ -1310,6 +1311,7 @@ class CrawlerRunConfig():
    ):
        # TODO: Planning to set properties dynamically based on the __init__ signature
        self.url = url
+        self.base_url = base_url  # Base URL for markdown link resolution

        # Content Processing Parameters
        self.word_count_threshold = word_count_threshold
@@ -1701,6 +1703,7 @@ class CrawlerRunConfig():
            # Link Extraction Parameters
            link_preview_config=kwargs.get("link_preview_config"),
            url=kwargs.get("url"),
+            base_url=kwargs.get("base_url"),
            # URL Matching Parameters
            url_matcher=kwargs.get("url_matcher"),
            match_mode=kwargs.get("match_mode", MatchMode.OR),
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -618,7 +618,8 @@ class AsyncWebCrawler:
        markdown_result: MarkdownGenerationResult = (
            markdown_generator.generate_markdown(
                input_html=markdown_input_html,
-                base_url=params.get("redirected_url", url)
+                # Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url
+                base_url=params.get("base_url") or params.get("redirected_url") or url
                # html2text_options=kwargs.get('html2text', {})
            )
        )