diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index 0ee0f728..f289fc28 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1292,6 +1292,7 @@ class CrawlerRunConfig(): method: str = "GET", stream: bool = False, url: str = None, + base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML) check_robots_txt: bool = False, user_agent: str = None, user_agent_mode: str = None, @@ -1310,6 +1311,7 @@ class CrawlerRunConfig(): ): # TODO: Planning to set properties dynamically based on the __init__ signature self.url = url + self.base_url = base_url # Base URL for markdown link resolution # Content Processing Parameters self.word_count_threshold = word_count_threshold @@ -1701,6 +1703,7 @@ class CrawlerRunConfig(): # Link Extraction Parameters link_preview_config=kwargs.get("link_preview_config"), url=kwargs.get("url"), + base_url=kwargs.get("base_url"), # URL Matching Parameters url_matcher=kwargs.get("url_matcher"), match_mode=kwargs.get("match_mode", MatchMode.OR), diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 50adfaba..94aec156 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -618,7 +618,8 @@ class AsyncWebCrawler: markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( input_html=markdown_input_html, - base_url=params.get("redirected_url", url) + # Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url + base_url=params.get("base_url") or params.get("redirected_url") or url # html2text_options=kwargs.get('html2text', {}) ) )