Add base_url parameter to CrawlerRunConfig for raw HTML processing
When processing raw: HTML (e.g., from cache), the URL parameter is meaningless for markdown link resolution. This adds a base_url parameter that can be set explicitly to provide proper URL resolution context. Changes: - Add base_url parameter to CrawlerRunConfig.__init__ - Add base_url to CrawlerRunConfig.from_kwargs - Update aprocess_html to use base_url for markdown generation Usage: config = CrawlerRunConfig(base_url='https://example.com') result = await crawler.arun(url='raw:{html}', config=config)
This commit is contained in:
@@ -1292,6 +1292,7 @@ class CrawlerRunConfig():
|
||||
method: str = "GET",
|
||||
stream: bool = False,
|
||||
url: str = None,
|
||||
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
|
||||
check_robots_txt: bool = False,
|
||||
user_agent: str = None,
|
||||
user_agent_mode: str = None,
|
||||
@@ -1310,6 +1311,7 @@ class CrawlerRunConfig():
|
||||
):
|
||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||
self.url = url
|
||||
self.base_url = base_url # Base URL for markdown link resolution
|
||||
|
||||
# Content Processing Parameters
|
||||
self.word_count_threshold = word_count_threshold
|
||||
@@ -1701,6 +1703,7 @@ class CrawlerRunConfig():
|
||||
# Link Extraction Parameters
|
||||
link_preview_config=kwargs.get("link_preview_config"),
|
||||
url=kwargs.get("url"),
|
||||
base_url=kwargs.get("base_url"),
|
||||
# URL Matching Parameters
|
||||
url_matcher=kwargs.get("url_matcher"),
|
||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||
|
||||
@@ -618,7 +618,8 @@ class AsyncWebCrawler:
|
||||
markdown_result: MarkdownGenerationResult = (
|
||||
markdown_generator.generate_markdown(
|
||||
input_html=markdown_input_html,
|
||||
base_url=params.get("redirected_url", url)
|
||||
# Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url
|
||||
base_url=params.get("base_url") or params.get("redirected_url") or url
|
||||
# html2text_options=kwargs.get('html2text', {})
|
||||
)
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user