Add base_url parameter to CrawlerRunConfig for raw HTML processing
When processing raw: HTML (e.g., from cache), the URL parameter is meaningless for markdown link resolution. This adds a base_url parameter that can be set explicitly to provide proper URL resolution context. Changes: - Add base_url parameter to CrawlerRunConfig.__init__ - Add base_url to CrawlerRunConfig.from_kwargs - Update aprocess_html to use base_url for markdown generation Usage: config = CrawlerRunConfig(base_url='https://example.com') result = await crawler.arun(url='raw:{html}', config=config)
This commit is contained in:
@@ -1292,6 +1292,7 @@ class CrawlerRunConfig():
|
|||||||
method: str = "GET",
|
method: str = "GET",
|
||||||
stream: bool = False,
|
stream: bool = False,
|
||||||
url: str = None,
|
url: str = None,
|
||||||
|
base_url: str = None, # Base URL for markdown link resolution (used with raw: HTML)
|
||||||
check_robots_txt: bool = False,
|
check_robots_txt: bool = False,
|
||||||
user_agent: str = None,
|
user_agent: str = None,
|
||||||
user_agent_mode: str = None,
|
user_agent_mode: str = None,
|
||||||
@@ -1310,6 +1311,7 @@ class CrawlerRunConfig():
|
|||||||
):
|
):
|
||||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||||
self.url = url
|
self.url = url
|
||||||
|
self.base_url = base_url # Base URL for markdown link resolution
|
||||||
|
|
||||||
# Content Processing Parameters
|
# Content Processing Parameters
|
||||||
self.word_count_threshold = word_count_threshold
|
self.word_count_threshold = word_count_threshold
|
||||||
@@ -1701,6 +1703,7 @@ class CrawlerRunConfig():
|
|||||||
# Link Extraction Parameters
|
# Link Extraction Parameters
|
||||||
link_preview_config=kwargs.get("link_preview_config"),
|
link_preview_config=kwargs.get("link_preview_config"),
|
||||||
url=kwargs.get("url"),
|
url=kwargs.get("url"),
|
||||||
|
base_url=kwargs.get("base_url"),
|
||||||
# URL Matching Parameters
|
# URL Matching Parameters
|
||||||
url_matcher=kwargs.get("url_matcher"),
|
url_matcher=kwargs.get("url_matcher"),
|
||||||
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
match_mode=kwargs.get("match_mode", MatchMode.OR),
|
||||||
|
|||||||
@@ -618,7 +618,8 @@ class AsyncWebCrawler:
|
|||||||
markdown_result: MarkdownGenerationResult = (
|
markdown_result: MarkdownGenerationResult = (
|
||||||
markdown_generator.generate_markdown(
|
markdown_generator.generate_markdown(
|
||||||
input_html=markdown_input_html,
|
input_html=markdown_input_html,
|
||||||
base_url=params.get("redirected_url", url)
|
# Use explicit base_url if provided (for raw: HTML), otherwise redirected_url, then url
|
||||||
|
base_url=params.get("base_url") or params.get("redirected_url") or url
|
||||||
# html2text_options=kwargs.get('html2text', {})
|
# html2text_options=kwargs.get('html2text', {})
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
Reference in New Issue
Block a user