From 585e5e5973a264ac22343f9a4fdef54048b3b31f Mon Sep 17 00:00:00 2001 From: Aravind Karnam Date: Tue, 25 Mar 2025 15:17:59 +0530 Subject: [PATCH] fix: https://github.com/unclecode/crawl4ai/issues/733 --- crawl4ai/async_webcrawler.py | 3 ++- crawl4ai/content_scraping_strategy.py | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 98111e4b..91b98d7f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -448,6 +448,7 @@ class AsyncWebCrawler: pdf_data=pdf_data, verbose=config.verbose, is_raw_html=True if url.startswith("raw:") else False, + redirected_url=async_response.redirected_url, **kwargs, ) @@ -596,7 +597,7 @@ class AsyncWebCrawler: markdown_result: MarkdownGenerationResult = ( markdown_generator.generate_markdown( cleaned_html=cleaned_html, - base_url=url, + base_url=params.get("redirected_url", url), # html2text_options=kwargs.get('html2text', {}) ) ) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 215e7cda..0848d655 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -128,7 +128,8 @@ class WebScrapingStrategy(ContentScrapingStrategy): Returns: ScrapingResult: A structured result containing the scraped content. """ - raw_result = self._scrap(url, html, is_async=False, **kwargs) + actual_url = kwargs.get("redirected_url", url) + raw_result = self._scrap(actual_url, html, is_async=False, **kwargs) if raw_result is None: return ScrapingResult( cleaned_html="",