diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 758157a5..786d2fb9 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1241,6 +1241,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): config.url = url response_headers = {} status_code = None + final_url = url # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -1322,6 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response = await page.goto( url, wait_until=config.wait_until, timeout=config.page_timeout ) + final_url = page.url except Error as e: raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}") @@ -1601,6 +1603,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): downloaded_files=( self._downloaded_files if self._downloaded_files else None ), + final_url=final_url, ) except Exception as e: diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index 61dc4a51..61cfc18f 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -433,7 +433,7 @@ class AsyncWebCrawler: ) # Process the HTML content - crawl_result = await self.aprocess_html( + crawl_result : CrawlResult = await self.aprocess_html( url=url, html=html, extracted_content=extracted_content, @@ -446,6 +446,7 @@ class AsyncWebCrawler: ) crawl_result.status_code = async_response.status_code + crawl_result.redirected_url = async_response.final_url or url crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.ssl_certificate = ( @@ -509,6 +510,7 @@ class AsyncWebCrawler: cached_result.success = bool(html) cached_result.session_id = getattr(config, "session_id", None) + cached_result.redirected_url = cached_result.redirected_url or url return cached_result except Exception as e: diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 217aced4..81e08b0c 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -117,6 +117,7 @@ class CrawlResult(BaseModel): status_code: Optional[int] = None ssl_certificate: Optional[SSLCertificate] = None dispatch_result: Optional[DispatchResult] = None + redirected_url: Optional[str] = None class Config: arbitrary_types_allowed = True @@ -131,6 +132,7 @@ class AsyncCrawlResponse(BaseModel): get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None downloaded_files: Optional[List[str]] = None ssl_certificate: Optional[SSLCertificate] = None + final_url: Optional[str] = None class Config: arbitrary_types_allowed = True diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md index 250c977d..7a230d5d 100644 --- a/docs/md_v2/index.md +++ b/docs/md_v2/index.md @@ -132,4 +132,4 @@ Throughout these sections, you’ll find code samples you can **copy-paste** int Thank you for joining me on this journey. Let’s keep building an **open, democratic** approach to data extraction and AI together. Happy Crawling! -— *Unclecde, Founder & Maintainer of Crawl4AI* +— *Unclecode, Founder & Maintainer of Crawl4AI*