From 4b1309cbf252d307365456f5a8f7043aef817d00 Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Sun, 19 Jan 2025 19:53:38 +0800
Subject: [PATCH] feat(crawler): add URL redirection tracking

Add capability to track and return final URLs after redirects in crawler responses. This enhancement helps users understand the actual destination of crawled URLs after any redirections.

Changes include:
- Added final_url tracking in AsyncPlaywrightCrawlerStrategy
- Added redirected_url field to CrawlResult model
- Updated AsyncWebCrawler to properly handle and store redirect URLs
- Fixed typo in documentation signature
---
 crawl4ai/async_crawler_strategy.py | 3 +++
 crawl4ai/async_webcrawler.py       | 4 +++-
 crawl4ai/models.py                 | 2 ++
 docs/md_v2/index.md                | 2 +-
 4 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 758157a5..786d2fb9 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -1241,6 +1241,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
         config.url = url
         response_headers = {}
         status_code = None
+        final_url = url 
 
         # Reset downloaded files list for new crawl
         self._downloaded_files = []
@@ -1322,6 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                     response = await page.goto(
                         url, wait_until=config.wait_until, timeout=config.page_timeout
                     )
+                    final_url = page.url
                 except Error as e:
                     raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
 
@@ -1601,6 +1603,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                 downloaded_files=(
                     self._downloaded_files if self._downloaded_files else None
                 ),
+                final_url=final_url,
             )
 
         except Exception as e:
diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py
index 61dc4a51..61cfc18f 100644
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -433,7 +433,7 @@ class AsyncWebCrawler:
                     )
 
                     # Process the HTML content
-                    crawl_result = await self.aprocess_html(
+                    crawl_result : CrawlResult = await self.aprocess_html(
                         url=url,
                         html=html,
                         extracted_content=extracted_content,
@@ -446,6 +446,7 @@ class AsyncWebCrawler:
                     )
 
                     crawl_result.status_code = async_response.status_code
+                    crawl_result.redirected_url = async_response.final_url or url
                     crawl_result.response_headers = async_response.response_headers
                     crawl_result.downloaded_files = async_response.downloaded_files
                     crawl_result.ssl_certificate = (
@@ -509,6 +510,7 @@ class AsyncWebCrawler:
 
                     cached_result.success = bool(html)
                     cached_result.session_id = getattr(config, "session_id", None)
+                    cached_result.redirected_url = cached_result.redirected_url or url
                     return cached_result
 
             except Exception as e:
diff --git a/crawl4ai/models.py b/crawl4ai/models.py
index 217aced4..81e08b0c 100644
--- a/crawl4ai/models.py
+++ b/crawl4ai/models.py
@@ -117,6 +117,7 @@ class CrawlResult(BaseModel):
     status_code: Optional[int] = None
     ssl_certificate: Optional[SSLCertificate] = None
     dispatch_result: Optional[DispatchResult] = None
+    redirected_url: Optional[str] = None
 
     class Config:
         arbitrary_types_allowed = True
@@ -131,6 +132,7 @@ class AsyncCrawlResponse(BaseModel):
     get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
     downloaded_files: Optional[List[str]] = None
     ssl_certificate: Optional[SSLCertificate] = None
+    final_url: Optional[str] = None
 
     class Config:
         arbitrary_types_allowed = True
diff --git a/docs/md_v2/index.md b/docs/md_v2/index.md
index 250c977d..7a230d5d 100644
--- a/docs/md_v2/index.md
+++ b/docs/md_v2/index.md
@@ -132,4 +132,4 @@ Throughout these sections, you’ll find code samples you can **copy-paste** int
 Thank you for joining me on this journey. Let’s keep building an **open, democratic** approach to data extraction and AI together.
 
 Happy Crawling!  
-— *Unclecde, Founder & Maintainer of Crawl4AI*  
+— *Unclecode, Founder & Maintainer of Crawl4AI*