feat(crawler): add URL redirection tracking

Add capability to track and return final URLs after redirects in crawler responses. This enhancement helps users understand the actual destination of crawled URLs after any redirections.

Changes include:
- Added final_url tracking in AsyncPlaywrightCrawlerStrategy
- Added redirected_url field to CrawlResult model
- Updated AsyncWebCrawler to properly handle and store redirect URLs
- Fixed typo in documentation signature
This commit is contained in:
UncleCode
2025-01-19 19:53:38 +08:00
parent 8b6fe6a98f
commit 4b1309cbf2
4 changed files with 9 additions and 2 deletions

View File

@@ -1241,6 +1241,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
config.url = url
response_headers = {}
status_code = None
final_url = url
# Reset downloaded files list for new crawl
self._downloaded_files = []
@@ -1322,6 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response = await page.goto(
url, wait_until=config.wait_until, timeout=config.page_timeout
)
final_url = page.url
except Error as e:
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
@@ -1601,6 +1603,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
),
final_url=final_url,
)
except Exception as e:

View File

@@ -433,7 +433,7 @@ class AsyncWebCrawler:
)
# Process the HTML content
crawl_result = await self.aprocess_html(
crawl_result : CrawlResult = await self.aprocess_html(
url=url,
html=html,
extracted_content=extracted_content,
@@ -446,6 +446,7 @@ class AsyncWebCrawler:
)
crawl_result.status_code = async_response.status_code
crawl_result.redirected_url = async_response.final_url or url
crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.ssl_certificate = (
@@ -509,6 +510,7 @@ class AsyncWebCrawler:
cached_result.success = bool(html)
cached_result.session_id = getattr(config, "session_id", None)
cached_result.redirected_url = cached_result.redirected_url or url
return cached_result
except Exception as e:

View File

@@ -117,6 +117,7 @@ class CrawlResult(BaseModel):
status_code: Optional[int] = None
ssl_certificate: Optional[SSLCertificate] = None
dispatch_result: Optional[DispatchResult] = None
redirected_url: Optional[str] = None
class Config:
arbitrary_types_allowed = True
@@ -131,6 +132,7 @@ class AsyncCrawlResponse(BaseModel):
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
downloaded_files: Optional[List[str]] = None
ssl_certificate: Optional[SSLCertificate] = None
final_url: Optional[str] = None
class Config:
arbitrary_types_allowed = True

View File

@@ -132,4 +132,4 @@ Throughout these sections, youll find code samples you can **copy-paste** int
Thank you for joining me on this journey. Lets keep building an **open, democratic** approach to data extraction and AI together.
Happy Crawling!
*Unclecde, Founder & Maintainer of Crawl4AI*
*Unclecode, Founder & Maintainer of Crawl4AI*