feat(crawler): add URL redirection tracking
Add capability to track and return final URLs after redirects in crawler responses. This enhancement helps users understand the actual destination of crawled URLs after any redirections. Changes include: - Added final_url tracking in AsyncPlaywrightCrawlerStrategy - Added redirected_url field to CrawlResult model - Updated AsyncWebCrawler to properly handle and store redirect URLs - Fixed typo in documentation signature
This commit is contained in:
@@ -1241,6 +1241,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
config.url = url
|
||||
response_headers = {}
|
||||
status_code = None
|
||||
final_url = url
|
||||
|
||||
# Reset downloaded files list for new crawl
|
||||
self._downloaded_files = []
|
||||
@@ -1322,6 +1323,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response = await page.goto(
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
)
|
||||
final_url = page.url
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||
|
||||
@@ -1601,6 +1603,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
downloaded_files=(
|
||||
self._downloaded_files if self._downloaded_files else None
|
||||
),
|
||||
final_url=final_url,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -433,7 +433,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
# Process the HTML content
|
||||
crawl_result = await self.aprocess_html(
|
||||
crawl_result : CrawlResult = await self.aprocess_html(
|
||||
url=url,
|
||||
html=html,
|
||||
extracted_content=extracted_content,
|
||||
@@ -446,6 +446,7 @@ class AsyncWebCrawler:
|
||||
)
|
||||
|
||||
crawl_result.status_code = async_response.status_code
|
||||
crawl_result.redirected_url = async_response.final_url or url
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.ssl_certificate = (
|
||||
@@ -509,6 +510,7 @@ class AsyncWebCrawler:
|
||||
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, "session_id", None)
|
||||
cached_result.redirected_url = cached_result.redirected_url or url
|
||||
return cached_result
|
||||
|
||||
except Exception as e:
|
||||
|
||||
@@ -117,6 +117,7 @@ class CrawlResult(BaseModel):
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
@@ -131,6 +132,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
final_url: Optional[str] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
@@ -132,4 +132,4 @@ Throughout these sections, you’ll find code samples you can **copy-paste** int
|
||||
Thank you for joining me on this journey. Let’s keep building an **open, democratic** approach to data extraction and AI together.
|
||||
|
||||
Happy Crawling!
|
||||
— *Unclecde, Founder & Maintainer of Crawl4AI*
|
||||
— *Unclecode, Founder & Maintainer of Crawl4AI*
|
||||
|
||||
Reference in New Issue
Block a user