Merge PR #1435: Add redirected_status_code to CrawlResult

Applied manually due to conflicts (PR based on older code).
Also fixed missing variable initialization for non-goto paths
(file://, raw:, js_only) that would have caused NameError.

Closes #1434
This commit is contained in:
unclecode
2026-02-06 09:23:54 +00:00
parent 0aacafed0a
commit 37a49c5315
3 changed files with 7 additions and 1 deletions

View File

@@ -523,7 +523,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
response_headers = {} response_headers = {}
execution_result = None execution_result = None
status_code = None status_code = None
redirected_url = url redirected_url = url
redirected_status_code = None
# Reset downloaded files list for new crawl # Reset downloaded files list for new crawl
self._downloaded_files = [] self._downloaded_files = []
@@ -709,6 +710,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
url, wait_until=config.wait_until, timeout=config.page_timeout url, wait_until=config.wait_until, timeout=config.page_timeout
) )
redirected_url = page.url redirected_url = page.url
redirected_status_code = response.status if response else None
except Error as e: except Error as e:
# Allow navigation to be aborted when downloading files # Allow navigation to be aborted when downloading files
# This is expected behavior for downloads in some browser engines # This is expected behavior for downloads in some browser engines
@@ -1072,6 +1074,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self._downloaded_files if self._downloaded_files else None self._downloaded_files if self._downloaded_files else None
), ),
redirected_url=redirected_url, redirected_url=redirected_url,
redirected_status_code=redirected_status_code,
# Include captured data if enabled # Include captured data if enabled
network_requests=captured_requests if config.capture_network_requests else None, network_requests=captured_requests if config.capture_network_requests else None,
console_messages=captured_console if config.capture_console_messages else None, console_messages=captured_console if config.capture_console_messages else None,

View File

@@ -438,6 +438,7 @@ class AsyncWebCrawler:
# For raw: URLs, don't fall back to the raw HTML string as redirected_url # For raw: URLs, don't fall back to the raw HTML string as redirected_url
is_raw_url = url.startswith("raw:") or url.startswith("raw://") is_raw_url = url.startswith("raw:") or url.startswith("raw://")
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url) crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
crawl_result.redirected_status_code = async_response.redirected_status_code
crawl_result.response_headers = async_response.response_headers crawl_result.response_headers = async_response.response_headers
crawl_result.downloaded_files = async_response.downloaded_files crawl_result.downloaded_files = async_response.downloaded_files
crawl_result.js_execution_result = js_execution_result crawl_result.js_execution_result = js_execution_result

View File

@@ -149,6 +149,7 @@ class CrawlResult(BaseModel):
ssl_certificate: Optional[SSLCertificate] = None ssl_certificate: Optional[SSLCertificate] = None
dispatch_result: Optional[DispatchResult] = None dispatch_result: Optional[DispatchResult] = None
redirected_url: Optional[str] = None redirected_url: Optional[str] = None
redirected_status_code: Optional[int] = None
network_requests: Optional[List[Dict[str, Any]]] = None network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}] tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
@@ -332,6 +333,7 @@ class AsyncCrawlResponse(BaseModel):
downloaded_files: Optional[List[str]] = None downloaded_files: Optional[List[str]] = None
ssl_certificate: Optional[SSLCertificate] = None ssl_certificate: Optional[SSLCertificate] = None
redirected_url: Optional[str] = None redirected_url: Optional[str] = None
redirected_status_code: Optional[int] = None
network_requests: Optional[List[Dict[str, Any]]] = None network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None