Merge PR #1435: Add redirected_status_code to CrawlResult
Applied manually due to conflicts (PR based on older code). Also fixed missing variable initialization for non-goto paths (file://, raw:, js_only) that would have caused NameError. Closes #1434
This commit is contained in:
@@ -523,7 +523,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
response_headers = {}
|
||||
execution_result = None
|
||||
status_code = None
|
||||
redirected_url = url
|
||||
redirected_url = url
|
||||
redirected_status_code = None
|
||||
|
||||
# Reset downloaded files list for new crawl
|
||||
self._downloaded_files = []
|
||||
@@ -709,6 +710,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
)
|
||||
redirected_url = page.url
|
||||
redirected_status_code = response.status if response else None
|
||||
except Error as e:
|
||||
# Allow navigation to be aborted when downloading files
|
||||
# This is expected behavior for downloads in some browser engines
|
||||
@@ -1072,6 +1074,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
self._downloaded_files if self._downloaded_files else None
|
||||
),
|
||||
redirected_url=redirected_url,
|
||||
redirected_status_code=redirected_status_code,
|
||||
# Include captured data if enabled
|
||||
network_requests=captured_requests if config.capture_network_requests else None,
|
||||
console_messages=captured_console if config.capture_console_messages else None,
|
||||
|
||||
@@ -438,6 +438,7 @@ class AsyncWebCrawler:
|
||||
# For raw: URLs, don't fall back to the raw HTML string as redirected_url
|
||||
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
|
||||
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
|
||||
crawl_result.redirected_status_code = async_response.redirected_status_code
|
||||
crawl_result.response_headers = async_response.response_headers
|
||||
crawl_result.downloaded_files = async_response.downloaded_files
|
||||
crawl_result.js_execution_result = js_execution_result
|
||||
|
||||
@@ -149,6 +149,7 @@ class CrawlResult(BaseModel):
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
redirected_status_code: Optional[int] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
||||
@@ -332,6 +333,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
redirected_url: Optional[str] = None
|
||||
redirected_status_code: Optional[int] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
|
||||
|
||||
Reference in New Issue
Block a user