Merge PR #1435: Add redirected_status_code to CrawlResult
Applied manually due to conflicts (PR based on older code). Also fixed missing variable initialization for non-goto paths (file://, raw:, js_only) that would have caused NameError. Closes #1434
This commit is contained in:
@@ -523,7 +523,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response_headers = {}
|
response_headers = {}
|
||||||
execution_result = None
|
execution_result = None
|
||||||
status_code = None
|
status_code = None
|
||||||
redirected_url = url
|
redirected_url = url
|
||||||
|
redirected_status_code = None
|
||||||
|
|
||||||
# Reset downloaded files list for new crawl
|
# Reset downloaded files list for new crawl
|
||||||
self._downloaded_files = []
|
self._downloaded_files = []
|
||||||
@@ -709,6 +710,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||||
)
|
)
|
||||||
redirected_url = page.url
|
redirected_url = page.url
|
||||||
|
redirected_status_code = response.status if response else None
|
||||||
except Error as e:
|
except Error as e:
|
||||||
# Allow navigation to be aborted when downloading files
|
# Allow navigation to be aborted when downloading files
|
||||||
# This is expected behavior for downloads in some browser engines
|
# This is expected behavior for downloads in some browser engines
|
||||||
@@ -1072,6 +1074,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
self._downloaded_files if self._downloaded_files else None
|
self._downloaded_files if self._downloaded_files else None
|
||||||
),
|
),
|
||||||
redirected_url=redirected_url,
|
redirected_url=redirected_url,
|
||||||
|
redirected_status_code=redirected_status_code,
|
||||||
# Include captured data if enabled
|
# Include captured data if enabled
|
||||||
network_requests=captured_requests if config.capture_network_requests else None,
|
network_requests=captured_requests if config.capture_network_requests else None,
|
||||||
console_messages=captured_console if config.capture_console_messages else None,
|
console_messages=captured_console if config.capture_console_messages else None,
|
||||||
|
|||||||
@@ -438,6 +438,7 @@ class AsyncWebCrawler:
|
|||||||
# For raw: URLs, don't fall back to the raw HTML string as redirected_url
|
# For raw: URLs, don't fall back to the raw HTML string as redirected_url
|
||||||
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
|
is_raw_url = url.startswith("raw:") or url.startswith("raw://")
|
||||||
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
|
crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url)
|
||||||
|
crawl_result.redirected_status_code = async_response.redirected_status_code
|
||||||
crawl_result.response_headers = async_response.response_headers
|
crawl_result.response_headers = async_response.response_headers
|
||||||
crawl_result.downloaded_files = async_response.downloaded_files
|
crawl_result.downloaded_files = async_response.downloaded_files
|
||||||
crawl_result.js_execution_result = js_execution_result
|
crawl_result.js_execution_result = js_execution_result
|
||||||
|
|||||||
@@ -149,6 +149,7 @@ class CrawlResult(BaseModel):
|
|||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
dispatch_result: Optional[DispatchResult] = None
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
redirected_url: Optional[str] = None
|
redirected_url: Optional[str] = None
|
||||||
|
redirected_status_code: Optional[int] = None
|
||||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}]
|
||||||
@@ -332,6 +333,7 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
downloaded_files: Optional[List[str]] = None
|
downloaded_files: Optional[List[str]] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
redirected_url: Optional[str] = None
|
redirected_url: Optional[str] = None
|
||||||
|
redirected_status_code: Optional[int] = None
|
||||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user