From 37a49c53153fe30b175bcf2a8f4887229e4fe9b9 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 6 Feb 2026 09:23:54 +0000 Subject: [PATCH] Merge PR #1435: Add redirected_status_code to CrawlResult Applied manually due to conflicts (PR based on older code). Also fixed missing variable initialization for non-goto paths (file://, raw:, js_only) that would have caused NameError. Closes #1434 --- crawl4ai/async_crawler_strategy.py | 5 ++++- crawl4ai/async_webcrawler.py | 1 + crawl4ai/models.py | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index a18264fb..944bb2b7 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -523,7 +523,8 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): response_headers = {} execution_result = None status_code = None - redirected_url = url + redirected_url = url + redirected_status_code = None # Reset downloaded files list for new crawl self._downloaded_files = [] @@ -709,6 +710,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): url, wait_until=config.wait_until, timeout=config.page_timeout ) redirected_url = page.url + redirected_status_code = response.status if response else None except Error as e: # Allow navigation to be aborted when downloading files # This is expected behavior for downloads in some browser engines @@ -1072,6 +1074,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._downloaded_files if self._downloaded_files else None ), redirected_url=redirected_url, + redirected_status_code=redirected_status_code, # Include captured data if enabled network_requests=captured_requests if config.capture_network_requests else None, console_messages=captured_console if config.capture_console_messages else None, diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index d0213415..70473b04 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -438,6 +438,7 @@ class AsyncWebCrawler: # For raw: URLs, don't fall back to the raw HTML string as redirected_url is_raw_url = url.startswith("raw:") or url.startswith("raw://") crawl_result.redirected_url = async_response.redirected_url or (None if is_raw_url else url) + crawl_result.redirected_status_code = async_response.redirected_status_code crawl_result.response_headers = async_response.response_headers crawl_result.downloaded_files = async_response.downloaded_files crawl_result.js_execution_result = js_execution_result diff --git a/crawl4ai/models.py b/crawl4ai/models.py index 930cda23..f8449576 100644 --- a/crawl4ai/models.py +++ b/crawl4ai/models.py @@ -149,6 +149,7 @@ class CrawlResult(BaseModel): ssl_certificate: Optional[SSLCertificate] = None dispatch_result: Optional[DispatchResult] = None redirected_url: Optional[str] = None + redirected_status_code: Optional[int] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None tables: List[Dict] = Field(default_factory=list) # NEW – [{headers,rows,caption,summary}] @@ -332,6 +333,7 @@ class AsyncCrawlResponse(BaseModel): downloaded_files: Optional[List[str]] = None ssl_certificate: Optional[SSLCertificate] = None redirected_url: Optional[str] = None + redirected_status_code: Optional[int] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None