From 1d6a2b9979d530703ec76708a385a2d87a1b5f7d Mon Sep 17 00:00:00 2001 From: ntohidi Date: Wed, 30 Apr 2025 12:29:17 +0200 Subject: [PATCH] fix(crawler): surface real redirect status codes and keep redirect chain. the 30x response instead of always returning 200. Refs #660 --- crawl4ai/async_crawler_strategy.py | 25 +++++++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 3162bd54..da5490b6 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -744,12 +744,33 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): "after_goto", page, context=context, url=url, response=response, config=config ) + # ────────────────────────────────────────────────────────────── + # Walk the redirect chain. Playwright returns only the last + # hop, so we trace the `request.redirected_from` links until the + # first response that differs from the final one and surface its + # status-code. + # ────────────────────────────────────────────────────────────── if response is None: status_code = 200 response_headers = {} else: - status_code = response.status - response_headers = response.headers + first_resp = response + req = response.request + while req and req.redirected_from: + prev_req = req.redirected_from + prev_resp = await prev_req.response() + if prev_resp: # keep earliest + first_resp = prev_resp + req = prev_req + + status_code = first_resp.status + response_headers = first_resp.headers + # if response is None: + # status_code = 200 + # response_headers = {} + # else: + # status_code = response.status + # response_headers = response.headers else: status_code = 200