diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 76977bb9..2850b36a 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -1023,6 +1023,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): final_messages = await self.adapter.retrieve_console_messages(page) captured_console.extend(final_messages) + ### + # This ensures we capture the current page URL at the time we return the response, + # which correctly reflects any JavaScript navigation that occurred. + ### + redirected_url = page.url # Use current page URL to capture JS redirects + # Return complete response return AsyncCrawlResponse( html=html, diff --git a/tests/async/test_redirect_url_resolution.py b/tests/async/test_redirect_url_resolution.py new file mode 100644 index 00000000..cce3e512 --- /dev/null +++ b/tests/async/test_redirect_url_resolution.py @@ -0,0 +1,118 @@ +"""Test delayed redirect WITH wait_for - does link resolution use correct URL?""" +import asyncio +import threading +from http.server import HTTPServer, SimpleHTTPRequestHandler + +class RedirectTestHandler(SimpleHTTPRequestHandler): + def log_message(self, format, *args): + pass + + def do_GET(self): + if self.path == "/page-a": + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + content = """ + + + Page A + +

Page A - Will redirect after 200ms

+ + + + """ + self.wfile.write(content.encode()) + elif self.path.startswith("/redirect-target"): + self.send_response(200) + self.send_header("Content-type", "text/html") + self.end_headers() + content = """ + + + Redirect Target + +

Redirect Target

+ + + + """ + self.wfile.write(content.encode()) + else: + self.send_response(404) + self.end_headers() + +async def main(): + import socket + class ReuseAddrHTTPServer(HTTPServer): + allow_reuse_address = True + + server = ReuseAddrHTTPServer(("localhost", 8769), RedirectTestHandler) + thread = threading.Thread(target=server.serve_forever) + thread.daemon = True + thread.start() + + try: + import sys + sys.path.insert(0, '/Users/nasrin/vscode/c4ai-uc/develop') + from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + print("=" * 60) + print("TEST: Delayed JS redirect WITH wait_for='css:#target-nav'") + print("This waits for the redirect to complete") + print("=" * 60) + + browser_config = BrowserConfig(headless=True, verbose=False) + crawl_config = CrawlerRunConfig( + cache_mode="bypass", + wait_for="css:#target-nav" # Wait for element on redirect target + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="http://localhost:8769/page-a", + config=crawl_config + ) + + print(f"Original URL: http://localhost:8769/page-a") + print(f"Redirected URL returned: {result.redirected_url}") + print(f"HTML contains 'Redirect Target': {'Redirect Target' in result.html}") + print() + + if "/redirect-target" in (result.redirected_url or ""): + print("āœ“ redirected_url is CORRECT") + else: + print("āœ— BUG #1: redirected_url is WRONG - still shows original URL!") + + # Check links + all_links = [] + if isinstance(result.links, dict): + all_links = result.links.get("internal", []) + result.links.get("external", []) + + print(f"\nLinks found ({len(all_links)} total):") + bug_found = False + for link in all_links: + href = link.get("href", "") if isinstance(link, dict) else getattr(link, 'href', "") + if "subpage" in href: + print(f" {href}") + if "/page-a/" in href: + print(" ^^^ BUG #2: Link resolved with WRONG base URL!") + bug_found = True + elif "/redirect-target/" in href: + print(" ^^^ CORRECT") + + if not bug_found and all_links: + print("\nāœ“ Link resolution is CORRECT") + + finally: + server.shutdown() + +if __name__ == "__main__": + asyncio.run(main())