diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py index 3ca96aed..274b4232 100644 --- a/crawl4ai/browser_manager.py +++ b/crawl4ai/browser_manager.py @@ -674,6 +674,11 @@ class BrowserManager: self.default_context = await self.create_browser_context() await self.setup_context(self.default_context) else: + # Handle --disable-web-security requiring a separate user data directory + if "--disable-web-security" in (self.config.extra_args or []) and not self.config.user_data_dir: + import tempfile + self.config.user_data_dir = tempfile.mkdtemp() + browser_args = self._build_browser_args() # Launch appropriate browser type @@ -682,9 +687,15 @@ class BrowserManager: elif self.config.browser_type == "webkit": self.browser = await self.playwright.webkit.launch(**browser_args) else: - self.browser = await self.playwright.chromium.launch(**browser_args) - - self.default_context = self.browser + if "--disable-web-security" in (self.config.extra_args or []): + # Use persistent context for --disable-web-security + browser_args["args"] = [arg for arg in browser_args["args"] if not arg.startswith("--user-data-dir")] + self.default_context = await self.playwright.chromium.launch_persistent_context(self.config.user_data_dir, **browser_args) + self.browser = self.default_context + self.config.use_managed_browser = True # Treat as managed for get_page logic + else: + self.browser = await self.playwright.chromium.launch(**browser_args) + self.default_context = self.browser async def _verify_cdp_ready(self, cdp_url: str) -> bool: """Verify CDP endpoint is ready with exponential backoff""" @@ -748,6 +759,9 @@ class BrowserManager: if self.config.extra_args: args.extend(self.config.extra_args) + if self.config.user_data_dir: + args.append(f"--user-data-dir={self.config.user_data_dir}") + # Deduplicate args args = list(dict.fromkeys(args)) diff --git a/tests/test_browser_manager_cors.py b/tests/test_browser_manager_cors.py new file mode 100644 index 00000000..d91ca8f2 --- /dev/null +++ b/tests/test_browser_manager_cors.py @@ -0,0 +1,73 @@ +import os +import sys +import pytest + +# Add the parent directory to the Python path +parent_dir = os.path.dirname( + os.path.dirname(os.path.dirname(os.path.abspath(__file__))) +) +sys.path.append(parent_dir) + +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +@pytest.mark.asyncio +async def test_normal_browser_launch(): + """Test that the browser manager launches normally without --disable-web-security""" + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", bypass_cache=True) + assert result.success + assert result.html + assert result.markdown + + +@pytest.mark.asyncio +async def test_cors_bypass_with_disable_web_security(): + """Test that --disable-web-security allows XMLHttpRequest to bypass CORS""" + browser_config = BrowserConfig( + extra_args=['--disable-web-security'], + headless=True # Run headless for test + ) + + # JS code that attempts XMLHttpRequest to a cross-origin URL that normally blocks CORS + js_code = """ + var xhr = new XMLHttpRequest(); + xhr.open('GET', 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', false); + xhr.send(); + if (xhr.status == 200) { + return {success: true, length: xhr.responseText.length}; + } else { + return {success: false, status: xhr.status, error: xhr.statusText}; + } + """ + + crawler_config = CrawlerRunConfig(js_code=js_code) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", config=crawler_config, bypass_cache=True) + assert result.success, f"Crawl failed: {result.error_message}" + js_result = result.js_execution_result + assert js_result is not None, "JS execution result is None" + assert js_result.get('success') == True, f"XMLHttpRequest failed: {js_result}" + # The result is wrapped in 'results' list + results = js_result.get('results', []) + assert len(results) > 0, "No results in JS execution" + xhr_result = results[0] + assert xhr_result.get('success') == True, f"XMLHttpRequest failed: {xhr_result}" + assert xhr_result.get('length', 0) > 0, f"No data received from XMLHttpRequest: {xhr_result}" + + +@pytest.mark.asyncio +async def test_browser_manager_without_cors_flag(): + """Ensure that without --disable-web-security, normal functionality still works""" + browser_config = BrowserConfig(headless=True) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun(url="https://example.com", bypass_cache=True) + assert result.success + assert result.html + + +# Entry point for debugging +if __name__ == "__main__": + pytest.main([__file__, "-v"]) \ No newline at end of file