Compare commits

..

1 Commits

3 changed files with 95 additions and 6 deletions

View File

@@ -674,6 +674,11 @@ class BrowserManager:
self.default_context = await self.create_browser_context()
await self.setup_context(self.default_context)
else:
# Handle --disable-web-security requiring a separate user data directory
if "--disable-web-security" in (self.config.extra_args or []) and not self.config.user_data_dir:
import tempfile
self.config.user_data_dir = tempfile.mkdtemp()
browser_args = self._build_browser_args()
# Launch appropriate browser type
@@ -682,9 +687,15 @@ class BrowserManager:
elif self.config.browser_type == "webkit":
self.browser = await self.playwright.webkit.launch(**browser_args)
else:
self.browser = await self.playwright.chromium.launch(**browser_args)
self.default_context = self.browser
if "--disable-web-security" in (self.config.extra_args or []):
# Use persistent context for --disable-web-security
browser_args["args"] = [arg for arg in browser_args["args"] if not arg.startswith("--user-data-dir")]
self.default_context = await self.playwright.chromium.launch_persistent_context(self.config.user_data_dir, **browser_args)
self.browser = self.default_context
self.config.use_managed_browser = True # Treat as managed for get_page logic
else:
self.browser = await self.playwright.chromium.launch(**browser_args)
self.default_context = self.browser
async def _verify_cdp_ready(self, cdp_url: str) -> bool:
"""Verify CDP endpoint is ready with exponential backoff"""
@@ -748,6 +759,9 @@ class BrowserManager:
if self.config.extra_args:
args.extend(self.config.extra_args)
if self.config.user_data_dir:
args.append(f"--user-data-dir={self.config.user_data_dir}")
# Deduplicate args
args = list(dict.fromkeys(args))

View File

@@ -1,4 +1,4 @@
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field, ConfigDict
from pydantic import BaseModel, HttpUrl, PrivateAttr, Field
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from typing import AsyncGenerator
from typing import Generic, TypeVar
@@ -153,7 +153,8 @@ class CrawlResult(BaseModel):
console_messages: Optional[List[Dict[str, Any]]] = None
tables: List[Dict] = Field(default_factory=list) # NEW [{headers,rows,caption,summary}]
model_config = ConfigDict(arbitrary_types_allowed=True)
class Config:
arbitrary_types_allowed = True
# NOTE: The StringCompatibleMarkdown class, custom __init__ method, property getters/setters,
# and model_dump override all exist to support a smooth transition from markdown as a string
@@ -331,7 +332,8 @@ class AsyncCrawlResponse(BaseModel):
network_requests: Optional[List[Dict[str, Any]]] = None
console_messages: Optional[List[Dict[str, Any]]] = None
model_config = ConfigDict(arbitrary_types_allowed=True)
class Config:
arbitrary_types_allowed = True
###############################
# Scraping Models

View File

@@ -0,0 +1,73 @@
import os
import sys
import pytest
# Add the parent directory to the Python path
parent_dir = os.path.dirname(
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
)
sys.path.append(parent_dir)
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
@pytest.mark.asyncio
async def test_normal_browser_launch():
"""Test that the browser manager launches normally without --disable-web-security"""
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", bypass_cache=True)
assert result.success
assert result.html
assert result.markdown
@pytest.mark.asyncio
async def test_cors_bypass_with_disable_web_security():
"""Test that --disable-web-security allows XMLHttpRequest to bypass CORS"""
browser_config = BrowserConfig(
extra_args=['--disable-web-security'],
headless=True # Run headless for test
)
# JS code that attempts XMLHttpRequest to a cross-origin URL that normally blocks CORS
js_code = """
var xhr = new XMLHttpRequest();
xhr.open('GET', 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', false);
xhr.send();
if (xhr.status == 200) {
return {success: true, length: xhr.responseText.length};
} else {
return {success: false, status: xhr.status, error: xhr.statusText};
}
"""
crawler_config = CrawlerRunConfig(js_code=js_code)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", config=crawler_config, bypass_cache=True)
assert result.success, f"Crawl failed: {result.error_message}"
js_result = result.js_execution_result
assert js_result is not None, "JS execution result is None"
assert js_result.get('success') == True, f"XMLHttpRequest failed: {js_result}"
# The result is wrapped in 'results' list
results = js_result.get('results', [])
assert len(results) > 0, "No results in JS execution"
xhr_result = results[0]
assert xhr_result.get('success') == True, f"XMLHttpRequest failed: {xhr_result}"
assert xhr_result.get('length', 0) > 0, f"No data received from XMLHttpRequest: {xhr_result}"
@pytest.mark.asyncio
async def test_browser_manager_without_cors_flag():
"""Ensure that without --disable-web-security, normal functionality still works"""
browser_config = BrowserConfig(headless=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com", bypass_cache=True)
assert result.success
assert result.html
# Entry point for debugging
if __name__ == "__main__":
pytest.main([__file__, "-v"])