Compare commits
2 Commits
main
...
fix-cors-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
af77800a6b | ||
|
|
c2c4d42be4 |
@@ -674,6 +674,11 @@ class BrowserManager:
|
|||||||
self.default_context = await self.create_browser_context()
|
self.default_context = await self.create_browser_context()
|
||||||
await self.setup_context(self.default_context)
|
await self.setup_context(self.default_context)
|
||||||
else:
|
else:
|
||||||
|
# Handle --disable-web-security requiring a separate user data directory
|
||||||
|
if "--disable-web-security" in (self.config.extra_args or []) and not self.config.user_data_dir:
|
||||||
|
import tempfile
|
||||||
|
self.config.user_data_dir = tempfile.mkdtemp()
|
||||||
|
|
||||||
browser_args = self._build_browser_args()
|
browser_args = self._build_browser_args()
|
||||||
|
|
||||||
# Launch appropriate browser type
|
# Launch appropriate browser type
|
||||||
@@ -682,9 +687,15 @@ class BrowserManager:
|
|||||||
elif self.config.browser_type == "webkit":
|
elif self.config.browser_type == "webkit":
|
||||||
self.browser = await self.playwright.webkit.launch(**browser_args)
|
self.browser = await self.playwright.webkit.launch(**browser_args)
|
||||||
else:
|
else:
|
||||||
self.browser = await self.playwright.chromium.launch(**browser_args)
|
if "--disable-web-security" in (self.config.extra_args or []):
|
||||||
|
# Use persistent context for --disable-web-security
|
||||||
self.default_context = self.browser
|
browser_args["args"] = [arg for arg in browser_args["args"] if not arg.startswith("--user-data-dir")]
|
||||||
|
self.default_context = await self.playwright.chromium.launch_persistent_context(self.config.user_data_dir, **browser_args)
|
||||||
|
self.browser = self.default_context
|
||||||
|
self.config.use_managed_browser = True # Treat as managed for get_page logic
|
||||||
|
else:
|
||||||
|
self.browser = await self.playwright.chromium.launch(**browser_args)
|
||||||
|
self.default_context = self.browser
|
||||||
|
|
||||||
async def _verify_cdp_ready(self, cdp_url: str) -> bool:
|
async def _verify_cdp_ready(self, cdp_url: str) -> bool:
|
||||||
"""Verify CDP endpoint is ready with exponential backoff"""
|
"""Verify CDP endpoint is ready with exponential backoff"""
|
||||||
@@ -748,6 +759,9 @@ class BrowserManager:
|
|||||||
if self.config.extra_args:
|
if self.config.extra_args:
|
||||||
args.extend(self.config.extra_args)
|
args.extend(self.config.extra_args)
|
||||||
|
|
||||||
|
if self.config.user_data_dir:
|
||||||
|
args.append(f"--user-data-dir={self.config.user_data_dir}")
|
||||||
|
|
||||||
# Deduplicate args
|
# Deduplicate args
|
||||||
args = list(dict.fromkeys(args))
|
args = list(dict.fromkeys(args))
|
||||||
|
|
||||||
|
|||||||
@@ -542,6 +542,19 @@ class LXMLWebScrapingStrategy(ContentScrapingStrategy):
|
|||||||
if el.tag in bypass_tags:
|
if el.tag in bypass_tags:
|
||||||
continue
|
continue
|
||||||
|
|
||||||
|
# Skip elements inside <pre> or <code> tags where whitespace is significant
|
||||||
|
# This preserves whitespace-only spans (e.g., <span class="w"> </span>) in code blocks
|
||||||
|
is_in_code_block = False
|
||||||
|
ancestor = el.getparent()
|
||||||
|
while ancestor is not None:
|
||||||
|
if ancestor.tag in ("pre", "code"):
|
||||||
|
is_in_code_block = True
|
||||||
|
break
|
||||||
|
ancestor = ancestor.getparent()
|
||||||
|
|
||||||
|
if is_in_code_block:
|
||||||
|
continue
|
||||||
|
|
||||||
text_content = (el.text_content() or "").strip()
|
text_content = (el.text_content() or "").strip()
|
||||||
if (
|
if (
|
||||||
len(text_content.split()) < word_count_threshold
|
len(text_content.split()) < word_count_threshold
|
||||||
|
|||||||
73
tests/test_browser_manager_cors.py
Normal file
73
tests/test_browser_manager_cors.py
Normal file
@@ -0,0 +1,73 @@
|
|||||||
|
import os
|
||||||
|
import sys
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
# Add the parent directory to the Python path
|
||||||
|
parent_dir = os.path.dirname(
|
||||||
|
os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||||
|
)
|
||||||
|
sys.path.append(parent_dir)
|
||||||
|
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_normal_browser_launch():
|
||||||
|
"""Test that the browser manager launches normally without --disable-web-security"""
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", bypass_cache=True)
|
||||||
|
assert result.success
|
||||||
|
assert result.html
|
||||||
|
assert result.markdown
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_cors_bypass_with_disable_web_security():
|
||||||
|
"""Test that --disable-web-security allows XMLHttpRequest to bypass CORS"""
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
extra_args=['--disable-web-security'],
|
||||||
|
headless=True # Run headless for test
|
||||||
|
)
|
||||||
|
|
||||||
|
# JS code that attempts XMLHttpRequest to a cross-origin URL that normally blocks CORS
|
||||||
|
js_code = """
|
||||||
|
var xhr = new XMLHttpRequest();
|
||||||
|
xhr.open('GET', 'https://raw.githubusercontent.com/tatsu-lab/alpaca_eval/main/docs/data_AlpacaEval_2/weighted_alpaca_eval_gpt4_turbo_leaderboard.csv', false);
|
||||||
|
xhr.send();
|
||||||
|
if (xhr.status == 200) {
|
||||||
|
return {success: true, length: xhr.responseText.length};
|
||||||
|
} else {
|
||||||
|
return {success: false, status: xhr.status, error: xhr.statusText};
|
||||||
|
}
|
||||||
|
"""
|
||||||
|
|
||||||
|
crawler_config = CrawlerRunConfig(js_code=js_code)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", config=crawler_config, bypass_cache=True)
|
||||||
|
assert result.success, f"Crawl failed: {result.error_message}"
|
||||||
|
js_result = result.js_execution_result
|
||||||
|
assert js_result is not None, "JS execution result is None"
|
||||||
|
assert js_result.get('success') == True, f"XMLHttpRequest failed: {js_result}"
|
||||||
|
# The result is wrapped in 'results' list
|
||||||
|
results = js_result.get('results', [])
|
||||||
|
assert len(results) > 0, "No results in JS execution"
|
||||||
|
xhr_result = results[0]
|
||||||
|
assert xhr_result.get('success') == True, f"XMLHttpRequest failed: {xhr_result}"
|
||||||
|
assert xhr_result.get('length', 0) > 0, f"No data received from XMLHttpRequest: {xhr_result}"
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_browser_manager_without_cors_flag():
|
||||||
|
"""Ensure that without --disable-web-security, normal functionality still works"""
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun(url="https://example.com", bypass_cache=True)
|
||||||
|
assert result.success
|
||||||
|
assert result.html
|
||||||
|
|
||||||
|
|
||||||
|
# Entry point for debugging
|
||||||
|
if __name__ == "__main__":
|
||||||
|
pytest.main([__file__, "-v"])
|
||||||
Reference in New Issue
Block a user