diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d3a4d98a..bdc0c9bf 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -1,3 +1,5 @@ +import copy +import functools import importlib import os import warnings @@ -35,6 +37,83 @@ from enum import Enum UrlMatcher = Union[str, Callable[[str], bool], List[Union[str, Callable[[str], bool]]]] +def _with_defaults(cls): + """Class decorator: adds set_defaults/get_defaults/reset_defaults classmethods. + + After decorating, every new instance resolves parameters as: + explicit arg > class-level user defaults > hardcoded default + + Usage:: + + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + cfg = BrowserConfig() # headless=False, viewport_width=1920 + cfg = BrowserConfig(headless=True) # explicit wins → headless=True + """ + original_init = cls.__init__ + sig = inspect.signature(original_init) + param_names = [p for p in sig.parameters if p != "self"] + valid_params = frozenset(param_names) + + @functools.wraps(original_init) + def wrapped_init(self, *args, **kwargs): + user_defaults = type(self)._user_defaults + if user_defaults: + # Determine which params the caller passed explicitly + explicit = set(kwargs.keys()) + for i in range(len(args)): + if i < len(param_names): + explicit.add(param_names[i]) + # Inject user defaults for non-explicit params + for key, value in user_defaults.items(): + if key not in explicit: + kwargs[key] = copy.deepcopy(value) + original_init(self, *args, **kwargs) + + cls.__init__ = wrapped_init + cls._user_defaults = {} + + @classmethod + def set_defaults(klass, **kwargs): + """Set class-level default overrides for new instances. + + Args: + **kwargs: Parameter names and their default values. + + Raises: + ValueError: If any key is not a valid ``__init__`` parameter. + """ + invalid = set(kwargs) - valid_params + if invalid: + raise ValueError( + f"Invalid parameter(s) for {klass.__name__}: {invalid}" + ) + for k, v in kwargs.items(): + klass._user_defaults[k] = copy.deepcopy(v) + + @classmethod + def get_defaults(klass): + """Return a deep copy of the current class-level defaults.""" + return copy.deepcopy(klass._user_defaults) + + @classmethod + def reset_defaults(klass, *names): + """Clear class-level defaults. + + With no arguments, removes all overrides. + With arguments, removes only the named overrides. + """ + if names: + for n in names: + klass._user_defaults.pop(n, None) + else: + klass._user_defaults.clear() + + cls.set_defaults = set_defaults + cls.get_defaults = get_defaults + cls.reset_defaults = reset_defaults + return cls + + class MatchMode(Enum): OR = "or" AND = "and" @@ -392,6 +471,7 @@ class ProxyConfig: config_dict.update(kwargs) return ProxyConfig.from_dict(config_dict) +@_with_defaults class BrowserConfig: """ Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrategy. @@ -1032,6 +1112,7 @@ class HTTPCrawlerConfig: return config return HTTPCrawlerConfig.from_kwargs(config) +@_with_defaults class CrawlerRunConfig(): """ diff --git a/tests/test_config_defaults.py b/tests/test_config_defaults.py new file mode 100644 index 00000000..700886aa --- /dev/null +++ b/tests/test_config_defaults.py @@ -0,0 +1,263 @@ +"""Tests for BrowserConfig.set_defaults / CrawlerRunConfig.set_defaults.""" + +import pytest +from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig + + +@pytest.fixture(autouse=True) +def _reset_defaults(): + """Ensure every test starts and ends with a clean slate.""" + BrowserConfig.reset_defaults() + CrawlerRunConfig.reset_defaults() + yield + BrowserConfig.reset_defaults() + CrawlerRunConfig.reset_defaults() + + +# ── Basic API ────────────────────────────────────────────────────────── + + +class TestBasicAPI: + def test_set_and_get_defaults(self): + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + d = BrowserConfig.get_defaults() + assert d == {"headless": False, "viewport_width": 1920} + + def test_get_defaults_returns_copy(self): + BrowserConfig.set_defaults(headers={"X-Foo": "bar"}) + d = BrowserConfig.get_defaults() + d["headers"]["X-Foo"] = "changed" + assert BrowserConfig.get_defaults()["headers"]["X-Foo"] == "bar" + + def test_reset_all(self): + BrowserConfig.set_defaults(headless=False) + BrowserConfig.reset_defaults() + assert BrowserConfig.get_defaults() == {} + + def test_reset_selective(self): + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + BrowserConfig.reset_defaults("headless") + assert BrowserConfig.get_defaults() == {"viewport_width": 1920} + + def test_invalid_param_raises(self): + with pytest.raises(ValueError, match="Invalid parameter"): + BrowserConfig.set_defaults(not_a_real_param=42) + + def test_invalid_param_among_valid(self): + with pytest.raises(ValueError): + BrowserConfig.set_defaults(headless=False, bogus=True) + # Nothing should have been stored + assert BrowserConfig.get_defaults() == {} + + def test_set_defaults_overwrites(self): + BrowserConfig.set_defaults(headless=False) + BrowserConfig.set_defaults(headless=True) + assert BrowserConfig.get_defaults()["headless"] is True + + def test_crawler_run_config_basic(self): + CrawlerRunConfig.set_defaults(verbose=False, scan_full_page=True) + d = CrawlerRunConfig.get_defaults() + assert d == {"verbose": False, "scan_full_page": True} + + +# ── Default injection ────────────────────────────────────────────────── + + +class TestDefaultInjection: + def test_browser_config_defaults_applied(self): + BrowserConfig.set_defaults( + headless=False, + cache_cdp_connection=True, + cdp_close_delay=0, + ) + cfg = BrowserConfig() + assert cfg.headless is False + assert cfg.cache_cdp_connection is True + assert cfg.cdp_close_delay == 0 + + def test_crawler_run_config_defaults_applied(self): + CrawlerRunConfig.set_defaults(verbose=False, scan_full_page=True) + cfg = CrawlerRunConfig() + assert cfg.verbose is False + assert cfg.scan_full_page is True + + def test_partial_defaults(self): + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig() + assert cfg.headless is False + # Other params keep their hardcoded defaults + assert cfg.browser_type == "chromium" + assert cfg.viewport_width == 1080 + + def test_multiple_instances_get_defaults(self): + BrowserConfig.set_defaults(headless=False) + c1 = BrowserConfig() + c2 = BrowserConfig() + assert c1.headless is False + assert c2.headless is False + + +# ── Explicit override ────────────────────────────────────────────────── + + +class TestExplicitOverride: + def test_explicit_kwarg_wins(self): + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig(headless=True) + assert cfg.headless is True + + def test_explicit_same_as_default_still_wins(self): + """Even if user passes the same value as user-default, it should be treated as explicit.""" + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig(headless=False) + assert cfg.headless is False + + def test_explicit_none_wins(self): + BrowserConfig.set_defaults(cdp_url="ws://localhost:9222") + cfg = BrowserConfig(cdp_url=None) + assert cfg.cdp_url is None + + def test_mixed_explicit_and_default(self): + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + cfg = BrowserConfig(headless=True) + assert cfg.headless is True # explicit + assert cfg.viewport_width == 1920 # from user default + + +# ── Mutable isolation ────────────────────────────────────────────────── + + +class TestMutableIsolation: + def test_list_default_not_shared(self): + BrowserConfig.set_defaults(cookies=[{"name": "a", "value": "1"}]) + c1 = BrowserConfig() + c2 = BrowserConfig() + c1.cookies.append({"name": "b", "value": "2"}) + assert len(c2.cookies) == 1 # c2 should be unaffected + + def test_dict_default_not_shared(self): + BrowserConfig.set_defaults(headers={"X-Foo": "bar"}) + c1 = BrowserConfig() + c2 = BrowserConfig() + c1.headers["X-New"] = "val" + assert "X-New" not in c2.headers + + def test_set_defaults_input_not_mutated(self): + original = {"X-Foo": "bar"} + BrowserConfig.set_defaults(headers=original) + cfg = BrowserConfig() + cfg.headers["X-Added"] = "val" + assert "X-Added" not in original + assert "X-Added" not in BrowserConfig.get_defaults()["headers"] + + +# ── Special processing ───────────────────────────────────────────────── + + +class TestSpecialProcessing: + def test_browser_mode_builtin_sets_managed(self): + BrowserConfig.set_defaults(browser_mode="builtin") + cfg = BrowserConfig() + assert cfg.use_managed_browser is True + + def test_viewport_dict_overrides_dimensions(self): + BrowserConfig.set_defaults(viewport={"width": 1920, "height": 1080}) + cfg = BrowserConfig() + assert cfg.viewport_width == 1920 + assert cfg.viewport_height == 1080 + + def test_proxy_string_converted_to_proxy_config(self): + BrowserConfig.set_defaults(proxy="http://user:pass@proxy:8080") + cfg = BrowserConfig() + assert cfg.proxy_config is not None + assert cfg.proxy_config.server == "http://proxy:8080" + + def test_crawler_run_config_proxy_dict_converted(self): + CrawlerRunConfig.set_defaults( + proxy_config={"server": "http://proxy:8080"} + ) + cfg = CrawlerRunConfig() + from crawl4ai.async_configs import ProxyConfig + assert isinstance(cfg.proxy_config, ProxyConfig) + + +# ── Clone / from_kwargs ──────────────────────────────────────────────── + + +class TestCloneAndFromKwargs: + def test_clone_preserves_user_default_values(self): + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + cfg = BrowserConfig() + cloned = cfg.clone() + assert cloned.headless is False + assert cloned.viewport_width == 1920 + + def test_clone_with_override(self): + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig() + cloned = cfg.clone(headless=True) + assert cloned.headless is True + + def test_from_kwargs_explicit_values(self): + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig.from_kwargs({"headless": True}) + assert cfg.headless is True + + +# ── Dump / Load round-trip ───────────────────────────────────────────── + + +class TestDumpLoad: + def test_dump_load_preserves_user_defaults(self): + BrowserConfig.set_defaults(headless=False, viewport_width=1920) + cfg = BrowserConfig() + data = cfg.dump() + loaded = BrowserConfig.load(data) + assert loaded.headless is False + assert loaded.viewport_width == 1920 + + def test_dump_load_survives_reset(self): + """Values should be baked into serialized data, independent of class defaults.""" + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig() + data = cfg.dump() + BrowserConfig.reset_defaults() + loaded = BrowserConfig.load(data) + assert loaded.headless is False + + def test_crawler_run_config_dump_load(self): + CrawlerRunConfig.set_defaults(verbose=False, scan_full_page=True) + cfg = CrawlerRunConfig() + data = cfg.dump() + CrawlerRunConfig.reset_defaults() + loaded = CrawlerRunConfig.load(data) + assert loaded.verbose is False + assert loaded.scan_full_page is True + + def test_to_dict_includes_user_default_values(self): + BrowserConfig.set_defaults(headless=False) + cfg = BrowserConfig() + d = cfg.to_dict() + assert d["headless"] is False + + +# ── Class isolation ──────────────────────────────────────────────────── + + +class TestClassIsolation: + def test_browser_defaults_dont_leak_to_crawler(self): + BrowserConfig.set_defaults(verbose=False) + cfg = CrawlerRunConfig() + assert cfg.verbose is True # CrawlerRunConfig hardcoded default + + def test_crawler_defaults_dont_leak_to_browser(self): + CrawlerRunConfig.set_defaults(verbose=False) + cfg = BrowserConfig() + assert cfg.verbose is True # BrowserConfig hardcoded default + + def test_independent_reset(self): + BrowserConfig.set_defaults(headless=False) + CrawlerRunConfig.set_defaults(verbose=False) + BrowserConfig.reset_defaults() + assert BrowserConfig.get_defaults() == {} + assert CrawlerRunConfig.get_defaults() == {"verbose": False}