From fbc52813a42ce1af9ae16dd68b73c1487b305173 Mon Sep 17 00:00:00 2001 From: unclecode Date: Fri, 6 Feb 2026 09:30:19 +0000 Subject: [PATCH] Add tests, docs, and contributors for PRs #1463 and #1435 - Add tests for device_scale_factor (config + integration) - Add tests for redirected_status_code (model + redirect + raw HTML) - Document device_scale_factor in browser config docs and API reference - Document redirected_status_code in crawler result docs and API reference - Add TristanDonze and charlaie to CONTRIBUTORS.md - Update PR-TODOLIST with session results --- .context/PR-TODOLIST.md | 7 +- CONTRIBUTORS.md | 2 + docs/md_v2/api/crawl-result.md | 15 +++- docs/md_v2/api/parameters.md | 1 + docs/md_v2/core/browser-crawler-config.md | 11 ++- docs/md_v2/core/crawler-result.md | 2 + tests/test_pr_1435_redirected_status_code.py | 72 ++++++++++++++++++++ tests/test_pr_1463_device_scale_factor.py | 62 +++++++++++++++++ 8 files changed, 164 insertions(+), 8 deletions(-) create mode 100644 tests/test_pr_1435_redirected_status_code.py create mode 100644 tests/test_pr_1463_device_scale_factor.py diff --git a/.context/PR-TODOLIST.md b/.context/PR-TODOLIST.md index 52aee53b..2a19e934 100644 --- a/.context/PR-TODOLIST.md +++ b/.context/PR-TODOLIST.md @@ -59,8 +59,8 @@ | #1668 | microHoffman | Add `--json-ensure-ascii` CLI flag for Unicode handling. Clean, small. | pending | | #1650 | KennyStryker | Add support for Vertex AI in LLM Extraction Strategy. | pending | | #1580 | arpagon | Add Azure OpenAI configuration support to crwl config. | pending | -| #1463 | TristanDonze | Add configurable `device_scale_factor` for screenshot quality. 3 files, clean. | pending | -| #1435 | charlaie | Add `redirected_status_code` to CrawlResult. 3 files, clean. | pending | +| ~~#1463~~ | ~~TristanDonze~~ | ~~Add configurable `device_scale_factor` for screenshot quality. 3 files, clean.~~ | **merged** | +| ~~#1435~~ | ~~charlaie~~ | ~~Add `redirected_status_code` to CrawlResult. 3 files, clean.~~ | **merged** | | #1425 | denrusio | Add OpenRouter API support. | pending | | #1417 | NickMandylas | Add CDP headers support for remote browser auth (AWS Bedrock etc). | pending | | #1290 | 130347665 | Support type-list pipeline in JsonElementExtraction (multi-step extract). | pending | @@ -212,3 +212,6 @@ | #1696 | majiayu000 | closed: duplicate of #1722 | 2026-02-02 | | #1478 | e1codes | closed: duplicate of #1715 | 2026-02-02 | | #1465 | fardhanrasya | closed: duplicate of #1715 | 2026-02-02 | +| #1450 | rbushri | closed: litellm handles response field normalization | 2026-02-06 | +| #1463 | TristanDonze | feat: add configurable device_scale_factor for screenshot quality | 2026-02-06 | +| #1435 | charlaie | feat: add redirected_status_code to CrawlResult | 2026-02-06 | diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 8c7bfab8..9b53422e 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -37,6 +37,8 @@ We would like to thank the following people for their contributions to Crawl4AI: - [stevenaldinger](https://github.com/stevenaldinger) - identified: duplicate PROMPT_EXTRACT_BLOCKS dead code in prompts.py [#931](https://github.com/unclecode/crawl4ai/pull/931) - [chrizzly2309](https://github.com/chrizzly2309) - identified: JWT auth bypass when no credentials provided [#1133](https://github.com/unclecode/crawl4ai/pull/1133) - [complete-dope](https://github.com/complete-dope) - identified: console logging error attribute issue [#729](https://github.com/unclecode/crawl4ai/pull/729) +- [TristanDonze](https://github.com/TristanDonze) - feat: add configurable device_scale_factor for screenshot quality [#1463](https://github.com/unclecode/crawl4ai/pull/1463) +- [charlaie](https://github.com/charlaie) - feat: add redirected_status_code to CrawlResult [#1435](https://github.com/unclecode/crawl4ai/pull/1435) #### Feb-Alpha-1 - [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651) diff --git a/docs/md_v2/api/crawl-result.md b/docs/md_v2/api/crawl-result.md index a27a87d2..f511913d 100644 --- a/docs/md_v2/api/crawl-result.md +++ b/docs/md_v2/api/crawl-result.md @@ -24,6 +24,7 @@ class CrawlResult(BaseModel): session_id: Optional[str] = None response_headers: Optional[dict] = None status_code: Optional[int] = None + redirected_status_code: Optional[int] = None ssl_certificate: Optional[SSLCertificate] = None dispatch_result: Optional[DispatchResult] = None ... @@ -50,15 +51,23 @@ if not result.success: print(f"Crawl failed: {result.error_message}") ``` -### 1.3 **`status_code`** *(Optional[int])* -**What**: The page's HTTP status code (e.g., 200, 404). +### 1.3 **`status_code`** *(Optional[int])* +**What**: The page's HTTP status code (e.g., 200, 404). When the page was reached via redirect, this is the status code of the **first** response in the redirect chain (e.g., 301 or 302). **Usage**: ```python if result.status_code == 404: print("Page not found!") ``` -### 1.4 **`error_message`** *(Optional[str])* +### 1.4 **`redirected_status_code`** *(Optional[int])* +**What**: The HTTP status code of the **final** redirect destination. For a 302→200 redirect, `status_code` is 302 and `redirected_status_code` is 200. `None` for non-HTTP requests (raw HTML, local files). +**Usage**: +```python +if result.status_code in (301, 302) and result.redirected_status_code == 200: + print(f"Redirected to {result.redirected_url} (OK)") +``` + +### 1.5 **`error_message`** *(Optional[str])* **What**: If `success=False`, a textual description of the failure. **Usage**: ```python diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 8e372fb3..dc5699aa 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -29,6 +29,7 @@ browser_cfg = BrowserConfig( | **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. | | **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). | | **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. | +| **`device_scale_factor`** | `float` (default: `1.0`) | Device pixel ratio for rendering. Use `2.0` for Retina-quality screenshots. Higher values produce larger images and use more memory. | | **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. | | **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. | | **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. | diff --git a/docs/md_v2/core/browser-crawler-config.md b/docs/md_v2/core/browser-crawler-config.md index adf370ea..4b35ee7c 100644 --- a/docs/md_v2/core/browser-crawler-config.md +++ b/docs/md_v2/core/browser-crawler-config.md @@ -84,11 +84,16 @@ class BrowserConfig: ``` - Leave as `None` if a proxy is not required. -7.⠀**`viewport_width` & `viewport_height`** - - The initial window size. +7.⠀**`viewport_width` & `viewport_height`** + - The initial window size. - Some sites behave differently with smaller or bigger viewports. -8.⠀**`verbose`** +8.⠀**`device_scale_factor`** + - Controls the device pixel ratio (DPR) for rendering. Default is `1.0`. + - Set to `2.0` for Retina-quality screenshots (e.g., a 1920×1080 viewport produces 3840×2160 images). + - Higher values increase screenshot size and rendering time proportionally. + +9.⠀**`verbose`** - If `True`, prints extra logs. - Handy for debugging. diff --git a/docs/md_v2/core/crawler-result.md b/docs/md_v2/core/crawler-result.md index 116782e1..292f70e3 100644 --- a/docs/md_v2/core/crawler-result.md +++ b/docs/md_v2/core/crawler-result.md @@ -39,6 +39,7 @@ class CrawlResult(BaseModel): ssl_certificate: Optional[SSLCertificate] = None dispatch_result: Optional[DispatchResult] = None redirected_url: Optional[str] = None + redirected_status_code: Optional[int] = None network_requests: Optional[List[Dict[str, Any]]] = None console_messages: Optional[List[Dict[str, Any]]] = None tables: List[Dict] = Field(default_factory=list) @@ -73,6 +74,7 @@ class CrawlResult(BaseModel): | **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. | | **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. | | **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). | +| **redirected_status_code (`Optional[int]`)** | HTTP status code of the final redirect destination (e.g., 200). `None` for non-HTTP requests (raw HTML, local files). | | **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. | | **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. | | **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. | diff --git a/tests/test_pr_1435_redirected_status_code.py b/tests/test_pr_1435_redirected_status_code.py new file mode 100644 index 00000000..f87d1e86 --- /dev/null +++ b/tests/test_pr_1435_redirected_status_code.py @@ -0,0 +1,72 @@ +"""Tests for PR #1435: redirected_status_code in CrawlResult.""" + +import pytest +import pytest_asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from crawl4ai.models import CrawlResult, AsyncCrawlResponse + + +class TestRedirectedStatusCodeModel: + """Test that the field exists and defaults correctly on both models.""" + + def test_crawl_result_default_none(self): + result = CrawlResult(url="http://example.com", html="", success=True) + assert result.redirected_status_code is None + + def test_crawl_result_set_value(self): + result = CrawlResult(url="http://example.com", html="", success=True, redirected_status_code=200) + assert result.redirected_status_code == 200 + + def test_async_crawl_response_default_none(self): + resp = AsyncCrawlResponse(html="", response_headers={}, status_code=200) + assert resp.redirected_status_code is None + + def test_async_crawl_response_set_value(self): + resp = AsyncCrawlResponse(html="", response_headers={}, status_code=200, redirected_status_code=301) + assert resp.redirected_status_code == 301 + + +@pytest.mark.asyncio +async def test_redirected_status_code_on_direct_request(): + """A non-redirected request should have redirected_status_code equal to the final status.""" + browser_config = BrowserConfig(headless=True) + run_config = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://httpbin.org/get", config=run_config) + + assert result.success + # Direct request — redirected_status_code should be the final response status (200) + assert result.redirected_status_code == 200 + + +@pytest.mark.asyncio +async def test_redirected_status_code_on_redirect(): + """A redirected request should capture the final destination's status code.""" + browser_config = BrowserConfig(headless=True) + run_config = CrawlerRunConfig() + + # httpbin /redirect/1 does a 302 redirect to /get (which returns 200) + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://httpbin.org/redirect/1", config=run_config) + + assert result.success + # status_code should be 302 (the first hop, per crawl4ai's redirect chain walking) + assert result.status_code == 302 + # redirected_status_code should be 200 (the final destination) + assert result.redirected_status_code == 200 + # redirected_url should point to the final destination + assert "/get" in (result.redirected_url or "") + + +@pytest.mark.asyncio +async def test_redirected_status_code_on_raw_html(): + """Raw HTML input should have redirected_status_code = None (no network request).""" + browser_config = BrowserConfig(headless=True) + run_config = CrawlerRunConfig() + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("raw:test", config=run_config) + + assert result.success + assert result.redirected_status_code is None diff --git a/tests/test_pr_1463_device_scale_factor.py b/tests/test_pr_1463_device_scale_factor.py new file mode 100644 index 00000000..7fbc717a --- /dev/null +++ b/tests/test_pr_1463_device_scale_factor.py @@ -0,0 +1,62 @@ +"""Tests for PR #1463: configurable device_scale_factor in BrowserConfig.""" + +import pytest +import pytest_asyncio +import base64 +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + + +class TestDeviceScaleFactorConfig: + """Test that device_scale_factor flows correctly through BrowserConfig.""" + + def test_default_value(self): + config = BrowserConfig() + assert config.device_scale_factor == 1.0 + + def test_custom_value(self): + config = BrowserConfig(device_scale_factor=2.0) + assert config.device_scale_factor == 2.0 + + def test_to_dict_includes_field(self): + config = BrowserConfig(device_scale_factor=3.0) + d = config.to_dict() + assert d["device_scale_factor"] == 3.0 + + def test_clone_preserves(self): + config = BrowserConfig(device_scale_factor=2.5) + cloned = config.clone() + assert cloned.device_scale_factor == 2.5 + + def test_from_kwargs(self): + config = BrowserConfig.from_kwargs({"device_scale_factor": 1.5}) + assert config.device_scale_factor == 1.5 + + def test_from_kwargs_default(self): + config = BrowserConfig.from_kwargs({}) + assert config.device_scale_factor == 1.0 + + +@pytest.mark.asyncio +async def test_device_scale_factor_produces_larger_screenshot(): + """Integration test: higher device_scale_factor should produce a larger screenshot.""" + html = "

Scale Test

" + raw_url = f"raw:{html}" + run_config = CrawlerRunConfig(screenshot=True) + + # Take screenshot at scale 1.0 + browser_1x = BrowserConfig(headless=True, device_scale_factor=1.0, viewport_width=800, viewport_height=600) + async with AsyncWebCrawler(config=browser_1x) as crawler: + result_1x = await crawler.arun(raw_url, config=run_config) + + # Take screenshot at scale 2.0 + browser_2x = BrowserConfig(headless=True, device_scale_factor=2.0, viewport_width=800, viewport_height=600) + async with AsyncWebCrawler(config=browser_2x) as crawler: + result_2x = await crawler.arun(raw_url, config=run_config) + + assert result_1x.screenshot is not None + assert result_2x.screenshot is not None + + # 2x scale should produce more pixel data (larger base64 string) + size_1x = len(base64.b64decode(result_1x.screenshot)) + size_2x = len(base64.b64decode(result_2x.screenshot)) + assert size_2x > size_1x, f"2x screenshot ({size_2x} bytes) should be larger than 1x ({size_1x} bytes)"