- Add tests for device_scale_factor (config + integration) - Add tests for redirected_status_code (model + redirect + raw HTML) - Document device_scale_factor in browser config docs and API reference - Document redirected_status_code in crawler result docs and API reference - Add TristanDonze and charlaie to CONTRIBUTORS.md - Update PR-TODOLIST with session results
This commit is contained in:
@@ -59,8 +59,8 @@
|
||||
| #1668 | microHoffman | Add `--json-ensure-ascii` CLI flag for Unicode handling. Clean, small. | pending |
|
||||
| #1650 | KennyStryker | Add support for Vertex AI in LLM Extraction Strategy. | pending |
|
||||
| #1580 | arpagon | Add Azure OpenAI configuration support to crwl config. | pending |
|
||||
| #1463 | TristanDonze | Add configurable `device_scale_factor` for screenshot quality. 3 files, clean. | pending |
|
||||
| #1435 | charlaie | Add `redirected_status_code` to CrawlResult. 3 files, clean. | pending |
|
||||
| ~~#1463~~ | ~~TristanDonze~~ | ~~Add configurable `device_scale_factor` for screenshot quality. 3 files, clean.~~ | **merged** |
|
||||
| ~~#1435~~ | ~~charlaie~~ | ~~Add `redirected_status_code` to CrawlResult. 3 files, clean.~~ | **merged** |
|
||||
| #1425 | denrusio | Add OpenRouter API support. | pending |
|
||||
| #1417 | NickMandylas | Add CDP headers support for remote browser auth (AWS Bedrock etc). | pending |
|
||||
| #1290 | 130347665 | Support type-list pipeline in JsonElementExtraction (multi-step extract). | pending |
|
||||
@@ -212,3 +212,6 @@
|
||||
| #1696 | majiayu000 | closed: duplicate of #1722 | 2026-02-02 |
|
||||
| #1478 | e1codes | closed: duplicate of #1715 | 2026-02-02 |
|
||||
| #1465 | fardhanrasya | closed: duplicate of #1715 | 2026-02-02 |
|
||||
| #1450 | rbushri | closed: litellm handles response field normalization | 2026-02-06 |
|
||||
| #1463 | TristanDonze | feat: add configurable device_scale_factor for screenshot quality | 2026-02-06 |
|
||||
| #1435 | charlaie | feat: add redirected_status_code to CrawlResult | 2026-02-06 |
|
||||
|
||||
@@ -37,6 +37,8 @@ We would like to thank the following people for their contributions to Crawl4AI:
|
||||
- [stevenaldinger](https://github.com/stevenaldinger) - identified: duplicate PROMPT_EXTRACT_BLOCKS dead code in prompts.py [#931](https://github.com/unclecode/crawl4ai/pull/931)
|
||||
- [chrizzly2309](https://github.com/chrizzly2309) - identified: JWT auth bypass when no credentials provided [#1133](https://github.com/unclecode/crawl4ai/pull/1133)
|
||||
- [complete-dope](https://github.com/complete-dope) - identified: console logging error attribute issue [#729](https://github.com/unclecode/crawl4ai/pull/729)
|
||||
- [TristanDonze](https://github.com/TristanDonze) - feat: add configurable device_scale_factor for screenshot quality [#1463](https://github.com/unclecode/crawl4ai/pull/1463)
|
||||
- [charlaie](https://github.com/charlaie) - feat: add redirected_status_code to CrawlResult [#1435](https://github.com/unclecode/crawl4ai/pull/1435)
|
||||
|
||||
#### Feb-Alpha-1
|
||||
- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
|
||||
|
||||
@@ -24,6 +24,7 @@ class CrawlResult(BaseModel):
|
||||
session_id: Optional[str] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
redirected_status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
...
|
||||
@@ -51,14 +52,22 @@ if not result.success:
|
||||
```
|
||||
|
||||
### 1.3 **`status_code`** *(Optional[int])*
|
||||
**What**: The page's HTTP status code (e.g., 200, 404).
|
||||
**What**: The page's HTTP status code (e.g., 200, 404). When the page was reached via redirect, this is the status code of the **first** response in the redirect chain (e.g., 301 or 302).
|
||||
**Usage**:
|
||||
```python
|
||||
if result.status_code == 404:
|
||||
print("Page not found!")
|
||||
```
|
||||
|
||||
### 1.4 **`error_message`** *(Optional[str])*
|
||||
### 1.4 **`redirected_status_code`** *(Optional[int])*
|
||||
**What**: The HTTP status code of the **final** redirect destination. For a 302→200 redirect, `status_code` is 302 and `redirected_status_code` is 200. `None` for non-HTTP requests (raw HTML, local files).
|
||||
**Usage**:
|
||||
```python
|
||||
if result.status_code in (301, 302) and result.redirected_status_code == 200:
|
||||
print(f"Redirected to {result.redirected_url} (OK)")
|
||||
```
|
||||
|
||||
### 1.5 **`error_message`** *(Optional[str])*
|
||||
**What**: If `success=False`, a textual description of the failure.
|
||||
**Usage**:
|
||||
```python
|
||||
|
||||
@@ -29,6 +29,7 @@ browser_cfg = BrowserConfig(
|
||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||
| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
|
||||
| **`device_scale_factor`** | `float` (default: `1.0`) | Device pixel ratio for rendering. Use `2.0` for Retina-quality screenshots. Higher values produce larger images and use more memory. |
|
||||
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||
| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
|
||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||
|
||||
@@ -88,7 +88,12 @@ class BrowserConfig:
|
||||
- The initial window size.
|
||||
- Some sites behave differently with smaller or bigger viewports.
|
||||
|
||||
8.⠀**`verbose`**
|
||||
8.⠀**`device_scale_factor`**
|
||||
- Controls the device pixel ratio (DPR) for rendering. Default is `1.0`.
|
||||
- Set to `2.0` for Retina-quality screenshots (e.g., a 1920×1080 viewport produces 3840×2160 images).
|
||||
- Higher values increase screenshot size and rendering time proportionally.
|
||||
|
||||
9.⠀**`verbose`**
|
||||
- If `True`, prints extra logs.
|
||||
- Handy for debugging.
|
||||
|
||||
|
||||
@@ -39,6 +39,7 @@ class CrawlResult(BaseModel):
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
dispatch_result: Optional[DispatchResult] = None
|
||||
redirected_url: Optional[str] = None
|
||||
redirected_status_code: Optional[int] = None
|
||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||
tables: List[Dict] = Field(default_factory=list)
|
||||
@@ -73,6 +74,7 @@ class CrawlResult(BaseModel):
|
||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||
| **redirected_status_code (`Optional[int]`)** | HTTP status code of the final redirect destination (e.g., 200). `None` for non-HTTP requests (raw HTML, local files). |
|
||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||
|
||||
72
tests/test_pr_1435_redirected_status_code.py
Normal file
72
tests/test_pr_1435_redirected_status_code.py
Normal file
@@ -0,0 +1,72 @@
|
||||
"""Tests for PR #1435: redirected_status_code in CrawlResult."""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.models import CrawlResult, AsyncCrawlResponse
|
||||
|
||||
|
||||
class TestRedirectedStatusCodeModel:
|
||||
"""Test that the field exists and defaults correctly on both models."""
|
||||
|
||||
def test_crawl_result_default_none(self):
|
||||
result = CrawlResult(url="http://example.com", html="", success=True)
|
||||
assert result.redirected_status_code is None
|
||||
|
||||
def test_crawl_result_set_value(self):
|
||||
result = CrawlResult(url="http://example.com", html="", success=True, redirected_status_code=200)
|
||||
assert result.redirected_status_code == 200
|
||||
|
||||
def test_async_crawl_response_default_none(self):
|
||||
resp = AsyncCrawlResponse(html="<html></html>", response_headers={}, status_code=200)
|
||||
assert resp.redirected_status_code is None
|
||||
|
||||
def test_async_crawl_response_set_value(self):
|
||||
resp = AsyncCrawlResponse(html="<html></html>", response_headers={}, status_code=200, redirected_status_code=301)
|
||||
assert resp.redirected_status_code == 301
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redirected_status_code_on_direct_request():
|
||||
"""A non-redirected request should have redirected_status_code equal to the final status."""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/get", config=run_config)
|
||||
|
||||
assert result.success
|
||||
# Direct request — redirected_status_code should be the final response status (200)
|
||||
assert result.redirected_status_code == 200
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redirected_status_code_on_redirect():
|
||||
"""A redirected request should capture the final destination's status code."""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
# httpbin /redirect/1 does a 302 redirect to /get (which returns 200)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://httpbin.org/redirect/1", config=run_config)
|
||||
|
||||
assert result.success
|
||||
# status_code should be 302 (the first hop, per crawl4ai's redirect chain walking)
|
||||
assert result.status_code == 302
|
||||
# redirected_status_code should be 200 (the final destination)
|
||||
assert result.redirected_status_code == 200
|
||||
# redirected_url should point to the final destination
|
||||
assert "/get" in (result.redirected_url or "")
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_redirected_status_code_on_raw_html():
|
||||
"""Raw HTML input should have redirected_status_code = None (no network request)."""
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
run_config = CrawlerRunConfig()
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("raw:<html><body>test</body></html>", config=run_config)
|
||||
|
||||
assert result.success
|
||||
assert result.redirected_status_code is None
|
||||
62
tests/test_pr_1463_device_scale_factor.py
Normal file
62
tests/test_pr_1463_device_scale_factor.py
Normal file
@@ -0,0 +1,62 @@
|
||||
"""Tests for PR #1463: configurable device_scale_factor in BrowserConfig."""
|
||||
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
import base64
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
|
||||
class TestDeviceScaleFactorConfig:
|
||||
"""Test that device_scale_factor flows correctly through BrowserConfig."""
|
||||
|
||||
def test_default_value(self):
|
||||
config = BrowserConfig()
|
||||
assert config.device_scale_factor == 1.0
|
||||
|
||||
def test_custom_value(self):
|
||||
config = BrowserConfig(device_scale_factor=2.0)
|
||||
assert config.device_scale_factor == 2.0
|
||||
|
||||
def test_to_dict_includes_field(self):
|
||||
config = BrowserConfig(device_scale_factor=3.0)
|
||||
d = config.to_dict()
|
||||
assert d["device_scale_factor"] == 3.0
|
||||
|
||||
def test_clone_preserves(self):
|
||||
config = BrowserConfig(device_scale_factor=2.5)
|
||||
cloned = config.clone()
|
||||
assert cloned.device_scale_factor == 2.5
|
||||
|
||||
def test_from_kwargs(self):
|
||||
config = BrowserConfig.from_kwargs({"device_scale_factor": 1.5})
|
||||
assert config.device_scale_factor == 1.5
|
||||
|
||||
def test_from_kwargs_default(self):
|
||||
config = BrowserConfig.from_kwargs({})
|
||||
assert config.device_scale_factor == 1.0
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_device_scale_factor_produces_larger_screenshot():
|
||||
"""Integration test: higher device_scale_factor should produce a larger screenshot."""
|
||||
html = "<html><body><h1>Scale Test</h1></body></html>"
|
||||
raw_url = f"raw:{html}"
|
||||
run_config = CrawlerRunConfig(screenshot=True)
|
||||
|
||||
# Take screenshot at scale 1.0
|
||||
browser_1x = BrowserConfig(headless=True, device_scale_factor=1.0, viewport_width=800, viewport_height=600)
|
||||
async with AsyncWebCrawler(config=browser_1x) as crawler:
|
||||
result_1x = await crawler.arun(raw_url, config=run_config)
|
||||
|
||||
# Take screenshot at scale 2.0
|
||||
browser_2x = BrowserConfig(headless=True, device_scale_factor=2.0, viewport_width=800, viewport_height=600)
|
||||
async with AsyncWebCrawler(config=browser_2x) as crawler:
|
||||
result_2x = await crawler.arun(raw_url, config=run_config)
|
||||
|
||||
assert result_1x.screenshot is not None
|
||||
assert result_2x.screenshot is not None
|
||||
|
||||
# 2x scale should produce more pixel data (larger base64 string)
|
||||
size_1x = len(base64.b64decode(result_1x.screenshot))
|
||||
size_2x = len(base64.b64decode(result_2x.screenshot))
|
||||
assert size_2x > size_1x, f"2x screenshot ({size_2x} bytes) should be larger than 1x ({size_1x} bytes)"
|
||||
Reference in New Issue
Block a user