- Add tests for device_scale_factor (config + integration) - Add tests for redirected_status_code (model + redirect + raw HTML) - Document device_scale_factor in browser config docs and API reference - Document redirected_status_code in crawler result docs and API reference - Add TristanDonze and charlaie to CONTRIBUTORS.md - Update PR-TODOLIST with session results
This commit is contained in:
@@ -59,8 +59,8 @@
|
|||||||
| #1668 | microHoffman | Add `--json-ensure-ascii` CLI flag for Unicode handling. Clean, small. | pending |
|
| #1668 | microHoffman | Add `--json-ensure-ascii` CLI flag for Unicode handling. Clean, small. | pending |
|
||||||
| #1650 | KennyStryker | Add support for Vertex AI in LLM Extraction Strategy. | pending |
|
| #1650 | KennyStryker | Add support for Vertex AI in LLM Extraction Strategy. | pending |
|
||||||
| #1580 | arpagon | Add Azure OpenAI configuration support to crwl config. | pending |
|
| #1580 | arpagon | Add Azure OpenAI configuration support to crwl config. | pending |
|
||||||
| #1463 | TristanDonze | Add configurable `device_scale_factor` for screenshot quality. 3 files, clean. | pending |
|
| ~~#1463~~ | ~~TristanDonze~~ | ~~Add configurable `device_scale_factor` for screenshot quality. 3 files, clean.~~ | **merged** |
|
||||||
| #1435 | charlaie | Add `redirected_status_code` to CrawlResult. 3 files, clean. | pending |
|
| ~~#1435~~ | ~~charlaie~~ | ~~Add `redirected_status_code` to CrawlResult. 3 files, clean.~~ | **merged** |
|
||||||
| #1425 | denrusio | Add OpenRouter API support. | pending |
|
| #1425 | denrusio | Add OpenRouter API support. | pending |
|
||||||
| #1417 | NickMandylas | Add CDP headers support for remote browser auth (AWS Bedrock etc). | pending |
|
| #1417 | NickMandylas | Add CDP headers support for remote browser auth (AWS Bedrock etc). | pending |
|
||||||
| #1290 | 130347665 | Support type-list pipeline in JsonElementExtraction (multi-step extract). | pending |
|
| #1290 | 130347665 | Support type-list pipeline in JsonElementExtraction (multi-step extract). | pending |
|
||||||
@@ -212,3 +212,6 @@
|
|||||||
| #1696 | majiayu000 | closed: duplicate of #1722 | 2026-02-02 |
|
| #1696 | majiayu000 | closed: duplicate of #1722 | 2026-02-02 |
|
||||||
| #1478 | e1codes | closed: duplicate of #1715 | 2026-02-02 |
|
| #1478 | e1codes | closed: duplicate of #1715 | 2026-02-02 |
|
||||||
| #1465 | fardhanrasya | closed: duplicate of #1715 | 2026-02-02 |
|
| #1465 | fardhanrasya | closed: duplicate of #1715 | 2026-02-02 |
|
||||||
|
| #1450 | rbushri | closed: litellm handles response field normalization | 2026-02-06 |
|
||||||
|
| #1463 | TristanDonze | feat: add configurable device_scale_factor for screenshot quality | 2026-02-06 |
|
||||||
|
| #1435 | charlaie | feat: add redirected_status_code to CrawlResult | 2026-02-06 |
|
||||||
|
|||||||
@@ -37,6 +37,8 @@ We would like to thank the following people for their contributions to Crawl4AI:
|
|||||||
- [stevenaldinger](https://github.com/stevenaldinger) - identified: duplicate PROMPT_EXTRACT_BLOCKS dead code in prompts.py [#931](https://github.com/unclecode/crawl4ai/pull/931)
|
- [stevenaldinger](https://github.com/stevenaldinger) - identified: duplicate PROMPT_EXTRACT_BLOCKS dead code in prompts.py [#931](https://github.com/unclecode/crawl4ai/pull/931)
|
||||||
- [chrizzly2309](https://github.com/chrizzly2309) - identified: JWT auth bypass when no credentials provided [#1133](https://github.com/unclecode/crawl4ai/pull/1133)
|
- [chrizzly2309](https://github.com/chrizzly2309) - identified: JWT auth bypass when no credentials provided [#1133](https://github.com/unclecode/crawl4ai/pull/1133)
|
||||||
- [complete-dope](https://github.com/complete-dope) - identified: console logging error attribute issue [#729](https://github.com/unclecode/crawl4ai/pull/729)
|
- [complete-dope](https://github.com/complete-dope) - identified: console logging error attribute issue [#729](https://github.com/unclecode/crawl4ai/pull/729)
|
||||||
|
- [TristanDonze](https://github.com/TristanDonze) - feat: add configurable device_scale_factor for screenshot quality [#1463](https://github.com/unclecode/crawl4ai/pull/1463)
|
||||||
|
- [charlaie](https://github.com/charlaie) - feat: add redirected_status_code to CrawlResult [#1435](https://github.com/unclecode/crawl4ai/pull/1435)
|
||||||
|
|
||||||
#### Feb-Alpha-1
|
#### Feb-Alpha-1
|
||||||
- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
|
- [sufianuddin](https://github.com/sufianuddin) - fix: [Documentation for JsonCssExtractionStrategy](https://github.com/unclecode/crawl4ai/issues/651)
|
||||||
|
|||||||
@@ -24,6 +24,7 @@ class CrawlResult(BaseModel):
|
|||||||
session_id: Optional[str] = None
|
session_id: Optional[str] = None
|
||||||
response_headers: Optional[dict] = None
|
response_headers: Optional[dict] = None
|
||||||
status_code: Optional[int] = None
|
status_code: Optional[int] = None
|
||||||
|
redirected_status_code: Optional[int] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
dispatch_result: Optional[DispatchResult] = None
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
...
|
...
|
||||||
@@ -50,15 +51,23 @@ if not result.success:
|
|||||||
print(f"Crawl failed: {result.error_message}")
|
print(f"Crawl failed: {result.error_message}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### 1.3 **`status_code`** *(Optional[int])*
|
### 1.3 **`status_code`** *(Optional[int])*
|
||||||
**What**: The page's HTTP status code (e.g., 200, 404).
|
**What**: The page's HTTP status code (e.g., 200, 404). When the page was reached via redirect, this is the status code of the **first** response in the redirect chain (e.g., 301 or 302).
|
||||||
**Usage**:
|
**Usage**:
|
||||||
```python
|
```python
|
||||||
if result.status_code == 404:
|
if result.status_code == 404:
|
||||||
print("Page not found!")
|
print("Page not found!")
|
||||||
```
|
```
|
||||||
|
|
||||||
### 1.4 **`error_message`** *(Optional[str])*
|
### 1.4 **`redirected_status_code`** *(Optional[int])*
|
||||||
|
**What**: The HTTP status code of the **final** redirect destination. For a 302→200 redirect, `status_code` is 302 and `redirected_status_code` is 200. `None` for non-HTTP requests (raw HTML, local files).
|
||||||
|
**Usage**:
|
||||||
|
```python
|
||||||
|
if result.status_code in (301, 302) and result.redirected_status_code == 200:
|
||||||
|
print(f"Redirected to {result.redirected_url} (OK)")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 1.5 **`error_message`** *(Optional[str])*
|
||||||
**What**: If `success=False`, a textual description of the failure.
|
**What**: If `success=False`, a textual description of the failure.
|
||||||
**Usage**:
|
**Usage**:
|
||||||
```python
|
```python
|
||||||
|
|||||||
@@ -29,6 +29,7 @@ browser_cfg = BrowserConfig(
|
|||||||
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
| **`viewport_width`** | `int` (default: `1080`) | Initial page width (in px). Useful for testing responsive layouts. |
|
||||||
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
| **`viewport_height`** | `int` (default: `600`) | Initial page height (in px). |
|
||||||
| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
|
| **`viewport`** | `dict` (default: `None`) | Viewport dimensions dict. If set, overrides `viewport_width` and `viewport_height`. |
|
||||||
|
| **`device_scale_factor`** | `float` (default: `1.0`) | Device pixel ratio for rendering. Use `2.0` for Retina-quality screenshots. Higher values produce larger images and use more memory. |
|
||||||
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
| **`proxy`** | `str` (deprecated) | Deprecated. Use `proxy_config` instead. If set, it will be auto-converted internally. |
|
||||||
| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
|
| **`proxy_config`** | `ProxyConfig or dict` (default: `None`)| For advanced or multi-proxy needs, specify `ProxyConfig` object or dict like `{"server": "...", "username": "...", "password": "..."}`. |
|
||||||
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
| **`use_persistent_context`** | `bool` (default: `False`) | If `True`, uses a **persistent** browser context (keep cookies, sessions across runs). Also sets `use_managed_browser=True`. |
|
||||||
|
|||||||
@@ -84,11 +84,16 @@ class BrowserConfig:
|
|||||||
```
|
```
|
||||||
- Leave as `None` if a proxy is not required.
|
- Leave as `None` if a proxy is not required.
|
||||||
|
|
||||||
7.⠀**`viewport_width` & `viewport_height`**
|
7.⠀**`viewport_width` & `viewport_height`**
|
||||||
- The initial window size.
|
- The initial window size.
|
||||||
- Some sites behave differently with smaller or bigger viewports.
|
- Some sites behave differently with smaller or bigger viewports.
|
||||||
|
|
||||||
8.⠀**`verbose`**
|
8.⠀**`device_scale_factor`**
|
||||||
|
- Controls the device pixel ratio (DPR) for rendering. Default is `1.0`.
|
||||||
|
- Set to `2.0` for Retina-quality screenshots (e.g., a 1920×1080 viewport produces 3840×2160 images).
|
||||||
|
- Higher values increase screenshot size and rendering time proportionally.
|
||||||
|
|
||||||
|
9.⠀**`verbose`**
|
||||||
- If `True`, prints extra logs.
|
- If `True`, prints extra logs.
|
||||||
- Handy for debugging.
|
- Handy for debugging.
|
||||||
|
|
||||||
|
|||||||
@@ -39,6 +39,7 @@ class CrawlResult(BaseModel):
|
|||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
dispatch_result: Optional[DispatchResult] = None
|
dispatch_result: Optional[DispatchResult] = None
|
||||||
redirected_url: Optional[str] = None
|
redirected_url: Optional[str] = None
|
||||||
|
redirected_status_code: Optional[int] = None
|
||||||
network_requests: Optional[List[Dict[str, Any]]] = None
|
network_requests: Optional[List[Dict[str, Any]]] = None
|
||||||
console_messages: Optional[List[Dict[str, Any]]] = None
|
console_messages: Optional[List[Dict[str, Any]]] = None
|
||||||
tables: List[Dict] = Field(default_factory=list)
|
tables: List[Dict] = Field(default_factory=list)
|
||||||
@@ -73,6 +74,7 @@ class CrawlResult(BaseModel):
|
|||||||
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
| **ssl_certificate (`Optional[SSLCertificate]`)** | SSL certificate info if `fetch_ssl_certificate=True`. |
|
||||||
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
| **dispatch_result (`Optional[DispatchResult]`)** | Additional concurrency and resource usage information when crawling URLs in parallel. |
|
||||||
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
| **redirected_url (`Optional[str]`)** | The URL after any redirects (different from `url` which is the final URL). |
|
||||||
|
| **redirected_status_code (`Optional[int]`)** | HTTP status code of the final redirect destination (e.g., 200). `None` for non-HTTP requests (raw HTML, local files). |
|
||||||
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
| **network_requests (`Optional[List[Dict[str, Any]]]`)** | List of network requests, responses, and failures captured during the crawl if `capture_network_requests=True`. |
|
||||||
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
| **console_messages (`Optional[List[Dict[str, Any]]]`)** | List of browser console messages captured during the crawl if `capture_console_messages=True`. |
|
||||||
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
| **tables (`List[Dict]`)** | Table data extracted from HTML tables with structure `[{headers, rows, caption, summary}]`. |
|
||||||
|
|||||||
72
tests/test_pr_1435_redirected_status_code.py
Normal file
72
tests/test_pr_1435_redirected_status_code.py
Normal file
@@ -0,0 +1,72 @@
|
|||||||
|
"""Tests for PR #1435: redirected_status_code in CrawlResult."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
from crawl4ai.models import CrawlResult, AsyncCrawlResponse
|
||||||
|
|
||||||
|
|
||||||
|
class TestRedirectedStatusCodeModel:
|
||||||
|
"""Test that the field exists and defaults correctly on both models."""
|
||||||
|
|
||||||
|
def test_crawl_result_default_none(self):
|
||||||
|
result = CrawlResult(url="http://example.com", html="", success=True)
|
||||||
|
assert result.redirected_status_code is None
|
||||||
|
|
||||||
|
def test_crawl_result_set_value(self):
|
||||||
|
result = CrawlResult(url="http://example.com", html="", success=True, redirected_status_code=200)
|
||||||
|
assert result.redirected_status_code == 200
|
||||||
|
|
||||||
|
def test_async_crawl_response_default_none(self):
|
||||||
|
resp = AsyncCrawlResponse(html="<html></html>", response_headers={}, status_code=200)
|
||||||
|
assert resp.redirected_status_code is None
|
||||||
|
|
||||||
|
def test_async_crawl_response_set_value(self):
|
||||||
|
resp = AsyncCrawlResponse(html="<html></html>", response_headers={}, status_code=200, redirected_status_code=301)
|
||||||
|
assert resp.redirected_status_code == 301
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_redirected_status_code_on_direct_request():
|
||||||
|
"""A non-redirected request should have redirected_status_code equal to the final status."""
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
run_config = CrawlerRunConfig()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun("https://httpbin.org/get", config=run_config)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
# Direct request — redirected_status_code should be the final response status (200)
|
||||||
|
assert result.redirected_status_code == 200
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_redirected_status_code_on_redirect():
|
||||||
|
"""A redirected request should capture the final destination's status code."""
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
run_config = CrawlerRunConfig()
|
||||||
|
|
||||||
|
# httpbin /redirect/1 does a 302 redirect to /get (which returns 200)
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun("https://httpbin.org/redirect/1", config=run_config)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
# status_code should be 302 (the first hop, per crawl4ai's redirect chain walking)
|
||||||
|
assert result.status_code == 302
|
||||||
|
# redirected_status_code should be 200 (the final destination)
|
||||||
|
assert result.redirected_status_code == 200
|
||||||
|
# redirected_url should point to the final destination
|
||||||
|
assert "/get" in (result.redirected_url or "")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_redirected_status_code_on_raw_html():
|
||||||
|
"""Raw HTML input should have redirected_status_code = None (no network request)."""
|
||||||
|
browser_config = BrowserConfig(headless=True)
|
||||||
|
run_config = CrawlerRunConfig()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
result = await crawler.arun("raw:<html><body>test</body></html>", config=run_config)
|
||||||
|
|
||||||
|
assert result.success
|
||||||
|
assert result.redirected_status_code is None
|
||||||
62
tests/test_pr_1463_device_scale_factor.py
Normal file
62
tests/test_pr_1463_device_scale_factor.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
"""Tests for PR #1463: configurable device_scale_factor in BrowserConfig."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
import pytest_asyncio
|
||||||
|
import base64
|
||||||
|
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||||
|
|
||||||
|
|
||||||
|
class TestDeviceScaleFactorConfig:
|
||||||
|
"""Test that device_scale_factor flows correctly through BrowserConfig."""
|
||||||
|
|
||||||
|
def test_default_value(self):
|
||||||
|
config = BrowserConfig()
|
||||||
|
assert config.device_scale_factor == 1.0
|
||||||
|
|
||||||
|
def test_custom_value(self):
|
||||||
|
config = BrowserConfig(device_scale_factor=2.0)
|
||||||
|
assert config.device_scale_factor == 2.0
|
||||||
|
|
||||||
|
def test_to_dict_includes_field(self):
|
||||||
|
config = BrowserConfig(device_scale_factor=3.0)
|
||||||
|
d = config.to_dict()
|
||||||
|
assert d["device_scale_factor"] == 3.0
|
||||||
|
|
||||||
|
def test_clone_preserves(self):
|
||||||
|
config = BrowserConfig(device_scale_factor=2.5)
|
||||||
|
cloned = config.clone()
|
||||||
|
assert cloned.device_scale_factor == 2.5
|
||||||
|
|
||||||
|
def test_from_kwargs(self):
|
||||||
|
config = BrowserConfig.from_kwargs({"device_scale_factor": 1.5})
|
||||||
|
assert config.device_scale_factor == 1.5
|
||||||
|
|
||||||
|
def test_from_kwargs_default(self):
|
||||||
|
config = BrowserConfig.from_kwargs({})
|
||||||
|
assert config.device_scale_factor == 1.0
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.mark.asyncio
|
||||||
|
async def test_device_scale_factor_produces_larger_screenshot():
|
||||||
|
"""Integration test: higher device_scale_factor should produce a larger screenshot."""
|
||||||
|
html = "<html><body><h1>Scale Test</h1></body></html>"
|
||||||
|
raw_url = f"raw:{html}"
|
||||||
|
run_config = CrawlerRunConfig(screenshot=True)
|
||||||
|
|
||||||
|
# Take screenshot at scale 1.0
|
||||||
|
browser_1x = BrowserConfig(headless=True, device_scale_factor=1.0, viewport_width=800, viewport_height=600)
|
||||||
|
async with AsyncWebCrawler(config=browser_1x) as crawler:
|
||||||
|
result_1x = await crawler.arun(raw_url, config=run_config)
|
||||||
|
|
||||||
|
# Take screenshot at scale 2.0
|
||||||
|
browser_2x = BrowserConfig(headless=True, device_scale_factor=2.0, viewport_width=800, viewport_height=600)
|
||||||
|
async with AsyncWebCrawler(config=browser_2x) as crawler:
|
||||||
|
result_2x = await crawler.arun(raw_url, config=run_config)
|
||||||
|
|
||||||
|
assert result_1x.screenshot is not None
|
||||||
|
assert result_2x.screenshot is not None
|
||||||
|
|
||||||
|
# 2x scale should produce more pixel data (larger base64 string)
|
||||||
|
size_1x = len(base64.b64decode(result_1x.screenshot))
|
||||||
|
size_2x = len(base64.b64decode(result_2x.screenshot))
|
||||||
|
assert size_2x > size_1x, f"2x screenshot ({size_2x} bytes) should be larger than 1x ({size_1x} bytes)"
|
||||||
Reference in New Issue
Block a user