docs(api): add streaming mode documentation and examples
Add comprehensive documentation for the new streaming mode feature in arun_many(): - Update arun_many() API docs to reflect streaming return type - Add streaming examples in quickstart and multi-url guides - Document stream parameter in configuration classes - Add clone() helper method documentation for configs This change improves documentation for processing large numbers of URLs efficiently.
This commit is contained in:
@@ -93,44 +93,75 @@ dispatcher = SemaphoreDispatcher(
|
|||||||
|
|
||||||
## 4. Usage Examples
|
## 4. Usage Examples
|
||||||
|
|
||||||
### 4.1 Simple Usage (Default MemoryAdaptiveDispatcher)
|
### 4.1 Batch Processing (Default)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async def crawl_batch():
|
||||||
results = await crawler.arun_many(urls, config=run_config)
|
|
||||||
```
|
|
||||||
|
|
||||||
### 4.2 Memory Adaptive with Rate Limiting
|
|
||||||
|
|
||||||
```python
|
|
||||||
async def crawl_with_memory_adaptive(urls):
|
|
||||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
stream=False # Default: get all results at once
|
||||||
|
)
|
||||||
|
|
||||||
dispatcher = MemoryAdaptiveDispatcher(
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
memory_threshold_percent=70.0,
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=1.0,
|
||||||
max_session_permit=10,
|
max_session_permit=10,
|
||||||
rate_limiter=RateLimiter(
|
|
||||||
base_delay=(1.0, 2.0),
|
|
||||||
max_delay=30.0,
|
|
||||||
max_retries=2
|
|
||||||
),
|
|
||||||
monitor=CrawlerMonitor(
|
monitor=CrawlerMonitor(
|
||||||
max_visible_rows=15,
|
|
||||||
display_mode=DisplayMode.DETAILED
|
display_mode=DisplayMode.DETAILED
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# Get all results at once
|
||||||
results = await crawler.arun_many(
|
results = await crawler.arun_many(
|
||||||
urls,
|
urls=urls,
|
||||||
config=run_config,
|
config=run_config,
|
||||||
dispatcher=dispatcher
|
dispatcher=dispatcher
|
||||||
)
|
)
|
||||||
return results
|
|
||||||
|
# Process all results after completion
|
||||||
|
for result in results:
|
||||||
|
if result.success:
|
||||||
|
await process_result(result)
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||||
```
|
```
|
||||||
|
|
||||||
### 4.3 Semaphore with Rate Limiting
|
### 4.2 Streaming Mode
|
||||||
|
|
||||||
|
```python
|
||||||
|
async def crawl_streaming():
|
||||||
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
|
run_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
stream=True # Enable streaming mode
|
||||||
|
)
|
||||||
|
|
||||||
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=70.0,
|
||||||
|
check_interval=1.0,
|
||||||
|
max_session_permit=10,
|
||||||
|
monitor=CrawlerMonitor(
|
||||||
|
display_mode=DisplayMode.DETAILED
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
|
# Process results as they become available
|
||||||
|
async for result in await crawler.arun_many(
|
||||||
|
urls=urls,
|
||||||
|
config=run_config,
|
||||||
|
dispatcher=dispatcher
|
||||||
|
):
|
||||||
|
if result.success:
|
||||||
|
# Process each result immediately
|
||||||
|
await process_result(result)
|
||||||
|
else:
|
||||||
|
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||||
|
```
|
||||||
|
|
||||||
|
### 4.3 Semaphore-based Crawling
|
||||||
|
|
||||||
```python
|
```python
|
||||||
async def crawl_with_semaphore(urls):
|
async def crawl_with_semaphore(urls):
|
||||||
|
|||||||
@@ -10,7 +10,7 @@ async def arun_many(
|
|||||||
config: Optional[CrawlerRunConfig] = None,
|
config: Optional[CrawlerRunConfig] = None,
|
||||||
dispatcher: Optional[BaseDispatcher] = None,
|
dispatcher: Optional[BaseDispatcher] = None,
|
||||||
...
|
...
|
||||||
) -> List[CrawlResult]:
|
) -> Union[List[CrawlResult], AsyncGenerator[CrawlResult, None]]:
|
||||||
"""
|
"""
|
||||||
Crawl multiple URLs concurrently or in batches.
|
Crawl multiple URLs concurrently or in batches.
|
||||||
|
|
||||||
@@ -18,7 +18,7 @@ async def arun_many(
|
|||||||
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
:param config: (Optional) A default `CrawlerRunConfig` applying to each crawl.
|
||||||
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
:param dispatcher: (Optional) A concurrency controller (e.g. MemoryAdaptiveDispatcher).
|
||||||
...
|
...
|
||||||
:return: A list of `CrawlResult` objects, one per URL.
|
:return: Either a list of `CrawlResult` objects, or an async generator if streaming is enabled.
|
||||||
"""
|
"""
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -26,24 +26,29 @@ async def arun_many(
|
|||||||
|
|
||||||
1. **Multiple URLs**:
|
1. **Multiple URLs**:
|
||||||
- Instead of crawling a single URL, you pass a list of them (strings or tasks).
|
- Instead of crawling a single URL, you pass a list of them (strings or tasks).
|
||||||
- The function returns a **list** of `CrawlResult`, in the same order as `urls`.
|
- The function returns either a **list** of `CrawlResult` or an **async generator** if streaming is enabled.
|
||||||
|
|
||||||
2. **Concurrency & Dispatchers**:
|
2. **Concurrency & Dispatchers**:
|
||||||
- **`dispatcher`** param allows advanced concurrency control.
|
- **`dispatcher`** param allows advanced concurrency control.
|
||||||
- If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.
|
- If omitted, a default dispatcher (like `MemoryAdaptiveDispatcher`) is used internally.
|
||||||
- Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
|
- Dispatchers handle concurrency, rate limiting, and memory-based adaptive throttling (see [Multi-URL Crawling](../advanced/multi-url-crawling.md)).
|
||||||
|
|
||||||
3. **Parallel** Execution**:
|
3. **Streaming Support**:
|
||||||
|
- Enable streaming by setting `stream=True` in your `CrawlerRunConfig`.
|
||||||
|
- When streaming, use `async for` to process results as they become available.
|
||||||
|
- Ideal for processing large numbers of URLs without waiting for all to complete.
|
||||||
|
|
||||||
|
4. **Parallel** Execution**:
|
||||||
- `arun_many()` can run multiple requests concurrently under the hood.
|
- `arun_many()` can run multiple requests concurrently under the hood.
|
||||||
- Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
|
- Each `CrawlResult` might also include a **`dispatch_result`** with concurrency details (like memory usage, start/end times).
|
||||||
|
|
||||||
### Basic Example
|
### Basic Example (Batch Mode)
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Minimal usage: The default dispatcher will be used
|
# Minimal usage: The default dispatcher will be used
|
||||||
results = await crawler.arun_many(
|
results = await crawler.arun_many(
|
||||||
urls=["https://site1.com", "https://site2.com"],
|
urls=["https://site1.com", "https://site2.com"],
|
||||||
config=my_run_config
|
config=CrawlerRunConfig(stream=False) # Default behavior
|
||||||
)
|
)
|
||||||
|
|
||||||
for res in results:
|
for res in results:
|
||||||
@@ -53,6 +58,25 @@ for res in results:
|
|||||||
print("Failed:", res.url, "-", res.error_message)
|
print("Failed:", res.url, "-", res.error_message)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Streaming Example
|
||||||
|
|
||||||
|
```python
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
stream=True, # Enable streaming mode
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process results as they complete
|
||||||
|
async for result in await crawler.arun_many(
|
||||||
|
urls=["https://site1.com", "https://site2.com", "https://site3.com"],
|
||||||
|
config=config
|
||||||
|
):
|
||||||
|
if result.success:
|
||||||
|
print(f"Just completed: {result.url}")
|
||||||
|
# Process each result immediately
|
||||||
|
process_result(result)
|
||||||
|
```
|
||||||
|
|
||||||
### With a Custom Dispatcher
|
### With a Custom Dispatcher
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -74,7 +98,7 @@ results = await crawler.arun_many(
|
|||||||
|
|
||||||
### Return Value
|
### Return Value
|
||||||
|
|
||||||
A **list** of [`CrawlResult`](./crawl-result.md) objects, one per URL. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
|
Either a **list** of [`CrawlResult`](./crawl-result.md) objects, or an **async generator** if streaming is enabled. You can iterate to check `result.success` or read each item’s `extracted_content`, `markdown`, or `dispatch_result`.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
|
|||||||
@@ -56,6 +56,7 @@ run_cfg = CrawlerRunConfig(
|
|||||||
word_count_threshold=15,
|
word_count_threshold=15,
|
||||||
excluded_tags=["nav", "footer"],
|
excluded_tags=["nav", "footer"],
|
||||||
exclude_external_links=True,
|
exclude_external_links=True,
|
||||||
|
stream=True, # Enable streaming for arun_many()
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -191,7 +192,28 @@ The `RateLimitConfig` class has these fields:
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 2.2 Example Usage
|
## 2.2 Helper Methods
|
||||||
|
|
||||||
|
Both `BrowserConfig` and `CrawlerRunConfig` provide a `clone()` method to create modified copies:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create a base configuration
|
||||||
|
base_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.ENABLED,
|
||||||
|
word_count_threshold=200
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create variations using clone()
|
||||||
|
stream_config = base_config.clone(stream=True)
|
||||||
|
no_cache_config = base_config.clone(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
stream=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `clone()` method is particularly useful when you need slightly different configurations for different use cases, without modifying the original config.
|
||||||
|
|
||||||
|
## 2.3 Example Usage
|
||||||
|
|
||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
@@ -226,7 +248,8 @@ async def main():
|
|||||||
memory_threshold_percent=70.0,
|
memory_threshold_percent=70.0,
|
||||||
check_interval=1.0,
|
check_interval=1.0,
|
||||||
max_session_permit=20,
|
max_session_permit=20,
|
||||||
display_mode="DETAILED"
|
display_mode="DETAILED",
|
||||||
|
stream=True
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||||
@@ -259,3 +282,10 @@ if __name__ == "__main__":
|
|||||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create a modified copy with the clone() method
|
||||||
|
stream_cfg = run_cfg.clone(
|
||||||
|
stream=True,
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|||||||
@@ -85,6 +85,25 @@ class BrowserConfig:
|
|||||||
- Additional flags for the underlying browser.
|
- Additional flags for the underlying browser.
|
||||||
- E.g. `["--disable-extensions"]`.
|
- E.g. `["--disable-extensions"]`.
|
||||||
|
|
||||||
|
### Helper Methods
|
||||||
|
|
||||||
|
Both configuration classes provide a `clone()` method to create modified copies:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create a base browser config
|
||||||
|
base_browser = BrowserConfig(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True,
|
||||||
|
text_mode=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create a visible browser config for debugging
|
||||||
|
debug_browser = base_browser.clone(
|
||||||
|
headless=False,
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
**Minimal Example**:
|
**Minimal Example**:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
@@ -123,6 +142,7 @@ class CrawlerRunConfig:
|
|||||||
max_session_permit=20,
|
max_session_permit=20,
|
||||||
display_mode=None,
|
display_mode=None,
|
||||||
verbose=True,
|
verbose=True,
|
||||||
|
stream=False, # Enable streaming for arun_many()
|
||||||
# ... other advanced parameters omitted
|
# ... other advanced parameters omitted
|
||||||
):
|
):
|
||||||
...
|
...
|
||||||
@@ -186,6 +206,36 @@ class CrawlerRunConfig:
|
|||||||
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
- The display mode for progress information (`DETAILED`, `BRIEF`, etc.).
|
||||||
- Affects how much information is printed during the crawl.
|
- Affects how much information is printed during the crawl.
|
||||||
|
|
||||||
|
### Helper Methods
|
||||||
|
|
||||||
|
The `clone()` method is particularly useful for creating variations of your crawler configuration:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Create a base configuration
|
||||||
|
base_config = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.ENABLED,
|
||||||
|
word_count_threshold=200,
|
||||||
|
wait_until="networkidle"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Create variations for different use cases
|
||||||
|
stream_config = base_config.clone(
|
||||||
|
stream=True, # Enable streaming mode
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
|
||||||
|
debug_config = base_config.clone(
|
||||||
|
page_timeout=120000, # Longer timeout for debugging
|
||||||
|
verbose=True
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `clone()` method:
|
||||||
|
- Creates a new instance with all the same settings
|
||||||
|
- Updates only the specified parameters
|
||||||
|
- Leaves the original configuration unchanged
|
||||||
|
- Perfect for creating variations without repeating all parameters
|
||||||
|
|
||||||
### Rate Limiting & Resource Management
|
### Rate Limiting & Resource Management
|
||||||
|
|
||||||
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
|
For batch processing with `arun_many()`, you can enable intelligent rate limiting:
|
||||||
@@ -229,7 +279,8 @@ crawl_conf = CrawlerRunConfig(
|
|||||||
max_delay=60.0,
|
max_delay=60.0,
|
||||||
max_retries=3,
|
max_retries=3,
|
||||||
rate_limit_codes=[429, 503]
|
rate_limit_codes=[429, 503]
|
||||||
)
|
),
|
||||||
|
stream=True # Enable streaming
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
|||||||
@@ -265,9 +265,21 @@ async def quick_parallel_example():
|
|||||||
"https://example.com/page3"
|
"https://example.com/page3"
|
||||||
]
|
]
|
||||||
|
|
||||||
run_conf = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
run_conf = CrawlerRunConfig(
|
||||||
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
stream=True # Enable streaming mode
|
||||||
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
# Stream results as they complete
|
||||||
|
async for result in await crawler.arun_many(urls, config=run_conf):
|
||||||
|
if result.success:
|
||||||
|
print(f"[OK] {result.url}, length: {len(result.markdown_v2.raw_markdown)}")
|
||||||
|
else:
|
||||||
|
print(f"[ERROR] {result.url} => {result.error_message}")
|
||||||
|
|
||||||
|
# Or get all results at once (default behavior)
|
||||||
|
run_conf = run_conf.clone(stream=False)
|
||||||
results = await crawler.arun_many(urls, config=run_conf)
|
results = await crawler.arun_many(urls, config=run_conf)
|
||||||
for res in results:
|
for res in results:
|
||||||
if res.success:
|
if res.success:
|
||||||
@@ -279,8 +291,13 @@ if __name__ == "__main__":
|
|||||||
asyncio.run(quick_parallel_example())
|
asyncio.run(quick_parallel_example())
|
||||||
```
|
```
|
||||||
|
|
||||||
|
The example above shows two ways to handle multiple URLs:
|
||||||
|
1. **Streaming mode** (`stream=True`): Process results as they become available using `async for`
|
||||||
|
2. **Batch mode** (`stream=False`): Wait for all results to complete
|
||||||
|
|
||||||
For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
|
For more advanced concurrency (e.g., a **semaphore-based** approach, **adaptive memory usage throttling**, or customized rate limiting), see [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md).
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
## 8. Dynamic Content Example
|
## 8. Dynamic Content Example
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user