This commit introduces significant enhancements to the Crawl4AI ecosystem: Chrome Extension - Script Builder (Alpha): - Add recording functionality to capture user interactions (clicks, typing, scrolling) - Implement smart event grouping for cleaner script generation - Support export to both JavaScript and C4A script formats - Add timeline view for visualizing and editing recorded actions - Include wait commands (time-based and element-based) - Add saved flows functionality for reusing automation scripts - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents) - Release new extension versions: v1.1.0, v1.2.0, v1.2.1 LLM Context Builder Improvements: - Reorganize context files from llmtxt/ to llm.txt/ with better structure - Separate diagram templates from text content (diagrams/ and txt/ subdirectories) - Add comprehensive context files for all major Crawl4AI components - Improve file naming convention for better discoverability Documentation Updates: - Update apps index page to match main documentation theme - Standardize color scheme: "Available" tags use primary color (#50ffff) - Change "Coming Soon" tags to dark gray for better visual hierarchy - Add interactive two-column layout for extension landing page - Include code examples for both Schema Builder and Script Builder features Technical Improvements: - Enhance event capture mechanism with better element selection - Add support for contenteditable elements and complex form interactions - Implement proper scroll event handling for both window and element scrolling - Add meta key support for keyboard shortcuts - Improve selector generation for more reliable element targeting The Script Builder is released as Alpha, acknowledging potential bugs while providing early access to this powerful automation recording feature.
339 lines
11 KiB
Plaintext
339 lines
11 KiB
Plaintext
## Multi-URL Crawling
|
|
|
|
Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
|
|
|
|
### Basic Multi-URL Crawling
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
|
|
# Batch processing (default) - get all results at once
|
|
async def batch_crawl():
|
|
urls = [
|
|
"https://example.com/page1",
|
|
"https://example.com/page2",
|
|
"https://example.com/page3"
|
|
]
|
|
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=False # Default: batch mode
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(urls, config=config)
|
|
|
|
for result in results:
|
|
if result.success:
|
|
print(f"✅ {result.url}: {len(result.markdown)} chars")
|
|
else:
|
|
print(f"❌ {result.url}: {result.error_message}")
|
|
|
|
# Streaming processing - handle results as they complete
|
|
async def streaming_crawl():
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True # Enable streaming
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
# Process results as they become available
|
|
async for result in await crawler.arun_many(urls, config=config):
|
|
if result.success:
|
|
print(f"🔥 Just completed: {result.url}")
|
|
await process_result_immediately(result)
|
|
else:
|
|
print(f"❌ Failed: {result.url}")
|
|
```
|
|
|
|
### Memory-Adaptive Dispatching
|
|
|
|
```python
|
|
from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
|
|
|
|
# Automatically manages concurrency based on system memory
|
|
async def memory_adaptive_crawl():
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=80.0, # Pause if memory exceeds 80%
|
|
check_interval=1.0, # Check memory every second
|
|
max_session_permit=15, # Max concurrent tasks
|
|
memory_wait_timeout=300.0 # Wait up to 5 minutes for memory
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
word_count_threshold=50
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=large_url_list,
|
|
config=config,
|
|
dispatcher=dispatcher
|
|
)
|
|
|
|
# Each result includes dispatch information
|
|
for result in results:
|
|
if result.dispatch_result:
|
|
dr = result.dispatch_result
|
|
print(f"Memory used: {dr.memory_usage:.1f}MB")
|
|
print(f"Duration: {dr.end_time - dr.start_time}")
|
|
```
|
|
|
|
### Rate-Limited Crawling
|
|
|
|
```python
|
|
from crawl4ai import RateLimiter, SemaphoreDispatcher
|
|
|
|
# Control request pacing and handle server rate limits
|
|
async def rate_limited_crawl():
|
|
rate_limiter = RateLimiter(
|
|
base_delay=(1.0, 3.0), # Random delay 1-3 seconds
|
|
max_delay=60.0, # Cap backoff at 60 seconds
|
|
max_retries=3, # Retry failed requests 3 times
|
|
rate_limit_codes=[429, 503] # Handle these status codes
|
|
)
|
|
|
|
dispatcher = SemaphoreDispatcher(
|
|
max_session_permit=5, # Fixed concurrency limit
|
|
rate_limiter=rate_limiter
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
user_agent_mode="random", # Randomize user agents
|
|
simulate_user=True # Simulate human behavior
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun_many(
|
|
urls=urls,
|
|
config=config,
|
|
dispatcher=dispatcher
|
|
):
|
|
print(f"Processed: {result.url}")
|
|
```
|
|
|
|
### Real-Time Monitoring
|
|
|
|
```python
|
|
from crawl4ai import CrawlerMonitor, DisplayMode
|
|
|
|
# Monitor crawling progress in real-time
|
|
async def monitored_crawl():
|
|
monitor = CrawlerMonitor(
|
|
max_visible_rows=20, # Show 20 tasks in display
|
|
display_mode=DisplayMode.DETAILED # Show individual task details
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=75.0,
|
|
max_session_permit=10,
|
|
monitor=monitor # Attach monitor to dispatcher
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
dispatcher=dispatcher
|
|
)
|
|
```
|
|
|
|
### Advanced Dispatcher Configurations
|
|
|
|
```python
|
|
# Memory-adaptive with comprehensive monitoring
|
|
memory_dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=85.0, # Higher memory tolerance
|
|
check_interval=0.5, # Check memory more frequently
|
|
max_session_permit=20, # More concurrent tasks
|
|
memory_wait_timeout=600.0, # Wait longer for memory
|
|
rate_limiter=RateLimiter(
|
|
base_delay=(0.5, 1.5),
|
|
max_delay=30.0,
|
|
max_retries=5
|
|
),
|
|
monitor=CrawlerMonitor(
|
|
max_visible_rows=15,
|
|
display_mode=DisplayMode.AGGREGATED # Summary view
|
|
)
|
|
)
|
|
|
|
# Simple semaphore-based dispatcher
|
|
semaphore_dispatcher = SemaphoreDispatcher(
|
|
max_session_permit=8, # Fixed concurrency
|
|
rate_limiter=RateLimiter(
|
|
base_delay=(1.0, 2.0),
|
|
max_delay=20.0
|
|
)
|
|
)
|
|
|
|
# Usage with custom dispatcher
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
config=config,
|
|
dispatcher=memory_dispatcher # or semaphore_dispatcher
|
|
)
|
|
```
|
|
|
|
### Handling Large-Scale Crawling
|
|
|
|
```python
|
|
async def large_scale_crawl():
|
|
# For thousands of URLs
|
|
urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=70.0, # Conservative memory usage
|
|
max_session_permit=25, # Higher concurrency
|
|
rate_limiter=RateLimiter(
|
|
base_delay=(0.1, 0.5), # Faster for large batches
|
|
max_retries=2 # Fewer retries for speed
|
|
),
|
|
monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
|
|
)
|
|
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.ENABLED, # Use caching for efficiency
|
|
stream=True, # Stream for memory efficiency
|
|
word_count_threshold=100, # Skip short content
|
|
exclude_external_links=True # Reduce processing overhead
|
|
)
|
|
|
|
successful_crawls = 0
|
|
failed_crawls = 0
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun_many(
|
|
urls=urls,
|
|
config=config,
|
|
dispatcher=dispatcher
|
|
):
|
|
if result.success:
|
|
successful_crawls += 1
|
|
await save_result_to_database(result)
|
|
else:
|
|
failed_crawls += 1
|
|
await log_failure(result.url, result.error_message)
|
|
|
|
# Progress reporting
|
|
if (successful_crawls + failed_crawls) % 100 == 0:
|
|
print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
|
|
|
|
print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
|
|
```
|
|
|
|
### Robots.txt Compliance
|
|
|
|
```python
|
|
async def compliant_crawl():
|
|
config = CrawlerRunConfig(
|
|
check_robots_txt=True, # Respect robots.txt
|
|
user_agent="MyBot/1.0", # Identify your bot
|
|
mean_delay=2.0, # Be polite with delays
|
|
max_range=1.0
|
|
)
|
|
|
|
dispatcher = SemaphoreDispatcher(
|
|
max_session_permit=3, # Conservative concurrency
|
|
rate_limiter=RateLimiter(
|
|
base_delay=(2.0, 5.0), # Slower, more respectful
|
|
max_retries=1
|
|
)
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun_many(
|
|
urls=urls,
|
|
config=config,
|
|
dispatcher=dispatcher
|
|
):
|
|
if result.success:
|
|
print(f"✅ Crawled: {result.url}")
|
|
elif "robots.txt" in result.error_message:
|
|
print(f"🚫 Blocked by robots.txt: {result.url}")
|
|
else:
|
|
print(f"❌ Error: {result.url}")
|
|
```
|
|
|
|
### Performance Analysis
|
|
|
|
```python
|
|
async def analyze_crawl_performance():
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=80.0,
|
|
max_session_permit=12,
|
|
monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
|
|
)
|
|
|
|
start_time = time.time()
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
results = await crawler.arun_many(
|
|
urls=urls,
|
|
dispatcher=dispatcher
|
|
)
|
|
|
|
end_time = time.time()
|
|
|
|
# Analyze results
|
|
successful = [r for r in results if r.success]
|
|
failed = [r for r in results if not r.success]
|
|
|
|
print(f"Total time: {end_time - start_time:.2f}s")
|
|
print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
|
|
print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
|
|
|
|
# Memory usage analysis
|
|
if successful and successful[0].dispatch_result:
|
|
memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
|
|
peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
|
|
|
|
print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
|
|
print(f"Peak memory usage: {max(peak_memory):.1f}MB")
|
|
```
|
|
|
|
### Error Handling and Recovery
|
|
|
|
```python
|
|
async def robust_multi_crawl():
|
|
failed_urls = []
|
|
|
|
config = CrawlerRunConfig(
|
|
cache_mode=CacheMode.BYPASS,
|
|
stream=True,
|
|
page_timeout=30000 # 30 second timeout
|
|
)
|
|
|
|
dispatcher = MemoryAdaptiveDispatcher(
|
|
memory_threshold_percent=85.0,
|
|
max_session_permit=10
|
|
)
|
|
|
|
async with AsyncWebCrawler() as crawler:
|
|
async for result in await crawler.arun_many(
|
|
urls=urls,
|
|
config=config,
|
|
dispatcher=dispatcher
|
|
):
|
|
if result.success:
|
|
await process_successful_result(result)
|
|
else:
|
|
failed_urls.append({
|
|
'url': result.url,
|
|
'error': result.error_message,
|
|
'status_code': result.status_code
|
|
})
|
|
|
|
# Retry logic for specific errors
|
|
if result.status_code in [503, 429]: # Server errors
|
|
await schedule_retry(result.url)
|
|
|
|
# Report failures
|
|
if failed_urls:
|
|
print(f"Failed to crawl {len(failed_urls)} URLs:")
|
|
for failure in failed_urls[:10]: # Show first 10
|
|
print(f" {failure['url']}: {failure['error']}")
|
|
```
|
|
|
|
**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/) |