Files
crawl4ai/docs/md_v2/assets/llm.txt/txt/multi_urls_crawling.txt
UncleCode 40640badad feat: add Script Builder to Chrome Extension and reorganize LLM context files
This commit introduces significant enhancements to the Crawl4AI ecosystem:

  Chrome Extension - Script Builder (Alpha):
  - Add recording functionality to capture user interactions (clicks, typing, scrolling)
  - Implement smart event grouping for cleaner script generation
  - Support export to both JavaScript and C4A script formats
  - Add timeline view for visualizing and editing recorded actions
  - Include wait commands (time-based and element-based)
  - Add saved flows functionality for reusing automation scripts
  - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents)
  - Release new extension versions: v1.1.0, v1.2.0, v1.2.1

  LLM Context Builder Improvements:
  - Reorganize context files from llmtxt/ to llm.txt/ with better structure
  - Separate diagram templates from text content (diagrams/ and txt/ subdirectories)
  - Add comprehensive context files for all major Crawl4AI components
  - Improve file naming convention for better discoverability

  Documentation Updates:
  - Update apps index page to match main documentation theme
  - Standardize color scheme: "Available" tags use primary color (#50ffff)
  - Change "Coming Soon" tags to dark gray for better visual hierarchy
  - Add interactive two-column layout for extension landing page
  - Include code examples for both Schema Builder and Script Builder features

  Technical Improvements:
  - Enhance event capture mechanism with better element selection
  - Add support for contenteditable elements and complex form interactions
  - Implement proper scroll event handling for both window and element scrolling
  - Add meta key support for keyboard shortcuts
  - Improve selector generation for more reliable element targeting

  The Script Builder is released as Alpha, acknowledging potential bugs while providing
  early access to this powerful automation recording feature.
2025-06-08 22:02:12 +08:00

339 lines
11 KiB
Plaintext

## Multi-URL Crawling
Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
### Basic Multi-URL Crawling
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# Batch processing (default) - get all results at once
async def batch_crawl():
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=False # Default: batch mode
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls, config=config)
for result in results:
if result.success:
print(f"✅ {result.url}: {len(result.markdown)} chars")
else:
print(f"❌ {result.url}: {result.error_message}")
# Streaming processing - handle results as they complete
async def streaming_crawl():
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True # Enable streaming
)
async with AsyncWebCrawler() as crawler:
# Process results as they become available
async for result in await crawler.arun_many(urls, config=config):
if result.success:
print(f"🔥 Just completed: {result.url}")
await process_result_immediately(result)
else:
print(f"❌ Failed: {result.url}")
```
### Memory-Adaptive Dispatching
```python
from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
# Automatically manages concurrency based on system memory
async def memory_adaptive_crawl():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Pause if memory exceeds 80%
check_interval=1.0, # Check memory every second
max_session_permit=15, # Max concurrent tasks
memory_wait_timeout=300.0 # Wait up to 5 minutes for memory
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=50
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=large_url_list,
config=config,
dispatcher=dispatcher
)
# Each result includes dispatch information
for result in results:
if result.dispatch_result:
dr = result.dispatch_result
print(f"Memory used: {dr.memory_usage:.1f}MB")
print(f"Duration: {dr.end_time - dr.start_time}")
```
### Rate-Limited Crawling
```python
from crawl4ai import RateLimiter, SemaphoreDispatcher
# Control request pacing and handle server rate limits
async def rate_limited_crawl():
rate_limiter = RateLimiter(
base_delay=(1.0, 3.0), # Random delay 1-3 seconds
max_delay=60.0, # Cap backoff at 60 seconds
max_retries=3, # Retry failed requests 3 times
rate_limit_codes=[429, 503] # Handle these status codes
)
dispatcher = SemaphoreDispatcher(
max_session_permit=5, # Fixed concurrency limit
rate_limiter=rate_limiter
)
config = CrawlerRunConfig(
user_agent_mode="random", # Randomize user agents
simulate_user=True # Simulate human behavior
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
print(f"Processed: {result.url}")
```
### Real-Time Monitoring
```python
from crawl4ai import CrawlerMonitor, DisplayMode
# Monitor crawling progress in real-time
async def monitored_crawl():
monitor = CrawlerMonitor(
max_visible_rows=20, # Show 20 tasks in display
display_mode=DisplayMode.DETAILED # Show individual task details
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=75.0,
max_session_permit=10,
monitor=monitor # Attach monitor to dispatcher
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
```
### Advanced Dispatcher Configurations
```python
# Memory-adaptive with comprehensive monitoring
memory_dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0, # Higher memory tolerance
check_interval=0.5, # Check memory more frequently
max_session_permit=20, # More concurrent tasks
memory_wait_timeout=600.0, # Wait longer for memory
rate_limiter=RateLimiter(
base_delay=(0.5, 1.5),
max_delay=30.0,
max_retries=5
),
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.AGGREGATED # Summary view
)
)
# Simple semaphore-based dispatcher
semaphore_dispatcher = SemaphoreDispatcher(
max_session_permit=8, # Fixed concurrency
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=20.0
)
)
# Usage with custom dispatcher
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
config=config,
dispatcher=memory_dispatcher # or semaphore_dispatcher
)
```
### Handling Large-Scale Crawling
```python
async def large_scale_crawl():
# For thousands of URLs
urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0, # Conservative memory usage
max_session_permit=25, # Higher concurrency
rate_limiter=RateLimiter(
base_delay=(0.1, 0.5), # Faster for large batches
max_retries=2 # Fewer retries for speed
),
monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
)
config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED, # Use caching for efficiency
stream=True, # Stream for memory efficiency
word_count_threshold=100, # Skip short content
exclude_external_links=True # Reduce processing overhead
)
successful_crawls = 0
failed_crawls = 0
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
successful_crawls += 1
await save_result_to_database(result)
else:
failed_crawls += 1
await log_failure(result.url, result.error_message)
# Progress reporting
if (successful_crawls + failed_crawls) % 100 == 0:
print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
```
### Robots.txt Compliance
```python
async def compliant_crawl():
config = CrawlerRunConfig(
check_robots_txt=True, # Respect robots.txt
user_agent="MyBot/1.0", # Identify your bot
mean_delay=2.0, # Be polite with delays
max_range=1.0
)
dispatcher = SemaphoreDispatcher(
max_session_permit=3, # Conservative concurrency
rate_limiter=RateLimiter(
base_delay=(2.0, 5.0), # Slower, more respectful
max_retries=1
)
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
print(f"✅ Crawled: {result.url}")
elif "robots.txt" in result.error_message:
print(f"🚫 Blocked by robots.txt: {result.url}")
else:
print(f"❌ Error: {result.url}")
```
### Performance Analysis
```python
async def analyze_crawl_performance():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
max_session_permit=12,
monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
)
start_time = time.time()
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
end_time = time.time()
# Analyze results
successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
print(f"Total time: {end_time - start_time:.2f}s")
print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
# Memory usage analysis
if successful and successful[0].dispatch_result:
memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
print(f"Peak memory usage: {max(peak_memory):.1f}MB")
```
### Error Handling and Recovery
```python
async def robust_multi_crawl():
failed_urls = []
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True,
page_timeout=30000 # 30 second timeout
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0,
max_session_permit=10
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
await process_successful_result(result)
else:
failed_urls.append({
'url': result.url,
'error': result.error_message,
'status_code': result.status_code
})
# Retry logic for specific errors
if result.status_code in [503, 429]: # Server errors
await schedule_retry(result.url)
# Report failures
if failed_urls:
print(f"Failed to crawl {len(failed_urls)} URLs:")
for failure in failed_urls[:10]: # Show first 10
print(f" {failure['url']}: {failure['error']}")
```
**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)