feat: add Script Builder to Chrome Extension and reorganize LLM context files

This commit introduces significant enhancements to the Crawl4AI ecosystem:

  Chrome Extension - Script Builder (Alpha):
  - Add recording functionality to capture user interactions (clicks, typing, scrolling)
  - Implement smart event grouping for cleaner script generation
  - Support export to both JavaScript and C4A script formats
  - Add timeline view for visualizing and editing recorded actions
  - Include wait commands (time-based and element-based)
  - Add saved flows functionality for reusing automation scripts
  - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents)
  - Release new extension versions: v1.1.0, v1.2.0, v1.2.1

  LLM Context Builder Improvements:
  - Reorganize context files from llmtxt/ to llm.txt/ with better structure
  - Separate diagram templates from text content (diagrams/ and txt/ subdirectories)
  - Add comprehensive context files for all major Crawl4AI components
  - Improve file naming convention for better discoverability

  Documentation Updates:
  - Update apps index page to match main documentation theme
  - Standardize color scheme: "Available" tags use primary color (#50ffff)
  - Change "Coming Soon" tags to dark gray for better visual hierarchy
  - Add interactive two-column layout for extension landing page
  - Include code examples for both Schema Builder and Script Builder features

  Technical Improvements:
  - Enhance event capture mechanism with better element selection
  - Add support for contenteditable elements and complex form interactions
  - Implement proper scroll event handling for both window and element scrolling
  - Add meta key support for keyboard shortcuts
  - Improve selector generation for more reliable element targeting

  The Script Builder is released as Alpha, acknowledging potential bugs while providing
  early access to this powerful automation recording feature.
This commit is contained in:
UncleCode
2025-06-08 22:02:12 +08:00
parent 926592649e
commit 40640badad
72 changed files with 28600 additions and 100986 deletions

View File

@@ -0,0 +1,339 @@
## Multi-URL Crawling
Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
### Basic Multi-URL Crawling
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# Batch processing (default) - get all results at once
async def batch_crawl():
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=False # Default: batch mode
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls, config=config)
for result in results:
if result.success:
print(f"✅ {result.url}: {len(result.markdown)} chars")
else:
print(f"❌ {result.url}: {result.error_message}")
# Streaming processing - handle results as they complete
async def streaming_crawl():
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True # Enable streaming
)
async with AsyncWebCrawler() as crawler:
# Process results as they become available
async for result in await crawler.arun_many(urls, config=config):
if result.success:
print(f"🔥 Just completed: {result.url}")
await process_result_immediately(result)
else:
print(f"❌ Failed: {result.url}")
```
### Memory-Adaptive Dispatching
```python
from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
# Automatically manages concurrency based on system memory
async def memory_adaptive_crawl():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Pause if memory exceeds 80%
check_interval=1.0, # Check memory every second
max_session_permit=15, # Max concurrent tasks
memory_wait_timeout=300.0 # Wait up to 5 minutes for memory
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=50
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=large_url_list,
config=config,
dispatcher=dispatcher
)
# Each result includes dispatch information
for result in results:
if result.dispatch_result:
dr = result.dispatch_result
print(f"Memory used: {dr.memory_usage:.1f}MB")
print(f"Duration: {dr.end_time - dr.start_time}")
```
### Rate-Limited Crawling
```python
from crawl4ai import RateLimiter, SemaphoreDispatcher
# Control request pacing and handle server rate limits
async def rate_limited_crawl():
rate_limiter = RateLimiter(
base_delay=(1.0, 3.0), # Random delay 1-3 seconds
max_delay=60.0, # Cap backoff at 60 seconds
max_retries=3, # Retry failed requests 3 times
rate_limit_codes=[429, 503] # Handle these status codes
)
dispatcher = SemaphoreDispatcher(
max_session_permit=5, # Fixed concurrency limit
rate_limiter=rate_limiter
)
config = CrawlerRunConfig(
user_agent_mode="random", # Randomize user agents
simulate_user=True # Simulate human behavior
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
print(f"Processed: {result.url}")
```
### Real-Time Monitoring
```python
from crawl4ai import CrawlerMonitor, DisplayMode
# Monitor crawling progress in real-time
async def monitored_crawl():
monitor = CrawlerMonitor(
max_visible_rows=20, # Show 20 tasks in display
display_mode=DisplayMode.DETAILED # Show individual task details
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=75.0,
max_session_permit=10,
monitor=monitor # Attach monitor to dispatcher
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
```
### Advanced Dispatcher Configurations
```python
# Memory-adaptive with comprehensive monitoring
memory_dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0, # Higher memory tolerance
check_interval=0.5, # Check memory more frequently
max_session_permit=20, # More concurrent tasks
memory_wait_timeout=600.0, # Wait longer for memory
rate_limiter=RateLimiter(
base_delay=(0.5, 1.5),
max_delay=30.0,
max_retries=5
),
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.AGGREGATED # Summary view
)
)
# Simple semaphore-based dispatcher
semaphore_dispatcher = SemaphoreDispatcher(
max_session_permit=8, # Fixed concurrency
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=20.0
)
)
# Usage with custom dispatcher
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
config=config,
dispatcher=memory_dispatcher # or semaphore_dispatcher
)
```
### Handling Large-Scale Crawling
```python
async def large_scale_crawl():
# For thousands of URLs
urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0, # Conservative memory usage
max_session_permit=25, # Higher concurrency
rate_limiter=RateLimiter(
base_delay=(0.1, 0.5), # Faster for large batches
max_retries=2 # Fewer retries for speed
),
monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
)
config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED, # Use caching for efficiency
stream=True, # Stream for memory efficiency
word_count_threshold=100, # Skip short content
exclude_external_links=True # Reduce processing overhead
)
successful_crawls = 0
failed_crawls = 0
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
successful_crawls += 1
await save_result_to_database(result)
else:
failed_crawls += 1
await log_failure(result.url, result.error_message)
# Progress reporting
if (successful_crawls + failed_crawls) % 100 == 0:
print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
```
### Robots.txt Compliance
```python
async def compliant_crawl():
config = CrawlerRunConfig(
check_robots_txt=True, # Respect robots.txt
user_agent="MyBot/1.0", # Identify your bot
mean_delay=2.0, # Be polite with delays
max_range=1.0
)
dispatcher = SemaphoreDispatcher(
max_session_permit=3, # Conservative concurrency
rate_limiter=RateLimiter(
base_delay=(2.0, 5.0), # Slower, more respectful
max_retries=1
)
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
print(f"✅ Crawled: {result.url}")
elif "robots.txt" in result.error_message:
print(f"🚫 Blocked by robots.txt: {result.url}")
else:
print(f"❌ Error: {result.url}")
```
### Performance Analysis
```python
async def analyze_crawl_performance():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
max_session_permit=12,
monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
)
start_time = time.time()
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
end_time = time.time()
# Analyze results
successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
print(f"Total time: {end_time - start_time:.2f}s")
print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
# Memory usage analysis
if successful and successful[0].dispatch_result:
memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
print(f"Peak memory usage: {max(peak_memory):.1f}MB")
```
### Error Handling and Recovery
```python
async def robust_multi_crawl():
failed_urls = []
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True,
page_timeout=30000 # 30 second timeout
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0,
max_session_permit=10
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
await process_successful_result(result)
else:
failed_urls.append({
'url': result.url,
'error': result.error_message,
'status_code': result.status_code
})
# Retry logic for specific errors
if result.status_code in [503, 429]: # Server errors
await schedule_retry(result.url)
# Report failures
if failed_urls:
print(f"Failed to crawl {len(failed_urls)} URLs:")
for failure in failed_urls[:10]: # Show first 10
print(f" {failure['url']}: {failure['error']}")
```
**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)