feat: add Script Builder to Chrome Extension and reorganize LLM context files

This commit introduces significant enhancements to the Crawl4AI ecosystem: Chrome Extension - Script Builder (Alpha): - Add recording functionality to capture user interactions (clicks, typing, scrolling) - Implement smart event grouping for cleaner script generation - Support export to both JavaScript and C4A script formats - Add timeline view for visualizing and editing recorded actions - Include wait commands (time-based and element-based) - Add saved flows functionality for reusing automation scripts - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents) - Release new extension versions: v1.1.0, v1.2.0, v1.2.1 LLM Context Builder Improvements: - Reorganize context files from llmtxt/ to llm.txt/ with better structure - Separate diagram templates from text content (diagrams/ and txt/ subdirectories) - Add comprehensive context files for all major Crawl4AI components - Improve file naming convention for better discoverability Documentation Updates: - Update apps index page to match main documentation theme - Standardize color scheme: "Available" tags use primary color (#50ffff) - Change "Coming Soon" tags to dark gray for better visual hierarchy - Add interactive two-column layout for extension landing page - Include code examples for both Schema Builder and Script Builder features Technical Improvements: - Enhance event capture mechanism with better element selection - Add support for contenteditable elements and complex form interactions - Implement proper scroll event handling for both window and element scrolling - Add meta key support for keyboard shortcuts - Improve selector generation for more reliable element targeting The Script Builder is released as Alpha, acknowledging potential bugs while providing early access to this powerful automation recording feature.
2025-06-08 22:02:12 +08:00
parent 926592649e
commit 40640badad
72 changed files with 28600 additions and 100986 deletions
--- a/docs/md_v2/assets/llm.txt/txt/multi_urls_crawling.txt
+++ b/docs/md_v2/assets/llm.txt/txt/multi_urls_crawling.txt
@@ -0,0 +1,339 @@
+## Multi-URL Crawling
+
+Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
+
+### Basic Multi-URL Crawling
+
+```python
+from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
+
+# Batch processing (default) - get all results at once
+async def batch_crawl():
+    urls = [
+        "https://example.com/page1",
+        "https://example.com/page2", 
+        "https://example.com/page3"
+    ]
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=False  # Default: batch mode
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(urls, config=config)
+        
+        for result in results:
+            if result.success:
+                print(f"✅ {result.url}: {len(result.markdown)} chars")
+            else:
+                print(f"❌ {result.url}: {result.error_message}")
+
+# Streaming processing - handle results as they complete
+async def streaming_crawl():
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True  # Enable streaming
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        # Process results as they become available
+        async for result in await crawler.arun_many(urls, config=config):
+            if result.success:
+                print(f"🔥 Just completed: {result.url}")
+                await process_result_immediately(result)
+            else:
+                print(f"❌ Failed: {result.url}")
+```
+
+### Memory-Adaptive Dispatching
+
+```python
+from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
+
+# Automatically manages concurrency based on system memory
+async def memory_adaptive_crawl():
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,  # Pause if memory exceeds 80%
+        check_interval=1.0,             # Check memory every second
+        max_session_permit=15,          # Max concurrent tasks
+        memory_wait_timeout=300.0       # Wait up to 5 minutes for memory
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        word_count_threshold=50
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=large_url_list,
+            config=config,
+            dispatcher=dispatcher
+        )
+        
+        # Each result includes dispatch information
+        for result in results:
+            if result.dispatch_result:
+                dr = result.dispatch_result
+                print(f"Memory used: {dr.memory_usage:.1f}MB")
+                print(f"Duration: {dr.end_time - dr.start_time}")
+```
+
+### Rate-Limited Crawling
+
+```python
+from crawl4ai import RateLimiter, SemaphoreDispatcher
+
+# Control request pacing and handle server rate limits
+async def rate_limited_crawl():
+    rate_limiter = RateLimiter(
+        base_delay=(1.0, 3.0),          # Random delay 1-3 seconds
+        max_delay=60.0,                 # Cap backoff at 60 seconds
+        max_retries=3,                  # Retry failed requests 3 times
+        rate_limit_codes=[429, 503]     # Handle these status codes
+    )
+    
+    dispatcher = SemaphoreDispatcher(
+        max_session_permit=5,           # Fixed concurrency limit
+        rate_limiter=rate_limiter
+    )
+    
+    config = CrawlerRunConfig(
+        user_agent_mode="random",       # Randomize user agents
+        simulate_user=True              # Simulate human behavior
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            print(f"Processed: {result.url}")
+```
+
+### Real-Time Monitoring
+
+```python
+from crawl4ai import CrawlerMonitor, DisplayMode
+
+# Monitor crawling progress in real-time
+async def monitored_crawl():
+    monitor = CrawlerMonitor(
+        max_visible_rows=20,                    # Show 20 tasks in display
+        display_mode=DisplayMode.DETAILED       # Show individual task details
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=75.0,
+        max_session_permit=10,
+        monitor=monitor  # Attach monitor to dispatcher
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            dispatcher=dispatcher
+        )
+```
+
+### Advanced Dispatcher Configurations
+
+```python
+# Memory-adaptive with comprehensive monitoring
+memory_dispatcher = MemoryAdaptiveDispatcher(
+    memory_threshold_percent=85.0,      # Higher memory tolerance
+    check_interval=0.5,                 # Check memory more frequently
+    max_session_permit=20,              # More concurrent tasks
+    memory_wait_timeout=600.0,          # Wait longer for memory
+    rate_limiter=RateLimiter(
+        base_delay=(0.5, 1.5),
+        max_delay=30.0,
+        max_retries=5
+    ),
+    monitor=CrawlerMonitor(
+        max_visible_rows=15,
+        display_mode=DisplayMode.AGGREGATED  # Summary view
+    )
+)
+
+# Simple semaphore-based dispatcher
+semaphore_dispatcher = SemaphoreDispatcher(
+    max_session_permit=8,               # Fixed concurrency
+    rate_limiter=RateLimiter(
+        base_delay=(1.0, 2.0),
+        max_delay=20.0
+    )
+)
+
+# Usage with custom dispatcher
+async with AsyncWebCrawler() as crawler:
+    results = await crawler.arun_many(
+        urls=urls,
+        config=config,
+        dispatcher=memory_dispatcher  # or semaphore_dispatcher
+    )
+```
+
+### Handling Large-Scale Crawling
+
+```python
+async def large_scale_crawl():
+    # For thousands of URLs
+    urls = load_urls_from_file("large_url_list.txt")  # 10,000+ URLs
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=70.0,  # Conservative memory usage
+        max_session_permit=25,          # Higher concurrency
+        rate_limiter=RateLimiter(
+            base_delay=(0.1, 0.5),      # Faster for large batches
+            max_retries=2               # Fewer retries for speed
+        ),
+        monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
+    )
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.ENABLED,   # Use caching for efficiency
+        stream=True,                    # Stream for memory efficiency
+        word_count_threshold=100,       # Skip short content
+        exclude_external_links=True     # Reduce processing overhead
+    )
+    
+    successful_crawls = 0
+    failed_crawls = 0
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                successful_crawls += 1
+                await save_result_to_database(result)
+            else:
+                failed_crawls += 1
+                await log_failure(result.url, result.error_message)
+            
+            # Progress reporting
+            if (successful_crawls + failed_crawls) % 100 == 0:
+                print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
+    
+    print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
+```
+
+### Robots.txt Compliance
+
+```python
+async def compliant_crawl():
+    config = CrawlerRunConfig(
+        check_robots_txt=True,          # Respect robots.txt
+        user_agent="MyBot/1.0",         # Identify your bot
+        mean_delay=2.0,                 # Be polite with delays
+        max_range=1.0
+    )
+    
+    dispatcher = SemaphoreDispatcher(
+        max_session_permit=3,           # Conservative concurrency
+        rate_limiter=RateLimiter(
+            base_delay=(2.0, 5.0),      # Slower, more respectful
+            max_retries=1
+        )
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                print(f"✅ Crawled: {result.url}")
+            elif "robots.txt" in result.error_message:
+                print(f"🚫 Blocked by robots.txt: {result.url}")
+            else:
+                print(f"❌ Error: {result.url}")
+```
+
+### Performance Analysis
+
+```python
+async def analyze_crawl_performance():
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=80.0,
+        max_session_permit=12,
+        monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
+    )
+    
+    start_time = time.time()
+    
+    async with AsyncWebCrawler() as crawler:
+        results = await crawler.arun_many(
+            urls=urls,
+            dispatcher=dispatcher
+        )
+    
+    end_time = time.time()
+    
+    # Analyze results
+    successful = [r for r in results if r.success]
+    failed = [r for r in results if not r.success]
+    
+    print(f"Total time: {end_time - start_time:.2f}s")
+    print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
+    print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
+    
+    # Memory usage analysis
+    if successful and successful[0].dispatch_result:
+        memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
+        peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
+        
+        print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
+        print(f"Peak memory usage: {max(peak_memory):.1f}MB")
+```
+
+### Error Handling and Recovery
+
+```python
+async def robust_multi_crawl():
+    failed_urls = []
+    
+    config = CrawlerRunConfig(
+        cache_mode=CacheMode.BYPASS,
+        stream=True,
+        page_timeout=30000  # 30 second timeout
+    )
+    
+    dispatcher = MemoryAdaptiveDispatcher(
+        memory_threshold_percent=85.0,
+        max_session_permit=10
+    )
+    
+    async with AsyncWebCrawler() as crawler:
+        async for result in await crawler.arun_many(
+            urls=urls,
+            config=config,
+            dispatcher=dispatcher
+        ):
+            if result.success:
+                await process_successful_result(result)
+            else:
+                failed_urls.append({
+                    'url': result.url,
+                    'error': result.error_message,
+                    'status_code': result.status_code
+                })
+                
+                # Retry logic for specific errors
+                if result.status_code in [503, 429]:  # Server errors
+                    await schedule_retry(result.url)
+    
+    # Report failures
+    if failed_urls:
+        print(f"Failed to crawl {len(failed_urls)} URLs:")
+        for failure in failed_urls[:10]:  # Show first 10
+            print(f"  {failure['url']}: {failure['error']}")
+```
+
+**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)