fix: allow custom LLM providers for adaptive crawler embedding config. ref: #1291

- Change embedding_llm_config from Dict to Union[LLMConfig, Dict] for type safety - Add backward-compatible conversion property _embedding_llm_config_dict - Replace all hardcoded OpenAI embedding configs with configurable options - Fix LLMConfig object attribute access in query expansion logic - Add comprehensive example demonstrating multiple provider configurations - Update documentation with both LLMConfig object and dictionary usage patterns Users can now specify any LLM provider for query expansion in embedding strategy: - New: embedding_llm_config=LLMConfig(provider='anthropic/claude-3', api_token='key') - Old: embedding_llm_config={'provider': 'openai/gpt-4', 'api_token': 'key'} (still works)
2025-09-09 12:49:55 +08:00
parent 0482c1eafc
commit 3bc56dd028
4 changed files with 381 additions and 19 deletions
--- a/docs/examples/adaptive_crawling/llm_config_example.py
+++ b/docs/examples/adaptive_crawling/llm_config_example.py
@@ -0,0 +1,154 @@
+import asyncio
+import os
+from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig, LLMConfig
+
+
+async def test_configuration(name: str, config: AdaptiveConfig, url: str, query: str):
+    """Test a specific configuration"""
+    print(f"\n{'='*60}")
+    print(f"Configuration: {name}")
+    print(f"{'='*60}")
+    
+    async with AsyncWebCrawler(verbose=False) as crawler:
+        adaptive = AdaptiveCrawler(crawler, config)
+        result = await adaptive.digest(start_url=url, query=query)
+        
+        print("\n" + "="*50)
+        print("CRAWL STATISTICS")
+        print("="*50)
+        adaptive.print_stats(detailed=False)
+        
+        # Get the most relevant content found
+        print("\n" + "="*50)
+        print("MOST RELEVANT PAGES")
+        print("="*50)
+        
+        relevant_pages = adaptive.get_relevant_content(top_k=5)
+        for i, page in enumerate(relevant_pages, 1):
+            print(f"\n{i}. {page['url']}")
+            print(f"   Relevance Score: {page['score']:.2%}")
+            
+            # Show a snippet of the content
+            content = page['content'] or ""
+            if content:
+                snippet = content[:200].replace('\n', ' ')
+                if len(content) > 200:
+                    snippet += "..."
+                print(f"   Preview: {snippet}")
+        
+        print(f"\n{'='*50}")
+        print(f"Pages crawled: {len(result.crawled_urls)}")
+        print(f"Final confidence: {adaptive.confidence:.1%}")
+        print(f"Stopped reason: {result.metrics.get('stopped_reason', 'max_pages')}")
+        
+        if result.metrics.get('is_irrelevant', False):
+            print("⚠️  Query detected as irrelevant!")
+        
+        return result
+
+
+async def llm_embedding():
+    """Demonstrate various embedding configurations"""
+    
+    print("EMBEDDING STRATEGY CONFIGURATION EXAMPLES")
+    print("=" * 60)
+    
+    # Base URL and query for testing
+    test_url = "https://docs.python.org/3/library/asyncio.html"
+    
+    openai_llm_config = LLMConfig(
+        provider='openai/text-embedding-3-small',
+        api_token=os.getenv('OPENAI_API_KEY'),
+        temperature=0.7,
+        max_tokens=2000
+    )
+    config_openai = AdaptiveConfig(
+        strategy="embedding",
+        max_pages=10,
+        
+        # Use OpenAI embeddings
+        embedding_llm_config=openai_llm_config,
+        # embedding_llm_config={
+        #     'provider': 'openai/text-embedding-3-small',
+        #     'api_token': os.getenv('OPENAI_API_KEY')
+        # },
+        
+        # OpenAI embeddings are high quality, can be stricter
+        embedding_k_exp=4.0,
+        n_query_variations=12
+    )
+    
+    await test_configuration(
+        "OpenAI Embeddings",
+        config_openai,
+        test_url,
+        # "event-driven architecture patterns"
+        "async await context managers coroutines"
+    )
+    return
+    
+    
+
+async def basic_adaptive_crawling():
+    """Basic adaptive crawling example"""
+    
+    # Initialize the crawler
+    async with AsyncWebCrawler(verbose=True) as crawler:
+        # Create an adaptive crawler with default settings (statistical strategy)
+        adaptive = AdaptiveCrawler(crawler)
+        
+        # Note: You can also use embedding strategy for semantic understanding:
+        # from crawl4ai import AdaptiveConfig
+        # config = AdaptiveConfig(strategy="embedding")
+        # adaptive = AdaptiveCrawler(crawler, config)
+        
+        # Start adaptive crawling
+        print("Starting adaptive crawl for Python async programming information...")
+        result = await adaptive.digest(
+            start_url="https://docs.python.org/3/library/asyncio.html",
+            query="async await context managers coroutines"
+        )
+        
+        # Display crawl statistics
+        print("\n" + "="*50)
+        print("CRAWL STATISTICS")
+        print("="*50)
+        adaptive.print_stats(detailed=False)
+        
+        # Get the most relevant content found
+        print("\n" + "="*50)
+        print("MOST RELEVANT PAGES")
+        print("="*50)
+        
+        relevant_pages = adaptive.get_relevant_content(top_k=5)
+        for i, page in enumerate(relevant_pages, 1):
+            print(f"\n{i}. {page['url']}")
+            print(f"   Relevance Score: {page['score']:.2%}")
+            
+            # Show a snippet of the content
+            content = page['content'] or ""
+            if content:
+                snippet = content[:200].replace('\n', ' ')
+                if len(content) > 200:
+                    snippet += "..."
+                print(f"   Preview: {snippet}")
+        
+        # Show final confidence
+        print(f"\n{'='*50}")
+        print(f"Final Confidence: {adaptive.confidence:.2%}")
+        print(f"Total Pages Crawled: {len(result.crawled_urls)}")
+        print(f"Knowledge Base Size: {len(adaptive.state.knowledge_base)} documents")
+        
+        
+        if adaptive.confidence >= 0.8:
+            print("✓ High confidence - can answer detailed questions about async Python")
+        elif adaptive.confidence >= 0.6:
+            print("~ Moderate confidence - can answer basic questions") 
+        else:
+            print("✗ Low confidence - need more information")
+
+
+
+if __name__ == "__main__":
+    asyncio.run(llm_embedding())
+    # asyncio.run(basic_adaptive_crawling())
--- a/docs/md_v2/core/adaptive-crawling.md
+++ b/docs/md_v2/core/adaptive-crawling.md
@@ -108,7 +108,19 @@ config = AdaptiveConfig(
    embedding_min_confidence_threshold=0.1  # Stop if completely irrelevant
 )

-# With custom embedding provider (e.g., OpenAI)
+# With custom LLM provider for query expansion (recommended)
+from crawl4ai import LLMConfig
+
+config = AdaptiveConfig(
+    strategy="embedding",
+    embedding_llm_config=LLMConfig(
+        provider='openai/text-embedding-3-small',
+        api_token='your-api-key',
+        temperature=0.7
+    )
+)
+
+# Alternative: Dictionary format (backward compatible)
 config = AdaptiveConfig(
    strategy="embedding",
    embedding_llm_config={