Merge pull request #1426 from unclecode/fix/update-quickstart-and-adaptive-strategies-docs

Update Quickstart and Adaptive Strategies documentation
2025-08-26 16:53:47 +08:00
parent 4fe2d01361 c09a57644f
commit cce3390a2d
2 changed files with 10 additions and 42 deletions
--- a/docs/md_v2/advanced/adaptive-strategies.md
+++ b/docs/md_v2/advanced/adaptive-strategies.md
@@ -126,30 +126,6 @@ Factors:
 - URL depth (fewer slashes = higher authority)
 - Clean URL structure
 ### Custom Link Scoring
 ```python
 class CustomLinkScorer:
    def score(self, link: Link, query: str, state: CrawlState) -> float:
        # Prioritize specific URL patterns
        if "/api/reference/" in link.href:
            return 2.0  # Double the score
        # Deprioritize certain sections
        if "/archive/" in link.href:
            return 0.1  # Reduce score by 90%
        # Default scoring
        return 1.0
 # Use with adaptive crawler
 adaptive = AdaptiveCrawler(
    crawler,
    config=config,
    link_scorer=CustomLinkScorer()
 )
 ```
 ## Domain-Specific Configurations
 ### Technical Documentation
@@ -230,8 +206,12 @@ config = AdaptiveConfig(
 # Periodically clean state
 if len(state.knowledge_base) > 1000:
-    # Keep only most relevant
+    # Keep only the top 500 most relevant docs
-    state.knowledge_base = get_top_relevant(state.knowledge_base, 500)
+    top_content = adaptive.get_relevant_content(top_k=500)
    keep_indices = {d["index"] for d in top_content}
    state.knowledge_base = [
        doc for i, doc in enumerate(state.knowledge_base) if i in keep_indices
    ]
 ```
 ### Parallel Processing
@@ -252,18 +232,6 @@ tasks = [
 results = await asyncio.gather(*tasks)
 ```
 ### Caching Strategy
 ```python
 # Enable caching for repeated crawls
 async with AsyncWebCrawler(
    config=BrowserConfig(
        cache_mode=CacheMode.ENABLED
    )
 ) as crawler:
    adaptive = AdaptiveCrawler(crawler, config)
 ```
 ## Debugging & Analysis
 ### Enable Verbose Logging
@@ -322,9 +290,9 @@ with open("crawl_analysis.json", "w") as f:
 ### Implementing a Custom Strategy
 ```python
-from crawl4ai.adaptive_crawler import BaseStrategy
+from crawl4ai.adaptive_crawler import CrawlStrategy
-class DomainSpecificStrategy(BaseStrategy):
+class DomainSpecificStrategy(CrawlStrategy):
    def calculate_coverage(self, state: CrawlState) -> float:
        # Custom coverage calculation
        # e.g., weight certain terms more heavily
@@ -351,7 +319,7 @@ adaptive = AdaptiveCrawler(
 ### Combining Strategies
 ```python
-class HybridStrategy(BaseStrategy):
+class HybridStrategy(CrawlStrategy):
    def __init__(self):
        self.strategies = [
            TechnicalDocStrategy(),
--- a/docs/md_v2/core/quickstart.md
+++ b/docs/md_v2/core/quickstart.md
@@ -79,7 +79,7 @@ if __name__ == "__main__":
    asyncio.run(main())
 ```
-> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
+> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching.
 We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.