From c09a57644f2b210f3af945cace6565890c892b27 Mon Sep 17 00:00:00 2001 From: Soham Kukreti Date: Thu, 21 Aug 2025 19:11:31 +0530 Subject: [PATCH] =?UTF-8?q?docs:=20update=20adaptive=20crawler=20docs=20an?= =?UTF-8?q?d=20cache=20defaults;=20remove=20deprecated=20examples=20(#1330?= =?UTF-8?q?)=20-=20Replace=20BaseStrategy=20with=20CrawlStrategy=20in=20cu?= =?UTF-8?q?stom=20strategy=20examples=20(DomainSpecificStrategy,=20HybridS?= =?UTF-8?q?trategy)=20-=20Remove=20=E2=80=9CCustom=20Link=20Scoring?= =?UTF-8?q?=E2=80=9D=20and=20=E2=80=9CCaching=20Strategy=E2=80=9D=20sectio?= =?UTF-8?q?ns=20no=20longer=20aligned=20with=20current=20library=20-=20Rev?= =?UTF-8?q?ise=20memory=20pruning=20example=20to=20use=20adaptive.get=5Fre?= =?UTF-8?q?levant=5Fcontent=20and=20index-based=20retention=20of=20top=205?= =?UTF-8?q?00=20docs=20-=20Correct=20Quickstart=20note:=20default=20cache?= =?UTF-8?q?=20mode=20is=20CacheMode.BYPASS;=20instruct=20enabling=20with?= =?UTF-8?q?=20CacheMode.ENABLED?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- docs/md_v2/advanced/adaptive-strategies.md | 50 ++++------------------ docs/md_v2/core/quickstart.md | 2 +- 2 files changed, 10 insertions(+), 42 deletions(-) diff --git a/docs/md_v2/advanced/adaptive-strategies.md b/docs/md_v2/advanced/adaptive-strategies.md index 4ab5b4cd..11c5585a 100644 --- a/docs/md_v2/advanced/adaptive-strategies.md +++ b/docs/md_v2/advanced/adaptive-strategies.md @@ -126,30 +126,6 @@ Factors: - URL depth (fewer slashes = higher authority) - Clean URL structure -### Custom Link Scoring - -```python -class CustomLinkScorer: - def score(self, link: Link, query: str, state: CrawlState) -> float: - # Prioritize specific URL patterns - if "/api/reference/" in link.href: - return 2.0 # Double the score - - # Deprioritize certain sections - if "/archive/" in link.href: - return 0.1 # Reduce score by 90% - - # Default scoring - return 1.0 - -# Use with adaptive crawler -adaptive = AdaptiveCrawler( - crawler, - config=config, - link_scorer=CustomLinkScorer() -) -``` - ## Domain-Specific Configurations ### Technical Documentation @@ -230,8 +206,12 @@ config = AdaptiveConfig( # Periodically clean state if len(state.knowledge_base) > 1000: - # Keep only most relevant - state.knowledge_base = get_top_relevant(state.knowledge_base, 500) + # Keep only the top 500 most relevant docs + top_content = adaptive.get_relevant_content(top_k=500) + keep_indices = {d["index"] for d in top_content} + state.knowledge_base = [ + doc for i, doc in enumerate(state.knowledge_base) if i in keep_indices + ] ``` ### Parallel Processing @@ -252,18 +232,6 @@ tasks = [ results = await asyncio.gather(*tasks) ``` -### Caching Strategy - -```python -# Enable caching for repeated crawls -async with AsyncWebCrawler( - config=BrowserConfig( - cache_mode=CacheMode.ENABLED - ) -) as crawler: - adaptive = AdaptiveCrawler(crawler, config) -``` - ## Debugging & Analysis ### Enable Verbose Logging @@ -322,9 +290,9 @@ with open("crawl_analysis.json", "w") as f: ### Implementing a Custom Strategy ```python -from crawl4ai.adaptive_crawler import BaseStrategy +from crawl4ai.adaptive_crawler import CrawlStrategy -class DomainSpecificStrategy(BaseStrategy): +class DomainSpecificStrategy(CrawlStrategy): def calculate_coverage(self, state: CrawlState) -> float: # Custom coverage calculation # e.g., weight certain terms more heavily @@ -351,7 +319,7 @@ adaptive = AdaptiveCrawler( ### Combining Strategies ```python -class HybridStrategy(BaseStrategy): +class HybridStrategy(CrawlStrategy): def __init__(self): self.strategies = [ TechnicalDocStrategy(), diff --git a/docs/md_v2/core/quickstart.md b/docs/md_v2/core/quickstart.md index e9a4b987..83cb6cef 100644 --- a/docs/md_v2/core/quickstart.md +++ b/docs/md_v2/core/quickstart.md @@ -79,7 +79,7 @@ if __name__ == "__main__": asyncio.run(main()) ``` -> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS` +> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching. We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.