Merge pull request #1426 from unclecode/fix/update-quickstart-and-adaptive-strategies-docs
Update Quickstart and Adaptive Strategies documentation
This commit is contained in:
@@ -126,30 +126,6 @@ Factors:
|
|||||||
- URL depth (fewer slashes = higher authority)
|
- URL depth (fewer slashes = higher authority)
|
||||||
- Clean URL structure
|
- Clean URL structure
|
||||||
|
|
||||||
### Custom Link Scoring
|
|
||||||
|
|
||||||
```python
|
|
||||||
class CustomLinkScorer:
|
|
||||||
def score(self, link: Link, query: str, state: CrawlState) -> float:
|
|
||||||
# Prioritize specific URL patterns
|
|
||||||
if "/api/reference/" in link.href:
|
|
||||||
return 2.0 # Double the score
|
|
||||||
|
|
||||||
# Deprioritize certain sections
|
|
||||||
if "/archive/" in link.href:
|
|
||||||
return 0.1 # Reduce score by 90%
|
|
||||||
|
|
||||||
# Default scoring
|
|
||||||
return 1.0
|
|
||||||
|
|
||||||
# Use with adaptive crawler
|
|
||||||
adaptive = AdaptiveCrawler(
|
|
||||||
crawler,
|
|
||||||
config=config,
|
|
||||||
link_scorer=CustomLinkScorer()
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Domain-Specific Configurations
|
## Domain-Specific Configurations
|
||||||
|
|
||||||
### Technical Documentation
|
### Technical Documentation
|
||||||
@@ -230,8 +206,12 @@ config = AdaptiveConfig(
|
|||||||
|
|
||||||
# Periodically clean state
|
# Periodically clean state
|
||||||
if len(state.knowledge_base) > 1000:
|
if len(state.knowledge_base) > 1000:
|
||||||
# Keep only most relevant
|
# Keep only the top 500 most relevant docs
|
||||||
state.knowledge_base = get_top_relevant(state.knowledge_base, 500)
|
top_content = adaptive.get_relevant_content(top_k=500)
|
||||||
|
keep_indices = {d["index"] for d in top_content}
|
||||||
|
state.knowledge_base = [
|
||||||
|
doc for i, doc in enumerate(state.knowledge_base) if i in keep_indices
|
||||||
|
]
|
||||||
```
|
```
|
||||||
|
|
||||||
### Parallel Processing
|
### Parallel Processing
|
||||||
@@ -252,18 +232,6 @@ tasks = [
|
|||||||
results = await asyncio.gather(*tasks)
|
results = await asyncio.gather(*tasks)
|
||||||
```
|
```
|
||||||
|
|
||||||
### Caching Strategy
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Enable caching for repeated crawls
|
|
||||||
async with AsyncWebCrawler(
|
|
||||||
config=BrowserConfig(
|
|
||||||
cache_mode=CacheMode.ENABLED
|
|
||||||
)
|
|
||||||
) as crawler:
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config)
|
|
||||||
```
|
|
||||||
|
|
||||||
## Debugging & Analysis
|
## Debugging & Analysis
|
||||||
|
|
||||||
### Enable Verbose Logging
|
### Enable Verbose Logging
|
||||||
@@ -322,9 +290,9 @@ with open("crawl_analysis.json", "w") as f:
|
|||||||
### Implementing a Custom Strategy
|
### Implementing a Custom Strategy
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.adaptive_crawler import BaseStrategy
|
from crawl4ai.adaptive_crawler import CrawlStrategy
|
||||||
|
|
||||||
class DomainSpecificStrategy(BaseStrategy):
|
class DomainSpecificStrategy(CrawlStrategy):
|
||||||
def calculate_coverage(self, state: CrawlState) -> float:
|
def calculate_coverage(self, state: CrawlState) -> float:
|
||||||
# Custom coverage calculation
|
# Custom coverage calculation
|
||||||
# e.g., weight certain terms more heavily
|
# e.g., weight certain terms more heavily
|
||||||
@@ -351,7 +319,7 @@ adaptive = AdaptiveCrawler(
|
|||||||
### Combining Strategies
|
### Combining Strategies
|
||||||
|
|
||||||
```python
|
```python
|
||||||
class HybridStrategy(BaseStrategy):
|
class HybridStrategy(CrawlStrategy):
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
self.strategies = [
|
self.strategies = [
|
||||||
TechnicalDocStrategy(),
|
TechnicalDocStrategy(),
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ if __name__ == "__main__":
|
|||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
> IMPORTANT: By default cache mode is set to `CacheMode.ENABLED`. So to have fresh content, you need to set it to `CacheMode.BYPASS`
|
> IMPORTANT: By default cache mode is set to `CacheMode.BYPASS` to have fresh content. Set `CacheMode.ENABLED` to enable caching.
|
||||||
|
|
||||||
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
We’ll explore more advanced config in later tutorials (like enabling proxies, PDF output, multi-tab sessions, etc.). For now, just note how you pass these objects to manage crawling.
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user