feat: cleanup unused code and enhance documentation for v0.7.1

- Remove unused StealthConfig from browser_manager.py
- Update LinkPreviewConfig import path in __init__.py and examples
- Fix infinity handling in content_scraping_strategy.py (use 0 instead of float('inf'))
- Remove sanitize_json_data functions from API endpoints
- Add comprehensive C4A Script documentation to release notes
- Update v0.7.0 release notes with improved code examples
- Create v0.7.1 release notes focusing on cleanup and documentation improvements
- Update demo files with corrected import paths and examples
- Fix virtual scroll and adaptive crawling examples across documentation

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
ntohidi
2025-07-17 11:35:16 +02:00
parent ccbe3c105c
commit cf8badfe27
13 changed files with 241 additions and 343 deletions

View File

@@ -30,33 +30,40 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
```python
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
import asyncio
# Initialize with custom adaptive parameters
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to stop crawling
max_depth=5, # Maximum crawl depth
max_pages=20, # Maximum number of pages to crawl
top_k_links=3, # Number of top links to follow per page
strategy="statistical", # 'statistical' or 'embedding'
coverage_weight=0.4, # Weight for coverage in confidence calculation
consistency_weight=0.3, # Weight for consistency in confidence calculation
saturation_weight=0.3 # Weight for saturation in confidence calculation
)
# Initialize adaptive crawler with web crawler
async with AsyncWebCrawler() as crawler:
adaptive_crawler = AdaptiveCrawler(crawler, config)
async def main():
# Crawl and learn patterns
state = await adaptive_crawler.digest(
start_url="https://news.example.com/article/12345",
query="latest news articles and content"
# Configure adaptive crawler
config = AdaptiveConfig(
strategy="statistical", # or "embedding" for semantic understanding
max_pages=10,
confidence_threshold=0.7, # Stop at 70% confidence
top_k_links=3, # Follow top 3 links per page
min_gain_threshold=0.05 # Need 5% information gain to continue
)
# Access results and confidence
print(f"Confidence Level: {adaptive_crawler.confidence:.0%}")
print(f"Pages Crawled: {len(state.crawled_urls)}")
print(f"Knowledge Base: {len(adaptive_crawler.state.knowledge_base)} documents")
async with AsyncWebCrawler(verbose=False) as crawler:
adaptive = AdaptiveCrawler(crawler, config)
print("Starting adaptive crawl about Python decorators...")
result = await adaptive.digest(
start_url="https://docs.python.org/3/glossary.html",
query="python decorators functions wrapping"
)
print(f"\n✅ Crawling Complete!")
print(f"• Confidence Level: {adaptive.confidence:.0%}")
print(f"• Pages Crawled: {len(result.crawled_urls)}")
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
# Get most relevant content
relevant = adaptive.get_relevant_content(top_k=3)
print(f"\nMost Relevant Pages:")
for i, page in enumerate(relevant, 1):
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
asyncio.run(main())
```
**Expected Real-World Impact:**
@@ -141,56 +148,47 @@ async with AsyncWebCrawler() as crawler:
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
### The Three-Layer Scoring System
### Intelligent Link Analysis and Scoring
```python
from crawl4ai import LinkPreviewConfig, CrawlerRunConfig, CacheMode
import asyncio
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
from crawl4ai.adaptive_crawler import LinkPreviewConfig
# Configure intelligent link analysis
link_config = LinkPreviewConfig(
include_internal=True,
include_external=False,
max_links=10,
concurrency=5,
query="python tutorial", # For contextual scoring
score_threshold=0.3,
verbose=True
)
# Use in your crawl
result = await crawler.arun(
"https://tech-blog.example.com",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
async def main():
# Configure intelligent link analysis
link_config = LinkPreviewConfig(
include_internal=True,
include_external=False,
max_links=10,
concurrency=5,
query="python tutorial", # For contextual scoring
score_threshold=0.3,
verbose=True
)
)
# Use in your crawl
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://www.geeksforgeeks.org/",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
)
)
# Access scored and sorted links
if result.success and result.links:
# Get scored links
internal_links = result.links.get("internal", [])
scored_links = [l for l in internal_links if l.get("total_score")]
scored_links.sort(key=lambda x: x.get("total_score", 0), reverse=True)
# Access scored and sorted links
if result.success and result.links:
for link in result.links.get("internal", []):
text = link.get('text', 'No text')[:40]
print(
text,
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
)
# Create a scoring table
table = Table(title="Link Scoring Results", box=box.ROUNDED)
table.add_column("Link Text", style="cyan", width=40)
table.add_column("Intrinsic Score", justify="center")
table.add_column("Contextual Score", justify="center")
table.add_column("Total Score", justify="center", style="bold green")
for link in scored_links[:5]:
text = link.get('text', 'No text')[:40]
table.add_row(
text,
f"{link.get('intrinsic_score', 0):.1f}/10",
f"{link.get('contextual_score', 0):.2f}/1",
f"{link.get('total_score', 0):.3f}"
)
console.print(table)
asyncio.run(main())
```
**Scoring Components:**
@@ -223,58 +221,34 @@ console.print(table)
### Technical Architecture
```python
import asyncio
from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages
seeder_config = SeedingConfig(
# Discovery sources
source="cc+sitemap", # Sitemap + Common Crawl
# Filtering
pattern="*/product/*", # URL pattern matching
# Validation
live_check=True, # Verify URLs are alive
max_urls=50, # Stop at 50 URLs
# Performance
concurrency=100, # Maximum concurrent requests for live checks/head extraction
hits_per_sec=10 # Rate limit in requests per second to avoid overwhelming servers
)
async def main():
async with AsyncUrlSeeder() as seeder:
# Discover Python tutorial URLs
config = SeedingConfig(
source="sitemap", # Use sitemap
pattern="*python*", # URL pattern filter
extract_head=True, # Get metadata
query="python tutorial", # For relevance scoring
scoring_method="bm25",
score_threshold=0.2,
max_urls=10
)
print("Discovering Python async tutorial URLs...")
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
print(f"\n✅ Found {len(urls)} relevant URLs:")
for i, url_info in enumerate(urls[:5], 1):
print(f"\n{i}. {url_info['url']}")
if url_info.get('relevance_score'):
print(f" Relevance: {url_info['relevance_score']:.3f}")
if url_info.get('head_data', {}).get('title'):
print(f" Title: {url_info['head_data']['title'][:60]}...")
async with AsyncUrlSeeder() as seeder:
console.print("Discovering URLs from Python docs...")
urls = await seeder.urls("docs.python.org", seeding_config)
console.print(f"\n✓ Discovered {len(urls)} URLs")
# Advanced: Relevance-based discovery
research_config = SeedingConfig(
source="sitemap+cc", # Sitemap + Common Crawl
pattern="*/blog/*", # Blog posts only
# Content relevance
extract_head=True, # Get meta tags
query="quantum computing tutorials",
scoring_method="bm25", # BM25 scoring method
score_threshold=0.4, # High relevance only
# Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
force=True # Bypass cache
)
# Discover with progress tracking
discovered = []
async with AsyncUrlSeeder() as seeder:
discovered = await seeder.urls("https://physics-blog.com", research_config)
console.print(f"\n✓ Discovered {len(discovered)} URLs")
# Results include scores and metadata
for url_data in discovered[:5]:
print(f"URL: {url_data['url']}")
print(f"Score: {url_data['relevance_score']:.3f}")
print(f"Title: {url_data['head_data']['title']}")
asyncio.run(main())
```
**Discovery Methods:**