Compare commits

...

11 Commits

Author SHA1 Message Date
unclecode
0163bd797c Merge branch 'release/v0.7.1' 2025-07-17 17:42:04 +08:00
ntohidi
26bad799e4 chore: update version to 0.7.1 2025-07-17 11:37:41 +02:00
ntohidi
cf8badfe27 feat: cleanup unused code and enhance documentation for v0.7.1
- Remove unused StealthConfig from browser_manager.py
- Update LinkPreviewConfig import path in __init__.py and examples
- Fix infinity handling in content_scraping_strategy.py (use 0 instead of float('inf'))
- Remove sanitize_json_data functions from API endpoints
- Add comprehensive C4A Script documentation to release notes
- Update v0.7.0 release notes with improved code examples
- Create v0.7.1 release notes focusing on cleanup and documentation improvements
- Update demo files with corrected import paths and examples
- Fix virtual scroll and adaptive crawling examples across documentation

🤖 Generated with Claude Code

Co-Authored-By: Claude <noreply@anthropic.com>
2025-07-17 11:35:16 +02:00
ntohidi
ccbe3c105c refactor: improve link scoring output format in release notes 2025-07-17 09:13:20 +02:00
Nasrin
761c19d54b Merge pull request #1307 from unclecode/fix/json-infinity-serialization
fix: Handle infinity values in JSON serialization for API  responses
2025-07-16 13:34:25 +02:00
Nasrin
14b0ecb137 Merge pull request #1305 from unclecode/fix/release-notes-demo-code
Fix: Update release notes and demo code
2025-07-16 13:33:53 +02:00
ntohidi
1d1970ae69 docs: Update release notes and docs for v0.7.0 with teh correct parameters and explanations 2025-07-15 11:32:04 +02:00
ntohidi
205df1e330 docs: Fix virtual scroll configuration 2025-07-15 10:29:47 +02:00
ntohidi
2640dc73a5 docs: Enhance session management example for dynamic content crawling with improved JavaScript handling and extraction schema. ref #226 2025-07-15 10:19:29 +02:00
ntohidi
58024755c5 docs: Update adaptive crawling parameters and examples in README and release notes 2025-07-15 10:15:05 +02:00
UncleCode
bde1bba6a2 docs: Add missing documentation pages to mkdocs.yml
- Added Adaptive Crawling to Core section
- Added URL Seeding to Core section
- Added Adaptive Strategies to Advanced section
2025-07-12 19:56:33 +08:00
19 changed files with 368 additions and 535 deletions

View File

@@ -523,15 +523,18 @@ async def test_news_crawl():
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically: - **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
```python ```python
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, confidence_threshold=0.7, # Min confidence to stop crawling
max_history=100, max_depth=5, # Maximum crawl depth
learning_rate=0.2 max_pages=20, # Maximum number of pages to crawl
strategy="statistical"
) )
result = await crawler.arun( async with AsyncWebCrawler() as crawler:
"https://news.example.com", adaptive_crawler = AdaptiveCrawler(crawler, config)
config=CrawlerRunConfig(adaptive_config=config) state = await adaptive_crawler.digest(
) start_url="https://news.example.com",
query="latest news content"
)
# Crawler learns patterns and improves extraction over time # Crawler learns patterns and improves extraction over time
``` ```

View File

@@ -3,7 +3,7 @@ import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode from .async_webcrawler import AsyncWebCrawler, CacheMode
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here # MODIFIED: Add SeedingConfig and VirtualScrollConfig here
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
from .content_scraping_strategy import ( from .content_scraping_strategy import (
ContentScrapingStrategy, ContentScrapingStrategy,
@@ -173,6 +173,7 @@ __all__ = [
"CompilationResult", "CompilationResult",
"ValidationResult", "ValidationResult",
"ErrorDetail", "ErrorDetail",
"LinkPreviewConfig"
] ]

View File

@@ -1,7 +1,7 @@
# crawl4ai/__version__.py # crawl4ai/__version__.py
# This is the version that will be used for stable releases # This is the version that will be used for stable releases
__version__ = "0.7.0" __version__ = "0.7.1"
# For nightly builds, this gets set during build process # For nightly builds, this gets set during build process
__nightly_version__ = None __nightly_version__ = None

View File

@@ -14,23 +14,8 @@ import hashlib
from .js_snippet import load_js_script from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig from .async_configs import BrowserConfig, CrawlerRunConfig
from playwright_stealth import StealthConfig
from .utils import get_chromium_path from .utils import get_chromium_path
stealth_config = StealthConfig(
webdriver=True,
chrome_app=True,
chrome_csi=True,
chrome_load_times=True,
chrome_runtime=True,
navigator_languages=True,
navigator_plugins=True,
navigator_permissions=True,
webgl_vendor=True,
outerdimensions=True,
navigator_hardware_concurrency=True,
media_codecs=True,
)
BROWSER_DISABLE_OPTIONS = [ BROWSER_DISABLE_OPTIONS = [
"--disable-background-networking", "--disable-background-networking",

View File

@@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
link_data["intrinsic_score"] = intrinsic_score link_data["intrinsic_score"] = intrinsic_score
except Exception: except Exception:
# Fail gracefully - assign default score # Fail gracefully - assign default score
link_data["intrinsic_score"] = float('inf') link_data["intrinsic_score"] = 0
else: else:
# No scoring enabled - assign infinity (all links equal priority) # No scoring enabled - assign infinity (all links equal priority)
link_data["intrinsic_score"] = float('inf') link_data["intrinsic_score"] = 0
is_external = is_external_url(normalized_href, base_domain) is_external = is_external_url(normalized_href, base_domain)
if is_external: if is_external:

View File

@@ -54,27 +54,6 @@ def _get_memory_mb():
logger.warning(f"Could not get memory info: {e}") logger.warning(f"Could not get memory info: {e}")
return None return None
# --- Helper to sanitize JSON data ---
def sanitize_json_data(data):
"""
Recursively sanitize data to handle infinity and NaN values that are not JSON compliant.
"""
import math
if isinstance(data, dict):
return {k: sanitize_json_data(v) for k, v in data.items()}
elif isinstance(data, list):
return [sanitize_json_data(item) for item in data]
elif isinstance(data, float):
if math.isinf(data):
return "Infinity" if data > 0 else "-Infinity"
elif math.isnan(data):
return "NaN"
else:
return data
else:
return data
async def handle_llm_qa( async def handle_llm_qa(
url: str, url: str,
@@ -392,10 +371,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
server_memory_mb = _get_memory_mb() server_memory_mb = _get_memory_mb()
result_dict = result.model_dump() result_dict = result.model_dump()
result_dict['server_memory_mb'] = server_memory_mb result_dict['server_memory_mb'] = server_memory_mb
# Sanitize data to handle infinity values logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
sanitized_dict = sanitize_json_data(result_dict) data = json.dumps(result_dict, default=datetime_handler) + "\n"
logger.info(f"Streaming result for {sanitized_dict.get('url', 'unknown')}")
data = json.dumps(sanitized_dict, default=datetime_handler) + "\n"
yield data.encode('utf-8') yield data.encode('utf-8')
except Exception as e: except Exception as e:
logger.error(f"Serialization error: {e}") logger.error(f"Serialization error: {e}")
@@ -469,7 +446,7 @@ async def handle_crawl_request(
return { return {
"success": True, "success": True,
"results": [sanitize_json_data(result.model_dump()) for result in results], "results": [result.model_dump() for result in results],
"server_processing_time_s": end_time - start_time, "server_processing_time_s": end_time - start_time,
"server_memory_delta_mb": mem_delta_mb, "server_memory_delta_mb": mem_delta_mb,
"server_peak_memory_mb": peak_mem_mb "server_peak_memory_mb": peak_mem_mb

View File

@@ -331,27 +331,6 @@ async def generate_pdf(
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()} return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
def sanitize_json_data(data):
"""
Recursively sanitize data to handle infinity and NaN values that are not JSON compliant.
"""
import math
if isinstance(data, dict):
return {k: sanitize_json_data(v) for k, v in data.items()}
elif isinstance(data, list):
return [sanitize_json_data(item) for item in data]
elif isinstance(data, float):
if math.isinf(data):
return "Infinity" if data > 0 else "-Infinity"
elif math.isnan(data):
return "NaN"
else:
return data
else:
return data
@app.post("/execute_js") @app.post("/execute_js")
@limiter.limit(config["rate_limiting"]["default_limit"]) @limiter.limit(config["rate_limiting"]["default_limit"])
@mcp_tool("execute_js") @mcp_tool("execute_js")
@@ -410,9 +389,7 @@ async def execute_js(
results = await crawler.arun(url=body.url, config=cfg) results = await crawler.arun(url=body.url, config=cfg)
# Return JSON-serializable dict of the first CrawlResult # Return JSON-serializable dict of the first CrawlResult
data = results[0].model_dump() data = results[0].model_dump()
# Sanitize data to handle infinity values return JSONResponse(data)
sanitized_data = sanitize_json_data(data)
return JSONResponse(sanitized_data)
@app.get("/llm/{url:path}") @app.get("/llm/{url:path}")

View File

@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization - **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements - **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores - Extraction confidence scores
```python ```python
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
import asyncio
# Initialize with custom learning parameters async def main():
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to use learned patterns # Configure adaptive crawler
max_history=100, # Remember last 100 crawls per domain config = AdaptiveConfig(
learning_rate=0.2, # How quickly to adapt to changes strategy="statistical", # or "embedding" for semantic understanding
patterns_per_page=3, # Patterns to learn per page type max_pages=10,
extraction_strategy='css' # 'css' or 'xpath' confidence_threshold=0.7, # Stop at 70% confidence
) top_k_links=3, # Follow top 3 links per page
min_gain_threshold=0.05 # Need 5% information gain to continue
adaptive_crawler = AdaptiveCrawler(config)
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://news.example.com/article/12345",
config=CrawlerRunConfig(
adaptive_config=config,
extraction_hints={ # Optional hints to speed up learning
"title": "article h1",
"content": "article .body-content"
}
)
) )
# Crawler identifies and stores patterns async with AsyncWebCrawler(verbose=False) as crawler:
if result.success: adaptive = AdaptiveCrawler(crawler, config)
state = adaptive_crawler.get_state("news.example.com")
print(f"Learned {len(state.patterns)} patterns") print("Starting adaptive crawl about Python decorators...")
print(f"Confidence: {state.avg_confidence:.2%}") result = await adaptive.digest(
start_url="https://docs.python.org/3/glossary.html",
query="python decorators functions wrapping"
)
print(f"\n✅ Crawling Complete!")
print(f"• Confidence Level: {adaptive.confidence:.0%}")
print(f"• Pages Crawled: {len(result.crawled_urls)}")
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
# Get most relevant content
relevant = adaptive.get_relevant_content(top_k=3)
print(f"\nMost Relevant Pages:")
for i, page in enumerate(relevant, 1):
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
# Subsequent crawls - uses learned patterns asyncio.run(main())
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
``` ```
**Expected Real-World Impact:** **Expected Real-World Impact:**
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
container_selector="[data-testid='primaryColumn']", container_selector="[data-testid='primaryColumn']",
scroll_count=20, # Number of scrolls scroll_count=20, # Number of scrolls
scroll_by="container_height", # Smart scrolling by container size scroll_by="container_height", # Smart scrolling by container size
wait_after_scroll=1.0, # Let content load wait_after_scroll=1.0 # Let content load
capture_method="incremental", # Capture new content on each scroll
deduplicate=True # Remove duplicate elements
) )
# For e-commerce product grids (Instagram style) # For e-commerce product grids (Instagram style)
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
container_selector="main .product-grid", container_selector="main .product-grid",
scroll_count=30, scroll_count=30,
scroll_by=800, # Fixed pixel scrolling scroll_by=800, # Fixed pixel scrolling
wait_after_scroll=1.5, # Images need time wait_after_scroll=1.5 # Images need time
stop_on_no_change=True # Smart stopping
) )
# For news feeds with lazy loading # For news feeds with lazy loading
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
container_selector=".article-feed", container_selector=".article-feed",
scroll_count=50, scroll_count=50,
scroll_by="page_height", # Viewport-based scrolling scroll_by="page_height", # Viewport-based scrolling
wait_after_scroll=0.5, wait_after_scroll=0.5 # Wait for content to load
wait_for_selector=".article-card", # Wait for specific elements
timeout=30000 # Max 30 seconds total
) )
# Use it in your crawl # Use it in your crawl
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
### The Three-Layer Scoring System ### Intelligent Link Analysis and Scoring
```python ```python
from crawl4ai import LinkPreviewConfig import asyncio
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
from crawl4ai.adaptive_crawler import LinkPreviewConfig
# Configure intelligent link analysis async def main():
link_config = LinkPreviewConfig( # Configure intelligent link analysis
# What to analyze link_config = LinkPreviewConfig(
include_internal=True, include_internal=True,
include_external=True, include_external=False,
max_links=100, # Analyze top 100 links max_links=10,
concurrency=5,
# Relevance scoring query="python tutorial", # For contextual scoring
query="machine learning tutorials", # Your interest score_threshold=0.3,
score_threshold=0.3, # Minimum relevance score verbose=True
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
)
# Use in your crawl
result = await crawler.arun(
"https://tech-blog.example.com",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True
) )
) # Use in your crawl
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://www.geeksforgeeks.org/",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
)
)
# Access scored and sorted links # Access scored and sorted links
for link in result.links["internal"][:10]: # Top 10 internal links if result.success and result.links:
print(f"Score: {link['total_score']:.3f}") for link in result.links.get("internal", []):
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes text = link.get('text', 'No text')[:40]
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query print(
print(f" URL: {link['href']}") text,
print(f" Title: {link['head_data']['title']}") f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
print(f" Description: {link['head_data']['meta']['description'][:100]}...") f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
)
asyncio.run(main())
``` ```
**Scoring Components:** **Scoring Components:**
1. **Intrinsic Score (0-10)**: Based on link quality indicators 1. **Intrinsic Score**: Based on link quality indicators
- Position on page (navigation, content, footer) - Position on page (navigation, content, footer)
- Link attributes (rel, title, class names) - Link attributes (rel, title, class names)
- Anchor text quality and length - Anchor text quality and length
- URL structure and depth - URL structure and depth
2. **Contextual Score (0-1)**: Relevance to your query 2. **Contextual Score**: Relevance to your query using BM25 algorithm
- Semantic similarity using embeddings
- Keyword matching in link text and title - Keyword matching in link text and title
- Meta description analysis - Meta description analysis
- Content preview scoring - Content preview scoring
3. **Total Score**: Weighted combination for final ranking 3. **Total Score**: Combined score for final ranking
**Expected Real-World Impact:** **Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
### Technical Architecture ### Technical Architecture
```python ```python
import asyncio
from crawl4ai import AsyncUrlSeeder, SeedingConfig from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages async def main():
seeder_config = SeedingConfig( async with AsyncUrlSeeder() as seeder:
# Discovery sources # Discover Python tutorial URLs
source="sitemap+cc", # Sitemap + Common Crawl config = SeedingConfig(
source="sitemap", # Use sitemap
# Filtering pattern="*python*", # URL pattern filter
pattern="*/product/*", # URL pattern matching extract_head=True, # Get metadata
ignore_patterns=["*/reviews/*", "*/questions/*"], query="python tutorial", # For relevance scoring
scoring_method="bm25",
# Validation score_threshold=0.2,
live_check=True, # Verify URLs are alive max_urls=10
max_urls=5000, # Stop at 5000 URLs )
# Performance print("Discovering Python async tutorial URLs...")
concurrency=100, # Parallel requests urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
hits_per_sec=10 # Rate limiting
) print(f"\n✅ Found {len(urls)} relevant URLs:")
for i, url_info in enumerate(urls[:5], 1):
print(f"\n{i}. {url_info['url']}")
if url_info.get('relevance_score'):
print(f" Relevance: {url_info['relevance_score']:.3f}")
if url_info.get('head_data', {}).get('title'):
print(f" Title: {url_info['head_data']['title'][:60]}...")
seeder = AsyncUrlSeeder(seeder_config) asyncio.run(main())
urls = await seeder.discover("https://shop.example.com")
# Advanced: Relevance-based discovery
research_config = SeedingConfig(
source="crawl+sitemap", # Deep crawl + sitemap
pattern="*/blog/*", # Blog posts only
# Content relevance
extract_head=True, # Get meta tags
query="quantum computing tutorials",
scoring_method="bm25", # Or "semantic" (coming soon)
score_threshold=0.4, # High relevance only
# Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache
)
# Discover with progress tracking
discovered = []
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
discovered.extend(batch)
print(f"Found {len(discovered)} relevant URLs so far...")
# Results include scores and metadata
for url_data in discovered[:5]:
print(f"URL: {url_data['url']}")
print(f"Score: {url_data['score']:.3f}")
print(f"Title: {url_data['title']}")
``` ```
**Discovery Methods:** **Discovery Methods:**
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
### What We Optimized ### What We Optimized
```python ```python
# Before v0.7.0 (slow) # Optimized crawling with v0.7.0 improvements
results = [] results = []
for url in urls: for url in urls:
result = await crawler.arun(url) result = await crawler.arun(
results.append(result) url,
config=CrawlerRunConfig(
# After v0.7.0 (fast) # Performance optimizations
# Automatic batching and connection pooling wait_until="domcontentloaded", # Faster than networkidle
results = await crawler.arun_batch( cache_mode=CacheMode.ENABLED # Enable caching
urls, )
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
) )
) results.append(result)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
``` ```
**Performance Gains:** **Performance Gains:**
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
- **Memory Usage**: 60% reduction with streaming processing - **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests - **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes ## 🔧 Important Changes

View File

@@ -0,0 +1,43 @@
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
*July 17, 2025 • 2 min read*
---
A small maintenance release that removes unused code and improves documentation.
## 🎯 What's Changed
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
- **Updated documentation** with better examples and parameter explanations
- **Fixed virtual scroll configuration** examples in docs
## 🧹 Code Cleanup
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
```python
# Removed unused code:
from playwright_stealth import StealthConfig
stealth_config = StealthConfig(...) # This was never used
```
## 📖 Documentation Updates
- Fixed adaptive crawling parameter examples
- Updated session management documentation
- Corrected virtual scroll configuration examples
## 🚀 Installation
```bash
pip install crawl4ai==0.7.1
```
No breaking changes - upgrade directly from v0.7.0.
---
Questions? Issues?
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)

View File

@@ -18,7 +18,7 @@ Usage:
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkPreviewConfig from crawl4ai import LinkPreviewConfig
async def basic_link_head_extraction(): async def basic_link_head_extraction():

View File

@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
from crawl4ai.cache_context import CacheMode from crawl4ai.cache_context import CacheMode
async def crawl_dynamic_content(): async def crawl_dynamic_content():
async with AsyncWebCrawler() as crawler: url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "github_commits_session" session_id = "wait_for_session"
url = "https://github.com/microsoft/TypeScript/commits/main" all_commits = []
all_commits = []
# Define extraction schema js_next_page = """
schema = { const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
"name": "Commit Extractor", if (commits.length > 0) {
"baseSelector": "li.Box-sc-g0xbh4-0", window.lastCommit = commits[0].textContent.trim();
"fields": [{ }
"name": "title", "selector": "h4.markdown-title", "type": "text" const button = document.querySelector('a[data-testid="pagination-next-button"]');
}], if (button) {button.click(); console.log('button clicked') }
} """
extraction_strategy = JsonCssExtractionStrategy(schema)
# JavaScript and wait configurations wait_for = """() => {
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();""" const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0""" if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
# Crawl multiple pages return firstCommit !== window.lastCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li[data-testid='commit-row-item']",
"fields": [
{
"name": "title",
"selector": "h4 a",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
browser_config = BrowserConfig(
verbose=True,
headless=False,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
for page in range(3): for page in range(3):
config = CrawlerRunConfig( crawler_config = CrawlerRunConfig(
url=url,
session_id=session_id, session_id=session_id,
css_selector="li[data-testid='commit-row-item']",
extraction_strategy=extraction_strategy, extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None, js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None, wait_for=wait_for if page > 0 else None,
js_only=page > 0, js_only=page > 0,
cache_mode=CacheMode.BYPASS cache_mode=CacheMode.BYPASS,
capture_console_messages=True,
) )
result = await crawler.arun(config=config) result = await crawler.arun(url=url, config=crawler_config)
if result.success:
if result.console_messages:
print(f"Page {page + 1} console messages:", result.console_messages)
if result.extracted_content:
# print(f"Page {page + 1} result:", result.extracted_content)
commits = json.loads(result.extracted_content) commits = json.loads(result.extracted_content)
all_commits.extend(commits) all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits") print(f"Page {page + 1}: Found {len(commits)} commits")
else:
print(f"Page {page + 1}: No content extracted")
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Clean up session # Clean up session
await crawler.crawler_strategy.kill_session(session_id) await crawler.crawler_strategy.kill_session(session_id)
return all_commits
``` ```
--- ---

View File

@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
wait_after_scroll=1.0 # Twitter needs time to load wait_after_scroll=1.0 # Twitter needs time to load
) )
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
config = CrawlerRunConfig( config = CrawlerRunConfig(
virtual_scroll_config=virtual_config, virtual_scroll_config=virtual_config
# Optional: Set headless=False to watch it work
# browser_config=BrowserConfig(headless=False)
) )
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun( result = await crawler.arun(
url="https://twitter.com/search?q=AI", url="https://twitter.com/search?q=AI",
config=config config=config
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
Virtual Scroll works seamlessly with extraction strategies: Virtual Scroll works seamlessly with extraction strategies:
```python ```python
from crawl4ai import LLMExtractionStrategy from crawl4ai import LLMExtractionStrategy, LLMConfig
# Define extraction schema # Define extraction schema
schema = { schema = {
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
scroll_count=20 scroll_count=20
), ),
extraction_strategy=LLMExtractionStrategy( extraction_strategy=LLMExtractionStrategy(
provider="openai/gpt-4o-mini", llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
schema=schema schema=schema
) )
) )

View File

@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns - **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages - **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization - **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering - **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
- **PDF Parsing**: Extract data from PDF documents
- **Performance Optimizations**: Significant speed and memory improvements - **Performance Optimizations**: Significant speed and memory improvements
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning ## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
- Extraction confidence scores - Extraction confidence scores
```python ```python
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
import asyncio
# Initialize with custom learning parameters async def main():
config = AdaptiveConfig(
confidence_threshold=0.7, # Min confidence to use learned patterns # Configure adaptive crawler
max_history=100, # Remember last 100 crawls per domain config = AdaptiveConfig(
learning_rate=0.2, # How quickly to adapt to changes strategy="statistical", # or "embedding" for semantic understanding
patterns_per_page=3, # Patterns to learn per page type max_pages=10,
extraction_strategy='css' # 'css' or 'xpath' confidence_threshold=0.7, # Stop at 70% confidence
) top_k_links=3, # Follow top 3 links per page
min_gain_threshold=0.05 # Need 5% information gain to continue
adaptive_crawler = AdaptiveCrawler(config)
# First crawl - crawler learns the structure
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://news.example.com/article/12345",
config=CrawlerRunConfig(
adaptive_config=config,
extraction_hints={ # Optional hints to speed up learning
"title": "article h1",
"content": "article .body-content"
}
)
) )
# Crawler identifies and stores patterns async with AsyncWebCrawler(verbose=False) as crawler:
if result.success: adaptive = AdaptiveCrawler(crawler, config)
state = adaptive_crawler.get_state("news.example.com")
print(f"Learned {len(state.patterns)} patterns") print("Starting adaptive crawl about Python decorators...")
print(f"Confidence: {state.avg_confidence:.2%}") result = await adaptive.digest(
start_url="https://docs.python.org/3/glossary.html",
query="python decorators functions wrapping"
)
print(f"\n✅ Crawling Complete!")
print(f"• Confidence Level: {adaptive.confidence:.0%}")
print(f"• Pages Crawled: {len(result.crawled_urls)}")
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
# Get most relevant content
relevant = adaptive.get_relevant_content(top_k=3)
print(f"\nMost Relevant Pages:")
for i, page in enumerate(relevant, 1):
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
# Subsequent crawls - uses learned patterns asyncio.run(main())
result2 = await crawler.arun(
"https://news.example.com/article/67890",
config=CrawlerRunConfig(adaptive_config=config)
)
# Automatically extracts using learned patterns!
``` ```
**Expected Real-World Impact:** **Expected Real-World Impact:**
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
container_selector="[data-testid='primaryColumn']", container_selector="[data-testid='primaryColumn']",
scroll_count=20, # Number of scrolls scroll_count=20, # Number of scrolls
scroll_by="container_height", # Smart scrolling by container size scroll_by="container_height", # Smart scrolling by container size
wait_after_scroll=1.0, # Let content load wait_after_scroll=1.0 # Let content load
capture_method="incremental", # Capture new content on each scroll
deduplicate=True # Remove duplicate elements
) )
# For e-commerce product grids (Instagram style) # For e-commerce product grids (Instagram style)
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
container_selector="main .product-grid", container_selector="main .product-grid",
scroll_count=30, scroll_count=30,
scroll_by=800, # Fixed pixel scrolling scroll_by=800, # Fixed pixel scrolling
wait_after_scroll=1.5, # Images need time wait_after_scroll=1.5 # Images need time
stop_on_no_change=True # Smart stopping
) )
# For news feeds with lazy loading # For news feeds with lazy loading
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
container_selector=".article-feed", container_selector=".article-feed",
scroll_count=50, scroll_count=50,
scroll_by="page_height", # Viewport-based scrolling scroll_by="page_height", # Viewport-based scrolling
wait_after_scroll=0.5, wait_after_scroll=0.5 # Wait for content to load
wait_for_selector=".article-card", # Wait for specific elements
timeout=30000 # Max 30 seconds total
) )
# Use it in your crawl # Use it in your crawl
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals. **My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
### The Three-Layer Scoring System ### Intelligent Link Analysis and Scoring
```python ```python
from crawl4ai import LinkPreviewConfig import asyncio
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
from crawl4ai.adaptive_crawler import LinkPreviewConfig
# Configure intelligent link analysis async def main():
link_config = LinkPreviewConfig( # Configure intelligent link analysis
# What to analyze link_config = LinkPreviewConfig(
include_internal=True, include_internal=True,
include_external=True, include_external=False,
max_links=100, # Analyze top 100 links max_links=10,
concurrency=5,
# Relevance scoring query="python tutorial", # For contextual scoring
query="machine learning tutorials", # Your interest score_threshold=0.3,
score_threshold=0.3, # Minimum relevance score verbose=True
# Performance
concurrent_requests=10, # Parallel processing
timeout_per_link=5000, # 5s per link
# Advanced scoring weights
scoring_weights={
"intrinsic": 0.3, # Link quality indicators
"contextual": 0.5, # Relevance to query
"popularity": 0.2 # Link prominence
}
)
# Use in your crawl
result = await crawler.arun(
"https://tech-blog.example.com",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True
) )
) # Use in your crawl
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
"https://www.geeksforgeeks.org/",
config=CrawlerRunConfig(
link_preview_config=link_config,
score_links=True, # Enable intrinsic scoring
cache_mode=CacheMode.BYPASS
)
)
# Access scored and sorted links # Access scored and sorted links
for link in result.links["internal"][:10]: # Top 10 internal links if result.success and result.links:
print(f"Score: {link['total_score']:.3f}") for link in result.links.get("internal", []):
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes text = link.get('text', 'No text')[:40]
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query print(
print(f" URL: {link['href']}") text,
print(f" Title: {link['head_data']['title']}") f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
print(f" Description: {link['head_data']['meta']['description'][:100]}...") f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
)
asyncio.run(main())
``` ```
**Scoring Components:** **Scoring Components:**
1. **Intrinsic Score (0-10)**: Based on link quality indicators 1. **Intrinsic Score**: Based on link quality indicators
- Position on page (navigation, content, footer) - Position on page (navigation, content, footer)
- Link attributes (rel, title, class names) - Link attributes (rel, title, class names)
- Anchor text quality and length - Anchor text quality and length
- URL structure and depth - URL structure and depth
2. **Contextual Score (0-1)**: Relevance to your query 2. **Contextual Score**: Relevance to your query using BM25 algorithm
- Semantic similarity using embeddings
- Keyword matching in link text and title - Keyword matching in link text and title
- Meta description analysis - Meta description analysis
- Content preview scoring - Content preview scoring
3. **Total Score**: Weighted combination for final ranking 3. **Total Score**: Combined score for final ranking
**Expected Real-World Impact:** **Expected Real-World Impact:**
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links - **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
### Technical Architecture ### Technical Architecture
```python ```python
import asyncio
from crawl4ai import AsyncUrlSeeder, SeedingConfig from crawl4ai import AsyncUrlSeeder, SeedingConfig
# Basic discovery - find all product pages async def main():
seeder_config = SeedingConfig( async with AsyncUrlSeeder() as seeder:
# Discovery sources # Discover Python tutorial URLs
source="sitemap+cc", # Sitemap + Common Crawl config = SeedingConfig(
source="sitemap", # Use sitemap
# Filtering pattern="*python*", # URL pattern filter
pattern="*/product/*", # URL pattern matching extract_head=True, # Get metadata
ignore_patterns=["*/reviews/*", "*/questions/*"], query="python tutorial", # For relevance scoring
scoring_method="bm25",
# Validation score_threshold=0.2,
live_check=True, # Verify URLs are alive max_urls=10
max_urls=5000, # Stop at 5000 URLs )
# Performance print("Discovering Python async tutorial URLs...")
concurrency=100, # Parallel requests urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
hits_per_sec=10 # Rate limiting
) print(f"\n✅ Found {len(urls)} relevant URLs:")
for i, url_info in enumerate(urls[:5], 1):
print(f"\n{i}. {url_info['url']}")
if url_info.get('relevance_score'):
print(f" Relevance: {url_info['relevance_score']:.3f}")
if url_info.get('head_data', {}).get('title'):
print(f" Title: {url_info['head_data']['title'][:60]}...")
seeder = AsyncUrlSeeder(seeder_config) asyncio.run(main())
urls = await seeder.discover("https://shop.example.com")
# Advanced: Relevance-based discovery
research_config = SeedingConfig(
source="crawl+sitemap", # Deep crawl + sitemap
pattern="*/blog/*", # Blog posts only
# Content relevance
extract_head=True, # Get meta tags
query="quantum computing tutorials",
scoring_method="bm25", # Or "semantic" (coming soon)
score_threshold=0.4, # High relevance only
# Smart filtering
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
min_content_length=500, # Skip thin content
force=True # Bypass cache
)
# Discover with progress tracking
discovered = []
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
discovered.extend(batch)
print(f"Found {len(discovered)} relevant URLs so far...")
# Results include scores and metadata
for url_data in discovered[:5]:
print(f"URL: {url_data['url']}")
print(f"Score: {url_data['score']:.3f}")
print(f"Title: {url_data['title']}")
``` ```
**Discovery Methods:** **Discovery Methods:**
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
### What We Optimized ### What We Optimized
```python ```python
# Before v0.7.0 (slow) # Optimized crawling with v0.7.0 improvements
results = [] results = []
for url in urls: for url in urls:
result = await crawler.arun(url) result = await crawler.arun(
results.append(result) url,
config=CrawlerRunConfig(
# After v0.7.0 (fast) # Performance optimizations
# Automatic batching and connection pooling wait_until="domcontentloaded", # Faster than networkidle
results = await crawler.arun_batch( cache_mode=CacheMode.ENABLED # Enable caching
urls, )
config=CrawlerRunConfig(
# New performance options
batch_size=10, # Process 10 URLs concurrently
reuse_browser=True, # Keep browser warm
eager_loading=False, # Load only what's needed
streaming_extraction=True, # Stream large extractions
# Optimized defaults
wait_until="domcontentloaded", # Faster than networkidle
exclude_external_resources=True, # Skip third-party assets
block_ads=True # Ad blocking built-in
) )
) results.append(result)
# Memory-efficient streaming for large crawls
async for result in crawler.arun_stream(large_url_list):
# Process results as they complete
await process_result(result)
# Memory is freed after each iteration
``` ```
**Performance Gains:** **Performance Gains:**
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
- **Memory Usage**: 60% reduction with streaming processing - **Memory Usage**: 60% reduction with streaming processing
- **Concurrent Crawls**: Handle 5x more parallel requests - **Concurrent Crawls**: Handle 5x more parallel requests
## 📄 PDF Support
PDF extraction is now natively supported in Crawl4AI.
```python
# Extract data from PDF documents
result = await crawler.arun(
"https://example.com/report.pdf",
config=CrawlerRunConfig(
pdf_extraction=True,
extraction_strategy=JsonCssExtractionStrategy({
# Works on converted PDF structure
"title": {"selector": "h1", "type": "text"},
"sections": {"selector": "h2", "type": "list"}
})
)
)
```
## 🔧 Important Changes ## 🔧 Important Changes

View File

@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
async def main(): async def main():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Create an adaptive crawler # Create an adaptive crawler (config is optional)
adaptive = AdaptiveCrawler(crawler) adaptive = AdaptiveCrawler(crawler)
# Start crawling with a query # Start crawling with a query
@@ -59,13 +59,13 @@ async def main():
from crawl4ai import AdaptiveConfig from crawl4ai import AdaptiveConfig
config = AdaptiveConfig( config = AdaptiveConfig(
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8) confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
max_pages=20, # Maximum pages to crawl (default: 50) max_pages=30, # Maximum pages to crawl (default: 20)
top_k_links=3, # Links to follow per page (default: 5) top_k_links=5, # Links to follow per page (default: 3)
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1) min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
) )
adaptive = AdaptiveCrawler(crawler, config=config) adaptive = AdaptiveCrawler(crawler, config)
``` ```
## Crawling Strategies ## Crawling Strategies
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
The confidence score (0-1) indicates how sufficient the gathered information is: The confidence score (0-1) indicates how sufficient the gathered information is:
- **0.0-0.3**: Insufficient information, needs more crawling - **0.0-0.3**: Insufficient information, needs more crawling
- **0.3-0.6**: Partial information, may answer basic queries - **0.3-0.6**: Partial information, may answer basic queries
- **0.6-0.8**: Good coverage, can answer most queries - **0.6-0.7**: Good coverage, can answer most queries
- **0.8-1.0**: Excellent coverage, comprehensive information - **0.7-1.0**: Excellent coverage, comprehensive information
### Statistics Display ### Statistics Display
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
- Avoid overly broad queries - Avoid overly broad queries
### 2. Threshold Tuning ### 2. Threshold Tuning
- Start with default (0.8) for general use - Start with default (0.7) for general use
- Lower to 0.6-0.7 for exploratory crawling - Lower to 0.5-0.6 for exploratory crawling
- Raise to 0.9+ for exhaustive coverage - Raise to 0.8+ for exhaustive coverage
### 3. Performance Optimization ### 3. Performance Optimization
- Use appropriate `max_pages` limits - Use appropriate `max_pages` limits

View File

@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
```python ```python
import asyncio import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkPreviewConfig from crawl4ai import LinkPreviewConfig
async def extract_link_heads_example(): async def extract_link_heads_example():
""" """
@@ -237,7 +237,7 @@ if __name__ == "__main__":
The `LinkPreviewConfig` class supports these options: The `LinkPreviewConfig` class supports these options:
```python ```python
from crawl4ai.async_configs import LinkPreviewConfig from crawl4ai import LinkPreviewConfig
link_preview_config = LinkPreviewConfig( link_preview_config = LinkPreviewConfig(
# BASIC SETTINGS # BASIC SETTINGS

View File

@@ -137,7 +137,7 @@ async def smart_blog_crawler():
word_count_threshold=300 # Only substantial articles word_count_threshold=300 # Only substantial articles
) )
# Extract URLs and stream results as they come # Extract URLs and crawl them
tutorial_urls = [t["url"] for t in tutorials[:10]] tutorial_urls = [t["url"] for t in tutorials[:10]]
results = await crawler.arun_many(tutorial_urls, config=config) results = await crawler.arun_many(tutorial_urls, config=config)
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
```python ```python
# Use both sources # Use both sources
config = SeedingConfig(source="cc+sitemap") config = SeedingConfig(source="sitemap+cc")
urls = await seeder.urls("example.com", config) urls = await seeder.urls("example.com", config)
``` ```
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
| Parameter | Type | Default | Description | | Parameter | Type | Default | Description |
|-----------|------|---------|-------------| |-----------|------|---------|-------------|
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" | | `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") | | `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
| `extract_head` | bool | False | Extract metadata from page `<head>` | | `extract_head` | bool | False | Extract metadata from page `<head>` |
| `live_check` | bool | False | Verify URLs are accessible | | `live_check` | bool | False | Verify URLs are accessible |
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) | | `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
| `concurrency` | int | 10 | Parallel workers for fetching | | `concurrency` | int | 10 | Parallel workers for fetching |
| `hits_per_sec` | int | None | Rate limit for requests | | `hits_per_sec` | int | 5 | Rate limit for requests |
| `force` | bool | False | Bypass cache, fetch fresh data | | `force` | bool | False | Bypass cache, fetch fresh data |
| `verbose` | bool | False | Show detailed progress | | `verbose` | bool | False | Show detailed progress |
| `query` | str | None | Search query for BM25 scoring | | `query` | str | None | Search query for BM25 scoring |
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
```python ```python
# Find specific products # Find specific products
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", # Use both sources source="sitemap+cc", # Use both sources
extract_head=True, extract_head=True,
query="wireless headphones noise canceling", query="wireless headphones noise canceling",
scoring_method="bm25", scoring_method="bm25",
@@ -782,7 +782,7 @@ class ResearchAssistant:
# Step 1: Discover relevant URLs # Step 1: Discover relevant URLs
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", # Maximum coverage source="sitemap+cc", # Maximum coverage
extract_head=True, # Get metadata extract_head=True, # Get metadata
query=topic, # Research topic query=topic, # Research topic
scoring_method="bm25", # Smart scoring scoring_method="bm25", # Smart scoring
@@ -832,7 +832,8 @@ class ResearchAssistant:
# Extract URLs and crawl all articles # Extract URLs and crawl all articles
article_urls = [article['url'] for article in top_articles] article_urls = [article['url'] for article in top_articles]
results = [] results = []
async for result in await crawler.arun_many(article_urls, config=config): crawl_results = await crawler.arun_many(article_urls, config=config)
async for result in crawl_results:
if result.success: if result.success:
results.append({ results.append({
'url': result.url, 'url': result.url,
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
# When crawling many URLs # When crawling many URLs
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
# Assuming urls is a list of URL strings # Assuming urls is a list of URL strings
results = await crawler.arun_many(urls, config=config) crawl_results = await crawler.arun_many(urls, config=config)
# Process as they arrive # Process as they arrive
async for result in results: async for result in crawl_results:
process_immediately(result) # Don't wait for all process_immediately(result) # Don't wait for all
``` ```
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
# E-commerce product discovery # E-commerce product discovery
config = SeedingConfig( config = SeedingConfig(
source="cc+sitemap", source="sitemap+cc",
pattern="*/product/*", pattern="*/product/*",
extract_head=True, extract_head=True,
live_check=True live_check=True

View File

@@ -28,7 +28,7 @@ from rich import box
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
from crawl4ai import AsyncUrlSeeder, SeedingConfig from crawl4ai import AsyncUrlSeeder, SeedingConfig
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
from crawl4ai import c4a_compile, CompilationResult from crawl4ai import c4a_compile, CompilationResult
# Initialize Rich console for beautiful output # Initialize Rich console for beautiful output

View File

@@ -13,14 +13,13 @@ from crawl4ai import (
BrowserConfig, BrowserConfig,
CacheMode, CacheMode,
# New imports for v0.7.0 # New imports for v0.7.0
LinkPreviewConfig,
VirtualScrollConfig, VirtualScrollConfig,
LinkPreviewConfig,
AdaptiveCrawler, AdaptiveCrawler,
AdaptiveConfig, AdaptiveConfig,
AsyncUrlSeeder, AsyncUrlSeeder,
SeedingConfig, SeedingConfig,
c4a_compile, c4a_compile,
CompilationResult
) )
@@ -170,16 +169,16 @@ async def demo_url_seeder():
# Discover Python tutorial URLs # Discover Python tutorial URLs
config = SeedingConfig( config = SeedingConfig(
source="sitemap", # Use sitemap source="sitemap", # Use sitemap
pattern="*tutorial*", # URL pattern filter pattern="*python*", # URL pattern filter
extract_head=True, # Get metadata extract_head=True, # Get metadata
query="python async programming", # For relevance scoring query="python tutorial", # For relevance scoring
scoring_method="bm25", scoring_method="bm25",
score_threshold=0.2, score_threshold=0.2,
max_urls=10 max_urls=10
) )
print("Discovering Python async tutorial URLs...") print("Discovering Python async tutorial URLs...")
urls = await seeder.urls("docs.python.org", config) urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
print(f"\n✅ Found {len(urls)} relevant URLs:") print(f"\n✅ Found {len(urls)} relevant URLs:")
for i, url_info in enumerate(urls[:5], 1): for i, url_info in enumerate(urls[:5], 1):
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
print(f"❌ Compilation error: {result.first_error.message}") print(f"❌ Compilation error: {result.first_error.message}")
async def demo_pdf_support():
"""
Demo 6: PDF Parsing Support
Shows how to extract content from PDF files.
Note: Requires 'pip install crawl4ai[pdf]'
"""
print("\n" + "="*60)
print("📄 DEMO 6: PDF Parsing Support")
print("="*60)
try:
# Check if PDF support is installed
import PyPDF2
# Example: Process a PDF URL
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
pdf=True, # Enable PDF generation
extract_text_from_pdf=True # Extract text content
)
print("PDF parsing is available!")
print("You can now crawl PDF URLs and extract their content.")
print("\nExample usage:")
print(' result = await crawler.arun("https://example.com/document.pdf")')
print(' pdf_text = result.extracted_content # Contains extracted text')
except ImportError:
print("⚠️ PDF support not installed.")
print("Install with: pip install crawl4ai[pdf]")
async def main(): async def main():
"""Run all demos""" """Run all demos"""
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations") print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
@@ -289,7 +255,6 @@ async def main():
("Virtual Scroll", demo_virtual_scroll), ("Virtual Scroll", demo_virtual_scroll),
("URL Seeder", demo_url_seeder), ("URL Seeder", demo_url_seeder),
("C4A Script", demo_c4a_script), ("C4A Script", demo_c4a_script),
("PDF Support", demo_pdf_support)
] ]
for name, demo_func in demos: for name, demo_func in demos:
@@ -309,7 +274,6 @@ async def main():
print("• Virtual Scroll: Capture all content from modern web pages") print("• Virtual Scroll: Capture all content from modern web pages")
print("• URL Seeder: Pre-discover and filter URLs efficiently") print("• URL Seeder: Pre-discover and filter URLs efficiently")
print("• C4A Script: Simple language for complex automations") print("• C4A Script: Simple language for complex automations")
print("• PDF Support: Extract content from PDF documents")
if __name__ == "__main__": if __name__ == "__main__":

View File

@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
from crawl4ai.models import Link from crawl4ai.models import Link
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.async_configs import LinkPreviewConfig from crawl4ai import LinkPreviewConfig
import asyncio import asyncio
import sys import sys
import os import os
@@ -237,7 +237,7 @@ def test_config_examples():
print(f" {key}: {value}") print(f" {key}: {value}")
print(" Usage:") print(" Usage:")
print(" from crawl4ai.async_configs import LinkPreviewConfig") print(" from crawl4ai import LinkPreviewConfig")
print(" config = CrawlerRunConfig(") print(" config = CrawlerRunConfig(")
print(" link_preview_config=LinkPreviewConfig(") print(" link_preview_config=LinkPreviewConfig(")
for key, value in config_dict.items(): for key, value in config_dict.items():