Compare commits
11 Commits
fix/json-i
...
v0.7.1
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
0163bd797c | ||
|
|
26bad799e4 | ||
|
|
cf8badfe27 | ||
|
|
ccbe3c105c | ||
|
|
761c19d54b | ||
|
|
14b0ecb137 | ||
|
|
1d1970ae69 | ||
|
|
205df1e330 | ||
|
|
2640dc73a5 | ||
|
|
58024755c5 | ||
|
|
bde1bba6a2 |
17
README.md
17
README.md
@@ -523,15 +523,18 @@ async def test_news_crawl():
|
|||||||
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
- **🧠 Adaptive Crawling**: Your crawler now learns and adapts to website patterns automatically:
|
||||||
```python
|
```python
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7,
|
confidence_threshold=0.7, # Min confidence to stop crawling
|
||||||
max_history=100,
|
max_depth=5, # Maximum crawl depth
|
||||||
learning_rate=0.2
|
max_pages=20, # Maximum number of pages to crawl
|
||||||
|
strategy="statistical"
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(
|
async with AsyncWebCrawler() as crawler:
|
||||||
"https://news.example.com",
|
adaptive_crawler = AdaptiveCrawler(crawler, config)
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
state = await adaptive_crawler.digest(
|
||||||
)
|
start_url="https://news.example.com",
|
||||||
|
query="latest news content"
|
||||||
|
)
|
||||||
# Crawler learns patterns and improves extraction over time
|
# Crawler learns patterns and improves extraction over time
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ import warnings
|
|||||||
|
|
||||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||||
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
# MODIFIED: Add SeedingConfig and VirtualScrollConfig here
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig, ProxyConfig, GeolocationConfig, SeedingConfig, VirtualScrollConfig, LinkPreviewConfig
|
||||||
|
|
||||||
from .content_scraping_strategy import (
|
from .content_scraping_strategy import (
|
||||||
ContentScrapingStrategy,
|
ContentScrapingStrategy,
|
||||||
@@ -173,6 +173,7 @@ __all__ = [
|
|||||||
"CompilationResult",
|
"CompilationResult",
|
||||||
"ValidationResult",
|
"ValidationResult",
|
||||||
"ErrorDetail",
|
"ErrorDetail",
|
||||||
|
"LinkPreviewConfig"
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,7 @@
|
|||||||
# crawl4ai/__version__.py
|
# crawl4ai/__version__.py
|
||||||
|
|
||||||
# This is the version that will be used for stable releases
|
# This is the version that will be used for stable releases
|
||||||
__version__ = "0.7.0"
|
__version__ = "0.7.1"
|
||||||
|
|
||||||
# For nightly builds, this gets set during build process
|
# For nightly builds, this gets set during build process
|
||||||
__nightly_version__ = None
|
__nightly_version__ = None
|
||||||
|
|||||||
@@ -14,23 +14,8 @@ import hashlib
|
|||||||
from .js_snippet import load_js_script
|
from .js_snippet import load_js_script
|
||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from playwright_stealth import StealthConfig
|
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
|
||||||
stealth_config = StealthConfig(
|
|
||||||
webdriver=True,
|
|
||||||
chrome_app=True,
|
|
||||||
chrome_csi=True,
|
|
||||||
chrome_load_times=True,
|
|
||||||
chrome_runtime=True,
|
|
||||||
navigator_languages=True,
|
|
||||||
navigator_plugins=True,
|
|
||||||
navigator_permissions=True,
|
|
||||||
webgl_vendor=True,
|
|
||||||
outerdimensions=True,
|
|
||||||
navigator_hardware_concurrency=True,
|
|
||||||
media_codecs=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
"--disable-background-networking",
|
"--disable-background-networking",
|
||||||
|
|||||||
@@ -1145,10 +1145,10 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
|||||||
link_data["intrinsic_score"] = intrinsic_score
|
link_data["intrinsic_score"] = intrinsic_score
|
||||||
except Exception:
|
except Exception:
|
||||||
# Fail gracefully - assign default score
|
# Fail gracefully - assign default score
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
else:
|
else:
|
||||||
# No scoring enabled - assign infinity (all links equal priority)
|
# No scoring enabled - assign infinity (all links equal priority)
|
||||||
link_data["intrinsic_score"] = float('inf')
|
link_data["intrinsic_score"] = 0
|
||||||
|
|
||||||
is_external = is_external_url(normalized_href, base_domain)
|
is_external = is_external_url(normalized_href, base_domain)
|
||||||
if is_external:
|
if is_external:
|
||||||
|
|||||||
@@ -54,27 +54,6 @@ def _get_memory_mb():
|
|||||||
logger.warning(f"Could not get memory info: {e}")
|
logger.warning(f"Could not get memory info: {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
# --- Helper to sanitize JSON data ---
|
|
||||||
def sanitize_json_data(data):
|
|
||||||
"""
|
|
||||||
Recursively sanitize data to handle infinity and NaN values that are not JSON compliant.
|
|
||||||
"""
|
|
||||||
import math
|
|
||||||
|
|
||||||
if isinstance(data, dict):
|
|
||||||
return {k: sanitize_json_data(v) for k, v in data.items()}
|
|
||||||
elif isinstance(data, list):
|
|
||||||
return [sanitize_json_data(item) for item in data]
|
|
||||||
elif isinstance(data, float):
|
|
||||||
if math.isinf(data):
|
|
||||||
return "Infinity" if data > 0 else "-Infinity"
|
|
||||||
elif math.isnan(data):
|
|
||||||
return "NaN"
|
|
||||||
else:
|
|
||||||
return data
|
|
||||||
else:
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
async def handle_llm_qa(
|
async def handle_llm_qa(
|
||||||
url: str,
|
url: str,
|
||||||
@@ -392,10 +371,8 @@ async def stream_results(crawler: AsyncWebCrawler, results_gen: AsyncGenerator)
|
|||||||
server_memory_mb = _get_memory_mb()
|
server_memory_mb = _get_memory_mb()
|
||||||
result_dict = result.model_dump()
|
result_dict = result.model_dump()
|
||||||
result_dict['server_memory_mb'] = server_memory_mb
|
result_dict['server_memory_mb'] = server_memory_mb
|
||||||
# Sanitize data to handle infinity values
|
logger.info(f"Streaming result for {result_dict.get('url', 'unknown')}")
|
||||||
sanitized_dict = sanitize_json_data(result_dict)
|
data = json.dumps(result_dict, default=datetime_handler) + "\n"
|
||||||
logger.info(f"Streaming result for {sanitized_dict.get('url', 'unknown')}")
|
|
||||||
data = json.dumps(sanitized_dict, default=datetime_handler) + "\n"
|
|
||||||
yield data.encode('utf-8')
|
yield data.encode('utf-8')
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logger.error(f"Serialization error: {e}")
|
logger.error(f"Serialization error: {e}")
|
||||||
@@ -469,7 +446,7 @@ async def handle_crawl_request(
|
|||||||
|
|
||||||
return {
|
return {
|
||||||
"success": True,
|
"success": True,
|
||||||
"results": [sanitize_json_data(result.model_dump()) for result in results],
|
"results": [result.model_dump() for result in results],
|
||||||
"server_processing_time_s": end_time - start_time,
|
"server_processing_time_s": end_time - start_time,
|
||||||
"server_memory_delta_mb": mem_delta_mb,
|
"server_memory_delta_mb": mem_delta_mb,
|
||||||
"server_peak_memory_mb": peak_mem_mb
|
"server_peak_memory_mb": peak_mem_mb
|
||||||
|
|||||||
@@ -331,27 +331,6 @@ async def generate_pdf(
|
|||||||
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
return {"success": True, "pdf": base64.b64encode(pdf_data).decode()}
|
||||||
|
|
||||||
|
|
||||||
def sanitize_json_data(data):
|
|
||||||
"""
|
|
||||||
Recursively sanitize data to handle infinity and NaN values that are not JSON compliant.
|
|
||||||
"""
|
|
||||||
import math
|
|
||||||
|
|
||||||
if isinstance(data, dict):
|
|
||||||
return {k: sanitize_json_data(v) for k, v in data.items()}
|
|
||||||
elif isinstance(data, list):
|
|
||||||
return [sanitize_json_data(item) for item in data]
|
|
||||||
elif isinstance(data, float):
|
|
||||||
if math.isinf(data):
|
|
||||||
return "Infinity" if data > 0 else "-Infinity"
|
|
||||||
elif math.isnan(data):
|
|
||||||
return "NaN"
|
|
||||||
else:
|
|
||||||
return data
|
|
||||||
else:
|
|
||||||
return data
|
|
||||||
|
|
||||||
|
|
||||||
@app.post("/execute_js")
|
@app.post("/execute_js")
|
||||||
@limiter.limit(config["rate_limiting"]["default_limit"])
|
@limiter.limit(config["rate_limiting"]["default_limit"])
|
||||||
@mcp_tool("execute_js")
|
@mcp_tool("execute_js")
|
||||||
@@ -410,9 +389,7 @@ async def execute_js(
|
|||||||
results = await crawler.arun(url=body.url, config=cfg)
|
results = await crawler.arun(url=body.url, config=cfg)
|
||||||
# Return JSON-serializable dict of the first CrawlResult
|
# Return JSON-serializable dict of the first CrawlResult
|
||||||
data = results[0].model_dump()
|
data = results[0].model_dump()
|
||||||
# Sanitize data to handle infinity values
|
return JSONResponse(data)
|
||||||
sanitized_data = sanitize_json_data(data)
|
|
||||||
return JSONResponse(sanitized_data)
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/llm/{url:path}")
|
@app.get("/llm/{url:path}")
|
||||||
|
|||||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
|||||||
|
|
||||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||||
- **PDF Parsing**: Extract data from PDF documents
|
|
||||||
- **Performance Optimizations**: Significant speed and memory improvements
|
- **Performance Optimizations**: Significant speed and memory improvements
|
||||||
|
|
||||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||||
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
# Configure adaptive crawler
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
config = AdaptiveConfig(
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
max_pages=10,
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
)
|
top_k_links=3, # Follow top 3 links per page
|
||||||
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
adaptive_config=config,
|
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
|
||||||
"title": "article h1",
|
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
if result.success:
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
asyncio.run(main())
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
|
|||||||
container_selector="[data-testid='primaryColumn']",
|
container_selector="[data-testid='primaryColumn']",
|
||||||
scroll_count=20, # Number of scrolls
|
scroll_count=20, # Number of scrolls
|
||||||
scroll_by="container_height", # Smart scrolling by container size
|
scroll_by="container_height", # Smart scrolling by container size
|
||||||
wait_after_scroll=1.0, # Let content load
|
wait_after_scroll=1.0 # Let content load
|
||||||
capture_method="incremental", # Capture new content on each scroll
|
|
||||||
deduplicate=True # Remove duplicate elements
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For e-commerce product grids (Instagram style)
|
# For e-commerce product grids (Instagram style)
|
||||||
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
|
|||||||
container_selector="main .product-grid",
|
container_selector="main .product-grid",
|
||||||
scroll_count=30,
|
scroll_count=30,
|
||||||
scroll_by=800, # Fixed pixel scrolling
|
scroll_by=800, # Fixed pixel scrolling
|
||||||
wait_after_scroll=1.5, # Images need time
|
wait_after_scroll=1.5 # Images need time
|
||||||
stop_on_no_change=True # Smart stopping
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For news feeds with lazy loading
|
# For news feeds with lazy loading
|
||||||
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
|
|||||||
container_selector=".article-feed",
|
container_selector=".article-feed",
|
||||||
scroll_count=50,
|
scroll_count=50,
|
||||||
scroll_by="page_height", # Viewport-based scrolling
|
scroll_by="page_height", # Viewport-based scrolling
|
||||||
wait_after_scroll=0.5,
|
wait_after_scroll=0.5 # Wait for content to load
|
||||||
wait_for_selector=".article-card", # Wait for specific elements
|
|
||||||
timeout=30000 # Max 30 seconds total
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use it in your crawl
|
# Use it in your crawl
|
||||||
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
# What to analyze
|
link_config = LinkPreviewConfig(
|
||||||
include_internal=True,
|
include_internal=True,
|
||||||
include_external=True,
|
include_external=False,
|
||||||
max_links=100, # Analyze top 100 links
|
max_links=10,
|
||||||
|
concurrency=5,
|
||||||
# Relevance scoring
|
query="python tutorial", # For contextual scoring
|
||||||
query="machine learning tutorials", # Your interest
|
score_threshold=0.3,
|
||||||
score_threshold=0.3, # Minimum relevance score
|
verbose=True
|
||||||
|
|
||||||
# Performance
|
|
||||||
concurrent_requests=10, # Parallel processing
|
|
||||||
timeout_per_link=5000, # 5s per link
|
|
||||||
|
|
||||||
# Advanced scoring weights
|
|
||||||
scoring_weights={
|
|
||||||
"intrinsic": 0.3, # Link quality indicators
|
|
||||||
"contextual": 0.5, # Relevance to query
|
|
||||||
"popularity": 0.2 # Link prominence
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
if result.success and result.links:
|
||||||
print(f"Score: {link['total_score']:.3f}")
|
for link in result.links.get("internal", []):
|
||||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
text = link.get('text', 'No text')[:40]
|
||||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
print(
|
||||||
print(f" URL: {link['href']}")
|
text,
|
||||||
print(f" Title: {link['head_data']['title']}")
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
|
|
||||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
1. **Intrinsic Score**: Based on link quality indicators
|
||||||
- Position on page (navigation, content, footer)
|
- Position on page (navigation, content, footer)
|
||||||
- Link attributes (rel, title, class names)
|
- Link attributes (rel, title, class names)
|
||||||
- Anchor text quality and length
|
- Anchor text quality and length
|
||||||
- URL structure and depth
|
- URL structure and depth
|
||||||
|
|
||||||
2. **Contextual Score (0-1)**: Relevance to your query
|
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||||
- Semantic similarity using embeddings
|
|
||||||
- Keyword matching in link text and title
|
- Keyword matching in link text and title
|
||||||
- Meta description analysis
|
- Meta description analysis
|
||||||
- Content preview scoring
|
- Content preview scoring
|
||||||
|
|
||||||
3. **Total Score**: Weighted combination for final ranking
|
3. **Total Score**: Combined score for final ranking
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||||
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
# Filtering
|
pattern="*python*", # URL pattern filter
|
||||||
pattern="*/product/*", # URL pattern matching
|
extract_head=True, # Get metadata
|
||||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
query="python tutorial", # For relevance scoring
|
||||||
|
scoring_method="bm25",
|
||||||
# Validation
|
score_threshold=0.2,
|
||||||
live_check=True, # Verify URLs are alive
|
max_urls=10
|
||||||
max_urls=5000, # Stop at 5000 URLs
|
)
|
||||||
|
|
||||||
# Performance
|
print("Discovering Python async tutorial URLs...")
|
||||||
concurrency=100, # Parallel requests
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
hits_per_sec=10 # Rate limiting
|
|
||||||
)
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
seeder = AsyncUrlSeeder(seeder_config)
|
asyncio.run(main())
|
||||||
urls = await seeder.discover("https://shop.example.com")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="crawl+sitemap", # Deep crawl + sitemap
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
min_content_length=500, # Skip thin content
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
|
||||||
discovered.extend(batch)
|
|
||||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['score']:.3f}")
|
|
||||||
print(f"Title: {url_data['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
|
|||||||
### What We Optimized
|
### What We Optimized
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Before v0.7.0 (slow)
|
# Optimized crawling with v0.7.0 improvements
|
||||||
results = []
|
results = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
result = await crawler.arun(url)
|
result = await crawler.arun(
|
||||||
results.append(result)
|
url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
# After v0.7.0 (fast)
|
# Performance optimizations
|
||||||
# Automatic batching and connection pooling
|
wait_until="domcontentloaded", # Faster than networkidle
|
||||||
results = await crawler.arun_batch(
|
cache_mode=CacheMode.ENABLED # Enable caching
|
||||||
urls,
|
)
|
||||||
config=CrawlerRunConfig(
|
|
||||||
# New performance options
|
|
||||||
batch_size=10, # Process 10 URLs concurrently
|
|
||||||
reuse_browser=True, # Keep browser warm
|
|
||||||
eager_loading=False, # Load only what's needed
|
|
||||||
streaming_extraction=True, # Stream large extractions
|
|
||||||
|
|
||||||
# Optimized defaults
|
|
||||||
wait_until="domcontentloaded", # Faster than networkidle
|
|
||||||
exclude_external_resources=True, # Skip third-party assets
|
|
||||||
block_ads=True # Ad blocking built-in
|
|
||||||
)
|
)
|
||||||
)
|
results.append(result)
|
||||||
|
|
||||||
# Memory-efficient streaming for large crawls
|
|
||||||
async for result in crawler.arun_stream(large_url_list):
|
|
||||||
# Process results as they complete
|
|
||||||
await process_result(result)
|
|
||||||
# Memory is freed after each iteration
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Performance Gains:**
|
**Performance Gains:**
|
||||||
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
|
|||||||
- **Memory Usage**: 60% reduction with streaming processing
|
- **Memory Usage**: 60% reduction with streaming processing
|
||||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||||
|
|
||||||
## 📄 PDF Support
|
|
||||||
|
|
||||||
PDF extraction is now natively supported in Crawl4AI.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Extract data from PDF documents
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com/report.pdf",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
pdf_extraction=True,
|
|
||||||
extraction_strategy=JsonCssExtractionStrategy({
|
|
||||||
# Works on converted PDF structure
|
|
||||||
"title": {"selector": "h1", "type": "text"},
|
|
||||||
"sections": {"selector": "h2", "type": "list"}
|
|
||||||
})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Important Changes
|
## 🔧 Important Changes
|
||||||
|
|
||||||
|
|||||||
43
docs/blog/release-v0.7.1.md
Normal file
43
docs/blog/release-v0.7.1.md
Normal file
@@ -0,0 +1,43 @@
|
|||||||
|
# 🛠️ Crawl4AI v0.7.1: Minor Cleanup Update
|
||||||
|
|
||||||
|
*July 17, 2025 • 2 min read*
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
A small maintenance release that removes unused code and improves documentation.
|
||||||
|
|
||||||
|
## 🎯 What's Changed
|
||||||
|
|
||||||
|
- **Removed unused StealthConfig** from `crawl4ai/browser_manager.py`
|
||||||
|
- **Updated documentation** with better examples and parameter explanations
|
||||||
|
- **Fixed virtual scroll configuration** examples in docs
|
||||||
|
|
||||||
|
## 🧹 Code Cleanup
|
||||||
|
|
||||||
|
Removed unused `StealthConfig` import and configuration that wasn't being used anywhere in the codebase. The project uses its own custom stealth implementation through JavaScript injection instead.
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Removed unused code:
|
||||||
|
from playwright_stealth import StealthConfig
|
||||||
|
stealth_config = StealthConfig(...) # This was never used
|
||||||
|
```
|
||||||
|
|
||||||
|
## 📖 Documentation Updates
|
||||||
|
|
||||||
|
- Fixed adaptive crawling parameter examples
|
||||||
|
- Updated session management documentation
|
||||||
|
- Corrected virtual scroll configuration examples
|
||||||
|
|
||||||
|
## 🚀 Installation
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install crawl4ai==0.7.1
|
||||||
|
```
|
||||||
|
|
||||||
|
No breaking changes - upgrade directly from v0.7.0.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
Questions? Issues?
|
||||||
|
- GitHub: [github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Discord: [discord.gg/crawl4ai](https://discord.gg/jP8KfhDhyN)
|
||||||
@@ -18,7 +18,7 @@ Usage:
|
|||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
|
|
||||||
async def basic_link_head_extraction():
|
async def basic_link_head_extraction():
|
||||||
|
|||||||
@@ -49,46 +49,75 @@ from crawl4ai import JsonCssExtractionStrategy
|
|||||||
from crawl4ai.cache_context import CacheMode
|
from crawl4ai.cache_context import CacheMode
|
||||||
|
|
||||||
async def crawl_dynamic_content():
|
async def crawl_dynamic_content():
|
||||||
async with AsyncWebCrawler() as crawler:
|
url = "https://github.com/microsoft/TypeScript/commits/main"
|
||||||
session_id = "github_commits_session"
|
session_id = "wait_for_session"
|
||||||
url = "https://github.com/microsoft/TypeScript/commits/main"
|
all_commits = []
|
||||||
all_commits = []
|
|
||||||
|
|
||||||
# Define extraction schema
|
js_next_page = """
|
||||||
schema = {
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
"name": "Commit Extractor",
|
if (commits.length > 0) {
|
||||||
"baseSelector": "li.Box-sc-g0xbh4-0",
|
window.lastCommit = commits[0].textContent.trim();
|
||||||
"fields": [{
|
}
|
||||||
"name": "title", "selector": "h4.markdown-title", "type": "text"
|
const button = document.querySelector('a[data-testid="pagination-next-button"]');
|
||||||
}],
|
if (button) {button.click(); console.log('button clicked') }
|
||||||
}
|
"""
|
||||||
extraction_strategy = JsonCssExtractionStrategy(schema)
|
|
||||||
|
|
||||||
# JavaScript and wait configurations
|
wait_for = """() => {
|
||||||
js_next_page = """document.querySelector('a[data-testid="pagination-next-button"]').click();"""
|
const commits = document.querySelectorAll('li[data-testid="commit-row-item"] h4');
|
||||||
wait_for = """() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0"""
|
if (commits.length === 0) return false;
|
||||||
|
const firstCommit = commits[0].textContent.trim();
|
||||||
# Crawl multiple pages
|
return firstCommit !== window.lastCommit;
|
||||||
|
}"""
|
||||||
|
|
||||||
|
schema = {
|
||||||
|
"name": "Commit Extractor",
|
||||||
|
"baseSelector": "li[data-testid='commit-row-item']",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "title",
|
||||||
|
"selector": "h4 a",
|
||||||
|
"type": "text",
|
||||||
|
"transform": "strip",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||||
|
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(
|
||||||
|
verbose=True,
|
||||||
|
headless=False,
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
for page in range(3):
|
for page in range(3):
|
||||||
config = CrawlerRunConfig(
|
crawler_config = CrawlerRunConfig(
|
||||||
url=url,
|
|
||||||
session_id=session_id,
|
session_id=session_id,
|
||||||
|
css_selector="li[data-testid='commit-row-item']",
|
||||||
extraction_strategy=extraction_strategy,
|
extraction_strategy=extraction_strategy,
|
||||||
js_code=js_next_page if page > 0 else None,
|
js_code=js_next_page if page > 0 else None,
|
||||||
wait_for=wait_for if page > 0 else None,
|
wait_for=wait_for if page > 0 else None,
|
||||||
js_only=page > 0,
|
js_only=page > 0,
|
||||||
cache_mode=CacheMode.BYPASS
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
capture_console_messages=True,
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(config=config)
|
result = await crawler.arun(url=url, config=crawler_config)
|
||||||
if result.success:
|
|
||||||
|
if result.console_messages:
|
||||||
|
print(f"Page {page + 1} console messages:", result.console_messages)
|
||||||
|
|
||||||
|
if result.extracted_content:
|
||||||
|
# print(f"Page {page + 1} result:", result.extracted_content)
|
||||||
commits = json.loads(result.extracted_content)
|
commits = json.loads(result.extracted_content)
|
||||||
all_commits.extend(commits)
|
all_commits.extend(commits)
|
||||||
print(f"Page {page + 1}: Found {len(commits)} commits")
|
print(f"Page {page + 1}: Found {len(commits)} commits")
|
||||||
|
else:
|
||||||
|
print(f"Page {page + 1}: No content extracted")
|
||||||
|
|
||||||
|
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
|
||||||
# Clean up session
|
# Clean up session
|
||||||
await crawler.crawler_strategy.kill_session(session_id)
|
await crawler.crawler_strategy.kill_session(session_id)
|
||||||
return all_commits
|
|
||||||
```
|
```
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
@@ -91,13 +91,12 @@ async def crawl_twitter_timeline():
|
|||||||
wait_after_scroll=1.0 # Twitter needs time to load
|
wait_after_scroll=1.0 # Twitter needs time to load
|
||||||
)
|
)
|
||||||
|
|
||||||
|
browser_config = BrowserConfig(headless=True) # Set to False to watch it work
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
virtual_scroll_config=virtual_config,
|
virtual_scroll_config=virtual_config
|
||||||
# Optional: Set headless=False to watch it work
|
|
||||||
# browser_config=BrowserConfig(headless=False)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
url="https://twitter.com/search?q=AI",
|
url="https://twitter.com/search?q=AI",
|
||||||
config=config
|
config=config
|
||||||
@@ -200,7 +199,7 @@ Use **scan_full_page** when:
|
|||||||
Virtual Scroll works seamlessly with extraction strategies:
|
Virtual Scroll works seamlessly with extraction strategies:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LLMExtractionStrategy
|
from crawl4ai import LLMExtractionStrategy, LLMConfig
|
||||||
|
|
||||||
# Define extraction schema
|
# Define extraction schema
|
||||||
schema = {
|
schema = {
|
||||||
@@ -222,7 +221,7 @@ config = CrawlerRunConfig(
|
|||||||
scroll_count=20
|
scroll_count=20
|
||||||
),
|
),
|
||||||
extraction_strategy=LLMExtractionStrategy(
|
extraction_strategy=LLMExtractionStrategy(
|
||||||
provider="openai/gpt-4o-mini",
|
llm_config=LLMConfig(provider="openai/gpt-4o-mini"),
|
||||||
schema=schema
|
schema=schema
|
||||||
)
|
)
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -10,9 +10,8 @@ Today I'm releasing Crawl4AI v0.7.0—the Adaptive Intelligence Update. This rel
|
|||||||
|
|
||||||
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
- **Adaptive Crawling**: Your crawler now learns and adapts to website patterns
|
||||||
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
- **Virtual Scroll Support**: Complete content extraction from infinite scroll pages
|
||||||
- **Link Preview with 3-Layer Scoring**: Intelligent link analysis and prioritization
|
- **Link Preview with Intelligent Scoring**: Intelligent link analysis and prioritization
|
||||||
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
- **Async URL Seeder**: Discover thousands of URLs in seconds with intelligent filtering
|
||||||
- **PDF Parsing**: Extract data from PDF documents
|
|
||||||
- **Performance Optimizations**: Significant speed and memory improvements
|
- **Performance Optimizations**: Significant speed and memory improvements
|
||||||
|
|
||||||
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
## 🧠 Adaptive Crawling: Intelligence Through Pattern Learning
|
||||||
@@ -30,44 +29,41 @@ The Adaptive Crawler maintains a persistent state for each domain, tracking:
|
|||||||
- Extraction confidence scores
|
- Extraction confidence scores
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import AdaptiveCrawler, AdaptiveConfig, CrawlState
|
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
|
||||||
|
import asyncio
|
||||||
|
|
||||||
# Initialize with custom learning parameters
|
async def main():
|
||||||
config = AdaptiveConfig(
|
|
||||||
confidence_threshold=0.7, # Min confidence to use learned patterns
|
# Configure adaptive crawler
|
||||||
max_history=100, # Remember last 100 crawls per domain
|
config = AdaptiveConfig(
|
||||||
learning_rate=0.2, # How quickly to adapt to changes
|
strategy="statistical", # or "embedding" for semantic understanding
|
||||||
patterns_per_page=3, # Patterns to learn per page type
|
max_pages=10,
|
||||||
extraction_strategy='css' # 'css' or 'xpath'
|
confidence_threshold=0.7, # Stop at 70% confidence
|
||||||
)
|
top_k_links=3, # Follow top 3 links per page
|
||||||
|
min_gain_threshold=0.05 # Need 5% information gain to continue
|
||||||
adaptive_crawler = AdaptiveCrawler(config)
|
|
||||||
|
|
||||||
# First crawl - crawler learns the structure
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://news.example.com/article/12345",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
adaptive_config=config,
|
|
||||||
extraction_hints={ # Optional hints to speed up learning
|
|
||||||
"title": "article h1",
|
|
||||||
"content": "article .body-content"
|
|
||||||
}
|
|
||||||
)
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Crawler identifies and stores patterns
|
async with AsyncWebCrawler(verbose=False) as crawler:
|
||||||
if result.success:
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
state = adaptive_crawler.get_state("news.example.com")
|
|
||||||
print(f"Learned {len(state.patterns)} patterns")
|
print("Starting adaptive crawl about Python decorators...")
|
||||||
print(f"Confidence: {state.avg_confidence:.2%}")
|
result = await adaptive.digest(
|
||||||
|
start_url="https://docs.python.org/3/glossary.html",
|
||||||
|
query="python decorators functions wrapping"
|
||||||
|
)
|
||||||
|
|
||||||
|
print(f"\n✅ Crawling Complete!")
|
||||||
|
print(f"• Confidence Level: {adaptive.confidence:.0%}")
|
||||||
|
print(f"• Pages Crawled: {len(result.crawled_urls)}")
|
||||||
|
print(f"• Knowledge Base: {len(adaptive.state.knowledge_base)} documents")
|
||||||
|
|
||||||
|
# Get most relevant content
|
||||||
|
relevant = adaptive.get_relevant_content(top_k=3)
|
||||||
|
print(f"\nMost Relevant Pages:")
|
||||||
|
for i, page in enumerate(relevant, 1):
|
||||||
|
print(f"{i}. {page['url']} (relevance: {page['score']:.2%})")
|
||||||
|
|
||||||
# Subsequent crawls - uses learned patterns
|
asyncio.run(main())
|
||||||
result2 = await crawler.arun(
|
|
||||||
"https://news.example.com/article/67890",
|
|
||||||
config=CrawlerRunConfig(adaptive_config=config)
|
|
||||||
)
|
|
||||||
# Automatically extracts using learned patterns!
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
@@ -92,9 +88,7 @@ twitter_config = VirtualScrollConfig(
|
|||||||
container_selector="[data-testid='primaryColumn']",
|
container_selector="[data-testid='primaryColumn']",
|
||||||
scroll_count=20, # Number of scrolls
|
scroll_count=20, # Number of scrolls
|
||||||
scroll_by="container_height", # Smart scrolling by container size
|
scroll_by="container_height", # Smart scrolling by container size
|
||||||
wait_after_scroll=1.0, # Let content load
|
wait_after_scroll=1.0 # Let content load
|
||||||
capture_method="incremental", # Capture new content on each scroll
|
|
||||||
deduplicate=True # Remove duplicate elements
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For e-commerce product grids (Instagram style)
|
# For e-commerce product grids (Instagram style)
|
||||||
@@ -102,8 +96,7 @@ grid_config = VirtualScrollConfig(
|
|||||||
container_selector="main .product-grid",
|
container_selector="main .product-grid",
|
||||||
scroll_count=30,
|
scroll_count=30,
|
||||||
scroll_by=800, # Fixed pixel scrolling
|
scroll_by=800, # Fixed pixel scrolling
|
||||||
wait_after_scroll=1.5, # Images need time
|
wait_after_scroll=1.5 # Images need time
|
||||||
stop_on_no_change=True # Smart stopping
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# For news feeds with lazy loading
|
# For news feeds with lazy loading
|
||||||
@@ -111,9 +104,7 @@ news_config = VirtualScrollConfig(
|
|||||||
container_selector=".article-feed",
|
container_selector=".article-feed",
|
||||||
scroll_count=50,
|
scroll_count=50,
|
||||||
scroll_by="page_height", # Viewport-based scrolling
|
scroll_by="page_height", # Viewport-based scrolling
|
||||||
wait_after_scroll=0.5,
|
wait_after_scroll=0.5 # Wait for content to load
|
||||||
wait_for_selector=".article-card", # Wait for specific elements
|
|
||||||
timeout=30000 # Max 30 seconds total
|
|
||||||
)
|
)
|
||||||
|
|
||||||
# Use it in your crawl
|
# Use it in your crawl
|
||||||
@@ -157,68 +148,63 @@ async with AsyncWebCrawler() as crawler:
|
|||||||
|
|
||||||
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
**My Solution:** I implemented a three-layer scoring system that analyzes links like a human would—considering their position, context, and relevance to your goals.
|
||||||
|
|
||||||
### The Three-Layer Scoring System
|
### Intelligent Link Analysis and Scoring
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai import LinkPreviewConfig
|
import asyncio
|
||||||
|
from crawl4ai import CrawlerRunConfig, CacheMode, AsyncWebCrawler
|
||||||
|
from crawl4ai.adaptive_crawler import LinkPreviewConfig
|
||||||
|
|
||||||
# Configure intelligent link analysis
|
async def main():
|
||||||
link_config = LinkPreviewConfig(
|
# Configure intelligent link analysis
|
||||||
# What to analyze
|
link_config = LinkPreviewConfig(
|
||||||
include_internal=True,
|
include_internal=True,
|
||||||
include_external=True,
|
include_external=False,
|
||||||
max_links=100, # Analyze top 100 links
|
max_links=10,
|
||||||
|
concurrency=5,
|
||||||
# Relevance scoring
|
query="python tutorial", # For contextual scoring
|
||||||
query="machine learning tutorials", # Your interest
|
score_threshold=0.3,
|
||||||
score_threshold=0.3, # Minimum relevance score
|
verbose=True
|
||||||
|
|
||||||
# Performance
|
|
||||||
concurrent_requests=10, # Parallel processing
|
|
||||||
timeout_per_link=5000, # 5s per link
|
|
||||||
|
|
||||||
# Advanced scoring weights
|
|
||||||
scoring_weights={
|
|
||||||
"intrinsic": 0.3, # Link quality indicators
|
|
||||||
"contextual": 0.5, # Relevance to query
|
|
||||||
"popularity": 0.2 # Link prominence
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
# Use in your crawl
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://tech-blog.example.com",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
link_preview_config=link_config,
|
|
||||||
score_links=True
|
|
||||||
)
|
)
|
||||||
)
|
# Use in your crawl
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
"https://www.geeksforgeeks.org/",
|
||||||
|
config=CrawlerRunConfig(
|
||||||
|
link_preview_config=link_config,
|
||||||
|
score_links=True, # Enable intrinsic scoring
|
||||||
|
cache_mode=CacheMode.BYPASS
|
||||||
|
)
|
||||||
|
)
|
||||||
|
|
||||||
# Access scored and sorted links
|
# Access scored and sorted links
|
||||||
for link in result.links["internal"][:10]: # Top 10 internal links
|
if result.success and result.links:
|
||||||
print(f"Score: {link['total_score']:.3f}")
|
for link in result.links.get("internal", []):
|
||||||
print(f" Intrinsic: {link['intrinsic_score']:.1f}/10") # Position, attributes
|
text = link.get('text', 'No text')[:40]
|
||||||
print(f" Contextual: {link['contextual_score']:.1f}/1") # Relevance to query
|
print(
|
||||||
print(f" URL: {link['href']}")
|
text,
|
||||||
print(f" Title: {link['head_data']['title']}")
|
f"{link.get('intrinsic_score', 0):.1f}/10" if link.get('intrinsic_score') is not None else "0.0/10",
|
||||||
print(f" Description: {link['head_data']['meta']['description'][:100]}...")
|
f"{link.get('contextual_score', 0):.2f}/1" if link.get('contextual_score') is not None else "0.00/1",
|
||||||
|
f"{link.get('total_score', 0):.3f}" if link.get('total_score') is not None else "0.000"
|
||||||
|
)
|
||||||
|
|
||||||
|
asyncio.run(main())
|
||||||
```
|
```
|
||||||
|
|
||||||
**Scoring Components:**
|
**Scoring Components:**
|
||||||
|
|
||||||
1. **Intrinsic Score (0-10)**: Based on link quality indicators
|
1. **Intrinsic Score**: Based on link quality indicators
|
||||||
- Position on page (navigation, content, footer)
|
- Position on page (navigation, content, footer)
|
||||||
- Link attributes (rel, title, class names)
|
- Link attributes (rel, title, class names)
|
||||||
- Anchor text quality and length
|
- Anchor text quality and length
|
||||||
- URL structure and depth
|
- URL structure and depth
|
||||||
|
|
||||||
2. **Contextual Score (0-1)**: Relevance to your query
|
2. **Contextual Score**: Relevance to your query using BM25 algorithm
|
||||||
- Semantic similarity using embeddings
|
|
||||||
- Keyword matching in link text and title
|
- Keyword matching in link text and title
|
||||||
- Meta description analysis
|
- Meta description analysis
|
||||||
- Content preview scoring
|
- Content preview scoring
|
||||||
|
|
||||||
3. **Total Score**: Weighted combination for final ranking
|
3. **Total Score**: Combined score for final ranking
|
||||||
|
|
||||||
**Expected Real-World Impact:**
|
**Expected Real-World Impact:**
|
||||||
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
- **Research Efficiency**: Find relevant papers 10x faster by following only high-score links
|
||||||
@@ -235,58 +221,34 @@ for link in result.links["internal"][:10]: # Top 10 internal links
|
|||||||
### Technical Architecture
|
### Technical Architecture
|
||||||
|
|
||||||
```python
|
```python
|
||||||
|
import asyncio
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
|
|
||||||
# Basic discovery - find all product pages
|
async def main():
|
||||||
seeder_config = SeedingConfig(
|
async with AsyncUrlSeeder() as seeder:
|
||||||
# Discovery sources
|
# Discover Python tutorial URLs
|
||||||
source="sitemap+cc", # Sitemap + Common Crawl
|
config = SeedingConfig(
|
||||||
|
source="sitemap", # Use sitemap
|
||||||
# Filtering
|
pattern="*python*", # URL pattern filter
|
||||||
pattern="*/product/*", # URL pattern matching
|
extract_head=True, # Get metadata
|
||||||
ignore_patterns=["*/reviews/*", "*/questions/*"],
|
query="python tutorial", # For relevance scoring
|
||||||
|
scoring_method="bm25",
|
||||||
# Validation
|
score_threshold=0.2,
|
||||||
live_check=True, # Verify URLs are alive
|
max_urls=10
|
||||||
max_urls=5000, # Stop at 5000 URLs
|
)
|
||||||
|
|
||||||
# Performance
|
print("Discovering Python async tutorial URLs...")
|
||||||
concurrency=100, # Parallel requests
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
hits_per_sec=10 # Rate limiting
|
|
||||||
)
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
|
print(f"\n{i}. {url_info['url']}")
|
||||||
|
if url_info.get('relevance_score'):
|
||||||
|
print(f" Relevance: {url_info['relevance_score']:.3f}")
|
||||||
|
if url_info.get('head_data', {}).get('title'):
|
||||||
|
print(f" Title: {url_info['head_data']['title'][:60]}...")
|
||||||
|
|
||||||
seeder = AsyncUrlSeeder(seeder_config)
|
asyncio.run(main())
|
||||||
urls = await seeder.discover("https://shop.example.com")
|
|
||||||
|
|
||||||
# Advanced: Relevance-based discovery
|
|
||||||
research_config = SeedingConfig(
|
|
||||||
source="crawl+sitemap", # Deep crawl + sitemap
|
|
||||||
pattern="*/blog/*", # Blog posts only
|
|
||||||
|
|
||||||
# Content relevance
|
|
||||||
extract_head=True, # Get meta tags
|
|
||||||
query="quantum computing tutorials",
|
|
||||||
scoring_method="bm25", # Or "semantic" (coming soon)
|
|
||||||
score_threshold=0.4, # High relevance only
|
|
||||||
|
|
||||||
# Smart filtering
|
|
||||||
filter_nonsense_urls=True, # Remove .xml, .txt, etc.
|
|
||||||
min_content_length=500, # Skip thin content
|
|
||||||
|
|
||||||
force=True # Bypass cache
|
|
||||||
)
|
|
||||||
|
|
||||||
# Discover with progress tracking
|
|
||||||
discovered = []
|
|
||||||
async for batch in seeder.discover_iter("https://physics-blog.com", research_config):
|
|
||||||
discovered.extend(batch)
|
|
||||||
print(f"Found {len(discovered)} relevant URLs so far...")
|
|
||||||
|
|
||||||
# Results include scores and metadata
|
|
||||||
for url_data in discovered[:5]:
|
|
||||||
print(f"URL: {url_data['url']}")
|
|
||||||
print(f"Score: {url_data['score']:.3f}")
|
|
||||||
print(f"Title: {url_data['title']}")
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Discovery Methods:**
|
**Discovery Methods:**
|
||||||
@@ -309,35 +271,18 @@ This release includes significant performance improvements through optimized res
|
|||||||
### What We Optimized
|
### What We Optimized
|
||||||
|
|
||||||
```python
|
```python
|
||||||
# Before v0.7.0 (slow)
|
# Optimized crawling with v0.7.0 improvements
|
||||||
results = []
|
results = []
|
||||||
for url in urls:
|
for url in urls:
|
||||||
result = await crawler.arun(url)
|
result = await crawler.arun(
|
||||||
results.append(result)
|
url,
|
||||||
|
config=CrawlerRunConfig(
|
||||||
# After v0.7.0 (fast)
|
# Performance optimizations
|
||||||
# Automatic batching and connection pooling
|
wait_until="domcontentloaded", # Faster than networkidle
|
||||||
results = await crawler.arun_batch(
|
cache_mode=CacheMode.ENABLED # Enable caching
|
||||||
urls,
|
)
|
||||||
config=CrawlerRunConfig(
|
|
||||||
# New performance options
|
|
||||||
batch_size=10, # Process 10 URLs concurrently
|
|
||||||
reuse_browser=True, # Keep browser warm
|
|
||||||
eager_loading=False, # Load only what's needed
|
|
||||||
streaming_extraction=True, # Stream large extractions
|
|
||||||
|
|
||||||
# Optimized defaults
|
|
||||||
wait_until="domcontentloaded", # Faster than networkidle
|
|
||||||
exclude_external_resources=True, # Skip third-party assets
|
|
||||||
block_ads=True # Ad blocking built-in
|
|
||||||
)
|
)
|
||||||
)
|
results.append(result)
|
||||||
|
|
||||||
# Memory-efficient streaming for large crawls
|
|
||||||
async for result in crawler.arun_stream(large_url_list):
|
|
||||||
# Process results as they complete
|
|
||||||
await process_result(result)
|
|
||||||
# Memory is freed after each iteration
|
|
||||||
```
|
```
|
||||||
|
|
||||||
**Performance Gains:**
|
**Performance Gains:**
|
||||||
@@ -347,24 +292,6 @@ async for result in crawler.arun_stream(large_url_list):
|
|||||||
- **Memory Usage**: 60% reduction with streaming processing
|
- **Memory Usage**: 60% reduction with streaming processing
|
||||||
- **Concurrent Crawls**: Handle 5x more parallel requests
|
- **Concurrent Crawls**: Handle 5x more parallel requests
|
||||||
|
|
||||||
## 📄 PDF Support
|
|
||||||
|
|
||||||
PDF extraction is now natively supported in Crawl4AI.
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Extract data from PDF documents
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://example.com/report.pdf",
|
|
||||||
config=CrawlerRunConfig(
|
|
||||||
pdf_extraction=True,
|
|
||||||
extraction_strategy=JsonCssExtractionStrategy({
|
|
||||||
# Works on converted PDF structure
|
|
||||||
"title": {"selector": "h1", "type": "text"},
|
|
||||||
"sections": {"selector": "h2", "type": "list"}
|
|
||||||
})
|
|
||||||
)
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
## 🔧 Important Changes
|
## 🔧 Important Changes
|
||||||
|
|
||||||
|
|||||||
@@ -35,7 +35,7 @@ from crawl4ai import AsyncWebCrawler, AdaptiveCrawler
|
|||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Create an adaptive crawler
|
# Create an adaptive crawler (config is optional)
|
||||||
adaptive = AdaptiveCrawler(crawler)
|
adaptive = AdaptiveCrawler(crawler)
|
||||||
|
|
||||||
# Start crawling with a query
|
# Start crawling with a query
|
||||||
@@ -59,13 +59,13 @@ async def main():
|
|||||||
from crawl4ai import AdaptiveConfig
|
from crawl4ai import AdaptiveConfig
|
||||||
|
|
||||||
config = AdaptiveConfig(
|
config = AdaptiveConfig(
|
||||||
confidence_threshold=0.7, # Stop when 70% confident (default: 0.8)
|
confidence_threshold=0.8, # Stop when 80% confident (default: 0.7)
|
||||||
max_pages=20, # Maximum pages to crawl (default: 50)
|
max_pages=30, # Maximum pages to crawl (default: 20)
|
||||||
top_k_links=3, # Links to follow per page (default: 5)
|
top_k_links=5, # Links to follow per page (default: 3)
|
||||||
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
min_gain_threshold=0.05 # Minimum expected gain to continue (default: 0.1)
|
||||||
)
|
)
|
||||||
|
|
||||||
adaptive = AdaptiveCrawler(crawler, config=config)
|
adaptive = AdaptiveCrawler(crawler, config)
|
||||||
```
|
```
|
||||||
|
|
||||||
## Crawling Strategies
|
## Crawling Strategies
|
||||||
@@ -198,8 +198,8 @@ if result.metrics.get('is_irrelevant', False):
|
|||||||
The confidence score (0-1) indicates how sufficient the gathered information is:
|
The confidence score (0-1) indicates how sufficient the gathered information is:
|
||||||
- **0.0-0.3**: Insufficient information, needs more crawling
|
- **0.0-0.3**: Insufficient information, needs more crawling
|
||||||
- **0.3-0.6**: Partial information, may answer basic queries
|
- **0.3-0.6**: Partial information, may answer basic queries
|
||||||
- **0.6-0.8**: Good coverage, can answer most queries
|
- **0.6-0.7**: Good coverage, can answer most queries
|
||||||
- **0.8-1.0**: Excellent coverage, comprehensive information
|
- **0.7-1.0**: Excellent coverage, comprehensive information
|
||||||
|
|
||||||
### Statistics Display
|
### Statistics Display
|
||||||
|
|
||||||
@@ -257,9 +257,9 @@ new_adaptive.import_knowledge_base("knowledge_base.jsonl")
|
|||||||
- Avoid overly broad queries
|
- Avoid overly broad queries
|
||||||
|
|
||||||
### 2. Threshold Tuning
|
### 2. Threshold Tuning
|
||||||
- Start with default (0.8) for general use
|
- Start with default (0.7) for general use
|
||||||
- Lower to 0.6-0.7 for exploratory crawling
|
- Lower to 0.5-0.6 for exploratory crawling
|
||||||
- Raise to 0.9+ for exhaustive coverage
|
- Raise to 0.8+ for exhaustive coverage
|
||||||
|
|
||||||
### 3. Performance Optimization
|
### 3. Performance Optimization
|
||||||
- Use appropriate `max_pages` limits
|
- Use appropriate `max_pages` limits
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ Here's a full example you can copy, paste, and run immediately:
|
|||||||
```python
|
```python
|
||||||
import asyncio
|
import asyncio
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
async def extract_link_heads_example():
|
async def extract_link_heads_example():
|
||||||
"""
|
"""
|
||||||
@@ -237,7 +237,7 @@ if __name__ == "__main__":
|
|||||||
The `LinkPreviewConfig` class supports these options:
|
The `LinkPreviewConfig` class supports these options:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
|
|
||||||
link_preview_config = LinkPreviewConfig(
|
link_preview_config = LinkPreviewConfig(
|
||||||
# BASIC SETTINGS
|
# BASIC SETTINGS
|
||||||
|
|||||||
@@ -137,7 +137,7 @@ async def smart_blog_crawler():
|
|||||||
word_count_threshold=300 # Only substantial articles
|
word_count_threshold=300 # Only substantial articles
|
||||||
)
|
)
|
||||||
|
|
||||||
# Extract URLs and stream results as they come
|
# Extract URLs and crawl them
|
||||||
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
tutorial_urls = [t["url"] for t in tutorials[:10]]
|
||||||
results = await crawler.arun_many(tutorial_urls, config=config)
|
results = await crawler.arun_many(tutorial_urls, config=config)
|
||||||
|
|
||||||
@@ -231,7 +231,7 @@ Common Crawl is a massive public dataset that regularly crawls the entire web. I
|
|||||||
|
|
||||||
```python
|
```python
|
||||||
# Use both sources
|
# Use both sources
|
||||||
config = SeedingConfig(source="cc+sitemap")
|
config = SeedingConfig(source="sitemap+cc")
|
||||||
urls = await seeder.urls("example.com", config)
|
urls = await seeder.urls("example.com", config)
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -241,13 +241,13 @@ The `SeedingConfig` object is your control panel. Here's everything you can conf
|
|||||||
|
|
||||||
| Parameter | Type | Default | Description |
|
| Parameter | Type | Default | Description |
|
||||||
|-----------|------|---------|-------------|
|
|-----------|------|---------|-------------|
|
||||||
| `source` | str | "cc" | URL source: "cc" (Common Crawl), "sitemap", or "cc+sitemap" |
|
| `source` | str | "sitemap+cc" | URL source: "cc" (Common Crawl), "sitemap", or "sitemap+cc" |
|
||||||
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
| `pattern` | str | "*" | URL pattern filter (e.g., "*/blog/*", "*.html") |
|
||||||
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
| `extract_head` | bool | False | Extract metadata from page `<head>` |
|
||||||
| `live_check` | bool | False | Verify URLs are accessible |
|
| `live_check` | bool | False | Verify URLs are accessible |
|
||||||
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
| `max_urls` | int | -1 | Maximum URLs to return (-1 = unlimited) |
|
||||||
| `concurrency` | int | 10 | Parallel workers for fetching |
|
| `concurrency` | int | 10 | Parallel workers for fetching |
|
||||||
| `hits_per_sec` | int | None | Rate limit for requests |
|
| `hits_per_sec` | int | 5 | Rate limit for requests |
|
||||||
| `force` | bool | False | Bypass cache, fetch fresh data |
|
| `force` | bool | False | Bypass cache, fetch fresh data |
|
||||||
| `verbose` | bool | False | Show detailed progress |
|
| `verbose` | bool | False | Show detailed progress |
|
||||||
| `query` | str | None | Search query for BM25 scoring |
|
| `query` | str | None | Search query for BM25 scoring |
|
||||||
@@ -522,7 +522,7 @@ urls = await seeder.urls("docs.example.com", config)
|
|||||||
```python
|
```python
|
||||||
# Find specific products
|
# Find specific products
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap", # Use both sources
|
source="sitemap+cc", # Use both sources
|
||||||
extract_head=True,
|
extract_head=True,
|
||||||
query="wireless headphones noise canceling",
|
query="wireless headphones noise canceling",
|
||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
@@ -782,7 +782,7 @@ class ResearchAssistant:
|
|||||||
|
|
||||||
# Step 1: Discover relevant URLs
|
# Step 1: Discover relevant URLs
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap", # Maximum coverage
|
source="sitemap+cc", # Maximum coverage
|
||||||
extract_head=True, # Get metadata
|
extract_head=True, # Get metadata
|
||||||
query=topic, # Research topic
|
query=topic, # Research topic
|
||||||
scoring_method="bm25", # Smart scoring
|
scoring_method="bm25", # Smart scoring
|
||||||
@@ -832,7 +832,8 @@ class ResearchAssistant:
|
|||||||
# Extract URLs and crawl all articles
|
# Extract URLs and crawl all articles
|
||||||
article_urls = [article['url'] for article in top_articles]
|
article_urls = [article['url'] for article in top_articles]
|
||||||
results = []
|
results = []
|
||||||
async for result in await crawler.arun_many(article_urls, config=config):
|
crawl_results = await crawler.arun_many(article_urls, config=config)
|
||||||
|
async for result in crawl_results:
|
||||||
if result.success:
|
if result.success:
|
||||||
results.append({
|
results.append({
|
||||||
'url': result.url,
|
'url': result.url,
|
||||||
@@ -933,10 +934,10 @@ config = SeedingConfig(concurrency=10, hits_per_sec=5)
|
|||||||
# When crawling many URLs
|
# When crawling many URLs
|
||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
# Assuming urls is a list of URL strings
|
# Assuming urls is a list of URL strings
|
||||||
results = await crawler.arun_many(urls, config=config)
|
crawl_results = await crawler.arun_many(urls, config=config)
|
||||||
|
|
||||||
# Process as they arrive
|
# Process as they arrive
|
||||||
async for result in results:
|
async for result in crawl_results:
|
||||||
process_immediately(result) # Don't wait for all
|
process_immediately(result) # Don't wait for all
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -1020,7 +1021,7 @@ config = SeedingConfig(
|
|||||||
|
|
||||||
# E-commerce product discovery
|
# E-commerce product discovery
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="cc+sitemap",
|
source="sitemap+cc",
|
||||||
pattern="*/product/*",
|
pattern="*/product/*",
|
||||||
extract_head=True,
|
extract_head=True,
|
||||||
live_check=True
|
live_check=True
|
||||||
|
|||||||
@@ -28,7 +28,7 @@ from rich import box
|
|||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, AdaptiveCrawler, AdaptiveConfig, BrowserConfig, CacheMode
|
||||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig, VirtualScrollConfig
|
from crawl4ai import LinkPreviewConfig, VirtualScrollConfig
|
||||||
from crawl4ai import c4a_compile, CompilationResult
|
from crawl4ai import c4a_compile, CompilationResult
|
||||||
|
|
||||||
# Initialize Rich console for beautiful output
|
# Initialize Rich console for beautiful output
|
||||||
|
|||||||
@@ -13,14 +13,13 @@ from crawl4ai import (
|
|||||||
BrowserConfig,
|
BrowserConfig,
|
||||||
CacheMode,
|
CacheMode,
|
||||||
# New imports for v0.7.0
|
# New imports for v0.7.0
|
||||||
LinkPreviewConfig,
|
|
||||||
VirtualScrollConfig,
|
VirtualScrollConfig,
|
||||||
|
LinkPreviewConfig,
|
||||||
AdaptiveCrawler,
|
AdaptiveCrawler,
|
||||||
AdaptiveConfig,
|
AdaptiveConfig,
|
||||||
AsyncUrlSeeder,
|
AsyncUrlSeeder,
|
||||||
SeedingConfig,
|
SeedingConfig,
|
||||||
c4a_compile,
|
c4a_compile,
|
||||||
CompilationResult
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
@@ -170,16 +169,16 @@ async def demo_url_seeder():
|
|||||||
# Discover Python tutorial URLs
|
# Discover Python tutorial URLs
|
||||||
config = SeedingConfig(
|
config = SeedingConfig(
|
||||||
source="sitemap", # Use sitemap
|
source="sitemap", # Use sitemap
|
||||||
pattern="*tutorial*", # URL pattern filter
|
pattern="*python*", # URL pattern filter
|
||||||
extract_head=True, # Get metadata
|
extract_head=True, # Get metadata
|
||||||
query="python async programming", # For relevance scoring
|
query="python tutorial", # For relevance scoring
|
||||||
scoring_method="bm25",
|
scoring_method="bm25",
|
||||||
score_threshold=0.2,
|
score_threshold=0.2,
|
||||||
max_urls=10
|
max_urls=10
|
||||||
)
|
)
|
||||||
|
|
||||||
print("Discovering Python async tutorial URLs...")
|
print("Discovering Python async tutorial URLs...")
|
||||||
urls = await seeder.urls("docs.python.org", config)
|
urls = await seeder.urls("https://www.geeksforgeeks.org/", config)
|
||||||
|
|
||||||
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
print(f"\n✅ Found {len(urls)} relevant URLs:")
|
||||||
for i, url_info in enumerate(urls[:5], 1):
|
for i, url_info in enumerate(urls[:5], 1):
|
||||||
@@ -245,39 +244,6 @@ IF (EXISTS `.price-filter`) THEN CLICK `input[data-max-price="100"]`
|
|||||||
print(f"❌ Compilation error: {result.first_error.message}")
|
print(f"❌ Compilation error: {result.first_error.message}")
|
||||||
|
|
||||||
|
|
||||||
async def demo_pdf_support():
|
|
||||||
"""
|
|
||||||
Demo 6: PDF Parsing Support
|
|
||||||
|
|
||||||
Shows how to extract content from PDF files.
|
|
||||||
Note: Requires 'pip install crawl4ai[pdf]'
|
|
||||||
"""
|
|
||||||
print("\n" + "="*60)
|
|
||||||
print("📄 DEMO 6: PDF Parsing Support")
|
|
||||||
print("="*60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Check if PDF support is installed
|
|
||||||
import PyPDF2
|
|
||||||
|
|
||||||
# Example: Process a PDF URL
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
pdf=True, # Enable PDF generation
|
|
||||||
extract_text_from_pdf=True # Extract text content
|
|
||||||
)
|
|
||||||
|
|
||||||
print("PDF parsing is available!")
|
|
||||||
print("You can now crawl PDF URLs and extract their content.")
|
|
||||||
print("\nExample usage:")
|
|
||||||
print(' result = await crawler.arun("https://example.com/document.pdf")')
|
|
||||||
print(' pdf_text = result.extracted_content # Contains extracted text')
|
|
||||||
|
|
||||||
except ImportError:
|
|
||||||
print("⚠️ PDF support not installed.")
|
|
||||||
print("Install with: pip install crawl4ai[pdf]")
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all demos"""
|
"""Run all demos"""
|
||||||
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
print("\n🚀 Crawl4AI v0.7.0 Feature Demonstrations")
|
||||||
@@ -289,7 +255,6 @@ async def main():
|
|||||||
("Virtual Scroll", demo_virtual_scroll),
|
("Virtual Scroll", demo_virtual_scroll),
|
||||||
("URL Seeder", demo_url_seeder),
|
("URL Seeder", demo_url_seeder),
|
||||||
("C4A Script", demo_c4a_script),
|
("C4A Script", demo_c4a_script),
|
||||||
("PDF Support", demo_pdf_support)
|
|
||||||
]
|
]
|
||||||
|
|
||||||
for name, demo_func in demos:
|
for name, demo_func in demos:
|
||||||
@@ -309,7 +274,6 @@ async def main():
|
|||||||
print("• Virtual Scroll: Capture all content from modern web pages")
|
print("• Virtual Scroll: Capture all content from modern web pages")
|
||||||
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
print("• URL Seeder: Pre-discover and filter URLs efficiently")
|
||||||
print("• C4A Script: Simple language for complex automations")
|
print("• C4A Script: Simple language for complex automations")
|
||||||
print("• PDF Support: Extract content from PDF documents")
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
|
|||||||
@@ -5,7 +5,7 @@ Test script for Link Extractor functionality
|
|||||||
|
|
||||||
from crawl4ai.models import Link
|
from crawl4ai.models import Link
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||||
from crawl4ai.async_configs import LinkPreviewConfig
|
from crawl4ai import LinkPreviewConfig
|
||||||
import asyncio
|
import asyncio
|
||||||
import sys
|
import sys
|
||||||
import os
|
import os
|
||||||
@@ -237,7 +237,7 @@ def test_config_examples():
|
|||||||
print(f" {key}: {value}")
|
print(f" {key}: {value}")
|
||||||
|
|
||||||
print(" Usage:")
|
print(" Usage:")
|
||||||
print(" from crawl4ai.async_configs import LinkPreviewConfig")
|
print(" from crawl4ai import LinkPreviewConfig")
|
||||||
print(" config = CrawlerRunConfig(")
|
print(" config = CrawlerRunConfig(")
|
||||||
print(" link_preview_config=LinkPreviewConfig(")
|
print(" link_preview_config=LinkPreviewConfig(")
|
||||||
for key, value in config_dict.items():
|
for key, value in config_dict.items():
|
||||||
|
|||||||
Reference in New Issue
Block a user