Merge branch 'next' into 2025-MAR-ALPHA-1

This commit is contained in:
Aravind Karnam
2025-03-13 10:42:22 +05:30
67 changed files with 4194 additions and 729 deletions

View File

@@ -0,0 +1,230 @@
"""
Crawl4AI Crypto Trading Analysis Demo
Author: Unclecode
Date: 2024-03-15
This script demonstrates advanced crypto market analysis using:
1. Web scraping of real-time CoinMarketCap data
2. Smart table extraction with layout detection
3. Hedge fund-grade financial metrics
4. Interactive visualizations for trading signals
Key Features:
- Volume Anomaly Detection: Finds unusual trading activity
- Liquidity Power Score: Identifies easily tradable assets
- Volatility-Weighted Momentum: Surface sustainable trends
- Smart Money Signals: Algorithmic buy/hold recommendations
"""
import asyncio
import pandas as pd
import plotly.express as px
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy
from crawl4ai import CrawlResult
from typing import List
from IPython.display import HTML
class CryptoAlphaGenerator:
"""
Advanced crypto analysis engine that transforms raw web data into:
- Volume anomaly flags
- Liquidity scores
- Momentum-risk ratios
- Machine learning-inspired trading signals
Methods:
analyze_tables(): Process raw tables into trading insights
create_visuals(): Generate institutional-grade visualizations
generate_insights(): Create plain English trading recommendations
"""
def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Convert crypto market data to machine-readable format
Handles currency symbols, units (B=Billions), and percentage values
"""
# Clean numeric columns
df['Price'] = df['Price'].str.replace('[^\d.]', '', regex=True).astype(float)
df['Market Cap'] = df['Market Cap'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
df['Volume(24h)'] = df['Volume(24h)'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
# Convert percentages to decimal values
for col in ['1h %', '24h %', '7d %']:
df[col] = df[col].str.replace('%', '').astype(float) / 100
return df
def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
"""
Compute advanced trading metrics used by quantitative funds:
1. Volume/Market Cap Ratio - Measures liquidity efficiency
(High ratio = Underestimated attention)
2. Volatility Score - Risk-adjusted momentum potential
(STD of 1h/24h/7d returns)
3. Momentum Score - Weighted average of returns
(1h:30% + 24h:50% + 7d:20%)
4. Volume Anomaly - 3σ deviation detection
(Flags potential insider activity)
"""
# Liquidity Metrics
df['Volume/Market Cap Ratio'] = df['Volume(24h)'] / df['Market Cap']
# Risk Metrics
df['Volatility Score'] = df[['1h %','24h %','7d %']].std(axis=1)
# Momentum Metrics
df['Momentum Score'] = (df['1h %']*0.3 + df['24h %']*0.5 + df['7d %']*0.2)
# Anomaly Detection
median_vol = df['Volume(24h)'].median()
df['Volume Anomaly'] = df['Volume(24h)'] > 3 * median_vol
# Value Flags
df['Undervalued Flag'] = (df['Market Cap'] < 1e9) & (df['Momentum Score'] > 0.05)
df['Liquid Giant'] = (df['Volume/Market Cap Ratio'] > 0.15) & (df['Market Cap'] > 1e9)
return df
def create_visuals(self, df: pd.DataFrame) -> dict:
"""
Generate three institutional-grade visualizations:
1. 3D Market Map - X:Size, Y:Liquidity, Z:Momentum
2. Liquidity Tree - Color:Volume Efficiency
3. Momentum Leaderboard - Top sustainable movers
"""
# 3D Market Overview
fig1 = px.scatter_3d(
df,
x='Market Cap',
y='Volume/Market Cap Ratio',
z='Momentum Score',
size='Volatility Score',
color='Volume Anomaly',
hover_name='Name',
title='Smart Money Market Map: Spot Overlooked Opportunities',
labels={'Market Cap': 'Size (Log $)', 'Volume/Market Cap Ratio': 'Liquidity Power'},
log_x=True,
template='plotly_dark'
)
# Liquidity Efficiency Tree
fig2 = px.treemap(
df,
path=['Name'],
values='Market Cap',
color='Volume/Market Cap Ratio',
hover_data=['Momentum Score'],
title='Liquidity Forest: Green = High Trading Efficiency',
color_continuous_scale='RdYlGn'
)
# Momentum Leaders
fig3 = px.bar(
df.sort_values('Momentum Score', ascending=False).head(10),
x='Name',
y='Momentum Score',
color='Volatility Score',
title='Sustainable Momentum Leaders (Low Volatility + High Growth)',
text='7d %',
template='plotly_dark'
)
return {'market_map': fig1, 'liquidity_tree': fig2, 'momentum_leaders': fig3}
def generate_insights(self, df: pd.DataFrame) -> str:
"""
Create plain English trading insights explaining:
- Volume spikes and their implications
- Risk-reward ratios of top movers
- Liquidity warnings for large positions
"""
top_coin = df.sort_values('Momentum Score', ascending=False).iloc[0]
anomaly_coins = df[df['Volume Anomaly']].sort_values('Volume(24h)', ascending=False)
report = f"""
🚀 Top Alpha Opportunity: {top_coin['Name']}
- Momentum Score: {top_coin['Momentum Score']:.2%} (Top 1%)
- Risk-Reward Ratio: {top_coin['Momentum Score']/top_coin['Volatility Score']:.1f}
- Liquidity Warning: {'✅ Safe' if top_coin['Liquid Giant'] else '⚠️ Thin Markets'}
🔥 Volume Spikes Detected ({len(anomaly_coins)} coins):
{anomaly_coins[['Name', 'Volume(24h)']].head(3).to_markdown(index=False)}
💡 Smart Money Tip: Coins with Volume/Cap > 15% and Momentum > 5%
historically outperform by 22% weekly returns.
"""
return report
async def main():
"""
Main execution flow:
1. Configure headless browser for scraping
2. Extract live crypto market data
3. Clean and analyze using hedge fund models
4. Generate visualizations and insights
5. Output professional trading report
"""
# Configure browser with anti-detection features
browser_config = BrowserConfig(
headless=True,
stealth=True,
block_resources=["image", "media"]
)
# Initialize crawler with smart table detection
crawler = AsyncWebCrawler(config=browser_config)
await crawler.start()
try:
# Set up scraping parameters
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
scraping_strategy=LXMLWebScrapingStrategy(
table_score_threshold=8, # Strict table detection
keep_data_attributes=True
)
)
# Execute market data extraction
results: List[CrawlResult] = await crawler.arun(
url='https://coinmarketcap.com/?page=1',
config=crawl_config
)
# Process results
for result in results:
if result.success and result.media['tables']:
# Extract primary market table
raw_df = pd.DataFrame(
result.media['tables'][0]['rows'],
columns=result.media['tables'][0]['headers']
)
# Initialize analysis engine
analyzer = CryptoAlphaGenerator()
clean_df = analyzer.clean_data(raw_df)
analyzed_df = analyzer.calculate_metrics(clean_df)
# Generate outputs
visuals = analyzer.create_visuals(analyzed_df)
insights = analyzer.generate_insights(analyzed_df)
# Save visualizations
visuals['market_map'].write_html("market_map.html")
visuals['liquidity_tree'].write_html("liquidity_tree.html")
# Display results
print("🔑 Key Trading Insights:")
print(insights)
print("\n📊 Open 'market_map.html' for interactive analysis")
finally:
await crawler.close()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -65,7 +65,6 @@ async def basic_deep_crawl():
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
)
# 2⃣ Stream vs. Non-Stream Execution
async def stream_vs_nonstream():
"""
@@ -80,7 +79,7 @@ async def stream_vs_nonstream():
base_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1, include_external=False),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
verbose=False,
)
async with AsyncWebCrawler() as crawler:
@@ -127,7 +126,6 @@ async def stream_vs_nonstream():
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
# 3⃣ Introduce Filters & Scorers
async def filters_and_scorers():
"""
@@ -212,11 +210,11 @@ async def filters_and_scorers():
# Create a keyword relevance scorer
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=0.3
keywords=["crawl", "example", "async", "configuration","javascript","css"], weight=1
)
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy( # Note: Changed to BestFirst
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=1, include_external=False, url_scorer=keyword_scorer
),
scraping_strategy=LXMLWebScrapingStrategy(),
@@ -236,11 +234,172 @@ async def filters_and_scorers():
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
# 4⃣ Advanced Filters
async def advanced_filters():
"""
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
# 4⃣ Wrap-Up and Key Takeaways
This function covers:
- SEO filters
- Text relevancy filtering
- Combining advanced filters
"""
print("\n===== ADVANCED FILTERS =====")
async with AsyncWebCrawler() as crawler:
# SEO FILTER EXAMPLE
print("\n📊 EXAMPLE 1: SEO FILTERS")
print(
"Quantitative SEO quality assessment filter based searching keywords in the head section"
)
seo_filter = SEOFilter(
threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
)
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1, filter_chain=FilterChain([seo_filter])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
print(f" ✅ Found {len(results)} pages with relevant keywords")
for result in results:
print(f"{result.url}")
# ADVANCED TEXT RELEVANCY FILTER
print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
# More sophisticated content relevance filter
relevance_filter = ContentRelevanceFilter(
query="Interact with the web using your authentic digital identity",
threshold=0.7,
)
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1, filter_chain=FilterChain([relevance_filter])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
print(f" ✅ Found {len(results)} pages")
for result in results:
relevance_score = result.metadata.get("relevance_score", 0)
print(f" → Score: {relevance_score:.2f} | {result.url}")
# 5⃣ Max Pages and Score Thresholds
async def max_pages_and_thresholds():
"""
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
This function shows:
- How to limit the number of pages crawled
- How to set score thresholds for more targeted crawling
- Comparing BFS, DFS, and Best-First strategies with these parameters
"""
print("\n===== MAX PAGES AND SCORE THRESHOLDS =====")
from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
async with AsyncWebCrawler() as crawler:
# Define a common keyword scorer for all examples
keyword_scorer = KeywordRelevanceScorer(
keywords=["browser", "crawler", "web", "automation"],
weight=1.0
)
# EXAMPLE 1: BFS WITH MAX PAGES
print("\n📊 EXAMPLE 1: BFS STRATEGY WITH MAX PAGES LIMIT")
print(" Limit the crawler to a maximum of 5 pages")
bfs_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=5 # Only crawl 5 pages
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=bfs_config)
print(f" ✅ Crawled exactly {len(results)} pages as specified by max_pages")
for result in results:
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | {result.url}")
# EXAMPLE 2: DFS WITH SCORE THRESHOLD
print("\n📊 EXAMPLE 2: DFS STRATEGY WITH SCORE THRESHOLD")
print(" Only crawl pages with a relevance score above 0.5")
dfs_config = CrawlerRunConfig(
deep_crawl_strategy=DFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
score_threshold=0.7, # Only process URLs with scores above 0.5
max_pages=10
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=dfs_config)
print(f" ✅ Crawled {len(results)} pages with scores above threshold")
for result in results:
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
# EXAMPLE 3: BEST-FIRST WITH BOTH CONSTRAINTS
print("\n📊 EXAMPLE 3: BEST-FIRST STRATEGY WITH BOTH CONSTRAINTS")
print(" Limit to 7 pages with scores above 0.3, prioritizing highest scores")
bf_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=7, # Limit to 7 pages total
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
stream=True,
)
results = []
async for result in await crawler.arun(url="https://docs.crawl4ai.com", config=bf_config):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f" → Depth: {depth} | Score: {score:.2f} | {result.url}")
print(f" ✅ Crawled {len(results)} high-value pages with scores above 0.3")
if results:
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
print(f" ✅ Average score: {avg_score:.2f}")
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
# 6⃣ Wrap-Up and Key Takeaways
async def wrap_up():
"""
PART 4: Wrap-Up and Key Takeaways
PART 6: Wrap-Up and Key Takeaways
Summarize the key concepts learned in this tutorial.
"""
@@ -308,71 +467,6 @@ async def wrap_up():
print(f" Depth {depth}: {count} pages")
# 5⃣ Advanced Filters
async def advanced_filters():
"""
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
This function covers:
- SEO filters
- Text relevancy filtering
- Combining advanced filters
"""
print("\n===== ADVANCED FILTERS =====")
async with AsyncWebCrawler() as crawler:
# SEO FILTER EXAMPLE
print("\n📊 EXAMPLE 1: SEO FILTERS")
print(
"Quantitative SEO quality assessment filter based searching keywords in the head section"
)
seo_filter = SEOFilter(
threshold=0.5, keywords=["dynamic", "interaction", "javascript"]
)
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1, filter_chain=FilterChain([seo_filter])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
print(f" ✅ Found {len(results)} pages with relevant keywords")
for result in results:
print(f"{result.url}")
# ADVANCED TEXT RELEVANCY FILTER
print("\n📊 EXAMPLE 2: ADVANCED TEXT RELEVANCY FILTER")
# More sophisticated content relevance filter
relevance_filter = ContentRelevanceFilter(
query="Interact with the web using your authentic digital identity",
threshold=0.7,
)
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1, filter_chain=FilterChain([relevance_filter])
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True,
cache_mode=CacheMode.BYPASS,
)
results = await crawler.arun(url="https://docs.crawl4ai.com", config=config)
print(f" ✅ Found {len(results)} pages")
for result in results:
relevance_score = result.metadata.get("relevance_score", 0)
print(f" → Score: {relevance_score:.2f} | {result.url}")
# Main function to run the entire tutorial
async def run_tutorial():
"""
Executes all tutorial sections in sequence.
@@ -387,8 +481,9 @@ async def run_tutorial():
basic_deep_crawl,
stream_vs_nonstream,
filters_and_scorers,
wrap_up,
max_pages_and_thresholds,
advanced_filters,
wrap_up,
]
for section in tutorial_sections:
@@ -398,7 +493,6 @@ async def run_tutorial():
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
print("For more information, check out https://docs.crawl4ai.com")
# Execute the tutorial when run directly
if __name__ == "__main__":
asyncio.run(run_tutorial())

View File

@@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
start = time.perf_counter()
async with AsyncWebCrawler(config=browser_config) as crawler:
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0,
memory_threshold_percent=95.0,
max_session_permit=10,
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2

View File

@@ -11,7 +11,7 @@ import asyncio
import os
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import (
LLMExtractionStrategy,
JsonCssExtractionStrategy,
@@ -61,19 +61,19 @@ async def main():
# 1. LLM Extraction with different input formats
markdown_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information including name, price, and description",
)
html_strategy = LLMExtractionStrategy(
input_format="html",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information from HTML including structured data",
)
fit_markdown_strategy = LLMExtractionStrategy(
input_format="fit_markdown",
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract product information from cleaned markdown",
)

View File

@@ -16,9 +16,9 @@ async def main():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=PruningContentFilter(
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
# )
),
)
result : CrawlResult = await crawler.arun(

View File

@@ -0,0 +1,108 @@
"""
Identity-Based Browsing Example with Crawl4AI
This example demonstrates how to:
1. Create a persistent browser profile interactively
2. List available profiles
3. Use a saved profile for crawling authenticated sites
4. Delete profiles when no longer needed
Uses the new BrowserProfiler class for profile management.
"""
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
from crawl4ai.browser_profiler import BrowserProfiler
from crawl4ai.async_logger import AsyncLogger
from colorama import Fore, Style, init
# Initialize colorama
init()
# Create a shared logger instance
logger = AsyncLogger(verbose=True)
# Create a shared BrowserProfiler instance
profiler = BrowserProfiler(logger=logger)
async def crawl_with_profile(profile_path, url):
"""Use a profile to crawl an authenticated page"""
logger.info(f"\nCrawling {Fore.CYAN}{url}{Style.RESET_ALL} using profile at {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CRAWL")
# Create browser config with the profile path
browser_config = BrowserConfig(
headless=False, # Set to False if you want to see the browser window
use_managed_browser=True, # Required for persistent profiles
user_data_dir=profile_path
)
start_time = asyncio.get_event_loop().time()
# Initialize crawler with the browser config
async with AsyncWebCrawler(config=browser_config) as crawler:
# Crawl the URL - You should have access to authenticated content now
result = await crawler.arun(url)
elapsed_time = asyncio.get_event_loop().time() - start_time
if result.success:
# Use url_status method for consistent logging
logger.url_status(url, True, elapsed_time, tag="CRAWL")
# Print page title or some indication of success
title = result.metadata.get("title", "")
logger.success(f"Page title: {Fore.GREEN}{title}{Style.RESET_ALL}", tag="CRAWL")
return result
else:
# Log error status
logger.error_status(url, result.error_message, tag="CRAWL")
return None
async def main():
logger.info(f"{Fore.CYAN}Identity-Based Browsing Example with Crawl4AI{Style.RESET_ALL}", tag="DEMO")
logger.info("This example demonstrates using profiles for authenticated browsing", tag="DEMO")
# Choose between interactive mode and automatic mode
mode = input(f"{Fore.CYAN}Run in [i]nteractive mode or [a]utomatic mode? (i/a): {Style.RESET_ALL}").lower()
if mode == 'i':
# Interactive profile management - use the interactive_manager method
# Pass the crawl_with_profile function as the callback for the "crawl a website" option
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
else:
# Automatic mode - simplified example
profiles = profiler.list_profiles()
if not profiles:
# Create a new profile if none exists
logger.info("No profiles found. Creating a new one...", tag="DEMO")
profile_path = await profiler.create_profile()
if not profile_path:
logger.error("Cannot proceed without a valid profile", tag="DEMO")
return
else:
# Use the first (most recent) profile
profile_path = profiles[0]["path"]
logger.info(f"Using existing profile: {Fore.CYAN}{profiles[0]['name']}{Style.RESET_ALL}", tag="DEMO")
# Example: Crawl an authenticated page
urls_to_crawl = [
"https://github.com/settings/profile", # GitHub requires login
# "https://twitter.com/home", # Twitter requires login
# "https://www.linkedin.com/feed/", # LinkedIn requires login
]
for url in urls_to_crawl:
await crawl_with_profile(profile_path, url)
if __name__ == "__main__":
try:
# Run the async main function
asyncio.run(main())
except KeyboardInterrupt:
logger.warning("Example interrupted by user", tag="DEMO")
except Exception as e:
logger.error(f"Error in example: {str(e)}", tag="DEMO")

View File

@@ -1,10 +1,11 @@
from crawl4ai.async_configs import LlmConfig
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from crawl4ai import LLMConfig
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
import asyncio
import os
import json
from pydantic import BaseModel, Field
url = r"https://openai.com/api/pricing/"
url = "https://openai.com/api/pricing/"
class OpenAIModelFee(BaseModel):
@@ -14,10 +15,6 @@ class OpenAIModelFee(BaseModel):
..., description="Fee for output token for the OpenAI model."
)
from crawl4ai import AsyncWebCrawler
async def main():
# Use AsyncWebCrawler
async with AsyncWebCrawler() as crawler:
@@ -26,7 +23,7 @@ async def main():
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="From the crawled content, extract all mentioned model names along with their "

View File

@@ -1,7 +1,7 @@
import os
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def test_llm_filter():
@@ -23,7 +23,7 @@ async def test_llm_filter():
# Initialize LLM filter with focused instruction
filter = LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
instruction="""
Focus on extracting the core educational content about Python classes.
Include:
@@ -43,7 +43,7 @@ async def test_llm_filter():
)
filter = LLMContentFilter(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
ignore_cache = True,
instruction="""

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
@@ -211,7 +211,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1,
page_timeout=80000,
extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider=provider,api_token=api_token),
llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -416,6 +416,7 @@ async def crawl_dynamic_content_pages_method_2():
async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(
@@ -507,6 +508,9 @@ async def ssl_certification():
if result.success and result.ssl_certificate:
cert = result.ssl_certificate
tmp_dir = os.path.join(__location__, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
# 1. Access certificate properties directly
print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}")
@@ -529,67 +533,6 @@ async def ssl_certification():
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
# Speed Comparison
async def speed_comparison():
print("\n--- Speed Comparison ---")
# Firecrawl comparison
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(scrape_status['markdown'])} characters")
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
print()
# Crawl4AI comparisons
browser_config = BrowserConfig(headless=True)
# Simple crawl
async with AsyncWebCrawler(config=browser_config) as crawler:
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, word_count_threshold=0
),
)
end = time.time()
print("Crawl4AI (simple crawl):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown)} characters")
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Advanced filtering
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
),
),
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
print()
# Main execution
async def main():
# Basic examples

View File

@@ -1,6 +1,6 @@
import os, sys
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# append parent directory to system path
sys.path.append(
@@ -147,7 +147,7 @@ async def extract_structured_data_using_llm(
url="https://openai.com/api/pricing/",
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider=provider,api_token=api_token),
llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
@@ -570,7 +570,7 @@ async def generate_knowledge_graph():
relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="""Extract entities and relationships from the given text.""",

View File

@@ -1,6 +1,6 @@
import os
import time
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
@@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
),
)
cprint(
@@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="I am interested in only financial news",
),
)
@@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler):
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract only content related to technology",
),
)

View File

@@ -13,11 +13,11 @@ from crawl4ai.deep_crawling import (
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.configs import ProxyConfig
from crawl4ai.proxy_strategy import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
from pprint import pprint
@@ -284,9 +284,9 @@ async def llm_content_filter():
PART 5: LLM Content Filter
This function demonstrates:
- Configuring LLM providers via LlmConfig
- Configuring LLM providers via LLMConfig
- Using LLM to generate focused markdown
- LlmConfig for configuration
- LLMConfig for configuration
Note: Requires a valid API key for the chosen LLM provider
"""
@@ -296,7 +296,7 @@ async def llm_content_filter():
# Create LLM configuration
# Replace with your actual API key or set as environment variable
llm_config = LlmConfig(
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
)
@@ -309,7 +309,7 @@ async def llm_content_filter():
# Create markdown generator with LLM filter
markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter(
llmConfig=llm_config,
llm_config=llm_config,
instruction="Extract key concepts and summaries"
)
)
@@ -381,7 +381,7 @@ async def llm_schema_generation():
PART 7: LLM Schema Generation
This function demonstrates:
- Configuring LLM providers via LlmConfig
- Configuring LLM providers via LLMConfig
- Using LLM to generate extraction schemas
- JsonCssExtractionStrategy
@@ -406,9 +406,9 @@ async def llm_schema_generation():
<div class="rating">4.7/5</div>
</div>
"""
print("\n📊 Setting up LlmConfig...")
print("\n📊 Setting up LLMConfig...")
# Create LLM configuration
llm_config = LlmConfig(
llm_config = LLMConfig(
provider="gemini/gemini-1.5-pro",
api_token="env:GEMINI_API_KEY"
)
@@ -416,7 +416,7 @@ async def llm_schema_generation():
print(" This would use the LLM to analyze HTML and create an extraction schema")
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llmConfig = llm_config,
llm_config = llm_config,
query="Extract product name and price"
)
print("\n✅ Generated Schema:")

View File

@@ -167,13 +167,114 @@ async with AsyncWebCrawler() as crawler:
---
## 6. Summary
## 6. Using the BrowserProfiler Class
- **Create** your user-data directory by launching Chrome/Chromium externally with `--user-data-dir=/some/path`.
- **Log in** or configure sites as needed, then close the browser.
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`.
- Enjoy **persistent** sessions that reflect your real identity.
- If you only need quick, ephemeral automation, **Magic Mode** might suffice.
Crawl4AI provides a dedicated `BrowserProfiler` class for managing browser profiles, making it easy to create, list, and delete profiles for identity-based browsing.
### Creating and Managing Profiles with BrowserProfiler
The `BrowserProfiler` class offers a comprehensive API for browser profile management:
```python
import asyncio
from crawl4ai import BrowserProfiler
async def manage_profiles():
# Create a profiler instance
profiler = BrowserProfiler()
# Create a profile interactively - opens a browser window
profile_path = await profiler.create_profile(
profile_name="my-login-profile" # Optional: name your profile
)
print(f"Profile saved at: {profile_path}")
# List all available profiles
profiles = profiler.list_profiles()
for profile in profiles:
print(f"Profile: {profile['name']}")
print(f" Path: {profile['path']}")
print(f" Created: {profile['created']}")
print(f" Browser type: {profile['type']}")
# Get a specific profile path by name
specific_profile = profiler.get_profile_path("my-login-profile")
# Delete a profile when no longer needed
success = profiler.delete_profile("old-profile-name")
asyncio.run(manage_profiles())
```
**How profile creation works:**
1. A browser window opens for you to interact with
2. You log in to websites, set preferences, etc.
3. When you're done, press 'q' in the terminal to close the browser
4. The profile is saved in the Crawl4AI profiles directory
5. You can use the returned path with `BrowserConfig.user_data_dir`
### Interactive Profile Management
The `BrowserProfiler` also offers an interactive management console that guides you through profile creation, listing, and deletion:
```python
import asyncio
from crawl4ai import BrowserProfiler, AsyncWebCrawler, BrowserConfig
# Define a function to use a profile for crawling
async def crawl_with_profile(profile_path, url):
browser_config = BrowserConfig(
headless=True,
use_managed_browser=True,
user_data_dir=profile_path
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url)
return result
async def main():
# Create a profiler instance
profiler = BrowserProfiler()
# Launch the interactive profile manager
# Passing the crawl function as a callback adds a "crawl with profile" option
await profiler.interactive_manager(crawl_callback=crawl_with_profile)
asyncio.run(main())
```
### Legacy Methods
For backward compatibility, the previous methods on `ManagedBrowser` are still available, but they delegate to the new `BrowserProfiler` class:
```python
from crawl4ai.browser_manager import ManagedBrowser
# These methods still work but use BrowserProfiler internally
profiles = ManagedBrowser.list_profiles()
```
### Complete Example
See the full example in `docs/examples/identity_based_browsing.py` for a complete demonstration of creating and using profiles for authenticated browsing using the new `BrowserProfiler` class.
---
## 7. Summary
- **Create** your user-data directory either:
- By launching Chrome/Chromium externally with `--user-data-dir=/some/path`
- Or by using the built-in `BrowserProfiler.create_profile()` method
- Or through the interactive interface with `profiler.interactive_manager()`
- **Log in** or configure sites as needed, then close the browser
- **Reference** that folder in `BrowserConfig(user_data_dir="...")` + `use_managed_browser=True`
- **List and reuse** profiles with `BrowserProfiler.list_profiles()`
- **Manage** your profiles with the dedicated `BrowserProfiler` class
- Enjoy **persistent** sessions that reflect your real identity
- If you only need quick, ephemeral automation, **Magic Mode** might suffice
**Recommended**: Always prefer a **Managed Browser** for robust, identity-based crawling and simpler interactions with complex sites. Use **Magic Mode** for quick tasks or prototypes where persistent data is unnecessary.

View File

@@ -71,7 +71,8 @@ We group them by category.
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
@@ -246,8 +247,8 @@ run_config = CrawlerRunConfig(
)
```
# 3. **LlmConfig** - Setting up LLM providers
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
# 3. **LLMConfig** - Setting up LLM providers
LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
1. LLMExtractionStrategy
2. LLMContentFilter
@@ -263,7 +264,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
## 3.2 Example Usage
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
## 4. Putting It All Together
@@ -271,7 +272,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
- **Use** `CrawlerRunConfig` for each crawls **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
```python
# Create a modified copy with the clone() method

View File

@@ -131,7 +131,7 @@ OverlappingWindowChunking(
```python
from pydantic import BaseModel
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# Define schema
class Article(BaseModel):
@@ -141,7 +141,7 @@ class Article(BaseModel):
# Create strategy
strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="ollama/llama2"),
llm_config = LLMConfig(provider="ollama/llama2"),
schema=Article.schema(),
instruction="Extract article details"
)
@@ -198,7 +198,7 @@ result = await crawler.arun(
```python
from crawl4ai.chunking_strategy import OverlappingWindowChunking
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# Create chunking strategy
chunker = OverlappingWindowChunking(
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
# Use with extraction strategy
strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="ollama/llama2"),
llm_config = LLMConfig(provider="ollama/llama2"),
chunking_strategy=chunker
)

View File

@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
**Minor Updates & Improvements:**
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
* **Config**: FastFilterChain has been replaced with FilterChain
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.

View File

@@ -251,7 +251,7 @@ from crawl4ai import (
RoundRobinProxyStrategy,
)
import asyncio
from crawl4ai.configs import ProxyConfig
from crawl4ai.proxy_strategy import ProxyConfig
async def main():
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
@@ -305,13 +305,13 @@ asyncio.run(main())
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
import asyncio
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
markdown_generator = DefaultMarkdownGenerator(
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
)
config = CrawlerRunConfig(markdown_generator=markdown_generator)
@@ -335,13 +335,13 @@ asyncio.run(main())
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
schema = JsonCssExtractionStrategy.generate_schema(
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
llmConfig = llm_config,
llm_config = llm_config,
query="Extract product name and price"
)
print(schema)
@@ -394,20 +394,20 @@ print(schema)
serialization, especially for sets of allowed/blocked domains. No code changes
required.
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
extraction, filtering, and schema generation tasks. It simplifies passing
provider strings, API tokens, and base URLs across all sections where LLM
configuration is necessary. It also enables reuse and allows for quick
experimentation between different LLM configurations.
```python
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# Example of using LlmConfig with LLMExtractionStrategy
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...)
# Example of using LLMConfig with LLMExtractionStrategy
llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
# Example usage within a crawler
async with AsyncWebCrawler() as crawler:
@@ -418,7 +418,7 @@ print(schema)
```
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
`base_url`, and `api_base` from `LLMExtractionStrategy` and
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
`LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
- **Changed: Improved browser context management and added shared data support.
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now

View File

@@ -4,7 +4,7 @@ Crawl4AIs flexibility stems from two key classes:
1. **`BrowserConfig`** Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
2. **`CrawlerRunConfig`** Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
@@ -239,7 +239,7 @@ The `clone()` method:
## 3. LlmConfig Essentials
## 3. LLMConfig Essentials
### Key fields to note
@@ -256,16 +256,16 @@ The `clone()` method:
- If your provider has a custom endpoint
```python
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
```
## 4. Putting It All Together
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each calls needs:
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each calls needs:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def main():
@@ -289,14 +289,14 @@ async def main():
# 3) Example LLM content filtering
gemini_config = LlmConfig(
gemini_config = LLMConfig(
provider="gemini/gemini-1.5-pro"
api_token = "env:GEMINI_API_TOKEN"
)
# Initialize LLM filter with specific instruction
filter = LLMContentFilter(
llmConfig=gemini_config, # or your preferred provider
llm_config=gemini_config, # or your preferred provider
instruction="""
Focus on extracting the core educational content.
Include:
@@ -343,7 +343,7 @@ if __name__ == "__main__":
For a **detailed list** of available parameters (including advanced ones), see:
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
You can explore topics like:
@@ -356,7 +356,7 @@ You can explore topics like:
## 6. Conclusion
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.

View File

@@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co
## 1. CSS-Based Selection
There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
### 1.1 Using `css_selector`
A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
```python
@@ -32,6 +36,33 @@ if __name__ == "__main__":
**Result**: Only elements matching that selector remain in `result.cleaned_html`.
### 1.2 Using `target_elements`
The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
config = CrawlerRunConfig(
# Target article body and sidebar, but not other content
target_elements=["article.main-content", "aside.sidebar"]
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/blog-post",
config=config
)
print("Markdown focused on target elements")
print("Links from entire page still available:", len(result.links.get("internal", [])))
if __name__ == "__main__":
asyncio.run(main())
```
**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
---
## 2. Content Filtering & Exclusions
@@ -211,7 +242,7 @@ if __name__ == "__main__":
import asyncio
import json
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class ArticleData(BaseModel):
@@ -220,7 +251,7 @@ class ArticleData(BaseModel):
async def main():
llm_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
schema=ArticleData.schema(),
extraction_type="schema",
instruction="Extract 'headline' and a short 'summary' from the content."
@@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when:
---
## 7. Conclusion
## 7. Combining CSS Selection Methods
By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
1. **`css_selector`** Basic scoping to an element or region.
2. **`word_count_threshold`** Skip short blocks.
3. **`excluded_tags`** Remove entire HTML tags.
4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** Filter out unwanted links or domains.
5. **`exclude_external_images`** Remove images from external sources.
6. **`process_iframes`** Merge iframe content if needed.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
async def main():
# Target specific content but preserve page context
config = CrawlerRunConfig(
# Focus markdown on main content and sidebar
target_elements=["#main-content", ".sidebar"],
# Global filters applied to entire page
excluded_tags=["nav", "footer", "header"],
exclude_external_links=True,
# Use basic content thresholds
word_count_threshold=15,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/article",
config=config
)
print(f"Content focuses on specific elements, but all links still analyzed")
print(f"Internal links: {len(result.links.get('internal', []))}")
print(f"External links: {len(result.links.get('external', []))}")
if __name__ == "__main__":
asyncio.run(main())
```
This approach gives you the best of both worlds:
- Markdown generation and content extraction focus on the elements you care about
- Links, images and other page data still give you the full context of the page
- Content filtering still applies globally
## 8. Conclusion
By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
1. **`target_elements`** Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
2. **`css_selector`** Basic scoping to an element or region for all extraction processes.
3. **`word_count_threshold`** Skip short blocks.
4. **`excluded_tags`** Remove entire HTML tags.
5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** Filter out unwanted links or domains.
6. **`exclude_external_images`** Remove images from external sources.
7. **`process_iframes`** Merge iframe content if needed.
Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!

View File

@@ -73,12 +73,18 @@ from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
strategy = BFSDeepCrawlStrategy(
max_depth=2, # Crawl initial page + 2 levels deep
include_external=False, # Stay within the same domain
max_pages=50, # Maximum number of pages to crawl (optional)
score_threshold=0.3, # Minimum score for URLs to be crawled (optional)
)
```
**Key parameters:**
- **`max_depth`**: Number of levels to crawl beyond the starting page
- **`include_external`**: Whether to follow links to other domains
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
- **`filter_chain`**: FilterChain instance for URL filtering
- **`url_scorer`**: Scorer instance for evaluating URLs
### 2.2 DFSDeepCrawlStrategy (Depth-First Search)
@@ -91,12 +97,18 @@ from crawl4ai.deep_crawling import DFSDeepCrawlStrategy
strategy = DFSDeepCrawlStrategy(
max_depth=2, # Crawl initial page + 2 levels deep
include_external=False, # Stay within the same domain
max_pages=30, # Maximum number of pages to crawl (optional)
score_threshold=0.5, # Minimum score for URLs to be crawled (optional)
)
```
**Key parameters:**
- **`max_depth`**: Number of levels to crawl beyond the starting page
- **`include_external`**: Whether to follow links to other domains
- **`max_pages`**: Maximum number of pages to crawl (default: infinite)
- **`score_threshold`**: Minimum score for URLs to be crawled (default: -inf)
- **`filter_chain`**: FilterChain instance for URL filtering
- **`url_scorer`**: Scorer instance for evaluating URLs
### 2.3 BestFirstCrawlingStrategy (⭐️ - Recommended Deep crawl strategy)
@@ -116,7 +128,8 @@ scorer = KeywordRelevanceScorer(
strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=scorer
url_scorer=scorer,
max_pages=25, # Maximum number of pages to crawl (optional)
)
```
@@ -124,6 +137,8 @@ This crawling approach:
- Evaluates each discovered URL based on scorer criteria
- Visits higher-scoring pages first
- Helps focus crawl resources on the most relevant content
- Can limit total pages crawled with `max_pages`
- Does not need `score_threshold` as it naturally prioritizes by score
---
@@ -410,27 +425,64 @@ if __name__ == "__main__":
---
## 8. Common Pitfalls & Tips
## 8. Limiting and Controlling Crawl Size
1.**Set realistic depth limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size.
### 8.1 Using max_pages
You can limit the total number of pages crawled with the `max_pages` parameter:
```python
# Limit to exactly 20 pages regardless of depth
strategy = BFSDeepCrawlStrategy(
max_depth=3,
max_pages=20
)
```
This feature is useful for:
- Controlling API costs
- Setting predictable execution times
- Focusing on the most important content
- Testing crawl configurations before full execution
### 8.2 Using score_threshold
For BFS and DFS strategies, you can set a minimum score threshold to only crawl high-quality pages:
```python
# Only follow links with scores above 0.4
strategy = DFSDeepCrawlStrategy(
max_depth=2,
url_scorer=KeywordRelevanceScorer(keywords=["api", "guide", "reference"]),
score_threshold=0.4 # Skip URLs with scores below this value
)
```
Note that for BestFirstCrawlingStrategy, score_threshold is not needed since pages are already processed in order of highest score first.
## 9. Common Pitfalls & Tips
1.**Set realistic limits.** Be cautious with `max_depth` values > 3, which can exponentially increase crawl size. Use `max_pages` to set hard limits.
2.**Don't neglect the scoring component.** BestFirstCrawling works best with well-tuned scorers. Experiment with keyword weights for optimal prioritization.
3.**Be a good web citizen.** Respect robots.txt. (disabled by default)
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.status` when processing results.
4.**Handle page errors gracefully.** Not all pages will be accessible. Check `result.success` and `result.error_message` when processing results.
5.**Balance breadth vs. depth.** Choose your strategy wisely - BFS for comprehensive coverage, DFS for deep exploration, BestFirst for focused relevance-based crawling.
---
## 9. Summary & Next Steps
## 10. Summary & Next Steps
In this **Deep Crawling with Crawl4AI** tutorial, you learned to:
- Configure **BFSDeepCrawlStrategy** and **BestFirstCrawlingStrategy**
- Configure **BFSDeepCrawlStrategy**, **DFSDeepCrawlStrategy**, and **BestFirstCrawlingStrategy**
- Process results in streaming or non-streaming mode
- Apply filters to target specific content
- Use scorers to prioritize the most relevant pages
- Limit crawls with `max_pages` and `score_threshold` parameters
- Build a complete advanced crawler with combined techniques
With these tools, you can efficiently extract structured data from websites at scale, focusing precisely on the content you need for your specific use case.

View File

@@ -133,19 +133,28 @@ This approach is handy when you still want external links but need to block cert
### 3.1 Accessing `result.media`
By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
**Basic Example**:
```python
if result.success:
# Get images
images_info = result.media.get("images", [])
print(f"Found {len(images_info)} images in total.")
for i, img in enumerate(images_info[:5]): # Inspect just the first 5
for i, img in enumerate(images_info[:3]): # Inspect just the first 3
print(f"[Image {i}] URL: {img['src']}")
print(f" Alt text: {img.get('alt', '')}")
print(f" Score: {img.get('score')}")
print(f" Description: {img.get('desc', '')}\n")
# Get tables
tables = result.media.get("tables", [])
print(f"Found {len(tables)} data tables in total.")
for i, table in enumerate(tables):
print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
print(f" Columns: {len(table.get('headers', []))}")
print(f" Rows: {len(table.get('rows', []))}")
```
**Structure Example**:
@@ -171,6 +180,19 @@ result.media = {
],
"audio": [
# Similar structure but with audio-specific fields
],
"tables": [
{
"headers": ["Name", "Age", "Location"],
"rows": [
["John Doe", "34", "New York"],
["Jane Smith", "28", "San Francisco"],
["Alex Johnson", "42", "Chicago"]
],
"caption": "Employee Directory",
"summary": "Directory of company employees"
},
# More tables if present
]
}
```
@@ -199,7 +221,53 @@ crawler_cfg = CrawlerRunConfig(
This setting attempts to discard images from outside the primary domain, keeping only those from the site youre crawling.
### 3.3 Additional Media Config
### 3.3 Working with Tables
Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
- Presence of thead and tbody sections
- Use of th elements for headers
- Column consistency
- Text density
- And other factors
Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
**Accessing Table Data**:
```python
if result.success:
tables = result.media.get("tables", [])
print(f"Found {len(tables)} data tables on the page")
if tables:
# Access the first table
first_table = tables[0]
print(f"Table caption: {first_table.get('caption', 'No caption')}")
print(f"Headers: {first_table.get('headers', [])}")
# Print the first 3 rows
for i, row in enumerate(first_table.get('rows', [])[:3]):
print(f"Row {i+1}: {row}")
```
**Configuring Table Extraction**:
You can adjust the sensitivity of the table detection algorithm with:
```python
crawler_cfg = CrawlerRunConfig(
table_score_threshold=5 # Lower value = more tables detected (default: 7)
)
```
Each extracted table contains:
- `headers`: Column header names
- `rows`: List of rows, each containing cell values
- `caption`: Table caption text (if available)
- `summary`: Table summary attribute (if specified)
### 3.4 Additional Media Config
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
@@ -273,4 +341,11 @@ if __name__ == "__main__":
---
**Thats it for Link & Media Analysis!** Youre now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
**Thats it for Link & Media Analysis!** Youre now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
### Table Extraction Tips
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.

View File

@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.content_filter_strategy import LLMContentFilter
async def main():
# Initialize LLM filter with specific instruction
filter = LLMContentFilter(
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
instruction="""
Focus on extracting the core educational content.
Include:

View File

@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# Generate a schema (one-time cost)
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
# Using OpenAI (requires API token)
schema = JsonCssExtractionStrategy.generate_schema(
html,
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
)
# Or using Ollama (open source, no token needed)
schema = JsonCssExtractionStrategy.generate_schema(
html,
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
)
# Use the schema for fast, repeated extractions
@@ -211,7 +211,7 @@ import os
import json
import asyncio
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class OpenAIModelFee(BaseModel):
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
word_count_threshold=1,
page_timeout=80000,
extraction_strategy=LLMExtractionStrategy(
llmConfig = LlmConfig(provider=provider,api_token=api_token),
llm_config = LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.

View File

@@ -76,7 +76,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
```python
extraction_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
schema=MyModel.model_json_schema(),
extraction_type="schema",
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
@@ -101,7 +101,7 @@ import asyncio
import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class Product(BaseModel):
@@ -111,7 +111,7 @@ class Product(BaseModel):
async def main():
# 1. Define the LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
schema=Product.schema_json(), # Or use model_json_schema()
extraction_type="schema",
instruction="Extract all product objects with 'name' and 'price' from the content.",

View File

@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
```python
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
from crawl4ai.async_configs import LlmConfig
from crawl4ai import LLMConfig
# Sample HTML with product information
html = """
@@ -435,14 +435,14 @@ html = """
css_schema = JsonCssExtractionStrategy.generate_schema(
html,
schema_type="css",
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
)
# Option 2: Using Ollama (open source, no token needed)
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
html,
schema_type="xpath",
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
)
# Use the generated schema for fast, repeated extractions

View File

@@ -0,0 +1,78 @@
import asyncio
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
BFSDeepCrawlStrategy,
CrawlResult,
FilterChain,
DomainFilter,
URLPatternFilter,
)
# Import necessary classes from crawl4ai library:
# - AsyncWebCrawler: The main class for web crawling.
# - CrawlerRunConfig: Configuration class for crawler behavior.
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
# - CrawlResult: Data model for individual crawl results.
# - FilterChain: Used to chain multiple URL filters.
# - URLPatternFilter: Filter URLs based on patterns.
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
async def basic_deep_crawl():
"""
Performs a basic deep crawl starting from a seed URL, demonstrating:
- Breadth-First Search (BFS) deep crawling strategy.
- Filtering URLs based on URL patterns.
- Accessing crawl results and metadata.
"""
# 1. Define URL Filters:
# Create a URLPatternFilter to include only URLs containing "text".
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
url_filter = URLPatternFilter(
patterns=[
"*text*", # Include URLs that contain "text" in their path or URL
]
)
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
domain_filter = DomainFilter(
allowed_domains=["groq.com"],
blocked_domains=["example.com"],
)
# 2. Configure CrawlerRunConfig for Deep Crawling:
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
),
verbose=True, # Enable verbose logging to see detailed output during crawling
)
# 3. Initialize and Run AsyncWebCrawler:
# Use AsyncWebCrawler as a context manager for automatic start and close.
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
config=config, # Pass the configured CrawlerRunConfig to arun method
)
# 4. Process and Print Crawl Results:
# Iterate through the list of CrawlResult objects returned by the deep crawl.
for result in results:
# Print the URL and its crawl depth from the metadata for each crawled URL.
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
if __name__ == "__main__":
import asyncio
asyncio.run(basic_deep_crawl())

View File

@@ -0,0 +1,162 @@
import asyncio
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
BFSDeepCrawlStrategy,
CrawlResult,
URLFilter, # Base class for filters, not directly used in examples but good to import for context
ContentTypeFilter,
DomainFilter,
FilterChain,
URLPatternFilter,
SEOFilter # Advanced filter, can be introduced later or as bonus
)
async def deep_crawl_filter_tutorial_part_2():
"""
Tutorial demonstrating URL filters in Crawl4AI, focusing on isolated filter behavior
before integrating them into a deep crawl.
This tutorial covers:
- Testing individual filters with synthetic URLs.
- Understanding filter logic and behavior in isolation.
- Combining filters using FilterChain.
- Integrating filters into a deep crawling example.
"""
# === Introduction: URL Filters in Isolation ===
print("\n" + "=" * 40)
print("=== Introduction: URL Filters in Isolation ===")
print("=" * 40 + "\n")
print("In this section, we will explore each filter individually using synthetic URLs.")
print("This allows us to understand exactly how each filter works before using them in a crawl.\n")
# === 2. ContentTypeFilter - Testing in Isolation ===
print("\n" + "=" * 40)
print("=== 2. ContentTypeFilter - Testing in Isolation ===")
print("=" * 40 + "\n")
# 2.1. Create ContentTypeFilter:
# Create a ContentTypeFilter to allow only 'text/html' and 'application/json' content types
# BASED ON URL EXTENSIONS.
content_type_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"])
print("ContentTypeFilter created, allowing types (by extension): ['text/html', 'application/json']")
print("Note: ContentTypeFilter in Crawl4ai works by checking URL file extensions, not HTTP headers.")
# 2.2. Synthetic URLs for Testing:
# ContentTypeFilter checks URL extensions. We provide URLs with different extensions to test.
test_urls_content_type = [
"https://example.com/page.html", # Should pass: .html extension (text/html)
"https://example.com/data.json", # Should pass: .json extension (application/json)
"https://example.com/image.png", # Should reject: .png extension (not allowed type)
"https://example.com/document.pdf", # Should reject: .pdf extension (not allowed type)
"https://example.com/page", # Should pass: no extension (defaults to allow) - check default behaviour!
"https://example.com/page.xhtml", # Should pass: .xhtml extension (text/html)
]
# 2.3. Apply Filter and Show Results:
print("\n=== Testing ContentTypeFilter (URL Extension based) ===")
for url in test_urls_content_type:
passed = content_type_filter.apply(url)
result = "PASSED" if passed else "REJECTED"
extension = ContentTypeFilter._extract_extension(url) # Show extracted extension for clarity
print(f"- URL: {url} - {result} (Extension: '{extension or 'No Extension'}')")
print("=" * 40)
input("Press Enter to continue to DomainFilter example...")
# === 3. DomainFilter - Testing in Isolation ===
print("\n" + "=" * 40)
print("=== 3. DomainFilter - Testing in Isolation ===")
print("=" * 40 + "\n")
# 3.1. Create DomainFilter:
domain_filter = DomainFilter(allowed_domains=["crawl4ai.com", "example.com"])
print("DomainFilter created, allowing domains: ['crawl4ai.com', 'example.com']")
# 3.2. Synthetic URLs for Testing:
test_urls_domain = [
"https://docs.crawl4ai.com/api",
"https://example.com/products",
"https://another-website.org/blog",
"https://sub.example.com/about",
"https://crawl4ai.com.attacker.net", # Corrected example: now should be rejected
]
# 3.3. Apply Filter and Show Results:
print("\n=== Testing DomainFilter ===")
for url in test_urls_domain:
passed = domain_filter.apply(url)
result = "PASSED" if passed else "REJECTED"
print(f"- URL: {url} - {result}")
print("=" * 40)
input("Press Enter to continue to FilterChain example...")
# === 4. FilterChain - Combining Filters ===
print("\n" + "=" * 40)
print("=== 4. FilterChain - Combining Filters ===")
print("=" * 40 + "\n")
combined_filter = FilterChain(
filters=[
URLPatternFilter(patterns=["*api*"]),
ContentTypeFilter(allowed_types=["text/html"]), # Still URL extension based
DomainFilter(allowed_domains=["docs.crawl4ai.com"]),
]
)
print("FilterChain created, combining URLPatternFilter, ContentTypeFilter, and DomainFilter.")
test_urls_combined = [
"https://docs.crawl4ai.com/api/async-webcrawler",
"https://example.com/api/products",
"https://docs.crawl4ai.com/core/crawling",
"https://another-website.org/api/data",
]
# 4.3. Apply FilterChain and Show Results
print("\n=== Testing FilterChain (URLPatternFilter + ContentTypeFilter + DomainFilter) ===")
for url in test_urls_combined:
passed = await combined_filter.apply(url)
result = "PASSED" if passed else "REJECTED"
print(f"- URL: {url} - {result}")
print("=" * 40)
input("Press Enter to continue to Deep Crawl with FilterChain example...")
# === 5. Deep Crawl with FilterChain ===
print("\n" + "=" * 40)
print("=== 5. Deep Crawl with FilterChain ===")
print("=" * 40 + "\n")
print("Finally, let's integrate the FilterChain into a deep crawl example.")
config_final_crawl = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
max_pages=10,
include_external=False,
filter_chain=combined_filter
),
verbose=False,
)
async with AsyncWebCrawler() as crawler:
results_final_crawl: List[CrawlResult] = await crawler.arun(
url="https://docs.crawl4ai.com", config=config_final_crawl
)
print("=== Crawled URLs (Deep Crawl with FilterChain) ===")
for result in results_final_crawl:
print(f"- {result.url}, Depth: {result.metadata.get('depth', 0)}")
print("=" * 40)
print("\nTutorial Completed! Review the output of each section to understand URL filters.")
if __name__ == "__main__":
asyncio.run(deep_crawl_filter_tutorial_part_2())