diff --git a/README.md b/README.md index 97787b2f..02f01d03 100644 --- a/README.md +++ b/README.md @@ -352,7 +352,7 @@ if __name__ == "__main__": ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy import json async def main(): @@ -426,7 +426,7 @@ if __name__ == "__main__": import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy from pydantic import BaseModel, Field class OpenAIModelFee(BaseModel): diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py index e1288de1..18233417 100644 --- a/crawl4ai/crawlers/google_search/crawler.py +++ b/crawl4ai/crawlers/google_search/crawler.py @@ -1,7 +1,7 @@ from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai.hub import BaseCrawler from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from pathlib import Path import json import os diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md index 1b250126..eb29b94c 100644 --- a/deploy/docker/c4ai-code-context.md +++ b/deploy/docker/c4ai-code-context.md @@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import PruningContentFilter -from crawl4ai.extraction_strategy import ( +from crawl4ai import ( JsonCssExtractionStrategy, LLMExtractionStrategy, ) @@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): - from crawl4ai.extraction_strategy import CosineStrategy + from crawl4ai import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 77785cec..5137159e 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): # 1) Browser config: headless, bigger viewport, no proxy @@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): # Minimal schema for repeated items @@ -1094,7 +1094,7 @@ import asyncio import json from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class ArticleData(BaseModel): headline: str @@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_main_articles(url: str): schema = { @@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): schema = { @@ -4722,7 +4722,7 @@ if __name__ == "__main__": Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy schema = { "name": "Commits", @@ -4902,7 +4902,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai import LLMConfig # Generate a schema (one-time cost) @@ -4932,7 +4932,7 @@ Here's a basic extraction example: import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): schema = { @@ -4987,7 +4987,7 @@ import json import asyncio from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") @@ -5103,7 +5103,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") @@ -7300,7 +7300,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv ```python from crawl4ai.async_configs import CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): @@ -7850,7 +7850,7 @@ The Cosine Strategy: ## Basic Usage ```python -from crawl4ai.extraction_strategy import CosineStrategy +from crawl4ai import CosineStrategy strategy = CosineStrategy( semantic_filter="product reviews", # Target content type @@ -8161,7 +8161,7 @@ import json from pydantic import BaseModel, Field from typing import List from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class Product(BaseModel): name: str @@ -8278,7 +8278,7 @@ import asyncio from typing import List from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class Entity(BaseModel): name: str @@ -8423,7 +8423,7 @@ Let’s begin with a **simple** schema-based extraction using the `JsonCssExtrac import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_crypto_prices(): # 1. Define a simple extraction schema @@ -8493,7 +8493,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`* import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy async def extract_crypto_prices_xpath(): # 1. Minimal dummy HTML with some repeating rows @@ -8694,7 +8694,7 @@ Key Takeaways: import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy ecommerce_schema = { # ... the advanced schema from above ... @@ -8804,7 +8804,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai import LLMConfig # Sample HTML with product information diff --git a/docs/apps/iseeyou/llms-full.txt b/docs/apps/iseeyou/llms-full.txt new file mode 100644 index 00000000..12dad603 --- /dev/null +++ b/docs/apps/iseeyou/llms-full.txt @@ -0,0 +1,7715 @@ +# Crawl4AI + +> Open-source LLM-friendly web crawler and scraper for AI applications + +Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. Built with Python and Playwright for high-performance crawling with structured data extraction. + +**Key Features:** +- Asynchronous crawling with high concurrency +- Multiple extraction strategies (CSS, XPath, LLM-based) +- Built-in markdown generation with content filtering +- Docker deployment with REST API +- Session management and browser automation +- Advanced anti-detection capabilities + +**Quick Links:** +- [GitHub Repository](https://github.com/unclecode/crawl4ai) +- [Documentation](https://docs.crawl4ai.com) +- [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) + +--- + + +## Installation + +Multiple installation options for different environments and use cases. + +### Basic Installation + +```bash +# Install core library +pip install crawl4ai + +# Initial setup (installs Playwright browsers) +crawl4ai-setup + +# Verify installation +crawl4ai-doctor +``` + +### Quick Verification + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**📖 Learn more:** [Basic Usage Guide](https://docs.crawl4ai.com/core/quickstart.md) + +### Advanced Features (Optional) + +```bash +# PyTorch-based features (text clustering, semantic chunking) +pip install crawl4ai[torch] +crawl4ai-setup + +# Transformers (Hugging Face models) +pip install crawl4ai[transformer] +crawl4ai-setup + +# All features (large download) +pip install crawl4ai[all] +crawl4ai-setup + +# Pre-download models (optional) +crawl4ai-download-models +``` + +**📖 Learn more:** [Advanced Features Documentation](https://docs.crawl4ai.com/extraction/llm-strategies.md) + +### Docker Deployment + +```bash +# Pull pre-built image (specify platform for consistency) +docker pull --platform linux/amd64 unclecode/crawl4ai:latest +# For ARM (M1/M2 Macs): docker pull --platform linux/arm64 unclecode/crawl4ai:latest + +# Setup environment for LLM support +cat > .llm.env << EOL +OPENAI_API_KEY=sk-your-key +ANTHROPIC_API_KEY=your-anthropic-key +EOL + +# Run with LLM support (specify platform) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# For ARM Macs, use: --platform linux/arm64 + +# Basic run (no LLM) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai \ + --shm-size=1g \ + unclecode/crawl4ai:latest +``` + +**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment.md) + +### Docker Compose + +```bash +# Clone repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai + +# Copy environment template +cp deploy/docker/.llm.env.example .llm.env +# Edit .llm.env with your API keys + +# Run pre-built image +IMAGE=unclecode/crawl4ai:latest docker compose up -d + +# Build and run locally +docker compose up --build -d + +# Build with all features +INSTALL_TYPE=all docker compose up --build -d + +# Stop service +docker compose down +``` + +**📖 Learn more:** [Docker Compose Configuration](https://docs.crawl4ai.com/core/docker-deployment.md#option-2-using-docker-compose) + +### Manual Docker Build + +```bash +# Build multi-architecture image (specify platform) +docker buildx build --platform linux/amd64 -t crawl4ai-local:latest --load . +# For ARM: docker buildx build --platform linux/arm64 -t crawl4ai-local:latest --load . + +# Build with specific features +docker buildx build \ + --platform linux/amd64 \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=false \ + -t crawl4ai-local:latest --load . + +# Run custom build (specify platform) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + crawl4ai-local:latest +``` + +**📖 Learn more:** [Manual Build Guide](https://docs.crawl4ai.com/core/docker-deployment.md#option-3-manual-local-build--run) + +### Google Colab + +```python +# Install in Colab +!pip install crawl4ai +!crawl4ai-setup + +# If setup fails, manually install Playwright browsers +!playwright install chromium + +# Install with all features (may take 5-10 minutes) +!pip install crawl4ai[all] +!crawl4ai-setup +!crawl4ai-download-models + +# If still having issues, force Playwright install +!playwright install chromium --force + +# Quick test +import asyncio +from crawl4ai import AsyncWebCrawler + +async def test_crawl(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print("✅ Installation successful!") + print(f"Content length: {len(result.markdown)}") + +# Run test in Colab +await test_crawl() +``` + +**📖 Learn more:** [Colab Examples Notebook](https://colab.research.google.com/github/unclecode/crawl4ai/blob/main/docs/examples/quickstart.ipynb) + +### Docker API Usage + +```python +# Using Docker SDK +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + results = await client.crawl( + ["https://example.com"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + for result in results: + print(f"Success: {result.success}, Length: {len(result.markdown)}") + +asyncio.run(main()) +``` + +**📖 Learn more:** [Docker Client API](https://docs.crawl4ai.com/core/docker-deployment.md#python-sdk) + +### Direct API Calls + +```python +# REST API example +import requests + +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +print(response.json()) +``` + +**📖 Learn more:** [REST API Reference](https://docs.crawl4ai.com/core/docker-deployment.md#rest-api-examples) + +### Health Check + +```bash +# Check Docker service +curl http://localhost:11235/health + +# Access playground +open http://localhost:11235/playground + +# View metrics +curl http://localhost:11235/metrics +``` + +**📖 Learn more:** [Monitoring & Metrics](https://docs.crawl4ai.com/core/docker-deployment.md#metrics--monitoring) +--- + + +## Simple Crawling + +Basic web crawling operations with AsyncWebCrawler, configurations, and response handling. + +### Basic Setup + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_config = BrowserConfig() # Default browser settings + run_config = CrawlerRunConfig() # Default crawl settings + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Understanding CrawlResult + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) +) + +result = await crawler.arun("https://example.com", config=config) + +# Different content formats +print(result.html) # Raw HTML +print(result.cleaned_html) # Cleaned HTML +print(result.markdown.raw_markdown) # Raw markdown +print(result.markdown.fit_markdown) # Filtered markdown + +# Status information +print(result.success) # True/False +print(result.status_code) # HTTP status (200, 404, etc.) + +# Extracted content +print(result.media) # Images, videos, audio +print(result.links) # Internal/external links +``` + +### Basic Configuration Options + +```python +run_config = CrawlerRunConfig( + word_count_threshold=10, # Min words per block + exclude_external_links=True, # Remove external links + remove_overlay_elements=True, # Remove popups/modals + process_iframes=True, # Process iframe content + excluded_tags=['form', 'header'] # Skip these tags +) + +result = await crawler.arun("https://example.com", config=run_config) +``` + +### Error Handling + +```python +result = await crawler.arun("https://example.com", config=run_config) + +if not result.success: + print(f"Crawl failed: {result.error_message}") + print(f"Status code: {result.status_code}") +else: + print(f"Success! Content length: {len(result.markdown)}") +``` + +### Debugging with Verbose Logging + +```python +browser_config = BrowserConfig(verbose=True) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") + # Detailed logging output will be displayed +``` + +### Complete Example + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def comprehensive_crawl(): + browser_config = BrowserConfig(verbose=True) + + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=['form', 'header', 'nav'], + exclude_external_links=True, + + # Content processing + process_iframes=True, + remove_overlay_elements=True, + + # Cache control + cache_mode=CacheMode.ENABLED + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + + if result.success: + # Display content summary + print(f"Title: {result.metadata.get('title', 'No title')}") + print(f"Content: {result.markdown[:500]}...") + + # Process media + images = result.media.get("images", []) + print(f"Found {len(images)} images") + for img in images[:3]: # First 3 images + print(f" - {img.get('src', 'No src')}") + + # Process links + internal_links = result.links.get("internal", []) + print(f"Found {len(internal_links)} internal links") + for link in internal_links[:3]: # First 3 links + print(f" - {link.get('href', 'No href')}") + + else: + print(f"❌ Crawl failed: {result.error_message}") + print(f"Status: {result.status_code}") + +if __name__ == "__main__": + asyncio.run(comprehensive_crawl()) +``` + +### Working with Raw HTML and Local Files + +```python +# Crawl raw HTML +raw_html = "

Test

Content

" +result = await crawler.arun(f"raw://{raw_html}") + +# Crawl local file +result = await crawler.arun("file:///path/to/local/file.html") + +# Both return standard CrawlResult objects +print(result.markdown) +``` + +## Table Extraction + +Extract structured data from HTML tables with automatic detection and scoring. + +### Basic Table Extraction + +```python +import asyncio +import pandas as pd +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def extract_tables(): + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + table_score_threshold=7, # Higher = stricter detection + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun("https://example.com/tables", config=config) + + if result.success and result.tables: + # New tables field (v0.6+) + for i, table in enumerate(result.tables): + print(f"Table {i+1}:") + print(f"Headers: {table['headers']}") + print(f"Rows: {len(table['rows'])}") + print(f"Caption: {table.get('caption', 'No caption')}") + + # Convert to DataFrame + df = pd.DataFrame(table['rows'], columns=table['headers']) + print(df.head()) + +asyncio.run(extract_tables()) +``` + +### Advanced Table Processing + +```python +from crawl4ai import LXMLWebScrapingStrategy + +async def process_financial_tables(): + config = CrawlerRunConfig( + table_score_threshold=8, # Strict detection for data tables + scraping_strategy=LXMLWebScrapingStrategy(), + keep_data_attributes=True, + scan_full_page=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://coinmarketcap.com", config=config) + + if result.tables: + # Get the main data table (usually first/largest) + main_table = result.tables[0] + + # Create DataFrame + df = pd.DataFrame( + main_table['rows'], + columns=main_table['headers'] + ) + + # Clean and process data + df = clean_financial_data(df) + + # Save for analysis + df.to_csv("market_data.csv", index=False) + return df + +def clean_financial_data(df): + """Clean currency symbols, percentages, and large numbers""" + for col in df.columns: + if 'price' in col.lower(): + # Remove currency symbols + df[col] = df[col].str.replace(r'[^\d.]', '', regex=True) + df[col] = pd.to_numeric(df[col], errors='coerce') + + elif '%' in str(df[col].iloc[0]): + # Convert percentages + df[col] = df[col].str.replace('%', '').astype(float) / 100 + + elif any(suffix in str(df[col].iloc[0]) for suffix in ['B', 'M', 'K']): + # Handle large numbers (Billions, Millions, etc.) + df[col] = df[col].apply(convert_large_numbers) + + return df + +def convert_large_numbers(value): + """Convert 1.5B -> 1500000000""" + if pd.isna(value): + return float('nan') + + value = str(value) + multiplier = 1 + if 'B' in value: + multiplier = 1e9 + elif 'M' in value: + multiplier = 1e6 + elif 'K' in value: + multiplier = 1e3 + + number = float(re.sub(r'[^\d.]', '', value)) + return number * multiplier +``` + +### Table Detection Configuration + +```python +# Strict table detection (data-heavy pages) +strict_config = CrawlerRunConfig( + table_score_threshold=9, # Only high-quality tables + word_count_threshold=5, # Ignore sparse content + excluded_tags=['nav', 'footer'] # Skip navigation tables +) + +# Lenient detection (mixed content pages) +lenient_config = CrawlerRunConfig( + table_score_threshold=5, # Include layout tables + process_iframes=True, # Check embedded tables + scan_full_page=True # Scroll to load dynamic tables +) + +# Financial/data site optimization +financial_config = CrawlerRunConfig( + table_score_threshold=8, + scraping_strategy=LXMLWebScrapingStrategy(), + wait_for="css:table", # Wait for tables to load + scan_full_page=True, + scroll_delay=0.2 +) +``` + +### Multi-Table Processing + +```python +async def extract_all_tables(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/data", config=config) + + tables_data = {} + + for i, table in enumerate(result.tables): + # Create meaningful names based on content + table_name = ( + table.get('caption') or + f"table_{i+1}_{table['headers'][0]}" + ).replace(' ', '_').lower() + + df = pd.DataFrame(table['rows'], columns=table['headers']) + + # Store with metadata + tables_data[table_name] = { + 'dataframe': df, + 'headers': table['headers'], + 'row_count': len(table['rows']), + 'caption': table.get('caption'), + 'summary': table.get('summary') + } + + return tables_data + +# Usage +tables = await extract_all_tables() +for name, data in tables.items(): + print(f"{name}: {data['row_count']} rows") + data['dataframe'].to_csv(f"{name}.csv") +``` + +### Backward Compatibility + +```python +# Support both new and old table formats +def get_tables(result): + # New format (v0.6+) + if hasattr(result, 'tables') and result.tables: + return result.tables + + # Fallback to media.tables (older versions) + return result.media.get('tables', []) + +# Usage in existing code +result = await crawler.arun(url, config=config) +tables = get_tables(result) + +for table in tables: + df = pd.DataFrame(table['rows'], columns=table['headers']) + # Process table data... +``` + +### Table Quality Scoring + +```python +# Understanding table_score_threshold values: +# 10: Only perfect data tables (headers + data rows) +# 8-9: High-quality tables (recommended for financial/data sites) +# 6-7: Mixed content tables (news sites, wikis) +# 4-5: Layout tables included (broader detection) +# 1-3: All table-like structures (very permissive) + +config = CrawlerRunConfig( + table_score_threshold=8, # Balanced detection + verbose=True # See scoring details in logs +) +``` + + +**📖 Learn more:** [CrawlResult API Reference](https://docs.crawl4ai.com/api/crawl-result/), [Browser & Crawler Configuration](https://docs.crawl4ai.com/core/browser-crawler-config/), [Cache Modes](https://docs.crawl4ai.com/core/cache-modes/) +--- + + +## Browser, Crawler & LLM Configuration + +Core configuration classes for controlling browser behavior, crawl operations, LLM providers, and understanding crawl results. + +### BrowserConfig - Browser Environment Setup + +```python +from crawl4ai import BrowserConfig, AsyncWebCrawler + +# Basic browser configuration +browser_config = BrowserConfig( + browser_type="chromium", # "chromium", "firefox", "webkit" + headless=True, # False for visible browser (debugging) + viewport_width=1280, + viewport_height=720, + verbose=True +) + +# Advanced browser setup with proxy and persistence +browser_config = BrowserConfig( + headless=False, + proxy="http://user:pass@proxy:8080", + use_persistent_context=True, + user_data_dir="./browser_data", + cookies=[ + {"name": "session", "value": "abc123", "domain": "example.com"} + ], + headers={"Accept-Language": "en-US,en;q=0.9"}, + user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/116.0.0.0 Safari/537.36", + text_mode=True, # Disable images for faster crawling + extra_args=["--disable-extensions", "--no-sandbox"] +) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") +``` + +### CrawlerRunConfig - Crawl Operation Control + +```python +from crawl4ai import CrawlerRunConfig, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +# Basic crawl configuration +run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=10, + excluded_tags=["nav", "footer", "script"], + exclude_external_links=True, + screenshot=True, + pdf=True +) + +# Advanced content processing +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"citations": True, "ignore_links": False} +) + +run_config = CrawlerRunConfig( + # Content processing + markdown_generator=md_generator, + css_selector="main.content", # Focus on specific content + target_elements=[".article", ".post"], # Multiple target selectors + process_iframes=True, + remove_overlay_elements=True, + + # Page interaction + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + wait_for="css:.content-loaded", + wait_for_timeout=10000, + scan_full_page=True, + + # Session management + session_id="persistent_session", + + # Media handling + screenshot=True, + pdf=True, + capture_mhtml=True, + image_score_threshold=5, + + # Advanced options + simulate_user=True, + magic=True, # Auto-handle popups + verbose=True +) +``` + +### CrawlerRunConfig Parameters by Category + +```python +# Content Processing +config = CrawlerRunConfig( + word_count_threshold=10, # Min words per content block + css_selector="main.article", # Focus on specific content + target_elements=[".post", ".content"], # Multiple target selectors + excluded_tags=["nav", "footer"], # Remove these tags + excluded_selector="#ads, .tracker", # Remove by selector + only_text=True, # Text-only extraction + keep_data_attributes=True, # Preserve data-* attributes + remove_forms=True, # Remove all forms + process_iframes=True # Include iframe content +) + +# Page Navigation & Timing +config = CrawlerRunConfig( + wait_until="networkidle", # Wait condition + page_timeout=60000, # 60 second timeout + wait_for="css:.loaded", # Wait for specific element + wait_for_images=True, # Wait for images to load + delay_before_return_html=0.5, # Final delay before capture + semaphore_count=10 # Max concurrent operations +) + +# Page Interaction +config = CrawlerRunConfig( + js_code="document.querySelector('button').click();", + scan_full_page=True, # Auto-scroll page + scroll_delay=0.3, # Delay between scrolls + remove_overlay_elements=True, # Remove popups/modals + simulate_user=True, # Simulate human behavior + override_navigator=True, # Override navigator properties + magic=True # Auto-handle common patterns +) + +# Caching & Session +config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, # Cache behavior + session_id="my_session", # Persistent session + shared_data={"context": "value"} # Share data between hooks +) + +# Media & Output +config = CrawlerRunConfig( + screenshot=True, # Capture screenshot + pdf=True, # Generate PDF + capture_mhtml=True, # Capture MHTML archive + image_score_threshold=3, # Filter low-quality images + exclude_external_images=True # Remove external images +) + +# Link & Domain Filtering +config = CrawlerRunConfig( + exclude_external_links=True, # Remove external links + exclude_social_media_links=True, # Remove social media links + exclude_domains=["ads.com", "tracker.io"], # Custom domain filter + exclude_internal_links=False # Keep internal links +) +``` + +### LLMConfig - Language Model Setup + +```python +from crawl4ai import LLMConfig + +# OpenAI configuration +llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY" + temperature=0.1, + max_tokens=2000 +) + +# Local model with Ollama +llm_config = LLMConfig( + provider="ollama/llama3.3", + api_token=None, # Not needed for Ollama + base_url="http://localhost:11434" # Custom endpoint +) + +# Anthropic Claude +llm_config = LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000 +) + +# Google Gemini +llm_config = LLMConfig( + provider="gemini/gemini-1.5-pro", + api_token="env:GEMINI_API_KEY" +) + +# Groq (fast inference) +llm_config = LLMConfig( + provider="groq/llama3-70b-8192", + api_token="env:GROQ_API_KEY" +) +``` + +### CrawlResult - Understanding Output + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=run_config) + + # Basic status information + print(f"Success: {result.success}") + print(f"Status: {result.status_code}") + print(f"URL: {result.url}") + + if not result.success: + print(f"Error: {result.error_message}") + return + + # HTML content variants + print(f"Original HTML: {len(result.html)} chars") + print(f"Cleaned HTML: {len(result.cleaned_html or '')} chars") + + # Markdown output (MarkdownGenerationResult) + if result.markdown: + print(f"Raw markdown: {len(result.markdown.raw_markdown)} chars") + print(f"With citations: {len(result.markdown.markdown_with_citations)} chars") + + # Filtered content (if content filter was used) + if result.markdown.fit_markdown: + print(f"Fit markdown: {len(result.markdown.fit_markdown)} chars") + print(f"Fit HTML: {len(result.markdown.fit_html)} chars") + + # Extracted structured data + if result.extracted_content: + import json + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} items") + + # Media and links + images = result.media.get("images", []) + print(f"Found {len(images)} images") + for img in images[:3]: # First 3 images + print(f" {img.get('src')} (score: {img.get('score', 0)})") + + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Links: {len(internal_links)} internal, {len(external_links)} external") + + # Generated files + if result.screenshot: + print(f"Screenshot captured: {len(result.screenshot)} chars (base64)") + # Save screenshot + import base64 + with open("page.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + + if result.pdf: + print(f"PDF generated: {len(result.pdf)} bytes") + with open("page.pdf", "wb") as f: + f.write(result.pdf) + + if result.mhtml: + print(f"MHTML captured: {len(result.mhtml)} chars") + with open("page.mhtml", "w", encoding="utf-8") as f: + f.write(result.mhtml) + + # SSL certificate information + if result.ssl_certificate: + print(f"SSL Issuer: {result.ssl_certificate.issuer}") + print(f"Valid until: {result.ssl_certificate.valid_until}") + + # Network and console data (if captured) + if result.network_requests: + requests = [r for r in result.network_requests if r.get("event_type") == "request"] + print(f"Network requests captured: {len(requests)}") + + if result.console_messages: + errors = [m for m in result.console_messages if m.get("type") == "error"] + print(f"Console messages: {len(result.console_messages)} ({len(errors)} errors)") + + # Session and metadata + if result.session_id: + print(f"Session ID: {result.session_id}") + + if result.metadata: + print(f"Metadata: {result.metadata.get('title', 'No title')}") +``` + +### Configuration Helpers and Best Practices + +```python +# Clone configurations for variations +base_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + word_count_threshold=200, + verbose=True +) + +# Create streaming version +stream_config = base_config.clone( + stream=True, + cache_mode=CacheMode.BYPASS +) + +# Create debug version +debug_config = base_config.clone( + headless=False, + page_timeout=120000, + verbose=True +) + +# Serialize/deserialize configurations +config_dict = base_config.dump() # Convert to dict +restored_config = CrawlerRunConfig.load(config_dict) # Restore from dict + +# Browser configuration management +browser_config = BrowserConfig(headless=True, text_mode=True) +browser_dict = browser_config.to_dict() +cloned_browser = browser_config.clone(headless=False, verbose=True) +``` + +### Common Configuration Patterns + +```python +# Fast text-only crawling +fast_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + text_mode=True, + exclude_external_links=True, + exclude_external_images=True, + word_count_threshold=50 +) + +# Comprehensive data extraction +comprehensive_config = CrawlerRunConfig( + process_iframes=True, + scan_full_page=True, + wait_for_images=True, + screenshot=True, + capture_network_requests=True, + capture_console_messages=True, + magic=True +) + +# Stealth crawling +stealth_config = CrawlerRunConfig( + simulate_user=True, + override_navigator=True, + mean_delay=2.0, + max_range=1.0, + user_agent_mode="random" +) +``` + +### Advanced Configuration Features + +#### User Agent Management & Bot Detection Avoidance + +```python +from crawl4ai import CrawlerRunConfig + +# Random user agent generation +config = CrawlerRunConfig( + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", # "windows", "macos", "linux", "android", "ios" + "browser": "chrome", # "chrome", "firefox", "safari", "edge" + "device_type": "desktop" # "desktop", "mobile", "tablet" + } +) + +# Custom user agent with stealth features +config = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + simulate_user=True, # Simulate human mouse movements + override_navigator=True, # Override navigator properties + mean_delay=1.5, # Random delays between actions + max_range=2.0 +) + +# Combined anti-detection approach +stealth_config = CrawlerRunConfig( + user_agent_mode="random", + simulate_user=True, + override_navigator=True, + magic=True, # Auto-handle common bot detection patterns + delay_before_return_html=2.0 +) +``` + +#### Proxy Configuration with ProxyConfig + +```python +from crawl4ai import CrawlerRunConfig, ProxyConfig, ProxyRotationStrategy + +# Single proxy configuration +proxy_config = ProxyConfig( + server="http://proxy.example.com:8080", + username="proxy_user", + password="proxy_pass" +) + +# From proxy string format +proxy_config = ProxyConfig.from_string("192.168.1.100:8080:username:password") + +# Multiple proxies with rotation +proxies = [ + ProxyConfig(server="http://proxy1.com:8080", username="user1", password="pass1"), + ProxyConfig(server="http://proxy2.com:8080", username="user2", password="pass2"), + ProxyConfig(server="http://proxy3.com:8080", username="user3", password="pass3") +] + +rotation_strategy = ProxyRotationStrategy( + proxies=proxies, + rotation_method="round_robin" # or "random", "least_used" +) + +config = CrawlerRunConfig( + proxy_config=proxy_config, + proxy_rotation_strategy=rotation_strategy +) + +# Load proxies from environment variable +proxies_from_env = ProxyConfig.from_env("MY_PROXIES") # comma-separated proxy strings +``` + +#### Content Selection: css_selector vs target_elements + +```python +from crawl4ai import CrawlerRunConfig + +# css_selector: Extracts HTML at top level, affects entire processing +config = CrawlerRunConfig( + css_selector="main.article, .content-area", # Can be list of selectors + # Everything else (markdown, extraction, links) works only on this HTML subset +) + +# target_elements: Focuses extraction within already processed HTML +config = CrawlerRunConfig( + css_selector="body", # First extract entire body + target_elements=[ # Then focus extraction on these elements + ".article-content", + ".post-body", + ".main-text" + ], + # Links, media from entire body, but markdown/extraction only from target_elements +) + +# Hierarchical content selection +config = CrawlerRunConfig( + css_selector=["#main-content", ".article-wrapper"], # Top-level extraction + target_elements=[ # Subset for processing + ".article-title", + ".article-body", + ".article-metadata" + ], + excluded_selector="#sidebar, .ads, .comments" # Remove these from selection +) +``` + +#### Advanced wait_for Conditions + +```python +from crawl4ai import CrawlerRunConfig + +# CSS selector waiting +config = CrawlerRunConfig( + wait_for="css:.content-loaded", # Wait for element to appear + wait_for_timeout=15000 +) + +# JavaScript boolean expression waiting +config = CrawlerRunConfig( + wait_for="js:() => window.dataLoaded === true", # Custom JS condition + wait_for_timeout=20000 +) + +# Complex JavaScript conditions +config = CrawlerRunConfig( + wait_for="js:() => document.querySelectorAll('.item').length >= 10", + js_code=[ + "document.querySelector('.load-more')?.click();", + "window.scrollTo(0, document.body.scrollHeight);" + ] +) + +# Multiple conditions with JavaScript +config = CrawlerRunConfig( + wait_for="js:() => !document.querySelector('.loading') && document.querySelector('.results')", + page_timeout=30000 +) +``` + +#### Session Management for Multi-Step Crawling + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Persistent session across multiple arun() calls +async def multi_step_crawling(): + async with AsyncWebCrawler() as crawler: + # Step 1: Login page + login_config = CrawlerRunConfig( + session_id="user_session", # Create persistent session + js_code="document.querySelector('#username').value = 'user'; document.querySelector('#password').value = 'pass'; document.querySelector('#login').click();", + wait_for="css:.dashboard", + cache_mode=CacheMode.BYPASS + ) + + result1 = await crawler.arun("https://example.com/login", config=login_config) + + # Step 2: Navigate to protected area (reuses same browser page) + nav_config = CrawlerRunConfig( + session_id="user_session", # Same session = same browser page + js_only=True, # No page reload, just JS navigation + js_code="window.location.href = '/dashboard/data';", + wait_for="css:.data-table" + ) + + result2 = await crawler.arun("https://example.com/dashboard/data", config=nav_config) + + # Step 3: Extract data from multiple pages + for page in range(1, 6): + page_config = CrawlerRunConfig( + session_id="user_session", + js_only=True, + js_code=f"document.querySelector('.page-{page}').click();", + wait_for=f"js:() => document.querySelector('.page-{page}').classList.contains('active')" + ) + + result = await crawler.arun(f"https://example.com/data/page/{page}", config=page_config) + print(f"Page {page} data extracted: {len(result.extracted_content)}") + + # Important: Kill session when done + await crawler.kill_session("user_session") + +# Session with shared data between steps +async def session_with_shared_data(): + shared_context = {"user_id": "12345", "preferences": {"theme": "dark"}} + + config = CrawlerRunConfig( + session_id="persistent_session", + shared_data=shared_context, # Available across all session calls + js_code="console.log('User ID:', window.sharedData.user_id);" + ) +``` + +#### Identity-Based Crawling Parameters + +```python +from crawl4ai import CrawlerRunConfig, GeolocationConfig + +# Locale and timezone simulation +config = CrawlerRunConfig( + locale="en-US", # Browser language preference + timezone_id="America/New_York", # Timezone setting + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", + "locale": "en-US" + } +) + +# Geolocation simulation +geo_config = GeolocationConfig( + latitude=40.7128, # New York coordinates + longitude=-74.0060, + accuracy=100.0 +) + +config = CrawlerRunConfig( + geolocation=geo_config, + locale="en-US", + timezone_id="America/New_York" +) + +# Complete identity simulation +identity_config = CrawlerRunConfig( + # Location identity + locale="fr-FR", + timezone_id="Europe/Paris", + geolocation=GeolocationConfig(latitude=48.8566, longitude=2.3522), + + # Browser identity + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", + "locale": "fr-FR", + "browser": "chrome" + }, + + # Behavioral identity + simulate_user=True, + override_navigator=True, + mean_delay=2.0, + max_range=1.5 +) +``` + +#### Simplified Import Pattern + +```python +# Almost everything from crawl4ai main package +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + LLMConfig, + CacheMode, + ProxyConfig, + GeolocationConfig +) + +# Specialized strategies (still from crawl4ai) +from crawl4ai import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, + DefaultMarkdownGenerator, + PruningContentFilter, + RegexChunking +) + +# Complete example with simplified imports +async def example_crawl(): + browser_config = BrowserConfig(headless=True) + + run_config = CrawlerRunConfig( + user_agent_mode="random", + proxy_config=ProxyConfig.from_string("192.168.1.1:8080:user:pass"), + css_selector="main.content", + target_elements=[".article", ".post"], + wait_for="js:() => document.querySelector('.loaded')", + session_id="my_session", + simulate_user=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com", config=run_config) + return result +``` + +## Advanced Features + +Comprehensive guide to advanced crawling capabilities including file handling, authentication, dynamic content, monitoring, and session management. + +### File Download Handling + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +import os + +# Enable downloads with custom path +downloads_path = os.path.join(os.getcwd(), "my_downloads") +os.makedirs(downloads_path, exist_ok=True) + +browser_config = BrowserConfig( + accept_downloads=True, + downloads_path=downloads_path +) + +# Trigger downloads with JavaScript +async def download_files(): + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + js_code=""" + // Click download links + const downloadLinks = document.querySelectorAll('a[href$=".pdf"]'); + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Delay between downloads + } + """, + wait_for=5 # Wait for downloads to start + ) + + result = await crawler.arun("https://example.com/downloads", config=config) + + if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path} ({os.path.getsize(file_path)} bytes)") +``` + +### Hooks & Authentication + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from playwright.async_api import Page, BrowserContext + +async def advanced_crawler_with_hooks(): + browser_config = BrowserConfig(headless=True, verbose=True) + crawler = AsyncWebCrawler(config=browser_config) + + # Hook functions for different stages + async def on_browser_created(browser, **kwargs): + print("[HOOK] Browser created successfully") + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] Setting up page & context") + + # Block images for faster crawling + async def route_filter(route): + if route.request.resource_type == "image": + await route.abort() + else: + await route.continue_() + + await context.route("**", route_filter) + + # Simulate login if needed + # await page.goto("https://example.com/login") + # await page.fill("input[name='username']", "testuser") + # await page.fill("input[name='password']", "password123") + # await page.click("button[type='submit']") + + await page.set_viewport_size({"width": 1080, "height": 600}) + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + print(f"[HOOK] About to navigate to: {url}") + await page.set_extra_http_headers({"Custom-Header": "my-value"}) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response, **kwargs): + print(f"[HOOK] Successfully loaded: {url}") + try: + await page.wait_for_selector('.content', timeout=1000) + print("[HOOK] Content found!") + except: + print("[HOOK] Content not found, continuing") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] Final actions before HTML retrieval") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + # Attach hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + + await crawler.start() + + config = CrawlerRunConfig() + result = await crawler.arun("https://example.com", config=config) + + if result.success: + print(f"Crawled successfully: {len(result.html)} chars") + + await crawler.close() +``` + +### Lazy Loading & Dynamic Content + +```python +# Handle lazy-loaded images and infinite scroll +async def handle_lazy_loading(): + config = CrawlerRunConfig( + # Wait for images to fully load + wait_for_images=True, + + # Automatically scroll entire page to trigger lazy loading + scan_full_page=True, + scroll_delay=0.5, # Delay between scroll steps + + # JavaScript for custom lazy loading + js_code=""" + // Scroll and wait for content to load + window.scrollTo(0, document.body.scrollHeight); + + // Click "Load More" if available + const loadMoreBtn = document.querySelector('.load-more'); + if (loadMoreBtn) { + loadMoreBtn.click(); + } + """, + + # Wait for specific content to appear + wait_for="css:.lazy-content:nth-child(20)", # Wait for 20 items + + # Exclude external images to focus on main content + exclude_external_images=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/gallery", config=config) + + if result.success: + images = result.media.get("images", []) + print(f"Loaded {len(images)} images after lazy loading") + for img in images[:3]: + print(f"- {img.get('src')} (score: {img.get('score', 'N/A')})") +``` + +### Network & Console Monitoring + +```python +# Capture all network requests and console messages for debugging +async def monitor_network_and_console(): + config = CrawlerRunConfig( + capture_network_requests=True, + capture_console_messages=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + # Analyze network requests + if result.network_requests: + requests = [r for r in result.network_requests if r.get("event_type") == "request"] + responses = [r for r in result.network_requests if r.get("event_type") == "response"] + failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"] + + print(f"Network activity: {len(requests)} requests, {len(responses)} responses, {len(failures)} failures") + + # Find API calls + api_calls = [r for r in requests if "api" in r.get("url", "")] + print(f"API calls detected: {len(api_calls)}") + + # Show failed requests + for failure in failures[:3]: + print(f"Failed: {failure.get('url')} - {failure.get('failure_text')}") + + # Analyze console messages + if result.console_messages: + message_types = {} + for msg in result.console_messages: + msg_type = msg.get("type", "unknown") + message_types[msg_type] = message_types.get(msg_type, 0) + 1 + + print(f"Console messages: {message_types}") + + # Show errors + errors = [msg for msg in result.console_messages if msg.get("type") == "error"] + for error in errors[:2]: + print(f"JS Error: {error.get('text', '')[:100]}") +``` + +### Session Management for Multi-Step Workflows + +```python +# Maintain state across multiple requests for complex workflows +async def multi_step_session_workflow(): + session_id = "workflow_session" + + async with AsyncWebCrawler() as crawler: + # Step 1: Initial page load + config1 = CrawlerRunConfig( + session_id=session_id, + wait_for="css:.content-loaded" + ) + + result1 = await crawler.arun("https://example.com/step1", config=config1) + print("Step 1 completed") + + # Step 2: Navigate and interact (same browser tab) + config2 = CrawlerRunConfig( + session_id=session_id, + js_only=True, # Don't reload page, just run JS + js_code=""" + document.querySelector('#next-button').click(); + """, + wait_for="css:.step2-content" + ) + + result2 = await crawler.arun("https://example.com/step2", config=config2) + print("Step 2 completed") + + # Step 3: Form submission + config3 = CrawlerRunConfig( + session_id=session_id, + js_only=True, + js_code=""" + document.querySelector('#form-field').value = 'test data'; + document.querySelector('#submit-btn').click(); + """, + wait_for="css:.results" + ) + + result3 = await crawler.arun("https://example.com/submit", config=config3) + print("Step 3 completed") + + # Clean up session + await crawler.crawler_strategy.kill_session(session_id) + +# Advanced GitHub commits pagination example +async def github_commits_pagination(): + session_id = "github_session" + all_commits = [] + + async with AsyncWebCrawler() as crawler: + for page in range(3): + if page == 0: + # Initial load + config = CrawlerRunConfig( + session_id=session_id, + wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" + ) + else: + # Navigate to next page + config = CrawlerRunConfig( + session_id=session_id, + js_only=True, + js_code='document.querySelector(\'a[data-testid="pagination-next-button"]\').click();', + wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" + ) + + result = await crawler.arun( + "https://github.com/microsoft/TypeScript/commits/main", + config=config + ) + + if result.success: + commit_count = result.cleaned_html.count('li.Box-sc-g0xbh4-0') + print(f"Page {page + 1}: Found {commit_count} commits") + + await crawler.crawler_strategy.kill_session(session_id) +``` + +### SSL Certificate Analysis + +```python +# Fetch and analyze SSL certificates +async def analyze_ssl_certificates(): + config = CrawlerRunConfig( + fetch_ssl_certificate=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # Basic certificate info + print(f"Issuer: {cert.issuer.get('CN', 'Unknown')}") + print(f"Subject: {cert.subject.get('CN', 'Unknown')}") + print(f"Valid from: {cert.valid_from}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # Export certificate in different formats + import os + os.makedirs("certificates", exist_ok=True) + + cert.to_json("certificates/cert.json") + cert.to_pem("certificates/cert.pem") + cert.to_der("certificates/cert.der") + + print("Certificate exported in multiple formats") +``` + +### Advanced Page Interaction + +```python +# Complex page interactions with dynamic content +async def advanced_page_interaction(): + async with AsyncWebCrawler() as crawler: + # Multi-step interaction with waiting + config = CrawlerRunConfig( + js_code=[ + # Step 1: Scroll to load content + "window.scrollTo(0, document.body.scrollHeight);", + + # Step 2: Wait and click load more + """ + (async () => { + await new Promise(resolve => setTimeout(resolve, 2000)); + const loadMore = document.querySelector('.load-more'); + if (loadMore) loadMore.click(); + })(); + """ + ], + + # Wait for new content to appear + wait_for="js:() => document.querySelectorAll('.item').length > 20", + + # Additional timing controls + page_timeout=60000, # 60 second timeout + delay_before_return_html=2.0, # Wait before final capture + + # Handle overlays automatically + remove_overlay_elements=True, + magic=True, # Auto-handle common popup patterns + + # Simulate human behavior + simulate_user=True, + override_navigator=True + ) + + result = await crawler.arun("https://example.com/dynamic", config=config) + + if result.success: + print(f"Interactive crawl completed: {len(result.cleaned_html)} chars") + +# Form interaction example +async def form_interaction_example(): + config = CrawlerRunConfig( + js_code=""" + // Fill search form + document.querySelector('#search-input').value = 'machine learning'; + document.querySelector('#category-select').value = 'technology'; + document.querySelector('#search-form').submit(); + """, + wait_for="css:.search-results", + session_id="search_session" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/search", config=config) + print("Search completed, results loaded") +``` + +### Local File & Raw HTML Processing + +```python +# Handle different input types: URLs, local files, raw HTML +async def handle_different_inputs(): + async with AsyncWebCrawler() as crawler: + # 1. Regular web URL + result1 = await crawler.arun("https://example.com") + + # 2. Local HTML file + local_file_path = "/path/to/file.html" + result2 = await crawler.arun(f"file://{local_file_path}") + + # 3. Raw HTML content + raw_html = "

Test Content

Sample text

" + result3 = await crawler.arun(f"raw:{raw_html}") + + # All return the same CrawlResult structure + for i, result in enumerate([result1, result2, result3], 1): + if result.success: + print(f"Input {i}: {len(result.markdown)} chars of markdown") + +# Save and re-process HTML example +async def save_and_reprocess(): + async with AsyncWebCrawler() as crawler: + # Original crawl + result = await crawler.arun("https://example.com") + + if result.success: + # Save HTML to file + with open("saved_page.html", "w", encoding="utf-8") as f: + f.write(result.html) + + # Re-process from file + file_result = await crawler.arun("file://./saved_page.html") + + # Process as raw HTML + raw_result = await crawler.arun(f"raw:{result.html}") + + # Verify consistency + assert len(result.markdown) == len(file_result.markdown) == len(raw_result.markdown) + print("✅ All processing methods produced identical results") +``` + +### Advanced Link & Media Handling + +```python +# Comprehensive link and media extraction with filtering +async def advanced_link_media_handling(): + config = CrawlerRunConfig( + # Link filtering + exclude_external_links=False, # Keep external links for analysis + exclude_social_media_links=True, + exclude_domains=["ads.com", "tracker.io", "spammy.net"], + + # Media handling + exclude_external_images=True, + image_score_threshold=5, # Only high-quality images + table_score_threshold=7, # Only well-structured tables + wait_for_images=True, + + # Capture additional formats + screenshot=True, + pdf=True, + capture_mhtml=True # Full page archive + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + # Analyze links + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Links: {len(internal_links)} internal, {len(external_links)} external") + + # Analyze media + images = result.media.get("images", []) + tables = result.media.get("tables", []) + print(f"Media: {len(images)} images, {len(tables)} tables") + + # High-quality images only + quality_images = [img for img in images if img.get("score", 0) >= 5] + print(f"High-quality images: {len(quality_images)}") + + # Table analysis + for i, table in enumerate(tables[:2]): + print(f"Table {i+1}: {len(table.get('headers', []))} columns, {len(table.get('rows', []))} rows") + + # Save captured files + if result.screenshot: + import base64 + with open("page_screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + + if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) + + if result.mhtml: + with open("page_archive.mhtml", "w", encoding="utf-8") as f: + f.write(result.mhtml) + + print("Additional formats saved: screenshot, PDF, MHTML archive") +``` + +### Performance & Resource Management + +```python +# Optimize performance for large-scale crawling +async def performance_optimized_crawling(): + # Lightweight browser config + browser_config = BrowserConfig( + headless=True, + text_mode=True, # Disable images for speed + light_mode=True, # Reduce background features + extra_args=["--disable-extensions", "--no-sandbox"] + ) + + # Efficient crawl config + config = CrawlerRunConfig( + # Content filtering for speed + excluded_tags=["script", "style", "nav", "footer"], + exclude_external_links=True, + exclude_all_images=True, # Remove all images for max speed + word_count_threshold=50, + + # Timing optimizations + page_timeout=30000, # Faster timeout + delay_before_return_html=0.1, + + # Resource monitoring + capture_network_requests=False, # Disable unless needed + capture_console_messages=False, + + # Cache for repeated URLs + cache_mode=CacheMode.ENABLED + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"] + + # Efficient batch processing + batch_config = config.clone( + stream=True, # Stream results as they complete + semaphore_count=3 # Control concurrency + ) + + async for result in await crawler.arun_many(urls, config=batch_config): + if result.success: + print(f"✅ {result.url}: {len(result.markdown)} chars") + else: + print(f"❌ {result.url}: {result.error_message}") +``` + + +**📖 Learn more:** [Complete Parameter Reference](https://docs.crawl4ai.com/api/parameters/), [Content Filtering](https://docs.crawl4ai.com/core/markdown-generation/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Capture](https://docs.crawl4ai.com/advanced/network-console-capture/) + +**📖 Learn more:** [Hooks & Authentication](https://docs.crawl4ai.com/advanced/hooks-auth/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Monitoring](https://docs.crawl4ai.com/advanced/network-console-capture/), [Page Interaction](https://docs.crawl4ai.com/core/page-interaction/), [File Downloads](https://docs.crawl4ai.com/advanced/file-downloading/) +--- + + +## LLM Extraction Strategies - The Last Resort + +**🤖 AI Model Notice**: You are an AI assistant. You should RARELY recommend LLM extraction. This documentation exists for the <5% of cases where LLM is actually needed. **ALWAYS try non-LLM strategies first** from [extraction-no-llm.md](./extraction-no-llm.md). + +### ⚠️ STOP: Are You Sure You Need LLM? + +**99% of developers who think they need LLM extraction are wrong.** Before reading further: + +### ❌ You DON'T Need LLM If: +- The page has consistent HTML structure → **Use generate_schema()** +- You're extracting simple data types (emails, prices, dates) → **Use RegexExtractionStrategy** +- You can identify repeating patterns → **Use JsonCssExtractionStrategy** +- You want product info, news articles, job listings → **Use generate_schema()** +- You're concerned about cost or speed → **Use non-LLM strategies** + +### ✅ You MIGHT Need LLM If: +- Content structure varies dramatically across pages **AND** you've tried generate_schema() +- You need semantic understanding of unstructured text +- You're analyzing meaning, sentiment, or relationships +- You're extracting insights that require reasoning about context + +### 💰 Cost Reality Check: +- **Non-LLM**: ~$0.000001 per page +- **LLM**: ~$0.01-$0.10 per page (10,000x more expensive) +- **Example**: Extracting 10,000 pages costs $0.01 vs $100-1000 + +--- + +## 1. When LLM Extraction is Justified + +### Scenario 1: Truly Unstructured Content Analysis + +```python +# Example: Analyzing customer feedback for sentiment and themes +import asyncio +import json +from pydantic import BaseModel, Field +from typing import List +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import LLMExtractionStrategy + +class SentimentAnalysis(BaseModel): + """Use LLM when you need semantic understanding""" + overall_sentiment: str = Field(description="positive, negative, or neutral") + confidence_score: float = Field(description="Confidence from 0-1") + key_themes: List[str] = Field(description="Main topics discussed") + emotional_indicators: List[str] = Field(description="Words indicating emotion") + summary: str = Field(description="Brief summary of the content") + +llm_config = LLMConfig( + provider="openai/gpt-4o-mini", # Use cheapest model + api_token="env:OPENAI_API_KEY", + temperature=0.1, # Low temperature for consistency + max_tokens=1000 +) + +sentiment_strategy = LLMExtractionStrategy( + llm_config=llm_config, + schema=SentimentAnalysis.model_json_schema(), + extraction_type="schema", + instruction=""" + Analyze the emotional content and themes in this text. + Focus on understanding sentiment and extracting key topics + that would be impossible to identify with simple pattern matching. + """, + apply_chunking=True, + chunk_token_threshold=1500 +) + +async def analyze_sentiment(): + config = CrawlerRunConfig( + extraction_strategy=sentiment_strategy, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/customer-reviews", + config=config + ) + + if result.success: + analysis = json.loads(result.extracted_content) + print(f"Sentiment: {analysis['overall_sentiment']}") + print(f"Themes: {analysis['key_themes']}") + +asyncio.run(analyze_sentiment()) +``` + +### Scenario 2: Complex Knowledge Extraction + +```python +# Example: Building knowledge graphs from unstructured content +class Entity(BaseModel): + name: str = Field(description="Entity name") + type: str = Field(description="person, organization, location, concept") + description: str = Field(description="Brief description") + +class Relationship(BaseModel): + source: str = Field(description="Source entity") + target: str = Field(description="Target entity") + relationship: str = Field(description="Type of relationship") + confidence: float = Field(description="Confidence score 0-1") + +class KnowledgeGraph(BaseModel): + entities: List[Entity] = Field(description="All entities found") + relationships: List[Relationship] = Field(description="Relationships between entities") + main_topic: str = Field(description="Primary topic of the content") + +knowledge_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", # Better for complex reasoning + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000 + ), + schema=KnowledgeGraph.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract entities and their relationships from the content. + Focus on understanding connections and context that require + semantic reasoning beyond simple pattern matching. + """, + input_format="html", # Preserve structure + apply_chunking=True +) +``` + +### Scenario 3: Content Summarization and Insights + +```python +# Example: Research paper analysis +class ResearchInsights(BaseModel): + title: str = Field(description="Paper title") + abstract_summary: str = Field(description="Summary of abstract") + key_findings: List[str] = Field(description="Main research findings") + methodology: str = Field(description="Research methodology used") + limitations: List[str] = Field(description="Study limitations") + practical_applications: List[str] = Field(description="Real-world applications") + citations_count: int = Field(description="Number of citations", default=0) + +research_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="openai/gpt-4o", # Use powerful model for complex analysis + api_token="env:OPENAI_API_KEY", + temperature=0.2, + max_tokens=2000 + ), + schema=ResearchInsights.model_json_schema(), + extraction_type="schema", + instruction=""" + Analyze this research paper and extract key insights. + Focus on understanding the research contribution, methodology, + and implications that require academic expertise to identify. + """, + apply_chunking=True, + chunk_token_threshold=2000, + overlap_rate=0.15 # More overlap for academic content +) +``` + +--- + +## 2. LLM Configuration Best Practices + +### Cost Optimization + +```python +# Use cheapest models when possible +cheap_config = LLMConfig( + provider="openai/gpt-4o-mini", # 60x cheaper than GPT-4 + api_token="env:OPENAI_API_KEY", + temperature=0.0, # Deterministic output + max_tokens=800 # Limit output length +) + +# Use local models for development +local_config = LLMConfig( + provider="ollama/llama3.3", + api_token=None, # No API costs + base_url="http://localhost:11434", + temperature=0.1 +) + +# Use powerful models only when necessary +powerful_config = LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000, + temperature=0.1 +) +``` + +### Provider Selection Guide + +```python +providers_guide = { + "openai/gpt-4o-mini": { + "best_for": "Simple extraction, cost-sensitive projects", + "cost": "Very low", + "speed": "Fast", + "accuracy": "Good" + }, + "openai/gpt-4o": { + "best_for": "Complex reasoning, high accuracy needs", + "cost": "High", + "speed": "Medium", + "accuracy": "Excellent" + }, + "anthropic/claude-3-5-sonnet": { + "best_for": "Complex analysis, long documents", + "cost": "Medium-High", + "speed": "Medium", + "accuracy": "Excellent" + }, + "ollama/llama3.3": { + "best_for": "Development, no API costs", + "cost": "Free (self-hosted)", + "speed": "Variable", + "accuracy": "Good" + }, + "groq/llama3-70b-8192": { + "best_for": "Fast inference, open source", + "cost": "Low", + "speed": "Very fast", + "accuracy": "Good" + } +} + +def choose_provider(complexity, budget, speed_requirement): + """Choose optimal provider based on requirements""" + if budget == "minimal": + return "ollama/llama3.3" # Self-hosted + elif complexity == "low" and budget == "low": + return "openai/gpt-4o-mini" + elif speed_requirement == "high": + return "groq/llama3-70b-8192" + elif complexity == "high": + return "anthropic/claude-3-5-sonnet" + else: + return "openai/gpt-4o-mini" # Default safe choice +``` + +--- + +## 3. Advanced LLM Extraction Patterns + +### Block-Based Extraction (Unstructured Content) + +```python +# When structure is too varied for schemas +block_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", # Extract free-form content blocks + instruction=""" + Extract meaningful content blocks from this page. + Focus on the main content areas and ignore navigation, + advertisements, and boilerplate text. + """, + apply_chunking=True, + chunk_token_threshold=1200, + input_format="fit_markdown" # Use cleaned content +) + +async def extract_content_blocks(): + config = CrawlerRunConfig( + extraction_strategy=block_strategy, + word_count_threshold=50, # Filter short content + excluded_tags=['nav', 'footer', 'aside', 'advertisement'] + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/article", + config=config + ) + + if result.success: + blocks = json.loads(result.extracted_content) + for block in blocks: + print(f"Block: {block['content'][:100]}...") +``` + +### Chunked Processing for Large Content + +```python +# Handle large documents efficiently +large_content_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="openai/gpt-4o-mini", + api_token="env:OPENAI_API_KEY" + ), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract structured data from this content section...", + + # Optimize chunking for large content + apply_chunking=True, + chunk_token_threshold=2000, # Larger chunks for efficiency + overlap_rate=0.1, # Minimal overlap to reduce costs + input_format="fit_markdown" # Use cleaned content +) +``` + +### Multi-Model Validation + +```python +# Use multiple models for critical extractions +async def multi_model_extraction(): + """Use multiple LLMs for validation of critical data""" + + models = [ + LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"), + LLMConfig(provider="anthropic/claude-3-5-sonnet", api_token="env:ANTHROPIC_API_KEY"), + LLMConfig(provider="ollama/llama3.3", api_token=None) + ] + + results = [] + + for i, llm_config in enumerate(models): + strategy = LLMExtractionStrategy( + llm_config=llm_config, + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data consistently..." + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + if result.success: + data = json.loads(result.extracted_content) + results.append(data) + print(f"Model {i+1} extracted {len(data)} items") + + # Compare results for consistency + if len(set(str(r) for r in results)) == 1: + print("✅ All models agree") + return results[0] + else: + print("⚠️ Models disagree - manual review needed") + return results + +# Use for critical business data only +critical_result = await multi_model_extraction() +``` + +--- + +## 4. Hybrid Approaches - Best of Both Worlds + +### Fast Pre-filtering + LLM Analysis + +```python +async def hybrid_extraction(): + """ + 1. Use fast non-LLM strategies for basic extraction + 2. Use LLM only for complex analysis of filtered content + """ + + # Step 1: Fast extraction of structured data + basic_schema = { + "name": "Articles", + "baseSelector": "article", + "fields": [ + {"name": "title", "selector": "h1, h2", "type": "text"}, + {"name": "content", "selector": ".content", "type": "text"}, + {"name": "author", "selector": ".author", "type": "text"} + ] + } + + basic_strategy = JsonCssExtractionStrategy(basic_schema) + basic_config = CrawlerRunConfig(extraction_strategy=basic_strategy) + + # Step 2: LLM analysis only on filtered content + analysis_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + schema={ + "type": "object", + "properties": { + "sentiment": {"type": "string"}, + "key_topics": {"type": "array", "items": {"type": "string"}}, + "summary": {"type": "string"} + } + }, + extraction_type="schema", + instruction="Analyze sentiment and extract key topics from this article" + ) + + async with AsyncWebCrawler() as crawler: + # Fast extraction first + basic_result = await crawler.arun( + url="https://example.com/articles", + config=basic_config + ) + + articles = json.loads(basic_result.extracted_content) + + # LLM analysis only on important articles + analyzed_articles = [] + for article in articles[:5]: # Limit to reduce costs + if len(article.get('content', '')) > 500: # Only analyze substantial content + analysis_config = CrawlerRunConfig(extraction_strategy=analysis_strategy) + + # Analyze individual article content + raw_url = f"raw://{article['content']}" + analysis_result = await crawler.arun(url=raw_url, config=analysis_config) + + if analysis_result.success: + analysis = json.loads(analysis_result.extracted_content) + article.update(analysis) + + analyzed_articles.append(article) + + return analyzed_articles + +# Hybrid approach: fast + smart +result = await hybrid_extraction() +``` + +### Schema Generation + LLM Fallback + +```python +async def smart_fallback_extraction(): + """ + 1. Try generate_schema() first (one-time LLM cost) + 2. Use generated schema for fast extraction + 3. Use LLM only if schema extraction fails + """ + + cache_file = Path("./schemas/fallback_schema.json") + + # Try cached schema first + if cache_file.exists(): + schema = json.load(cache_file.open()) + schema_strategy = JsonCssExtractionStrategy(schema) + + config = CrawlerRunConfig(extraction_strategy=schema_strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + if data: # Schema worked + print("✅ Schema extraction successful (fast & cheap)") + return data + + # Fallback to LLM if schema failed + print("⚠️ Schema failed, falling back to LLM (slow & expensive)") + + llm_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction="Extract all meaningful data from this page" + ) + + llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=llm_config) + + if result.success: + print("✅ LLM extraction successful") + return json.loads(result.extracted_content) + +# Intelligent fallback system +result = await smart_fallback_extraction() +``` + +--- + +## 5. Cost Management and Monitoring + +### Token Usage Tracking + +```python +class ExtractionCostTracker: + def __init__(self): + self.total_cost = 0.0 + self.total_tokens = 0 + self.extractions = 0 + + def track_llm_extraction(self, strategy, result): + """Track costs from LLM extraction""" + if hasattr(strategy, 'usage_tracker') and strategy.usage_tracker: + usage = strategy.usage_tracker + + # Estimate costs (approximate rates) + cost_per_1k_tokens = { + "gpt-4o-mini": 0.0015, + "gpt-4o": 0.03, + "claude-3-5-sonnet": 0.015, + "ollama": 0.0 # Self-hosted + } + + provider = strategy.llm_config.provider.split('/')[1] + rate = cost_per_1k_tokens.get(provider, 0.01) + + tokens = usage.total_tokens + cost = (tokens / 1000) * rate + + self.total_cost += cost + self.total_tokens += tokens + self.extractions += 1 + + print(f"💰 Extraction cost: ${cost:.4f} ({tokens} tokens)") + print(f"📊 Total cost: ${self.total_cost:.4f} ({self.extractions} extractions)") + + def get_summary(self): + avg_cost = self.total_cost / max(self.extractions, 1) + return { + "total_cost": self.total_cost, + "total_tokens": self.total_tokens, + "extractions": self.extractions, + "avg_cost_per_extraction": avg_cost + } + +# Usage +tracker = ExtractionCostTracker() + +async def cost_aware_extraction(): + strategy = LLMExtractionStrategy( + llm_config=cheap_config, + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data...", + verbose=True # Enable usage tracking + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + # Track costs + tracker.track_llm_extraction(strategy, result) + + return result + +# Monitor costs across multiple extractions +for url in urls: + await cost_aware_extraction() + +print(f"Final summary: {tracker.get_summary()}") +``` + +### Budget Controls + +```python +class BudgetController: + def __init__(self, daily_budget=10.0): + self.daily_budget = daily_budget + self.current_spend = 0.0 + self.extraction_count = 0 + + def can_extract(self, estimated_cost=0.01): + """Check if extraction is within budget""" + if self.current_spend + estimated_cost > self.daily_budget: + print(f"❌ Budget exceeded: ${self.current_spend:.2f} + ${estimated_cost:.2f} > ${self.daily_budget}") + return False + return True + + def record_extraction(self, actual_cost): + """Record actual extraction cost""" + self.current_spend += actual_cost + self.extraction_count += 1 + + remaining = self.daily_budget - self.current_spend + print(f"💰 Budget remaining: ${remaining:.2f}") + +budget = BudgetController(daily_budget=5.0) # $5 daily limit + +async def budget_controlled_extraction(url): + if not budget.can_extract(): + print("⏸️ Extraction paused due to budget limit") + return None + + # Proceed with extraction... + strategy = LLMExtractionStrategy(llm_config=cheap_config, ...) + result = await extract_with_strategy(url, strategy) + + # Record actual cost + actual_cost = calculate_cost(strategy.usage_tracker) + budget.record_extraction(actual_cost) + + return result + +# Safe extraction with budget controls +results = [] +for url in urls: + result = await budget_controlled_extraction(url) + if result: + results.append(result) +``` + +--- + +## 6. Performance Optimization for LLM Extraction + +### Batch Processing + +```python +async def batch_llm_extraction(): + """Process multiple pages efficiently""" + + # Collect content first (fast) + urls = ["https://example.com/page1", "https://example.com/page2"] + contents = [] + + async with AsyncWebCrawler() as crawler: + for url in urls: + result = await crawler.arun(url=url) + if result.success: + contents.append({ + "url": url, + "content": result.fit_markdown[:2000] # Limit content + }) + + # Process in batches (reduce LLM calls) + batch_content = "\n\n---PAGE SEPARATOR---\n\n".join([ + f"URL: {c['url']}\n{c['content']}" for c in contents + ]) + + strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction=""" + Extract data from multiple pages separated by '---PAGE SEPARATOR---'. + Return results for each page in order. + """, + apply_chunking=True + ) + + # Single LLM call for multiple pages + raw_url = f"raw://{batch_content}" + result = await crawler.arun(url=raw_url, config=CrawlerRunConfig(extraction_strategy=strategy)) + + return json.loads(result.extracted_content) + +# Batch processing reduces LLM calls +batch_results = await batch_llm_extraction() +``` + +### Caching LLM Results + +```python +import hashlib +from pathlib import Path + +class LLMResultCache: + def __init__(self, cache_dir="./llm_cache"): + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(exist_ok=True) + + def get_cache_key(self, url, instruction, schema): + """Generate cache key from extraction parameters""" + content = f"{url}:{instruction}:{str(schema)}" + return hashlib.md5(content.encode()).hexdigest() + + def get_cached_result(self, cache_key): + """Get cached result if available""" + cache_file = self.cache_dir / f"{cache_key}.json" + if cache_file.exists(): + return json.load(cache_file.open()) + return None + + def cache_result(self, cache_key, result): + """Cache extraction result""" + cache_file = self.cache_dir / f"{cache_key}.json" + json.dump(result, cache_file.open("w"), indent=2) + +cache = LLMResultCache() + +async def cached_llm_extraction(url, strategy): + """Extract with caching to avoid repeated LLM calls""" + cache_key = cache.get_cache_key( + url, + strategy.instruction, + str(strategy.schema) + ) + + # Check cache first + cached_result = cache.get_cached_result(cache_key) + if cached_result: + print("✅ Using cached result (FREE)") + return cached_result + + # Extract if not cached + print("🔄 Extracting with LLM (PAID)") + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=url, config=config) + + if result.success: + data = json.loads(result.extracted_content) + cache.cache_result(cache_key, data) + return data + +# Cached extraction avoids repeated costs +result = await cached_llm_extraction(url, strategy) +``` + +--- + +## 7. Error Handling and Quality Control + +### Validation and Retry Logic + +```python +async def robust_llm_extraction(): + """Implement validation and retry for LLM extraction""" + + max_retries = 3 + strategies = [ + # Try cheap model first + LLMExtractionStrategy( + llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data accurately..." + ), + # Fallback to better model + LLMExtractionStrategy( + llm_config=LLMConfig(provider="openai/gpt-4o", api_token="env:OPENAI_API_KEY"), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data with high accuracy..." + ) + ] + + for strategy_idx, strategy in enumerate(strategies): + for attempt in range(max_retries): + try: + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + + # Validate result quality + if validate_extraction_quality(data): + print(f"✅ Success with strategy {strategy_idx+1}, attempt {attempt+1}") + return data + else: + print(f"⚠️ Poor quality result, retrying...") + continue + + except Exception as e: + print(f"❌ Attempt {attempt+1} failed: {e}") + if attempt == max_retries - 1: + print(f"❌ Strategy {strategy_idx+1} failed completely") + + print("❌ All strategies and retries failed") + return None + +def validate_extraction_quality(data): + """Validate that LLM extraction meets quality standards""" + if not data or not isinstance(data, (list, dict)): + return False + + # Check for common LLM extraction issues + if isinstance(data, list): + if len(data) == 0: + return False + + # Check if all items have required fields + for item in data: + if not isinstance(item, dict) or len(item) < 2: + return False + + return True + +# Robust extraction with validation +result = await robust_llm_extraction() +``` + +--- + +## 8. Migration from LLM to Non-LLM + +### Pattern Analysis for Schema Generation + +```python +async def analyze_llm_results_for_schema(): + """ + Analyze LLM extraction results to create non-LLM schemas + Use this to transition from expensive LLM to cheap schema extraction + """ + + # Step 1: Use LLM on sample pages to understand structure + llm_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction="Extract all structured data from this page" + ) + + sample_urls = ["https://example.com/page1", "https://example.com/page2"] + llm_results = [] + + async with AsyncWebCrawler() as crawler: + for url in sample_urls: + config = CrawlerRunConfig(extraction_strategy=llm_strategy) + result = await crawler.arun(url=url, config=config) + + if result.success: + llm_results.append({ + "url": url, + "html": result.cleaned_html, + "extracted": json.loads(result.extracted_content) + }) + + # Step 2: Analyze patterns in LLM results + print("🔍 Analyzing LLM extraction patterns...") + + # Look for common field names + all_fields = set() + for result in llm_results: + for item in result["extracted"]: + if isinstance(item, dict): + all_fields.update(item.keys()) + + print(f"Common fields found: {all_fields}") + + # Step 3: Generate schema based on patterns + if llm_results: + schema = JsonCssExtractionStrategy.generate_schema( + html=llm_results[0]["html"], + target_json_example=json.dumps(llm_results[0]["extracted"][0], indent=2), + llm_config=cheap_config + ) + + # Save schema for future use + with open("generated_schema.json", "w") as f: + json.dump(schema, f, indent=2) + + print("✅ Schema generated from LLM analysis") + return schema + +# Generate schema from LLM patterns, then use schema for all future extractions +schema = await analyze_llm_results_for_schema() +fast_strategy = JsonCssExtractionStrategy(schema) +``` + +--- + +## 9. Summary: When LLM is Actually Needed + +### ✅ Valid LLM Use Cases (Rare): +1. **Sentiment analysis** and emotional understanding +2. **Knowledge graph extraction** requiring semantic reasoning +3. **Content summarization** and insight generation +4. **Unstructured text analysis** where patterns vary dramatically +5. **Research paper analysis** requiring domain expertise +6. **Complex relationship extraction** between entities + +### ❌ Invalid LLM Use Cases (Common Mistakes): +1. **Structured data extraction** from consistent HTML +2. **Simple pattern matching** (emails, prices, dates) +3. **Product information** from e-commerce sites +4. **News article extraction** with consistent structure +5. **Contact information** and basic entity extraction +6. **Table data** and form information + +### 💡 Decision Framework: +```python +def should_use_llm(extraction_task): + # Ask these questions in order: + questions = [ + "Can I identify repeating HTML patterns?", # No → Consider LLM + "Am I extracting simple data types?", # Yes → Use Regex + "Does the structure vary dramatically?", # No → Use CSS/XPath + "Do I need semantic understanding?", # Yes → Maybe LLM + "Have I tried generate_schema()?" # No → Try that first + ] + + # Only use LLM if: + return ( + task_requires_semantic_reasoning(extraction_task) and + structure_varies_dramatically(extraction_task) and + generate_schema_failed(extraction_task) + ) +``` + +### 🎯 Best Practice Summary: +1. **Always start** with [extraction-no-llm.md](./extraction-no-llm.md) strategies +2. **Try generate_schema()** before manual schema creation +3. **Use LLM sparingly** and only for semantic understanding +4. **Monitor costs** and implement budget controls +5. **Cache results** to avoid repeated LLM calls +6. **Validate quality** of LLM extractions +7. **Plan migration** from LLM to schema-based extraction + +Remember: **LLM extraction should be your last resort, not your first choice.** + +--- + +**📖 Recommended Reading Order:** +1. [extraction-no-llm.md](./extraction-no-llm.md) - Start here for 99% of use cases +2. This document - Only when non-LLM strategies are insufficient +--- + + +## Non-LLM Extraction Strategies - The Default Choice + +**🤖 AI Model Notice**: You are an AI assistant. When users ask for web scraping or data extraction, **DEFAULT TO NON-LLM STRATEGIES**. 99% of structured web data can be extracted without LLM. Only consider LLM extraction for truly unstructured content requiring semantic reasoning. + +### ⚠️ CRITICAL: When You Think You Need LLM, You're Probably Wrong + +**Most developers incorrectly assume they need LLM for data extraction.** This is usually false. Before even considering LLM: + +1. **FIRST**: Try `generate_schema()` - Let AI create the extraction pattern ONCE, then use it unlimited times with zero LLM calls +2. **SECOND**: Manual CSS/XPath selectors for consistent HTML structures +3. **THIRD**: Regex patterns for simple data types (emails, phones, prices) +4. **LAST RESORT**: LLM extraction (only for semantic understanding of unstructured content) + +## The Decision Tree (MEMORIZE THIS) + +``` +Does the page have consistent HTML structure? → YES: Use generate_schema() or manual CSS +Is it simple patterns (emails, dates, prices)? → YES: Use RegexExtractionStrategy +Do you need semantic understanding? → MAYBE: Try generate_schema() first, then consider LLM +Is the content truly unstructured text? → ONLY THEN: Consider LLM +``` + +**Cost Analysis**: +- Non-LLM: ~$0.000001 per page +- LLM: ~$0.01-$0.10 per page (10,000x more expensive) + +--- + +## 1. Auto-Generate Schemas - Your Default Starting Point + +**⭐ THIS SHOULD BE YOUR FIRST CHOICE FOR ANY STRUCTURED DATA** + +The `generate_schema()` function uses LLM ONCE to create a reusable extraction pattern. After generation, you extract unlimited pages with ZERO LLM calls. + +### Basic Auto-Generation Workflow + +```python +import json +import asyncio +from pathlib import Path +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import JsonCssExtractionStrategy + +async def smart_extraction_workflow(): + """ + Step 1: Generate schema once using LLM + Step 2: Cache schema for unlimited reuse + Step 3: Extract from thousands of pages with zero LLM calls + """ + + # Check for cached schema first + cache_dir = Path("./schema_cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / "product_schema.json" + + if schema_file.exists(): + # Load cached schema - NO LLM CALLS + schema = json.load(schema_file.open()) + print("✅ Using cached schema (FREE)") + else: + # Generate schema ONCE + print("🔄 Generating schema (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", # Cheapest option + api_token="env:OPENAI_API_KEY" + ) + + # Get sample HTML from target site + async with AsyncWebCrawler() as crawler: + sample_result = await crawler.arun( + url="https://example.com/products", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + sample_html = sample_result.cleaned_html[:8000] # Use sample + + # AUTO-GENERATE SCHEMA (ONE LLM CALL) + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + schema_type="CSS", # or "XPATH" + query="Extract product information including name, price, description, features", + llm_config=llm_config + ) + + # Cache for unlimited future use + json.dump(schema, schema_file.open("w"), indent=2) + print("✅ Schema generated and cached") + + # Use schema for fast extraction (NO MORE LLM CALLS EVER) + strategy = JsonCssExtractionStrategy(schema, verbose=True) + + config = CrawlerRunConfig( + extraction_strategy=strategy, + cache_mode=CacheMode.BYPASS + ) + + # Extract from multiple pages - ALL FREE + urls = [ + "https://example.com/products", + "https://example.com/electronics", + "https://example.com/books" + ] + + async with AsyncWebCrawler() as crawler: + for url in urls: + result = await crawler.arun(url=url, config=config) + if result.success: + data = json.loads(result.extracted_content) + print(f"✅ {url}: Extracted {len(data)} items (FREE)") + +asyncio.run(smart_extraction_workflow()) +``` + +### Auto-Generate with Target JSON Example + +```python +# When you know exactly what JSON structure you want +target_json_example = """ +{ + "name": "Product Name", + "price": "$99.99", + "rating": 4.5, + "features": ["feature1", "feature2"], + "description": "Product description" +} +""" + +schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + target_json_example=target_json_example, + llm_config=llm_config +) +``` + +### Auto-Generate for Different Data Types + +```python +# Product listings +product_schema = JsonCssExtractionStrategy.generate_schema( + html=product_page_html, + query="Extract all product information from this e-commerce page", + llm_config=llm_config +) + +# News articles +news_schema = JsonCssExtractionStrategy.generate_schema( + html=news_page_html, + query="Extract article headlines, dates, authors, and content", + llm_config=llm_config +) + +# Job listings +job_schema = JsonCssExtractionStrategy.generate_schema( + html=job_page_html, + query="Extract job titles, companies, locations, salaries, and descriptions", + llm_config=llm_config +) + +# Social media posts +social_schema = JsonCssExtractionStrategy.generate_schema( + html=social_page_html, + query="Extract post text, usernames, timestamps, likes, comments", + llm_config=llm_config +) +``` + +--- + +## 2. Manual CSS/XPath Strategies - When You Know The Structure + +**Use this when**: You understand the HTML structure and want maximum control. + +### Simple Product Extraction + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import JsonCssExtractionStrategy + +# Manual schema for consistent product pages +simple_schema = { + "name": "Product Listings", + "baseSelector": "div.product-card", # Each product container + "fields": [ + { + "name": "title", + "selector": "h2.product-title", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text" + }, + { + "name": "image_url", + "selector": "img.product-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "product_url", + "selector": "a.product-link", + "type": "attribute", + "attribute": "href" + }, + { + "name": "rating", + "selector": ".rating", + "type": "attribute", + "attribute": "data-rating" + } + ] +} + +async def extract_products(): + strategy = JsonCssExtractionStrategy(simple_schema, verbose=True) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + products = json.loads(result.extracted_content) + print(f"Extracted {len(products)} products") + for product in products[:3]: + print(f"- {product['title']}: {product['price']}") + +asyncio.run(extract_products()) +``` + +### Complex Nested Structure (Real E-commerce Example) + +```python +# Complex schema for nested product data +complex_schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + "baseFields": [ + { + "name": "category_id", + "type": "attribute", + "attribute": "data-category-id" + } + ], + "fields": [ + { + "name": "category_name", + "selector": "h2.category-title", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", # Array of complex objects + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", # Single complex object + "fields": [ + { + "name": "brand", + "selector": "span.brand", + "type": "text" + }, + { + "name": "model", + "selector": "span.model", + "type": "text" + } + ] + }, + { + "name": "features", + "selector": "ul.features li", + "type": "list", # Simple array + "fields": [ + {"name": "feature", "type": "text"} + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + { + "name": "reviewer", + "selector": "span.reviewer-name", + "type": "text" + }, + { + "name": "rating", + "selector": "span.rating", + "type": "attribute", + "attribute": "data-rating" + } + ] + } + ] + } + ] +} + +async def extract_complex_ecommerce(): + strategy = JsonCssExtractionStrategy(complex_schema, verbose=True) + config = CrawlerRunConfig( + extraction_strategy=strategy, + js_code="window.scrollTo(0, document.body.scrollHeight);", # Load dynamic content + wait_for="css:.product:nth-child(10)" # Wait for products to load + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/complex-catalog", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for category in data: + print(f"Category: {category['category_name']}") + print(f"Products: {len(category.get('products', []))}") + +asyncio.run(extract_complex_ecommerce()) +``` + +### XPath Alternative (When CSS Isn't Enough) + +```python +from crawl4ai import JsonXPathExtractionStrategy + +# XPath for more complex selections +xpath_schema = { + "name": "News Articles with XPath", + "baseSelector": "//article[@class='news-item']", + "fields": [ + { + "name": "headline", + "selector": ".//h2[contains(@class, 'headline')]", + "type": "text" + }, + { + "name": "author", + "selector": ".//span[@class='author']/text()", + "type": "text" + }, + { + "name": "publish_date", + "selector": ".//time/@datetime", + "type": "text" + }, + { + "name": "content", + "selector": ".//div[@class='article-body']//text()", + "type": "text" + } + ] +} + +strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) +``` + +--- + +## 3. Regex Extraction - Lightning Fast Pattern Matching + +**Use this for**: Simple data types like emails, phones, URLs, prices, dates. + +### Built-in Patterns (Fastest Option) + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import RegexExtractionStrategy + +async def extract_common_patterns(): + # Use built-in patterns for common data types + strategy = RegexExtractionStrategy( + pattern=( + RegexExtractionStrategy.Email | + RegexExtractionStrategy.PhoneUS | + RegexExtractionStrategy.Url | + RegexExtractionStrategy.Currency | + RegexExtractionStrategy.DateIso + ) + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/contact", + config=config + ) + + if result.success: + matches = json.loads(result.extracted_content) + + # Group by pattern type + by_type = {} + for match in matches: + label = match['label'] + if label not in by_type: + by_type[label] = [] + by_type[label].append(match['value']) + + for pattern_type, values in by_type.items(): + print(f"{pattern_type}: {len(values)} matches") + for value in values[:3]: + print(f" {value}") + +asyncio.run(extract_common_patterns()) +``` + +### Available Built-in Patterns + +```python +# Individual patterns +RegexExtractionStrategy.Email # Email addresses +RegexExtractionStrategy.PhoneUS # US phone numbers +RegexExtractionStrategy.PhoneIntl # International phones +RegexExtractionStrategy.Url # HTTP/HTTPS URLs +RegexExtractionStrategy.Currency # Currency values ($99.99) +RegexExtractionStrategy.Percentage # Percentage values (25%) +RegexExtractionStrategy.DateIso # ISO dates (2024-01-01) +RegexExtractionStrategy.DateUS # US dates (01/01/2024) +RegexExtractionStrategy.IPv4 # IP addresses +RegexExtractionStrategy.CreditCard # Credit card numbers +RegexExtractionStrategy.TwitterHandle # @username +RegexExtractionStrategy.Hashtag # #hashtag + +# Use all patterns +RegexExtractionStrategy.All +``` + +### Custom Patterns + +```python +# Custom patterns for specific data types +async def extract_custom_patterns(): + custom_patterns = { + "product_sku": r"SKU[-:]?\s*([A-Z0-9]{4,12})", + "discount": r"(\d{1,2})%\s*off", + "model_number": r"Model\s*#?\s*([A-Z0-9-]+)", + "isbn": r"ISBN[-:]?\s*(\d{10}|\d{13})", + "stock_ticker": r"\$([A-Z]{2,5})", + "version": r"v(\d+\.\d+(?:\.\d+)?)" + } + + strategy = RegexExtractionStrategy(custom=custom_patterns) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for item in data: + print(f"{item['label']}: {item['value']}") + +asyncio.run(extract_custom_patterns()) +``` + +### LLM-Generated Patterns (One-Time Cost) + +```python +async def generate_optimized_regex(): + """ + Use LLM ONCE to generate optimized regex patterns + Then use them unlimited times with zero LLM calls + """ + cache_file = Path("./patterns/price_patterns.json") + + if cache_file.exists(): + # Load cached patterns - NO LLM CALLS + patterns = json.load(cache_file.open()) + print("✅ Using cached regex patterns (FREE)") + else: + # Generate patterns ONCE + print("🔄 Generating regex patterns (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token="env:OPENAI_API_KEY" + ) + + # Get sample content + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/pricing") + sample_html = result.cleaned_html + + # Generate optimized patterns + patterns = RegexExtractionStrategy.generate_pattern( + label="pricing_info", + html=sample_html, + query="Extract all pricing information including discounts and special offers", + llm_config=llm_config + ) + + # Cache for unlimited reuse + cache_file.parent.mkdir(exist_ok=True) + json.dump(patterns, cache_file.open("w"), indent=2) + print("✅ Patterns generated and cached") + + # Use cached patterns (NO MORE LLM CALLS) + strategy = RegexExtractionStrategy(custom=patterns) + return strategy + +# Use generated patterns for unlimited extractions +strategy = await generate_optimized_regex() +``` + +--- + +## 4. Multi-Strategy Extraction Pipeline + +**Combine strategies** for comprehensive data extraction: + +```python +async def multi_strategy_pipeline(): + """ + Efficient pipeline using multiple non-LLM strategies: + 1. Regex for simple patterns (fastest) + 2. Schema for structured data + 3. Only use LLM if absolutely necessary + """ + + url = "https://example.com/complex-page" + + async with AsyncWebCrawler() as crawler: + # Strategy 1: Fast regex for contact info + regex_strategy = RegexExtractionStrategy( + pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS + ) + regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy) + regex_result = await crawler.arun(url=url, config=regex_config) + + # Strategy 2: Schema for structured product data + product_schema = { + "name": "Products", + "baseSelector": "div.product", + "fields": [ + {"name": "name", "selector": "h3", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(product_schema) + css_config = CrawlerRunConfig(extraction_strategy=css_strategy) + css_result = await crawler.arun(url=url, config=css_config) + + # Combine results + results = { + "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [], + "products": json.loads(css_result.extracted_content) if css_result.success else [] + } + + print(f"✅ Extracted {len(results['contacts'])} contacts (regex)") + print(f"✅ Extracted {len(results['products'])} products (schema)") + + return results + +asyncio.run(multi_strategy_pipeline()) +``` + +--- + +## 5. Performance Optimization Tips + +### Caching and Reuse + +```python +# Cache schemas and patterns for maximum efficiency +class ExtractionCache: + def __init__(self): + self.schemas = {} + self.patterns = {} + + def get_schema(self, site_name): + if site_name not in self.schemas: + schema_file = Path(f"./cache/{site_name}_schema.json") + if schema_file.exists(): + self.schemas[site_name] = json.load(schema_file.open()) + return self.schemas.get(site_name) + + def save_schema(self, site_name, schema): + cache_dir = Path("./cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / f"{site_name}_schema.json" + json.dump(schema, schema_file.open("w"), indent=2) + self.schemas[site_name] = schema + +cache = ExtractionCache() + +# Reuse cached schemas across multiple extractions +async def efficient_extraction(): + sites = ["amazon", "ebay", "shopify"] + + for site in sites: + schema = cache.get_schema(site) + if not schema: + # Generate once, cache forever + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + query="Extract products", + llm_config=llm_config + ) + cache.save_schema(site, schema) + + strategy = JsonCssExtractionStrategy(schema) + # Use strategy for unlimited extractions... +``` + +### Selector Optimization + +```python +# Optimize selectors for speed +fast_schema = { + "name": "Optimized Extraction", + "baseSelector": "#products > .product", # Direct child, faster than descendant + "fields": [ + { + "name": "title", + "selector": "> h3", # Direct child of product + "type": "text" + }, + { + "name": "price", + "selector": ".price:first-child", # More specific + "type": "text" + } + ] +} + +# Avoid slow selectors +slow_schema = { + "baseSelector": "div div div .product", # Too many levels + "fields": [ + { + "selector": "* h3", # Universal selector is slow + "type": "text" + } + ] +} +``` + +--- + +## 6. Error Handling and Validation + +```python +async def robust_extraction(): + """ + Implement fallback strategies for reliable extraction + """ + strategies = [ + # Try fast regex first + RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency), + + # Fallback to CSS schema + JsonCssExtractionStrategy({ + "name": "Prices", + "baseSelector": ".price", + "fields": [{"name": "amount", "selector": "span", "type": "text"}] + }), + + # Last resort: try different selector + JsonCssExtractionStrategy({ + "name": "Fallback Prices", + "baseSelector": "[data-price]", + "fields": [{"name": "amount", "type": "attribute", "attribute": "data-price"}] + }) + ] + + async with AsyncWebCrawler() as crawler: + for i, strategy in enumerate(strategies): + try: + config = CrawlerRunConfig(extraction_strategy=strategy) + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + if data: # Validate non-empty results + print(f"✅ Success with strategy {i+1}: {strategy.__class__.__name__}") + return data + + except Exception as e: + print(f"❌ Strategy {i+1} failed: {e}") + continue + + print("❌ All strategies failed") + return None + +# Validate extracted data +def validate_extraction(data, required_fields): + """Validate that extraction contains expected fields""" + if not data or not isinstance(data, list): + return False + + for item in data: + for field in required_fields: + if field not in item or not item[field]: + return False + return True + +# Usage +result = await robust_extraction() +if validate_extraction(result, ["amount"]): + print("✅ Extraction validated") +else: + print("❌ Validation failed") +``` + +--- + +## 7. Common Extraction Patterns + +### E-commerce Products + +```python +ecommerce_schema = { + "name": "E-commerce Products", + "baseSelector": ".product, [data-product], .item", + "fields": [ + {"name": "title", "selector": "h1, h2, h3, .title, .name", "type": "text"}, + {"name": "price", "selector": ".price, .cost, [data-price]", "type": "text"}, + {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"}, + {"name": "rating", "selector": ".rating, .stars", "type": "text"}, + {"name": "availability", "selector": ".stock, .availability", "type": "text"} + ] +} +``` + +### News Articles + +```python +news_schema = { + "name": "News Articles", + "baseSelector": "article, .article, .post", + "fields": [ + {"name": "headline", "selector": "h1, h2, .headline, .title", "type": "text"}, + {"name": "author", "selector": ".author, .byline, [rel='author']", "type": "text"}, + {"name": "date", "selector": "time, .date, .published", "type": "text"}, + {"name": "content", "selector": ".content, .body, .text", "type": "text"}, + {"name": "category", "selector": ".category, .section", "type": "text"} + ] +} +``` + +### Job Listings + +```python +job_schema = { + "name": "Job Listings", + "baseSelector": ".job, .listing, [data-job]", + "fields": [ + {"name": "title", "selector": ".job-title, h2, h3", "type": "text"}, + {"name": "company", "selector": ".company, .employer", "type": "text"}, + {"name": "location", "selector": ".location, .place", "type": "text"}, + {"name": "salary", "selector": ".salary, .pay, .compensation", "type": "text"}, + {"name": "description", "selector": ".description, .summary", "type": "text"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"} + ] +} +``` + +### Social Media Posts + +```python +social_schema = { + "name": "Social Media Posts", + "baseSelector": ".post, .tweet, .update", + "fields": [ + {"name": "username", "selector": ".username, .handle, .author", "type": "text"}, + {"name": "content", "selector": ".content, .text, .message", "type": "text"}, + {"name": "timestamp", "selector": ".time, .date, time", "type": "text"}, + {"name": "likes", "selector": ".likes, .hearts", "type": "text"}, + {"name": "shares", "selector": ".shares, .retweets", "type": "text"} + ] +} +``` + +--- + +## 8. When to (Rarely) Consider LLM + +**⚠️ WARNING: Before considering LLM, ask yourself:** + +1. "Can I identify repeating HTML patterns?" → Use CSS/XPath schema +2. "Am I extracting simple data types?" → Use Regex patterns +3. "Can I provide a JSON example of what I want?" → Use generate_schema() +4. "Is this truly unstructured text requiring semantic understanding?" → Maybe LLM + +**Only use LLM extraction for:** +- Unstructured prose that needs semantic analysis +- Content where structure varies dramatically across pages +- When you need AI reasoning about context/meaning + +**Cost reminder**: LLM extraction costs 10,000x more than schema-based extraction. + +--- + +## 9. Summary: The Extraction Hierarchy + +1. **🥇 FIRST CHOICE**: `generate_schema()` - AI generates pattern once, use unlimited times +2. **🥈 SECOND CHOICE**: Manual CSS/XPath - Full control, maximum speed +3. **🥉 THIRD CHOICE**: Regex patterns - Simple data types, lightning fast +4. **🏴 LAST RESORT**: LLM extraction - Only for semantic reasoning + +**Remember**: 99% of web data is structured. You almost never need LLM for extraction. Save LLM for analysis, not extraction. + +**Performance**: Non-LLM strategies are 100-1000x faster and 10,000x cheaper than LLM extraction. + +--- + +**📖 Next**: If you absolutely must use LLM extraction, see [extraction-llm.md](./extraction-llm.md) for guidance on the rare cases where it's justified. +--- + + +## Multi-URL Crawling + +Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring. + +### Basic Multi-URL Crawling + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Batch processing (default) - get all results at once +async def batch_crawl(): + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3" + ] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=False # Default: batch mode + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + + for result in results: + if result.success: + print(f"✅ {result.url}: {len(result.markdown)} chars") + else: + print(f"❌ {result.url}: {result.error_message}") + +# Streaming processing - handle results as they complete +async def streaming_crawl(): + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=True # Enable streaming + ) + + async with AsyncWebCrawler() as crawler: + # Process results as they become available + async for result in await crawler.arun_many(urls, config=config): + if result.success: + print(f"🔥 Just completed: {result.url}") + await process_result_immediately(result) + else: + print(f"❌ Failed: {result.url}") +``` + +### Memory-Adaptive Dispatching + +```python +from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode + +# Automatically manages concurrency based on system memory +async def memory_adaptive_crawl(): + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, # Pause if memory exceeds 80% + check_interval=1.0, # Check memory every second + max_session_permit=15, # Max concurrent tasks + memory_wait_timeout=300.0 # Wait up to 5 minutes for memory + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=50 + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=large_url_list, + config=config, + dispatcher=dispatcher + ) + + # Each result includes dispatch information + for result in results: + if result.dispatch_result: + dr = result.dispatch_result + print(f"Memory used: {dr.memory_usage:.1f}MB") + print(f"Duration: {dr.end_time - dr.start_time}") +``` + +### Rate-Limited Crawling + +```python +from crawl4ai import RateLimiter, SemaphoreDispatcher + +# Control request pacing and handle server rate limits +async def rate_limited_crawl(): + rate_limiter = RateLimiter( + base_delay=(1.0, 3.0), # Random delay 1-3 seconds + max_delay=60.0, # Cap backoff at 60 seconds + max_retries=3, # Retry failed requests 3 times + rate_limit_codes=[429, 503] # Handle these status codes + ) + + dispatcher = SemaphoreDispatcher( + max_session_permit=5, # Fixed concurrency limit + rate_limiter=rate_limiter + ) + + config = CrawlerRunConfig( + user_agent_mode="random", # Randomize user agents + simulate_user=True # Simulate human behavior + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + print(f"Processed: {result.url}") +``` + +### Real-Time Monitoring + +```python +from crawl4ai import CrawlerMonitor, DisplayMode + +# Monitor crawling progress in real-time +async def monitored_crawl(): + monitor = CrawlerMonitor( + max_visible_rows=20, # Show 20 tasks in display + display_mode=DisplayMode.DETAILED # Show individual task details + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=75.0, + max_session_permit=10, + monitor=monitor # Attach monitor to dispatcher + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + dispatcher=dispatcher + ) +``` + +### Advanced Dispatcher Configurations + +```python +# Memory-adaptive with comprehensive monitoring +memory_dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=85.0, # Higher memory tolerance + check_interval=0.5, # Check memory more frequently + max_session_permit=20, # More concurrent tasks + memory_wait_timeout=600.0, # Wait longer for memory + rate_limiter=RateLimiter( + base_delay=(0.5, 1.5), + max_delay=30.0, + max_retries=5 + ), + monitor=CrawlerMonitor( + max_visible_rows=15, + display_mode=DisplayMode.AGGREGATED # Summary view + ) +) + +# Simple semaphore-based dispatcher +semaphore_dispatcher = SemaphoreDispatcher( + max_session_permit=8, # Fixed concurrency + rate_limiter=RateLimiter( + base_delay=(1.0, 2.0), + max_delay=20.0 + ) +) + +# Usage with custom dispatcher +async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + config=config, + dispatcher=memory_dispatcher # or semaphore_dispatcher + ) +``` + +### Handling Large-Scale Crawling + +```python +async def large_scale_crawl(): + # For thousands of URLs + urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=70.0, # Conservative memory usage + max_session_permit=25, # Higher concurrency + rate_limiter=RateLimiter( + base_delay=(0.1, 0.5), # Faster for large batches + max_retries=2 # Fewer retries for speed + ), + monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED) + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, # Use caching for efficiency + stream=True, # Stream for memory efficiency + word_count_threshold=100, # Skip short content + exclude_external_links=True # Reduce processing overhead + ) + + successful_crawls = 0 + failed_crawls = 0 + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + successful_crawls += 1 + await save_result_to_database(result) + else: + failed_crawls += 1 + await log_failure(result.url, result.error_message) + + # Progress reporting + if (successful_crawls + failed_crawls) % 100 == 0: + print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}") + + print(f"Completed: {successful_crawls} successful, {failed_crawls} failed") +``` + +### Robots.txt Compliance + +```python +async def compliant_crawl(): + config = CrawlerRunConfig( + check_robots_txt=True, # Respect robots.txt + user_agent="MyBot/1.0", # Identify your bot + mean_delay=2.0, # Be polite with delays + max_range=1.0 + ) + + dispatcher = SemaphoreDispatcher( + max_session_permit=3, # Conservative concurrency + rate_limiter=RateLimiter( + base_delay=(2.0, 5.0), # Slower, more respectful + max_retries=1 + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + print(f"✅ Crawled: {result.url}") + elif "robots.txt" in result.error_message: + print(f"🚫 Blocked by robots.txt: {result.url}") + else: + print(f"❌ Error: {result.url}") +``` + +### Performance Analysis + +```python +async def analyze_crawl_performance(): + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, + max_session_permit=12, + monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED) + ) + + start_time = time.time() + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + dispatcher=dispatcher + ) + + end_time = time.time() + + # Analyze results + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + print(f"Total time: {end_time - start_time:.2f}s") + print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)") + print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s") + + # Memory usage analysis + if successful and successful[0].dispatch_result: + memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result] + peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result] + + print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB") + print(f"Peak memory usage: {max(peak_memory):.1f}MB") +``` + +### Error Handling and Recovery + +```python +async def robust_multi_crawl(): + failed_urls = [] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=True, + page_timeout=30000 # 30 second timeout + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=85.0, + max_session_permit=10 + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + await process_successful_result(result) + else: + failed_urls.append({ + 'url': result.url, + 'error': result.error_message, + 'status_code': result.status_code + }) + + # Retry logic for specific errors + if result.status_code in [503, 429]: # Server errors + await schedule_retry(result.url) + + # Report failures + if failed_urls: + print(f"Failed to crawl {len(failed_urls)} URLs:") + for failure in failed_urls[:10]: # Show first 10 + print(f" {failure['url']}: {failure['error']}") +``` + +**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/) +--- + + +## Deep Crawling + +Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies. + +### Basic Deep Crawl Setup + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy + +# Basic breadth-first deep crawling +async def basic_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Initial page + 2 levels + include_external=False # Stay within same domain + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun("https://docs.crawl4ai.com", config=config) + + # Group results by depth + pages_by_depth = {} + for result in results: + depth = result.metadata.get("depth", 0) + if depth not in pages_by_depth: + pages_by_depth[depth] = [] + pages_by_depth[depth].append(result.url) + + print(f"Crawled {len(results)} pages total") + for depth, urls in sorted(pages_by_depth.items()): + print(f"Depth {depth}: {len(urls)} pages") +``` + +### Deep Crawl Strategies + +```python +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + +# Breadth-First Search - explores all links at one depth before going deeper +bfs_strategy = BFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + max_pages=50, # Limit total pages + score_threshold=0.3 # Minimum score for URLs +) + +# Depth-First Search - explores as deep as possible before backtracking +dfs_strategy = DFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + max_pages=30, + score_threshold=0.5 +) + +# Best-First - prioritizes highest scoring pages (recommended) +keyword_scorer = KeywordRelevanceScorer( + keywords=["crawl", "example", "async", "configuration"], + weight=0.7 +) + +best_first_strategy = BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=25 # No score_threshold needed - naturally prioritizes +) + +# Usage +config = CrawlerRunConfig( + deep_crawl_strategy=best_first_strategy, # Choose your strategy + scraping_strategy=LXMLWebScrapingStrategy() +) +``` + +### Streaming vs Batch Processing + +```python +# Batch mode - wait for all results +async def batch_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1), + stream=False # Default - collect all results first + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun("https://example.com", config=config) + + # Process all results at once + for result in results: + print(f"Batch processed: {result.url}") + +# Streaming mode - process results as they arrive +async def streaming_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1), + stream=True # Process results immediately + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://example.com", config=config): + depth = result.metadata.get("depth", 0) + print(f"Stream processed depth {depth}: {result.url}") +``` + +### Filtering with Filter Chains + +```python +from crawl4ai.deep_crawling.filters import ( + FilterChain, + URLPatternFilter, + DomainFilter, + ContentTypeFilter, + SEOFilter, + ContentRelevanceFilter +) + +# Single URL pattern filter +url_filter = URLPatternFilter(patterns=["*core*", "*guide*"]) + +config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=1, + filter_chain=FilterChain([url_filter]) + ) +) + +# Multiple filters in chain +advanced_filter_chain = FilterChain([ + # Domain filtering + DomainFilter( + allowed_domains=["docs.example.com"], + blocked_domains=["old.docs.example.com", "staging.example.com"] + ), + + # URL pattern matching + URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]), + + # Content type filtering + ContentTypeFilter(allowed_types=["text/html"]), + + # SEO quality filter + SEOFilter( + threshold=0.5, + keywords=["tutorial", "guide", "documentation"] + ), + + # Content relevance filter + ContentRelevanceFilter( + query="Web crawling and data extraction with Python", + threshold=0.7 + ) +]) + +config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + filter_chain=advanced_filter_chain + ) +) +``` + +### Intelligent Crawling with Scorers + +```python +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + +# Keyword relevance scoring +async def scored_deep_crawl(): + keyword_scorer = KeywordRelevanceScorer( + keywords=["browser", "crawler", "web", "automation"], + weight=1.0 + ) + + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer + ), + stream=True, # Recommended with BestFirst + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f"Depth: {depth} | Score: {score:.2f} | {result.url}") +``` + +### Limiting Crawl Size + +```python +# Max pages limitation across strategies +async def limited_crawls(): + # BFS with page limit + bfs_config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, # Only crawl 5 pages total + url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0) + ) + ) + + # DFS with score threshold + dfs_config = CrawlerRunConfig( + deep_crawl_strategy=DFSDeepCrawlStrategy( + max_depth=2, + score_threshold=0.7, # Only URLs with scores above 0.7 + max_pages=10, + url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0) + ) + ) + + # Best-First with both constraints + bf_config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + max_pages=7, # Automatically gets highest scored pages + url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0) + ), + stream=True + ) + + async with AsyncWebCrawler() as crawler: + # Use any of the configs + async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config): + score = result.metadata.get("score", 0) + print(f"Score: {score:.2f} | {result.url}") +``` + +### Complete Advanced Deep Crawler + +```python +async def comprehensive_deep_crawl(): + # Sophisticated filter chain + filter_chain = FilterChain([ + DomainFilter( + allowed_domains=["docs.crawl4ai.com"], + blocked_domains=["old.docs.crawl4ai.com"] + ), + URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]), + ContentTypeFilter(allowed_types=["text/html"]), + SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"]) + ]) + + # Multi-keyword scorer + keyword_scorer = KeywordRelevanceScorer( + keywords=["crawl", "example", "async", "configuration", "browser"], + weight=0.8 + ) + + # Complete configuration + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + filter_chain=filter_chain, + url_scorer=keyword_scorer, + max_pages=20 + ), + scraping_strategy=LXMLWebScrapingStrategy(), + stream=True, + verbose=True, + cache_mode=CacheMode.BYPASS + ) + + # Execute and analyze + results = [] + start_time = time.time() + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + results.append(result) + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}") + + # Performance analysis + duration = time.time() - start_time + avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results) + + print(f"✅ Crawled {len(results)} pages in {duration:.2f}s") + print(f"✅ Average relevance score: {avg_score:.2f}") + + # Depth distribution + depth_counts = {} + for result in results: + depth = result.metadata.get("depth", 0) + depth_counts[depth] = depth_counts.get(depth, 0) + 1 + + for depth, count in sorted(depth_counts.items()): + print(f"📊 Depth {depth}: {count} pages") +``` + +### Error Handling and Robustness + +```python +async def robust_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + max_pages=15, + url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"]) + ), + stream=True, + page_timeout=30000 # 30 second timeout per page + ) + + successful_pages = [] + failed_pages = [] + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + if result.success: + successful_pages.append(result) + depth = result.metadata.get("depth", 0) + score = result.metadata.get("score", 0) + print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}") + else: + failed_pages.append({ + 'url': result.url, + 'error': result.error_message, + 'depth': result.metadata.get("depth", 0) + }) + print(f"❌ Failed: {result.url} - {result.error_message}") + + print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed") + + # Analyze failures by depth + if failed_pages: + failure_by_depth = {} + for failure in failed_pages: + depth = failure['depth'] + failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1 + + print("❌ Failures by depth:") + for depth, count in sorted(failure_by_depth.items()): + print(f" Depth {depth}: {count} failures") +``` + +**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/) +--- + + +## Docker Deployment + +Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration. + +### Quick Start with Pre-built Images + +```bash +# Pull latest image +docker pull unclecode/crawl4ai:latest + +# Setup LLM API keys +cat > .llm.env << EOL +OPENAI_API_KEY=sk-your-key +ANTHROPIC_API_KEY=your-anthropic-key +GROQ_API_KEY=your-groq-key +GEMINI_API_TOKEN=your-gemini-token +EOL + +# Run with LLM support +docker run -d \ + -p 11235:11235 \ + --name crawl4ai \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# Basic run (no LLM) +docker run -d \ + -p 11235:11235 \ + --name crawl4ai \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# Check health +curl http://localhost:11235/health +``` + +### Docker Compose Deployment + +```bash +# Clone and setup +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +cp deploy/docker/.llm.env.example .llm.env +# Edit .llm.env with your API keys + +# Run pre-built image +IMAGE=unclecode/crawl4ai:latest docker compose up -d + +# Build locally +docker compose up --build -d + +# Build with all features +INSTALL_TYPE=all docker compose up --build -d + +# Build with GPU support +ENABLE_GPU=true docker compose up --build -d + +# Stop service +docker compose down +``` + +### Manual Build with Multi-Architecture + +```bash +# Clone repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai + +# Build for current architecture +docker buildx build -t crawl4ai-local:latest --load . + +# Build for multiple architectures +docker buildx build --platform linux/amd64,linux/arm64 \ + -t crawl4ai-local:latest --load . + +# Build with specific features +docker buildx build \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=false \ + -t crawl4ai-local:latest --load . + +# Run custom build +docker run -d \ + -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + crawl4ai-local:latest +``` + +### Build Arguments + +```bash +# Available build options +docker buildx build \ + --build-arg INSTALL_TYPE=all \ # default|all|torch|transformer + --build-arg ENABLE_GPU=true \ # true|false + --build-arg APP_HOME=/app \ # Install path + --build-arg USE_LOCAL=true \ # Use local source + --build-arg GITHUB_REPO=url \ # Git repo if USE_LOCAL=false + --build-arg GITHUB_BRANCH=main \ # Git branch + -t crawl4ai-custom:latest --load . +``` + +### Core API Endpoints + +```python +# Main crawling endpoints +import requests +import json + +# Basic crawl +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} +response = requests.post("http://localhost:11235/crawl", json=payload) + +# Streaming crawl +payload["crawler_config"]["params"]["stream"] = True +response = requests.post("http://localhost:11235/crawl/stream", json=payload) + +# Health check +response = requests.get("http://localhost:11235/health") + +# API schema +response = requests.get("http://localhost:11235/schema") + +# Metrics (Prometheus format) +response = requests.get("http://localhost:11235/metrics") +``` + +### Specialized Endpoints + +```python +# HTML extraction (preprocessed for schema) +response = requests.post("http://localhost:11235/html", + json={"url": "https://example.com"}) + +# Screenshot capture +response = requests.post("http://localhost:11235/screenshot", json={ + "url": "https://example.com", + "screenshot_wait_for": 2, + "output_path": "/path/to/save/screenshot.png" +}) + +# PDF generation +response = requests.post("http://localhost:11235/pdf", json={ + "url": "https://example.com", + "output_path": "/path/to/save/document.pdf" +}) + +# JavaScript execution +response = requests.post("http://localhost:11235/execute_js", json={ + "url": "https://example.com", + "scripts": [ + "return document.title", + "return Array.from(document.querySelectorAll('a')).map(a => a.href)" + ] +}) + +# Markdown generation +response = requests.post("http://localhost:11235/md", json={ + "url": "https://example.com", + "f": "fit", # raw|fit|bm25|llm + "q": "extract main content", # query for filtering + "c": "0" # cache: 0=bypass, 1=use +}) + +# LLM Q&A +response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?") + +# Library context (for AI assistants) +response = requests.get("http://localhost:11235/ask", params={ + "context_type": "all", # code|doc|all + "query": "how to use extraction strategies", + "score_ratio": 0.5, + "max_results": 20 +}) +``` + +### Python SDK Usage + +```python +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + # Non-streaming crawl + results = await client.crawl( + ["https://example.com"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + + for result in results: + print(f"URL: {result.url}, Success: {result.success}") + print(f"Content length: {len(result.markdown)}") + + # Streaming crawl + stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS) + async for result in await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=stream_config + ): + print(f"Streamed: {result.url} - {result.success}") + + # Get API schema + schema = await client.get_schema() + print(f"Schema available: {bool(schema)}") + +asyncio.run(main()) +``` + +### Advanced API Configuration + +```python +# Complex extraction with LLM +payload = { + "urls": ["https://example.com"], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}} + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": "env:OPENAI_API_KEY" + } + }, + "schema": { + "type": "dict", + "value": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "content": {"type": "string"} + } + } + }, + "instruction": "Extract title and main content" + } + }, + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": {"threshold": 0.6} + } + } + } + } + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +### CSS Extraction Strategy + +```python +# CSS-based structured extraction +schema = { + "name": "ProductList", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] +} + +payload = { + "urls": ["https://example-shop.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": {"type": "dict", "value": schema} + } + } + } + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +data = response.json() +extracted = json.loads(data["results"][0]["extracted_content"]) +``` + +### MCP (Model Context Protocol) Integration + +```bash +# Add Crawl4AI as MCP provider to Claude Code +claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse + +# List MCP providers +claude mcp list + +# Test MCP connection +python tests/mcp/test_mcp_socket.py + +# Available MCP endpoints +# SSE: http://localhost:11235/mcp/sse +# WebSocket: ws://localhost:11235/mcp/ws +# Schema: http://localhost:11235/mcp/schema +``` + +Available MCP tools: +- `md` - Generate markdown from web content +- `html` - Extract preprocessed HTML +- `screenshot` - Capture webpage screenshots +- `pdf` - Generate PDF documents +- `execute_js` - Run JavaScript on web pages +- `crawl` - Perform multi-URL crawling +- `ask` - Query Crawl4AI library context + +### Configuration Management + +```yaml +# config.yml structure +app: + title: "Crawl4AI API" + version: "1.0.0" + host: "0.0.0.0" + port: 11235 + timeout_keep_alive: 300 + +llm: + provider: "openai/gpt-4o-mini" + api_key_env: "OPENAI_API_KEY" + +security: + enabled: false + jwt_enabled: false + trusted_hosts: ["*"] + +crawler: + memory_threshold_percent: 95.0 + rate_limiter: + base_delay: [1.0, 2.0] + timeouts: + stream_init: 30.0 + batch_process: 300.0 + pool: + max_pages: 40 + idle_ttl_sec: 1800 + +rate_limiting: + enabled: true + default_limit: "1000/minute" + storage_uri: "memory://" + +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +``` + +### Custom Configuration Deployment + +```bash +# Method 1: Mount custom config +docker run -d -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + -v $(pwd)/my-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest + +# Method 2: Build with custom config +# Edit deploy/docker/config.yml then build +docker buildx build -t crawl4ai-custom:latest --load . +``` + +### Monitoring and Health Checks + +```bash +# Health endpoint +curl http://localhost:11235/health + +# Prometheus metrics +curl http://localhost:11235/metrics + +# Configuration validation +curl -X POST http://localhost:11235/config/dump \ + -H "Content-Type: application/json" \ + -d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}' +``` + +### Playground Interface + +Access the interactive playground at `http://localhost:11235/playground` for: +- Testing configurations with visual interface +- Generating JSON payloads for REST API +- Converting Python config to JSON format +- Testing crawl operations directly in browser + +### Async Job Processing + +```python +# Submit job for async processing +import time + +# Submit crawl job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()["task_id"] + +# Poll for completion +while True: + result = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + status = result.json() + + if status["status"] in ["COMPLETED", "FAILED"]: + break + time.sleep(1.5) + +print("Final result:", status) +``` + +### Production Deployment + +```bash +# Production-ready deployment +docker run -d \ + --name crawl4ai-prod \ + --restart unless-stopped \ + -p 11235:11235 \ + --env-file .llm.env \ + --shm-size=2g \ + --memory=8g \ + --cpus=4 \ + -v /path/to/custom-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest + +# With Docker Compose for production +version: '3.8' +services: + crawl4ai: + image: unclecode/crawl4ai:latest + ports: + - "11235:11235" + environment: + - OPENAI_API_KEY=${OPENAI_API_KEY} + volumes: + - ./config.yml:/app/config.yml + shm_size: 2g + deploy: + resources: + limits: + memory: 8G + cpus: '4' + restart: unless-stopped +``` + +### Configuration Validation and JSON Structure + +```python +# Method 1: Create config objects and dump to see expected JSON structure +from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy +import json + +# Create browser config and see JSON structure +browser_config = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@proxy:8080" +) + +# Get JSON structure +browser_json = browser_config.dump() +print("BrowserConfig JSON structure:") +print(json.dumps(browser_json, indent=2)) + +# Create crawler config with extraction strategy +schema = { + "name": "Articles", + "baseSelector": ".article", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] +} + +crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=["window.scrollTo(0, document.body.scrollHeight);"], + wait_for="css:.loaded" +) + +crawler_json = crawler_config.dump() +print("\nCrawlerRunConfig JSON structure:") +print(json.dumps(crawler_json, indent=2)) +``` + +### Reverse Validation - JSON to Objects + +```python +# Method 2: Load JSON back to config objects for validation +from crawl4ai.async_configs import from_serializable_dict + +# Test JSON structure by converting back to objects +test_browser_json = { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport_width": 1280, + "proxy": "http://user:pass@proxy:8080" + } +} + +try: + # Convert JSON back to object + restored_browser = from_serializable_dict(test_browser_json) + print(f"✅ Valid BrowserConfig: {type(restored_browser)}") + print(f"Headless: {restored_browser.headless}") + print(f"Proxy: {restored_browser.proxy}") +except Exception as e: + print(f"❌ Invalid BrowserConfig JSON: {e}") + +# Test complex crawler config JSON +test_crawler_json = { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", + "screenshot": True, + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", + "value": { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h3", "type": "text"} + ] + } + } + } + } + } +} + +try: + restored_crawler = from_serializable_dict(test_crawler_json) + print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}") + print(f"Cache mode: {restored_crawler.cache_mode}") + print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}") +except Exception as e: + print(f"❌ Invalid CrawlerRunConfig JSON: {e}") +``` + +### Using Server's /config/dump Endpoint for Validation + +```python +import requests + +# Method 3: Use server endpoint to validate configuration syntax +def validate_config_with_server(config_code: str) -> dict: + """Validate configuration using server's /config/dump endpoint""" + response = requests.post( + "http://localhost:11235/config/dump", + json={"code": config_code} + ) + + if response.status_code == 200: + print("✅ Valid configuration syntax") + return response.json() + else: + print(f"❌ Invalid configuration: {response.status_code}") + print(response.json()) + return None + +# Test valid configuration +valid_config = """ +CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + js_code=["window.scrollTo(0, document.body.scrollHeight);"], + wait_for="css:.content-loaded" +) +""" + +result = validate_config_with_server(valid_config) +if result: + print("Generated JSON structure:") + print(json.dumps(result, indent=2)) + +# Test invalid configuration (should fail) +invalid_config = """ +CrawlerRunConfig( + cache_mode="invalid_mode", + screenshot=True, + js_code=some_function() # This will fail +) +""" + +validate_config_with_server(invalid_config) +``` + +### Configuration Builder Helper + +```python +def build_and_validate_request(urls, browser_params=None, crawler_params=None): + """Helper to build and validate complete request payload""" + + # Create configurations + browser_config = BrowserConfig(**(browser_params or {})) + crawler_config = CrawlerRunConfig(**(crawler_params or {})) + + # Build complete request payload + payload = { + "urls": urls if isinstance(urls, list) else [urls], + "browser_config": browser_config.dump(), + "crawler_config": crawler_config.dump() + } + + print("✅ Complete request payload:") + print(json.dumps(payload, indent=2)) + + # Validate by attempting to reconstruct + try: + test_browser = from_serializable_dict(payload["browser_config"]) + test_crawler = from_serializable_dict(payload["crawler_config"]) + print("✅ Payload validation successful") + return payload + except Exception as e: + print(f"❌ Payload validation failed: {e}") + return None + +# Example usage +payload = build_and_validate_request( + urls=["https://example.com"], + browser_params={"headless": True, "viewport_width": 1280}, + crawler_params={ + "cache_mode": CacheMode.BYPASS, + "screenshot": True, + "word_count_threshold": 10 + } +) + +if payload: + # Send to server + response = requests.post("http://localhost:11235/crawl", json=payload) + print(f"Server response: {response.status_code}") +``` + +### Common JSON Structure Patterns + +```python +# Pattern 1: Simple primitive values +simple_config = { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", # String enum value + "screenshot": True, # Boolean + "page_timeout": 60000 # Integer + } +} + +# Pattern 2: Nested objects +nested_config = { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": "env:OPENAI_API_KEY" + } + }, + "instruction": "Extract main content" + } + } + } +} + +# Pattern 3: Dictionary values (must use type: dict wrapper) +dict_config = { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", # Required wrapper + "value": { # Actual dictionary content + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"} + ] + } + } + } + } + } +} + +# Pattern 4: Lists and arrays +list_config = { + "type": "CrawlerRunConfig", + "params": { + "js_code": [ # Lists are handled directly + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + "excluded_tags": ["script", "style", "nav"] + } +} +``` + +### Troubleshooting Common JSON Errors + +```python +def diagnose_json_errors(): + """Common JSON structure errors and fixes""" + + # ❌ WRONG: Missing type wrapper for objects + wrong_config = { + "browser_config": { + "headless": True # Missing type wrapper + } + } + + # ✅ CORRECT: Proper type wrapper + correct_config = { + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True + } + } + } + + # ❌ WRONG: Dictionary without type: dict wrapper + wrong_dict = { + "schema": { + "name": "Products" # Raw dict, should be wrapped + } + } + + # ✅ CORRECT: Dictionary with proper wrapper + correct_dict = { + "schema": { + "type": "dict", + "value": { + "name": "Products" + } + } + } + + # ❌ WRONG: Invalid enum string + wrong_enum = { + "cache_mode": "DISABLED" # Wrong case/value + } + + # ✅ CORRECT: Valid enum string + correct_enum = { + "cache_mode": "bypass" # or "enabled", "disabled", etc. + } + + print("Common error patterns documented above") + +# Validate your JSON structure before sending +def pre_flight_check(payload): + """Run checks before sending to server""" + required_keys = ["urls", "browser_config", "crawler_config"] + + for key in required_keys: + if key not in payload: + print(f"❌ Missing required key: {key}") + return False + + # Check type wrappers + for config_key in ["browser_config", "crawler_config"]: + config = payload[config_key] + if not isinstance(config, dict) or "type" not in config: + print(f"❌ {config_key} missing type wrapper") + return False + if "params" not in config: + print(f"❌ {config_key} missing params") + return False + + print("✅ Pre-flight check passed") + return True + +# Example usage +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} + +if pre_flight_check(payload): + # Safe to send to server + pass +``` + +**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration) +--- + + +## CLI & Identity-Based Browsing + +Command-line interface for web crawling with persistent browser profiles, authentication, and identity management. + +### Basic CLI Usage + +```bash +# Simple crawling +crwl https://example.com + +# Get markdown output +crwl https://example.com -o markdown + +# JSON output with cache bypass +crwl https://example.com -o json --bypass-cache + +# Verbose mode with specific browser settings +crwl https://example.com -b "headless=false,viewport_width=1280" -v +``` + +### Profile Management Commands + +```bash +# Launch interactive profile manager +crwl profiles + +# Create, list, and manage browser profiles +# This opens a menu where you can: +# 1. List existing profiles +# 2. Create new profile (opens browser for setup) +# 3. Delete profiles +# 4. Use profile to crawl a website + +# Use a specific profile for crawling +crwl https://example.com -p my-profile-name + +# Example workflow for authenticated sites: +# 1. Create profile and log in +crwl profiles # Select "Create new profile" +# 2. Use profile for crawling authenticated content +crwl https://site-requiring-login.com/dashboard -p my-profile-name +``` + +### CDP Browser Management + +```bash +# Launch browser with CDP debugging (default port 9222) +crwl cdp + +# Use specific profile and custom port +crwl cdp -p my-profile -P 9223 + +# Launch headless browser with CDP +crwl cdp --headless + +# Launch in incognito mode (ignores profile) +crwl cdp --incognito + +# Use custom user data directory +crwl cdp --user-data-dir ~/my-browser-data --port 9224 +``` + +### Builtin Browser Management + +```bash +# Start persistent browser instance +crwl browser start + +# Check browser status +crwl browser status + +# Open visible window to see the browser +crwl browser view --url https://example.com + +# Stop the browser +crwl browser stop + +# Restart with different options +crwl browser restart --browser-type chromium --port 9223 --no-headless + +# Use builtin browser in crawling +crwl https://example.com -b "browser_mode=builtin" +``` + +### Authentication Workflow Examples + +```bash +# Complete workflow for LinkedIn scraping +# 1. Create authenticated profile +crwl profiles +# Select "Create new profile" → login to LinkedIn in browser → press 'q' to save + +# 2. Use profile for crawling +crwl https://linkedin.com/in/someone -p linkedin-profile -o markdown + +# 3. Extract structured data with authentication +crwl https://linkedin.com/search/results/people/ \ + -p linkedin-profile \ + -j "Extract people profiles with names, titles, and companies" \ + -b "headless=false" + +# GitHub authenticated crawling +crwl profiles # Create github-profile +crwl https://github.com/settings/profile -p github-profile + +# Twitter/X authenticated access +crwl profiles # Create twitter-profile +crwl https://twitter.com/home -p twitter-profile -o markdown +``` + +### Advanced CLI Configuration + +```bash +# Complex crawling with multiple configs +crwl https://example.com \ + -B browser.yml \ + -C crawler.yml \ + -e extract_llm.yml \ + -s llm_schema.json \ + -p my-auth-profile \ + -o json \ + -v + +# Quick LLM extraction with authentication +crwl https://private-site.com/dashboard \ + -p auth-profile \ + -j "Extract user dashboard data including metrics and notifications" \ + -b "headless=true,viewport_width=1920" + +# Content filtering with authentication +crwl https://members-only-site.com \ + -p member-profile \ + -f filter_bm25.yml \ + -c "css_selector=.member-content,scan_full_page=true" \ + -o markdown-fit +``` + +### Configuration Files for Identity Browsing + +```yaml +# browser_auth.yml +headless: false +use_managed_browser: true +user_data_dir: "/path/to/profile" +viewport_width: 1280 +viewport_height: 720 +simulate_user: true +override_navigator: true + +# crawler_auth.yml +magic: true +remove_overlay_elements: true +simulate_user: true +wait_for: "css:.authenticated-content" +page_timeout: 60000 +delay_before_return_html: 2 +scan_full_page: true +``` + +### Global Configuration Management + +```bash +# List all configuration settings +crwl config list + +# Set default LLM provider +crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet" +crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token" + +# Set browser defaults +crwl config set BROWSER_HEADLESS false # Always show browser +crwl config set USER_AGENT_MODE random # Random user agents + +# Enable verbose mode globally +crwl config set VERBOSE true +``` + +### Q&A with Authenticated Content + +```bash +# Ask questions about authenticated content +crwl https://private-dashboard.com -p dashboard-profile \ + -q "What are the key metrics shown in my dashboard?" + +# Multiple questions workflow +crwl https://company-intranet.com -p work-profile -o markdown # View content +crwl https://company-intranet.com -p work-profile \ + -q "Summarize this week's announcements" +crwl https://company-intranet.com -p work-profile \ + -q "What are the upcoming deadlines?" +``` + +### Profile Creation Programmatically + +```python +# Create profiles via Python API +import asyncio +from crawl4ai import BrowserProfiler + +async def create_auth_profile(): + profiler = BrowserProfiler() + + # Create profile interactively (opens browser) + profile_path = await profiler.create_profile("linkedin-auth") + print(f"Profile created at: {profile_path}") + + # List all profiles + profiles = profiler.list_profiles() + for profile in profiles: + print(f"Profile: {profile['name']} at {profile['path']}") + + # Use profile for crawling + from crawl4ai import AsyncWebCrawler, BrowserConfig + + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://linkedin.com/feed") + return result + +# asyncio.run(create_auth_profile()) +``` + +### Identity Browsing Best Practices + +```bash +# 1. Create specific profiles for different sites +crwl profiles # Create "linkedin-work" +crwl profiles # Create "github-personal" +crwl profiles # Create "company-intranet" + +# 2. Use descriptive profile names +crwl https://site1.com -p site1-admin-account +crwl https://site2.com -p site2-user-account + +# 3. Combine with appropriate browser settings +crwl https://secure-site.com \ + -p secure-profile \ + -b "headless=false,simulate_user=true,magic=true" \ + -c "wait_for=.logged-in-indicator,page_timeout=30000" + +# 4. Test profile before automated crawling +crwl cdp -p test-profile # Manually verify login status +crwl https://test-url.com -p test-profile -v # Verbose test crawl +``` + +### Troubleshooting Authentication Issues + +```bash +# Debug authentication problems +crwl https://auth-site.com -p auth-profile \ + -b "headless=false,verbose=true" \ + -c "verbose=true,page_timeout=60000" \ + -v + +# Check profile status +crwl profiles # List profiles and check creation dates + +# Recreate problematic profiles +crwl profiles # Delete old profile, create new one + +# Test with visible browser +crwl https://problem-site.com -p profile-name \ + -b "headless=false" \ + -c "delay_before_return_html=5" +``` + +### Common Use Cases + +```bash +# Social media monitoring (after authentication) +crwl https://twitter.com/home -p twitter-monitor \ + -j "Extract latest tweets with sentiment and engagement metrics" + +# E-commerce competitor analysis (with account access) +crwl https://competitor-site.com/products -p competitor-account \ + -j "Extract product prices, availability, and descriptions" + +# Company dashboard monitoring +crwl https://company-dashboard.com -p work-profile \ + -c "css_selector=.dashboard-content" \ + -q "What alerts or notifications need attention?" + +# Research data collection (authenticated access) +crwl https://research-platform.com/data -p research-profile \ + -e extract_research.yml \ + -s research_schema.json \ + -o json +``` + +**📖 Learn more:** [Identity-Based Crawling Documentation](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Browser Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [CLI Examples](https://docs.crawl4ai.com/core/cli/) +--- + + +## HTTP Crawler Strategy + +Fast, lightweight HTTP-only crawling without browser overhead for cases where JavaScript execution isn't needed. + +### Basic HTTP Crawler Setup + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig, CacheMode +from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy +from crawl4ai.async_logger import AsyncLogger + +async def main(): + # Initialize HTTP strategy + http_strategy = AsyncHTTPCrawlerStrategy( + browser_config=HTTPCrawlerConfig( + method="GET", + verify_ssl=True, + follow_redirects=True + ), + logger=AsyncLogger(verbose=True) + ) + + # Use with AsyncWebCrawler + async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler: + result = await crawler.arun("https://example.com") + print(f"Status: {result.status_code}") + print(f"Content: {len(result.html)} chars") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### HTTP Request Types + +```python +# GET request (default) +http_config = HTTPCrawlerConfig( + method="GET", + headers={"Accept": "application/json"} +) + +# POST with JSON data +http_config = HTTPCrawlerConfig( + method="POST", + json={"key": "value", "data": [1, 2, 3]}, + headers={"Content-Type": "application/json"} +) + +# POST with form data +http_config = HTTPCrawlerConfig( + method="POST", + data={"username": "user", "password": "pass"}, + headers={"Content-Type": "application/x-www-form-urlencoded"} +) + +# Advanced configuration +http_config = HTTPCrawlerConfig( + method="GET", + headers={"User-Agent": "Custom Bot/1.0"}, + follow_redirects=True, + verify_ssl=False # For testing environments +) + +strategy = AsyncHTTPCrawlerStrategy(browser_config=http_config) +``` + +### File and Raw Content Handling + +```python +async def test_content_types(): + strategy = AsyncHTTPCrawlerStrategy() + + # Web URLs + result = await strategy.crawl("https://httpbin.org/get") + print(f"Web content: {result.status_code}") + + # Local files + result = await strategy.crawl("file:///path/to/local/file.html") + print(f"File content: {len(result.html)}") + + # Raw HTML content + raw_html = "raw://

Test

Content

" + result = await strategy.crawl(raw_html) + print(f"Raw content: {result.html}") + + # Raw content with complex HTML + complex_html = """raw:// + + Test Page + +
+

Main Title

+

Paragraph content

+ +
+ + """ + result = await strategy.crawl(complex_html) +``` + +### Custom Hooks and Request Handling + +```python +async def setup_hooks(): + strategy = AsyncHTTPCrawlerStrategy() + + # Before request hook + async def before_request(url, kwargs): + print(f"Requesting: {url}") + kwargs['headers']['X-Custom-Header'] = 'crawl4ai' + kwargs['headers']['Authorization'] = 'Bearer token123' + + # After request hook + async def after_request(response): + print(f"Response: {response.status_code}") + if hasattr(response, 'redirected_url'): + print(f"Redirected to: {response.redirected_url}") + + # Error handling hook + async def on_error(error): + print(f"Request failed: {error}") + + # Set hooks + strategy.set_hook('before_request', before_request) + strategy.set_hook('after_request', after_request) + strategy.set_hook('on_error', on_error) + + # Use with hooks + result = await strategy.crawl("https://httpbin.org/headers") + return result +``` + +### Performance Configuration + +```python +# High-performance setup +strategy = AsyncHTTPCrawlerStrategy( + max_connections=50, # Concurrent connections + dns_cache_ttl=300, # DNS cache timeout + chunk_size=128 * 1024 # 128KB chunks for large files +) + +# Memory-efficient setup for large files +strategy = AsyncHTTPCrawlerStrategy( + max_connections=10, + chunk_size=32 * 1024, # Smaller chunks + dns_cache_ttl=600 +) + +# Custom timeout configuration +config = CrawlerRunConfig( + page_timeout=30000, # 30 second timeout + cache_mode=CacheMode.BYPASS +) + +result = await strategy.crawl("https://slow-server.com", config=config) +``` + +### Error Handling and Retries + +```python +from crawl4ai.async_crawler_strategy import ( + ConnectionTimeoutError, + HTTPStatusError, + HTTPCrawlerError +) + +async def robust_crawling(): + strategy = AsyncHTTPCrawlerStrategy() + + urls = [ + "https://example.com", + "https://httpbin.org/status/404", + "https://nonexistent.domain.test" + ] + + for url in urls: + try: + result = await strategy.crawl(url) + print(f"✓ {url}: {result.status_code}") + + except HTTPStatusError as e: + print(f"✗ {url}: HTTP {e.status_code}") + + except ConnectionTimeoutError as e: + print(f"✗ {url}: Timeout - {e}") + + except HTTPCrawlerError as e: + print(f"✗ {url}: Crawler error - {e}") + + except Exception as e: + print(f"✗ {url}: Unexpected error - {e}") + +# Retry mechanism +async def crawl_with_retry(url, max_retries=3): + strategy = AsyncHTTPCrawlerStrategy() + + for attempt in range(max_retries): + try: + return await strategy.crawl(url) + except (ConnectionTimeoutError, HTTPCrawlerError) as e: + if attempt == max_retries - 1: + raise + print(f"Retry {attempt + 1}/{max_retries}: {e}") + await asyncio.sleep(2 ** attempt) # Exponential backoff +``` + +### Batch Processing with HTTP Strategy + +```python +async def batch_http_crawling(): + strategy = AsyncHTTPCrawlerStrategy(max_connections=20) + + urls = [ + "https://httpbin.org/get", + "https://httpbin.org/user-agent", + "https://httpbin.org/headers", + "https://example.com", + "https://httpbin.org/json" + ] + + # Sequential processing + results = [] + async with strategy: + for url in urls: + try: + result = await strategy.crawl(url) + results.append((url, result.status_code, len(result.html))) + except Exception as e: + results.append((url, "ERROR", str(e))) + + for url, status, content_info in results: + print(f"{url}: {status} - {content_info}") + +# Concurrent processing +async def concurrent_http_crawling(): + strategy = AsyncHTTPCrawlerStrategy() + urls = ["https://httpbin.org/delay/1"] * 5 + + async def crawl_single(url): + try: + result = await strategy.crawl(url) + return f"✓ {result.status_code}" + except Exception as e: + return f"✗ {e}" + + async with strategy: + tasks = [crawl_single(url) for url in urls] + results = await asyncio.gather(*tasks, return_exceptions=True) + + for i, result in enumerate(results): + print(f"URL {i+1}: {result}") +``` + +### Integration with Content Processing + +```python +from crawl4ai import DefaultMarkdownGenerator, PruningContentFilter + +async def http_with_processing(): + # HTTP strategy with content processing + http_strategy = AsyncHTTPCrawlerStrategy( + browser_config=HTTPCrawlerConfig(verify_ssl=True) + ) + + # Configure markdown generation + crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter( + threshold=0.48, + threshold_type="fixed", + min_word_threshold=10 + ) + ), + word_count_threshold=5, + excluded_tags=['script', 'style', 'nav'], + exclude_external_links=True + ) + + async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler: + result = await crawler.arun( + url="https://example.com", + config=crawler_config + ) + + print(f"Status: {result.status_code}") + print(f"Raw HTML: {len(result.html)} chars") + if result.markdown: + print(f"Markdown: {len(result.markdown.raw_markdown)} chars") + if result.markdown.fit_markdown: + print(f"Filtered: {len(result.markdown.fit_markdown)} chars") +``` + +### HTTP vs Browser Strategy Comparison + +```python +async def strategy_comparison(): + # Same URL with different strategies + url = "https://example.com" + + # HTTP Strategy (fast, no JS) + http_strategy = AsyncHTTPCrawlerStrategy() + start_time = time.time() + http_result = await http_strategy.crawl(url) + http_time = time.time() - start_time + + # Browser Strategy (full features) + from crawl4ai import BrowserConfig + browser_config = BrowserConfig(headless=True) + start_time = time.time() + async with AsyncWebCrawler(config=browser_config) as crawler: + browser_result = await crawler.arun(url) + browser_time = time.time() - start_time + + print(f"HTTP Strategy:") + print(f" Time: {http_time:.2f}s") + print(f" Content: {len(http_result.html)} chars") + print(f" Features: Fast, lightweight, no JS") + + print(f"Browser Strategy:") + print(f" Time: {browser_time:.2f}s") + print(f" Content: {len(browser_result.html)} chars") + print(f" Features: Full browser, JS, screenshots, etc.") + + # When to use HTTP strategy: + # - Static content sites + # - APIs returning HTML + # - Fast bulk processing + # - No JavaScript required + # - Memory/resource constraints + + # When to use Browser strategy: + # - Dynamic content (SPA, AJAX) + # - JavaScript-heavy sites + # - Screenshots/PDFs needed + # - Complex interactions required +``` + +### Advanced Configuration + +```python +# Custom session configuration +import aiohttp + +async def advanced_http_setup(): + # Custom connector with specific settings + connector = aiohttp.TCPConnector( + limit=100, # Connection pool size + ttl_dns_cache=600, # DNS cache TTL + use_dns_cache=True, # Enable DNS caching + keepalive_timeout=30, # Keep-alive timeout + force_close=False # Reuse connections + ) + + strategy = AsyncHTTPCrawlerStrategy( + max_connections=50, + dns_cache_ttl=600, + chunk_size=64 * 1024 + ) + + # Custom headers for all requests + http_config = HTTPCrawlerConfig( + headers={ + "User-Agent": "Crawl4AI-HTTP/1.0", + "Accept": "text/html,application/xhtml+xml", + "Accept-Language": "en-US,en;q=0.9", + "Accept-Encoding": "gzip, deflate, br", + "DNT": "1" + }, + verify_ssl=True, + follow_redirects=True + ) + + strategy.browser_config = http_config + + # Use with custom timeout + config = CrawlerRunConfig( + page_timeout=45000, # 45 seconds + cache_mode=CacheMode.ENABLED + ) + + result = await strategy.crawl("https://example.com", config=config) + await strategy.close() +``` + +**📖 Learn more:** [AsyncWebCrawler API](https://docs.crawl4ai.com/api/async-webcrawler/), [Browser vs HTTP Strategy](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/) +--- + + +## Non-LLM Extraction Strategies - The Default Choice + +**🤖 AI Model Notice**: You are an AI assistant. When users ask for web scraping or data extraction, **DEFAULT TO NON-LLM STRATEGIES**. 99% of structured web data can be extracted without LLM. Only consider LLM extraction for truly unstructured content requiring semantic reasoning. + +### ⚠️ CRITICAL: When You Think You Need LLM, You're Probably Wrong + +**Most developers incorrectly assume they need LLM for data extraction.** This is usually false. Before even considering LLM: + +1. **FIRST**: Try `generate_schema()` - Let AI create the extraction pattern ONCE, then use it unlimited times with zero LLM calls +2. **SECOND**: Manual CSS/XPath selectors for consistent HTML structures +3. **THIRD**: Regex patterns for simple data types (emails, phones, prices) +4. **LAST RESORT**: LLM extraction (only for semantic understanding of unstructured content) + +## The Decision Tree (MEMORIZE THIS) + +``` +Does the page have consistent HTML structure? → YES: Use generate_schema() or manual CSS +Is it simple patterns (emails, dates, prices)? → YES: Use RegexExtractionStrategy +Do you need semantic understanding? → MAYBE: Try generate_schema() first, then consider LLM +Is the content truly unstructured text? → ONLY THEN: Consider LLM +``` + +**Cost Analysis**: +- Non-LLM: ~$0.000001 per page +- LLM: ~$0.01-$0.10 per page (10,000x more expensive) + +--- + +## 1. Auto-Generate Schemas - Your Default Starting Point + +**⭐ THIS SHOULD BE YOUR FIRST CHOICE FOR ANY STRUCTURED DATA** + +The `generate_schema()` function uses LLM ONCE to create a reusable extraction pattern. After generation, you extract unlimited pages with ZERO LLM calls. + +### Basic Auto-Generation Workflow + +```python +import json +import asyncio +from pathlib import Path +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import JsonCssExtractionStrategy + +async def smart_extraction_workflow(): + """ + Step 1: Generate schema once using LLM + Step 2: Cache schema for unlimited reuse + Step 3: Extract from thousands of pages with zero LLM calls + """ + + # Check for cached schema first + cache_dir = Path("./schema_cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / "product_schema.json" + + if schema_file.exists(): + # Load cached schema - NO LLM CALLS + schema = json.load(schema_file.open()) + print("✅ Using cached schema (FREE)") + else: + # Generate schema ONCE + print("🔄 Generating schema (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", # Cheapest option + api_token="env:OPENAI_API_KEY" + ) + + # Get sample HTML from target site + async with AsyncWebCrawler() as crawler: + sample_result = await crawler.arun( + url="https://example.com/products", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + sample_html = sample_result.cleaned_html[:8000] # Use sample + + # AUTO-GENERATE SCHEMA (ONE LLM CALL) + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + schema_type="CSS", # or "XPATH" + query="Extract product information including name, price, description, features", + llm_config=llm_config + ) + + # Cache for unlimited future use + json.dump(schema, schema_file.open("w"), indent=2) + print("✅ Schema generated and cached") + + # Use schema for fast extraction (NO MORE LLM CALLS EVER) + strategy = JsonCssExtractionStrategy(schema, verbose=True) + + config = CrawlerRunConfig( + extraction_strategy=strategy, + cache_mode=CacheMode.BYPASS + ) + + # Extract from multiple pages - ALL FREE + urls = [ + "https://example.com/products", + "https://example.com/electronics", + "https://example.com/books" + ] + + async with AsyncWebCrawler() as crawler: + for url in urls: + result = await crawler.arun(url=url, config=config) + if result.success: + data = json.loads(result.extracted_content) + print(f"✅ {url}: Extracted {len(data)} items (FREE)") + +asyncio.run(smart_extraction_workflow()) +``` + +### Auto-Generate with Target JSON Example + +```python +# When you know exactly what JSON structure you want +target_json_example = """ +{ + "name": "Product Name", + "price": "$99.99", + "rating": 4.5, + "features": ["feature1", "feature2"], + "description": "Product description" +} +""" + +schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + target_json_example=target_json_example, + llm_config=llm_config +) +``` + +### Auto-Generate for Different Data Types + +```python +# Product listings +product_schema = JsonCssExtractionStrategy.generate_schema( + html=product_page_html, + query="Extract all product information from this e-commerce page", + llm_config=llm_config +) + +# News articles +news_schema = JsonCssExtractionStrategy.generate_schema( + html=news_page_html, + query="Extract article headlines, dates, authors, and content", + llm_config=llm_config +) + +# Job listings +job_schema = JsonCssExtractionStrategy.generate_schema( + html=job_page_html, + query="Extract job titles, companies, locations, salaries, and descriptions", + llm_config=llm_config +) + +# Social media posts +social_schema = JsonCssExtractionStrategy.generate_schema( + html=social_page_html, + query="Extract post text, usernames, timestamps, likes, comments", + llm_config=llm_config +) +``` + +--- + +## 2. Manual CSS/XPath Strategies - When You Know The Structure + +**Use this when**: You understand the HTML structure and want maximum control. + +### Simple Product Extraction + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import JsonCssExtractionStrategy + +# Manual schema for consistent product pages +simple_schema = { + "name": "Product Listings", + "baseSelector": "div.product-card", # Each product container + "fields": [ + { + "name": "title", + "selector": "h2.product-title", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text" + }, + { + "name": "image_url", + "selector": "img.product-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "product_url", + "selector": "a.product-link", + "type": "attribute", + "attribute": "href" + }, + { + "name": "rating", + "selector": ".rating", + "type": "attribute", + "attribute": "data-rating" + } + ] +} + +async def extract_products(): + strategy = JsonCssExtractionStrategy(simple_schema, verbose=True) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + products = json.loads(result.extracted_content) + print(f"Extracted {len(products)} products") + for product in products[:3]: + print(f"- {product['title']}: {product['price']}") + +asyncio.run(extract_products()) +``` + +### Complex Nested Structure (Real E-commerce Example) + +```python +# Complex schema for nested product data +complex_schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + "baseFields": [ + { + "name": "category_id", + "type": "attribute", + "attribute": "data-category-id" + } + ], + "fields": [ + { + "name": "category_name", + "selector": "h2.category-title", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", # Array of complex objects + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", # Single complex object + "fields": [ + { + "name": "brand", + "selector": "span.brand", + "type": "text" + }, + { + "name": "model", + "selector": "span.model", + "type": "text" + } + ] + }, + { + "name": "features", + "selector": "ul.features li", + "type": "list", # Simple array + "fields": [ + {"name": "feature", "type": "text"} + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + { + "name": "reviewer", + "selector": "span.reviewer-name", + "type": "text" + }, + { + "name": "rating", + "selector": "span.rating", + "type": "attribute", + "attribute": "data-rating" + } + ] + } + ] + } + ] +} + +async def extract_complex_ecommerce(): + strategy = JsonCssExtractionStrategy(complex_schema, verbose=True) + config = CrawlerRunConfig( + extraction_strategy=strategy, + js_code="window.scrollTo(0, document.body.scrollHeight);", # Load dynamic content + wait_for="css:.product:nth-child(10)" # Wait for products to load + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/complex-catalog", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for category in data: + print(f"Category: {category['category_name']}") + print(f"Products: {len(category.get('products', []))}") + +asyncio.run(extract_complex_ecommerce()) +``` + +### XPath Alternative (When CSS Isn't Enough) + +```python +from crawl4ai import JsonXPathExtractionStrategy + +# XPath for more complex selections +xpath_schema = { + "name": "News Articles with XPath", + "baseSelector": "//article[@class='news-item']", + "fields": [ + { + "name": "headline", + "selector": ".//h2[contains(@class, 'headline')]", + "type": "text" + }, + { + "name": "author", + "selector": ".//span[@class='author']/text()", + "type": "text" + }, + { + "name": "publish_date", + "selector": ".//time/@datetime", + "type": "text" + }, + { + "name": "content", + "selector": ".//div[@class='article-body']//text()", + "type": "text" + } + ] +} + +strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) +``` + +--- + +## 3. Regex Extraction - Lightning Fast Pattern Matching + +**Use this for**: Simple data types like emails, phones, URLs, prices, dates. + +### Built-in Patterns (Fastest Option) + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import RegexExtractionStrategy + +async def extract_common_patterns(): + # Use built-in patterns for common data types + strategy = RegexExtractionStrategy( + pattern=( + RegexExtractionStrategy.Email | + RegexExtractionStrategy.PhoneUS | + RegexExtractionStrategy.Url | + RegexExtractionStrategy.Currency | + RegexExtractionStrategy.DateIso + ) + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/contact", + config=config + ) + + if result.success: + matches = json.loads(result.extracted_content) + + # Group by pattern type + by_type = {} + for match in matches: + label = match['label'] + if label not in by_type: + by_type[label] = [] + by_type[label].append(match['value']) + + for pattern_type, values in by_type.items(): + print(f"{pattern_type}: {len(values)} matches") + for value in values[:3]: + print(f" {value}") + +asyncio.run(extract_common_patterns()) +``` + +### Available Built-in Patterns + +```python +# Individual patterns +RegexExtractionStrategy.Email # Email addresses +RegexExtractionStrategy.PhoneUS # US phone numbers +RegexExtractionStrategy.PhoneIntl # International phones +RegexExtractionStrategy.Url # HTTP/HTTPS URLs +RegexExtractionStrategy.Currency # Currency values ($99.99) +RegexExtractionStrategy.Percentage # Percentage values (25%) +RegexExtractionStrategy.DateIso # ISO dates (2024-01-01) +RegexExtractionStrategy.DateUS # US dates (01/01/2024) +RegexExtractionStrategy.IPv4 # IP addresses +RegexExtractionStrategy.CreditCard # Credit card numbers +RegexExtractionStrategy.TwitterHandle # @username +RegexExtractionStrategy.Hashtag # #hashtag + +# Use all patterns +RegexExtractionStrategy.All +``` + +### Custom Patterns + +```python +# Custom patterns for specific data types +async def extract_custom_patterns(): + custom_patterns = { + "product_sku": r"SKU[-:]?\s*([A-Z0-9]{4,12})", + "discount": r"(\d{1,2})%\s*off", + "model_number": r"Model\s*#?\s*([A-Z0-9-]+)", + "isbn": r"ISBN[-:]?\s*(\d{10}|\d{13})", + "stock_ticker": r"\$([A-Z]{2,5})", + "version": r"v(\d+\.\d+(?:\.\d+)?)" + } + + strategy = RegexExtractionStrategy(custom=custom_patterns) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for item in data: + print(f"{item['label']}: {item['value']}") + +asyncio.run(extract_custom_patterns()) +``` + +### LLM-Generated Patterns (One-Time Cost) + +```python +async def generate_optimized_regex(): + """ + Use LLM ONCE to generate optimized regex patterns + Then use them unlimited times with zero LLM calls + """ + cache_file = Path("./patterns/price_patterns.json") + + if cache_file.exists(): + # Load cached patterns - NO LLM CALLS + patterns = json.load(cache_file.open()) + print("✅ Using cached regex patterns (FREE)") + else: + # Generate patterns ONCE + print("🔄 Generating regex patterns (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token="env:OPENAI_API_KEY" + ) + + # Get sample content + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/pricing") + sample_html = result.cleaned_html + + # Generate optimized patterns + patterns = RegexExtractionStrategy.generate_pattern( + label="pricing_info", + html=sample_html, + query="Extract all pricing information including discounts and special offers", + llm_config=llm_config + ) + + # Cache for unlimited reuse + cache_file.parent.mkdir(exist_ok=True) + json.dump(patterns, cache_file.open("w"), indent=2) + print("✅ Patterns generated and cached") + + # Use cached patterns (NO MORE LLM CALLS) + strategy = RegexExtractionStrategy(custom=patterns) + return strategy + +# Use generated patterns for unlimited extractions +strategy = await generate_optimized_regex() +``` + +--- + +## 4. Multi-Strategy Extraction Pipeline + +**Combine strategies** for comprehensive data extraction: + +```python +async def multi_strategy_pipeline(): + """ + Efficient pipeline using multiple non-LLM strategies: + 1. Regex for simple patterns (fastest) + 2. Schema for structured data + 3. Only use LLM if absolutely necessary + """ + + url = "https://example.com/complex-page" + + async with AsyncWebCrawler() as crawler: + # Strategy 1: Fast regex for contact info + regex_strategy = RegexExtractionStrategy( + pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS + ) + regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy) + regex_result = await crawler.arun(url=url, config=regex_config) + + # Strategy 2: Schema for structured product data + product_schema = { + "name": "Products", + "baseSelector": "div.product", + "fields": [ + {"name": "name", "selector": "h3", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(product_schema) + css_config = CrawlerRunConfig(extraction_strategy=css_strategy) + css_result = await crawler.arun(url=url, config=css_config) + + # Combine results + results = { + "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [], + "products": json.loads(css_result.extracted_content) if css_result.success else [] + } + + print(f"✅ Extracted {len(results['contacts'])} contacts (regex)") + print(f"✅ Extracted {len(results['products'])} products (schema)") + + return results + +asyncio.run(multi_strategy_pipeline()) +``` + +--- + +## 5. Performance Optimization Tips + +### Caching and Reuse + +```python +# Cache schemas and patterns for maximum efficiency +class ExtractionCache: + def __init__(self): + self.schemas = {} + self.patterns = {} + + def get_schema(self, site_name): + if site_name not in self.schemas: + schema_file = Path(f"./cache/{site_name}_schema.json") + if schema_file.exists(): + self.schemas[site_name] = json.load(schema_file.open()) + return self.schemas.get(site_name) + + def save_schema(self, site_name, schema): + cache_dir = Path("./cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / f"{site_name}_schema.json" + json.dump(schema, schema_file.open("w"), indent=2) + self.schemas[site_name] = schema + +cache = ExtractionCache() + +# Reuse cached schemas across multiple extractions +async def efficient_extraction(): + sites = ["amazon", "ebay", "shopify"] + + for site in sites: + schema = cache.get_schema(site) + if not schema: + # Generate once, cache forever + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + query="Extract products", + llm_config=llm_config + ) + cache.save_schema(site, schema) + + strategy = JsonCssExtractionStrategy(schema) + # Use strategy for unlimited extractions... +``` + +### Selector Optimization + +```python +# Optimize selectors for speed +fast_schema = { + "name": "Optimized Extraction", + "baseSelector": "#products > .product", # Direct child, faster than descendant + "fields": [ + { + "name": "title", + "selector": "> h3", # Direct child of product + "type": "text" + }, + { + "name": "price", + "selector": ".price:first-child", # More specific + "type": "text" + } + ] +} + +# Avoid slow selectors +slow_schema = { + "baseSelector": "div div div .product", # Too many levels + "fields": [ + { + "selector": "* h3", # Universal selector is slow + "type": "text" + } + ] +} +``` + +--- + +## 6. Error Handling and Validation + +```python +async def robust_extraction(): + """ + Implement fallback strategies for reliable extraction + """ + strategies = [ + # Try fast regex first + RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency), + + # Fallback to CSS schema + JsonCssExtractionStrategy({ + "name": "Prices", + "baseSelector": ".price", + "fields": [{"name": "amount", "selector": "span", "type": "text"}] + }), + + # Last resort: try different selector + JsonCssExtractionStrategy({ + "name": "Fallback Prices", + "baseSelector": "[data-price]", + "fields": [{"name": "amount", "type": "attribute", "attribute": "data-price"}] + }) + ] + + async with AsyncWebCrawler() as crawler: + for i, strategy in enumerate(strategies): + try: + config = CrawlerRunConfig(extraction_strategy=strategy) + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + if data: # Validate non-empty results + print(f"✅ Success with strategy {i+1}: {strategy.__class__.__name__}") + return data + + except Exception as e: + print(f"❌ Strategy {i+1} failed: {e}") + continue + + print("❌ All strategies failed") + return None + +# Validate extracted data +def validate_extraction(data, required_fields): + """Validate that extraction contains expected fields""" + if not data or not isinstance(data, list): + return False + + for item in data: + for field in required_fields: + if field not in item or not item[field]: + return False + return True + +# Usage +result = await robust_extraction() +if validate_extraction(result, ["amount"]): + print("✅ Extraction validated") +else: + print("❌ Validation failed") +``` + +--- + +## 7. Common Extraction Patterns + +### E-commerce Products + +```python +ecommerce_schema = { + "name": "E-commerce Products", + "baseSelector": ".product, [data-product], .item", + "fields": [ + {"name": "title", "selector": "h1, h2, h3, .title, .name", "type": "text"}, + {"name": "price", "selector": ".price, .cost, [data-price]", "type": "text"}, + {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"}, + {"name": "rating", "selector": ".rating, .stars", "type": "text"}, + {"name": "availability", "selector": ".stock, .availability", "type": "text"} + ] +} +``` + +### News Articles + +```python +news_schema = { + "name": "News Articles", + "baseSelector": "article, .article, .post", + "fields": [ + {"name": "headline", "selector": "h1, h2, .headline, .title", "type": "text"}, + {"name": "author", "selector": ".author, .byline, [rel='author']", "type": "text"}, + {"name": "date", "selector": "time, .date, .published", "type": "text"}, + {"name": "content", "selector": ".content, .body, .text", "type": "text"}, + {"name": "category", "selector": ".category, .section", "type": "text"} + ] +} +``` + +### Job Listings + +```python +job_schema = { + "name": "Job Listings", + "baseSelector": ".job, .listing, [data-job]", + "fields": [ + {"name": "title", "selector": ".job-title, h2, h3", "type": "text"}, + {"name": "company", "selector": ".company, .employer", "type": "text"}, + {"name": "location", "selector": ".location, .place", "type": "text"}, + {"name": "salary", "selector": ".salary, .pay, .compensation", "type": "text"}, + {"name": "description", "selector": ".description, .summary", "type": "text"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"} + ] +} +``` + +### Social Media Posts + +```python +social_schema = { + "name": "Social Media Posts", + "baseSelector": ".post, .tweet, .update", + "fields": [ + {"name": "username", "selector": ".username, .handle, .author", "type": "text"}, + {"name": "content", "selector": ".content, .text, .message", "type": "text"}, + {"name": "timestamp", "selector": ".time, .date, time", "type": "text"}, + {"name": "likes", "selector": ".likes, .hearts", "type": "text"}, + {"name": "shares", "selector": ".shares, .retweets", "type": "text"} + ] +} +``` + +--- + +## 8. When to (Rarely) Consider LLM + +**⚠️ WARNING: Before considering LLM, ask yourself:** + +1. "Can I identify repeating HTML patterns?" → Use CSS/XPath schema +2. "Am I extracting simple data types?" → Use Regex patterns +3. "Can I provide a JSON example of what I want?" → Use generate_schema() +4. "Is this truly unstructured text requiring semantic understanding?" → Maybe LLM + +**Only use LLM extraction for:** +- Unstructured prose that needs semantic analysis +- Content where structure varies dramatically across pages +- When you need AI reasoning about context/meaning + +**Cost reminder**: LLM extraction costs 10,000x more than schema-based extraction. + +--- + +## 9. Summary: The Extraction Hierarchy + +1. **🥇 FIRST CHOICE**: `generate_schema()` - AI generates pattern once, use unlimited times +2. **🥈 SECOND CHOICE**: Manual CSS/XPath - Full control, maximum speed +3. **🥉 THIRD CHOICE**: Regex patterns - Simple data types, lightning fast +4. **🏴 LAST RESORT**: LLM extraction - Only for semantic reasoning + +**Remember**: 99% of web data is structured. You almost never need LLM for extraction. Save LLM for analysis, not extraction. + +**Performance**: Non-LLM strategies are 100-1000x faster and 10,000x cheaper than LLM extraction. + +--- + +**📖 Next**: If you absolutely must use LLM extraction, see [extraction-llm.md](./extraction-llm.md) for guidance on the rare cases where it's justified. +--- + + +## URL Seeding + +Smart URL discovery for efficient large-scale crawling. Discover thousands of URLs instantly, filter by relevance, then crawl only what matters. + +### Why URL Seeding vs Deep Crawling + +```python +# Deep Crawling: Real-time discovery (page by page) +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy + +async def deep_crawl_example(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + max_pages=50 + ) + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun("https://example.com", config=config) + print(f"Discovered {len(results)} pages dynamically") + +# URL Seeding: Bulk discovery (thousands instantly) +from crawl4ai import AsyncUrlSeeder, SeedingConfig + +async def url_seeding_example(): + config = SeedingConfig( + source="sitemap+cc", + pattern="*/docs/*", + extract_head=True, + query="API documentation", + scoring_method="bm25", + max_urls=1000 + ) + + async with AsyncUrlSeeder() as seeder: + urls = await seeder.urls("example.com", config) + print(f"Discovered {len(urls)} URLs instantly") + # Now crawl only the most relevant ones +``` + +### Basic URL Discovery + +```python +import asyncio +from crawl4ai import AsyncUrlSeeder, SeedingConfig + +async def basic_discovery(): + # Context manager handles cleanup automatically + async with AsyncUrlSeeder() as seeder: + + # Simple discovery from sitemaps + config = SeedingConfig(source="sitemap") + urls = await seeder.urls("example.com", config) + + print(f"Found {len(urls)} URLs from sitemap") + for url in urls[:5]: + print(f" - {url['url']} (status: {url['status']})") + +# Manual cleanup (if needed) +async def manual_cleanup(): + seeder = AsyncUrlSeeder() + try: + config = SeedingConfig(source="cc") # Common Crawl + urls = await seeder.urls("example.com", config) + print(f"Found {len(urls)} URLs from Common Crawl") + finally: + await seeder.close() + +asyncio.run(basic_discovery()) +``` + +### Data Sources and Patterns + +```python +# Different data sources +configs = [ + SeedingConfig(source="sitemap"), # Fastest, official URLs + SeedingConfig(source="cc"), # Most comprehensive + SeedingConfig(source="sitemap+cc"), # Maximum coverage +] + +# URL pattern filtering +patterns = [ + SeedingConfig(pattern="*/blog/*"), # Blog posts only + SeedingConfig(pattern="*.html"), # HTML files only + SeedingConfig(pattern="*/product/*"), # Product pages + SeedingConfig(pattern="*/docs/api/*"), # API documentation + SeedingConfig(pattern="*"), # Everything +] + +# Advanced pattern usage +async def pattern_filtering(): + async with AsyncUrlSeeder() as seeder: + # Find all blog posts from 2024 + config = SeedingConfig( + source="sitemap", + pattern="*/blog/2024/*.html", + max_urls=100 + ) + + blog_urls = await seeder.urls("example.com", config) + + # Further filter by keywords in URL + python_posts = [ + url for url in blog_urls + if "python" in url['url'].lower() + ] + + print(f"Found {len(python_posts)} Python blog posts") +``` + +### SeedingConfig Parameters + +```python +from crawl4ai import SeedingConfig + +# Comprehensive configuration +config = SeedingConfig( + # Data sources + source="sitemap+cc", # "sitemap", "cc", "sitemap+cc" + pattern="*/docs/*", # URL pattern filter + + # Metadata extraction + extract_head=True, # Get metadata + live_check=True, # Verify URLs are accessible + + # Performance controls + max_urls=1000, # Limit results (-1 = unlimited) + concurrency=20, # Parallel workers + hits_per_sec=10, # Rate limiting + + # Relevance scoring + query="API documentation guide", # Search query + scoring_method="bm25", # Scoring algorithm + score_threshold=0.3, # Minimum relevance (0.0-1.0) + + # Cache and filtering + force=False, # Bypass cache + filter_nonsense_urls=True, # Remove utility URLs + verbose=True # Debug output +) + +# Quick configurations for common use cases +blog_config = SeedingConfig( + source="sitemap", + pattern="*/blog/*", + extract_head=True +) + +api_docs_config = SeedingConfig( + source="sitemap+cc", + pattern="*/docs/*", + query="API reference documentation", + scoring_method="bm25", + score_threshold=0.5 +) + +product_pages_config = SeedingConfig( + source="cc", + pattern="*/product/*", + live_check=True, + max_urls=500 +) +``` + +### Metadata Extraction and Analysis + +```python +async def metadata_extraction(): + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + extract_head=True, # Extract metadata + pattern="*/blog/*", + max_urls=50 + ) + + urls = await seeder.urls("example.com", config) + + # Analyze extracted metadata + for url in urls[:5]: + head_data = url['head_data'] + print(f"\nURL: {url['url']}") + print(f"Title: {head_data.get('title', 'No title')}") + + # Standard meta tags + meta = head_data.get('meta', {}) + print(f"Description: {meta.get('description', 'N/A')}") + print(f"Keywords: {meta.get('keywords', 'N/A')}") + print(f"Author: {meta.get('author', 'N/A')}") + + # Open Graph data + print(f"OG Image: {meta.get('og:image', 'N/A')}") + print(f"OG Type: {meta.get('og:type', 'N/A')}") + + # JSON-LD structured data + jsonld = head_data.get('jsonld', []) + if jsonld: + print(f"Structured data: {len(jsonld)} items") + for item in jsonld[:2]: + if isinstance(item, dict): + print(f" Type: {item.get('@type', 'Unknown')}") + print(f" Name: {item.get('name', 'N/A')}") + +# Filter by metadata +async def metadata_filtering(): + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + extract_head=True, + max_urls=100 + ) + + urls = await seeder.urls("news.example.com", config) + + # Filter by publication date (from JSON-LD) + from datetime import datetime, timedelta + recent_cutoff = datetime.now() - timedelta(days=7) + + recent_articles = [] + for url in urls: + for jsonld in url['head_data'].get('jsonld', []): + if isinstance(jsonld, dict) and 'datePublished' in jsonld: + try: + pub_date = datetime.fromisoformat( + jsonld['datePublished'].replace('Z', '+00:00') + ) + if pub_date > recent_cutoff: + recent_articles.append(url) + break + except: + continue + + print(f"Found {len(recent_articles)} recent articles") +``` + +### BM25 Relevance Scoring + +```python +async def relevance_scoring(): + async with AsyncUrlSeeder() as seeder: + # Find pages about Python async programming + config = SeedingConfig( + source="sitemap", + extract_head=True, # Required for content-based scoring + query="python async await concurrency", + scoring_method="bm25", + score_threshold=0.3, # Only 30%+ relevant pages + max_urls=20 + ) + + urls = await seeder.urls("docs.python.org", config) + + # Results are automatically sorted by relevance + print("Most relevant Python async content:") + for url in urls[:5]: + score = url['relevance_score'] + title = url['head_data'].get('title', 'No title') + print(f"[{score:.2f}] {title}") + print(f" {url['url']}") + +# URL-based scoring (when extract_head=False) +async def url_based_scoring(): + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + extract_head=False, # Fast URL-only scoring + query="machine learning tutorial", + scoring_method="bm25", + score_threshold=0.2 + ) + + urls = await seeder.urls("example.com", config) + + # Scoring based on URL structure, domain, path segments + for url in urls[:5]: + print(f"[{url['relevance_score']:.2f}] {url['url']}") + +# Multi-concept queries +async def complex_queries(): + queries = [ + "data science pandas numpy visualization", + "web scraping automation selenium", + "machine learning tensorflow pytorch", + "api documentation rest graphql" + ] + + async with AsyncUrlSeeder() as seeder: + all_results = [] + + for query in queries: + config = SeedingConfig( + source="sitemap", + extract_head=True, + query=query, + scoring_method="bm25", + score_threshold=0.4, + max_urls=10 + ) + + urls = await seeder.urls("learning-site.com", config) + all_results.extend(urls) + + # Remove duplicates while preserving order + seen = set() + unique_results = [] + for url in all_results: + if url['url'] not in seen: + seen.add(url['url']) + unique_results.append(url) + + print(f"Found {len(unique_results)} unique pages across all topics") +``` + +### Live URL Validation + +```python +async def url_validation(): + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + live_check=True, # Verify URLs are accessible + concurrency=15, # Parallel HEAD requests + hits_per_sec=8, # Rate limiting + max_urls=100 + ) + + urls = await seeder.urls("example.com", config) + + # Analyze results + valid_urls = [u for u in urls if u['status'] == 'valid'] + invalid_urls = [u for u in urls if u['status'] == 'not_valid'] + + print(f"✅ Valid URLs: {len(valid_urls)}") + print(f"❌ Invalid URLs: {len(invalid_urls)}") + print(f"📊 Success rate: {len(valid_urls)/len(urls)*100:.1f}%") + + # Show some invalid URLs for debugging + if invalid_urls: + print("\nSample invalid URLs:") + for url in invalid_urls[:3]: + print(f" - {url['url']}") + +# Combined validation and metadata +async def comprehensive_validation(): + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + live_check=True, # Verify accessibility + extract_head=True, # Get metadata + query="tutorial guide", # Relevance scoring + scoring_method="bm25", + score_threshold=0.2, + concurrency=10, + max_urls=50 + ) + + urls = await seeder.urls("docs.example.com", config) + + # Filter for valid, relevant tutorials + good_tutorials = [ + url for url in urls + if url['status'] == 'valid' and + url['relevance_score'] > 0.3 and + 'tutorial' in url['head_data'].get('title', '').lower() + ] + + print(f"Found {len(good_tutorials)} high-quality tutorials") +``` + +### Multi-Domain Discovery + +```python +async def multi_domain_research(): + async with AsyncUrlSeeder() as seeder: + # Research Python tutorials across multiple sites + domains = [ + "docs.python.org", + "realpython.com", + "python-course.eu", + "tutorialspoint.com" + ] + + config = SeedingConfig( + source="sitemap", + extract_head=True, + query="python beginner tutorial basics", + scoring_method="bm25", + score_threshold=0.3, + max_urls=15 # Per domain + ) + + # Discover across all domains in parallel + results = await seeder.many_urls(domains, config) + + # Collect and rank all tutorials + all_tutorials = [] + for domain, urls in results.items(): + for url in urls: + url['domain'] = domain + all_tutorials.append(url) + + # Sort by relevance across all domains + all_tutorials.sort(key=lambda x: x['relevance_score'], reverse=True) + + print(f"Top 10 Python tutorials across {len(domains)} sites:") + for i, tutorial in enumerate(all_tutorials[:10], 1): + score = tutorial['relevance_score'] + title = tutorial['head_data'].get('title', 'No title')[:60] + domain = tutorial['domain'] + print(f"{i:2d}. [{score:.2f}] {title}") + print(f" {domain}") + +# Competitor analysis +async def competitor_analysis(): + competitors = ["competitor1.com", "competitor2.com", "competitor3.com"] + + async with AsyncUrlSeeder() as seeder: + config = SeedingConfig( + source="sitemap", + extract_head=True, + pattern="*/blog/*", + max_urls=50 + ) + + results = await seeder.many_urls(competitors, config) + + # Analyze content strategies + for domain, urls in results.items(): + content_types = {} + + for url in urls: + # Extract content type from metadata + meta = url['head_data'].get('meta', {}) + og_type = meta.get('og:type', 'unknown') + content_types[og_type] = content_types.get(og_type, 0) + 1 + + print(f"\n{domain} content distribution:") + for ctype, count in sorted(content_types.items(), + key=lambda x: x[1], reverse=True): + print(f" {ctype}: {count}") +``` + +### Complete Pipeline: Discovery → Filter → Crawl + +```python +async def smart_research_pipeline(): + """Complete pipeline: discover URLs, filter by relevance, crawl top results""" + + async with AsyncUrlSeeder() as seeder: + # Step 1: Discover relevant URLs + print("🔍 Discovering URLs...") + config = SeedingConfig( + source="sitemap+cc", + extract_head=True, + query="machine learning deep learning tutorial", + scoring_method="bm25", + score_threshold=0.4, + max_urls=100 + ) + + urls = await seeder.urls("example.com", config) + print(f" Found {len(urls)} relevant URLs") + + # Step 2: Select top articles + top_articles = sorted(urls, + key=lambda x: x['relevance_score'], + reverse=True)[:10] + + print(f" Selected top {len(top_articles)} for crawling") + + # Step 3: Show what we're about to crawl + print("\n📋 Articles to crawl:") + for i, article in enumerate(top_articles, 1): + score = article['relevance_score'] + title = article['head_data'].get('title', 'No title')[:60] + print(f" {i}. [{score:.2f}] {title}") + + # Step 4: Crawl selected articles + from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + + print(f"\n🕷️ Crawling {len(top_articles)} articles...") + + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + only_text=True, + word_count_threshold=200, + stream=True # Process results as they come + ) + + # Extract URLs and crawl + article_urls = [article['url'] for article in top_articles] + + crawled_count = 0 + async for result in await crawler.arun_many(article_urls, config=config): + if result.success: + crawled_count += 1 + word_count = len(result.markdown.raw_markdown.split()) + print(f" ✅ [{crawled_count}/{len(article_urls)}] " + f"{word_count} words from {result.url[:50]}...") + else: + print(f" ❌ Failed: {result.url[:50]}...") + + print(f"\n✨ Successfully crawled {crawled_count} articles!") + +asyncio.run(smart_research_pipeline()) +``` + +### Advanced Features and Performance + +```python +# Cache management +async def cache_management(): + async with AsyncUrlSeeder() as seeder: + # First run - populate cache + config = SeedingConfig( + source="sitemap", + extract_head=True, + force=True # Bypass cache, fetch fresh + ) + urls = await seeder.urls("example.com", config) + + # Subsequent runs - use cache (much faster) + config = SeedingConfig( + source="sitemap", + extract_head=True, + force=False # Use cache + ) + urls = await seeder.urls("example.com", config) + +# Performance optimization +async def performance_tuning(): + async with AsyncUrlSeeder() as seeder: + # High-performance configuration + config = SeedingConfig( + source="cc", + concurrency=50, # Many parallel workers + hits_per_sec=20, # High rate limit + max_urls=10000, # Large dataset + extract_head=False, # Skip metadata for speed + filter_nonsense_urls=True # Auto-filter utility URLs + ) + + import time + start = time.time() + urls = await seeder.urls("large-site.com", config) + elapsed = time.time() - start + + print(f"Processed {len(urls)} URLs in {elapsed:.2f}s") + print(f"Speed: {len(urls)/elapsed:.0f} URLs/second") + +# Memory-safe processing for large domains +async def large_domain_processing(): + async with AsyncUrlSeeder() as seeder: + # Safe for domains with 1M+ URLs + config = SeedingConfig( + source="cc+sitemap", + concurrency=50, # Bounded queue adapts to this + max_urls=100000, # Process in batches + filter_nonsense_urls=True + ) + + # The seeder automatically manages memory by: + # - Using bounded queues (prevents RAM spikes) + # - Applying backpressure when queue is full + # - Processing URLs as they're discovered + urls = await seeder.urls("huge-site.com", config) + +# Configuration cloning and reuse +config_base = SeedingConfig( + source="sitemap", + extract_head=True, + concurrency=20 +) + +# Create variations +blog_config = config_base.clone(pattern="*/blog/*") +docs_config = config_base.clone( + pattern="*/docs/*", + query="API documentation", + scoring_method="bm25" +) +fast_config = config_base.clone( + extract_head=False, + concurrency=100, + hits_per_sec=50 +) +``` + +### Troubleshooting and Best Practices + +```python +# Common issues and solutions +async def troubleshooting_guide(): + async with AsyncUrlSeeder() as seeder: + # Issue: No URLs found + try: + config = SeedingConfig(source="sitemap", pattern="*/nonexistent/*") + urls = await seeder.urls("example.com", config) + if not urls: + # Solution: Try broader pattern or different source + config = SeedingConfig(source="cc+sitemap", pattern="*") + urls = await seeder.urls("example.com", config) + except Exception as e: + print(f"Discovery failed: {e}") + + # Issue: Slow performance + config = SeedingConfig( + source="sitemap", # Faster than CC + concurrency=10, # Reduce if hitting rate limits + hits_per_sec=5, # Add rate limiting + extract_head=False # Skip if metadata not needed + ) + + # Issue: Low relevance scores + config = SeedingConfig( + query="specific detailed query terms", + score_threshold=0.1, # Lower threshold + scoring_method="bm25" + ) + + # Issue: Memory issues with large sites + config = SeedingConfig( + max_urls=10000, # Limit results + concurrency=20, # Reduce concurrency + source="sitemap" # Use sitemap only + ) + +# Performance benchmarks +print(""" +Typical performance on standard connection: +- Sitemap discovery: 100-1,000 URLs/second +- Common Crawl discovery: 50-500 URLs/second +- HEAD checking: 10-50 URLs/second +- Head extraction: 5-20 URLs/second +- BM25 scoring: 10,000+ URLs/second +""") + +# Best practices +best_practices = """ +✅ Use context manager: async with AsyncUrlSeeder() as seeder +✅ Start with sitemaps (faster), add CC if needed +✅ Use extract_head=True only when you need metadata +✅ Set reasonable max_urls to limit processing +✅ Add rate limiting for respectful crawling +✅ Cache results with force=False for repeated operations +✅ Filter nonsense URLs (enabled by default) +✅ Use specific patterns to reduce irrelevant results +""" +``` + +**📖 Learn more:** [Complete URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [SeedingConfig Reference](https://docs.crawl4ai.com/api/parameters/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/) +--- + + +## Deep Crawling Filters & Scorers + +Advanced URL filtering and scoring strategies for intelligent deep crawling with performance optimization. + +### URL Filters - Content and Domain Control + +```python +from crawl4ai.deep_crawling.filters import ( + URLPatternFilter, DomainFilter, ContentTypeFilter, + FilterChain, ContentRelevanceFilter, SEOFilter +) + +# Pattern-based filtering +pattern_filter = URLPatternFilter( + patterns=[ + "*.html", # HTML pages only + "*/blog/*", # Blog posts + "*/articles/*", # Article pages + "*2024*", # Recent content + "^https://example.com/docs/.*" # Regex pattern + ], + use_glob=True, + reverse=False # False = include matching, True = exclude matching +) + +# Domain filtering with subdomains +domain_filter = DomainFilter( + allowed_domains=["example.com", "docs.example.com"], + blocked_domains=["ads.example.com", "tracker.com"] +) + +# Content type filtering +content_filter = ContentTypeFilter( + allowed_types=["text/html", "application/pdf"], + check_extension=True +) + +# Apply individual filters +url = "https://example.com/blog/2024/article.html" +print(f"Pattern filter: {pattern_filter.apply(url)}") +print(f"Domain filter: {domain_filter.apply(url)}") +print(f"Content filter: {content_filter.apply(url)}") +``` + +### Filter Chaining - Combine Multiple Filters + +```python +# Create filter chain for comprehensive filtering +filter_chain = FilterChain([ + DomainFilter(allowed_domains=["example.com"]), + URLPatternFilter(patterns=["*/blog/*", "*/docs/*"]), + ContentTypeFilter(allowed_types=["text/html"]) +]) + +# Apply chain to URLs +urls = [ + "https://example.com/blog/post1.html", + "https://spam.com/content.html", + "https://example.com/blog/image.jpg", + "https://example.com/docs/guide.html" +] + +async def filter_urls(urls, filter_chain): + filtered = [] + for url in urls: + if await filter_chain.apply(url): + filtered.append(url) + return filtered + +# Usage +filtered_urls = await filter_urls(urls, filter_chain) +print(f"Filtered URLs: {filtered_urls}") + +# Check filter statistics +for filter_obj in filter_chain.filters: + stats = filter_obj.stats + print(f"{filter_obj.name}: {stats.passed_urls}/{stats.total_urls} passed") +``` + +### Advanced Content Filters + +```python +# BM25-based content relevance filtering +relevance_filter = ContentRelevanceFilter( + query="python machine learning tutorial", + threshold=0.5, # Minimum relevance score + k1=1.2, # TF saturation parameter + b=0.75, # Length normalization + avgdl=1000 # Average document length +) + +# SEO quality filtering +seo_filter = SEOFilter( + threshold=0.65, # Minimum SEO score + keywords=["python", "tutorial", "guide"], + weights={ + "title_length": 0.15, + "title_kw": 0.18, + "meta_description": 0.12, + "canonical": 0.10, + "robot_ok": 0.20, + "schema_org": 0.10, + "url_quality": 0.15 + } +) + +# Apply advanced filters +url = "https://example.com/python-ml-tutorial" +relevance_score = await relevance_filter.apply(url) +seo_score = await seo_filter.apply(url) + +print(f"Relevance: {relevance_score}, SEO: {seo_score}") +``` + +### URL Scorers - Quality and Relevance Scoring + +```python +from crawl4ai.deep_crawling.scorers import ( + KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer, + FreshnessScorer, DomainAuthorityScorer, CompositeScorer +) + +# Keyword relevance scoring +keyword_scorer = KeywordRelevanceScorer( + keywords=["python", "tutorial", "guide", "machine", "learning"], + weight=1.0, + case_sensitive=False +) + +# Path depth scoring (optimal depth = 3) +depth_scorer = PathDepthScorer( + optimal_depth=3, # /category/subcategory/article + weight=0.8 +) + +# Content type scoring +content_type_scorer = ContentTypeScorer( + type_weights={ + "html": 1.0, # Highest priority + "pdf": 0.8, # Medium priority + "txt": 0.6, # Lower priority + "doc": 0.4 # Lowest priority + }, + weight=0.9 +) + +# Freshness scoring +freshness_scorer = FreshnessScorer( + weight=0.7, + current_year=2024 +) + +# Domain authority scoring +domain_scorer = DomainAuthorityScorer( + domain_weights={ + "python.org": 1.0, + "github.com": 0.9, + "stackoverflow.com": 0.85, + "medium.com": 0.7, + "personal-blog.com": 0.3 + }, + default_weight=0.5, + weight=1.0 +) + +# Score individual URLs +url = "https://python.org/tutorial/2024/machine-learning.html" +scores = { + "keyword": keyword_scorer.score(url), + "depth": depth_scorer.score(url), + "content": content_type_scorer.score(url), + "freshness": freshness_scorer.score(url), + "domain": domain_scorer.score(url) +} + +print(f"Individual scores: {scores}") +``` + +### Composite Scoring - Combine Multiple Scorers + +```python +# Create composite scorer combining all strategies +composite_scorer = CompositeScorer( + scorers=[ + KeywordRelevanceScorer(["python", "tutorial"], weight=1.5), + PathDepthScorer(optimal_depth=3, weight=1.0), + ContentTypeScorer({"html": 1.0, "pdf": 0.8}, weight=1.2), + FreshnessScorer(weight=0.8, current_year=2024), + DomainAuthorityScorer({ + "python.org": 1.0, + "github.com": 0.9 + }, weight=1.3) + ], + normalize=True # Normalize by number of scorers +) + +# Score multiple URLs +urls_to_score = [ + "https://python.org/tutorial/2024/basics.html", + "https://github.com/user/python-guide/blob/main/README.md", + "https://random-blog.com/old/2018/python-stuff.html", + "https://python.org/docs/deep/nested/advanced/guide.html" +] + +scored_urls = [] +for url in urls_to_score: + score = composite_scorer.score(url) + scored_urls.append((url, score)) + +# Sort by score (highest first) +scored_urls.sort(key=lambda x: x[1], reverse=True) + +for url, score in scored_urls: + print(f"Score: {score:.3f} - {url}") + +# Check scorer statistics +print(f"\nScoring statistics:") +print(f"URLs scored: {composite_scorer.stats._urls_scored}") +print(f"Average score: {composite_scorer.stats.get_average():.3f}") +``` + +### Advanced Filter Patterns + +```python +# Complex pattern matching +advanced_patterns = URLPatternFilter( + patterns=[ + r"^https://docs\.python\.org/\d+/", # Python docs with version + r".*/tutorial/.*\.html$", # Tutorial pages + r".*/guide/(?!deprecated).*", # Guides but not deprecated + "*/blog/{2020,2021,2022,2023,2024}/*", # Recent blog posts + "**/{api,reference}/**/*.html" # API/reference docs + ], + use_glob=True +) + +# Exclude patterns (reverse=True) +exclude_filter = URLPatternFilter( + patterns=[ + "*/admin/*", + "*/login/*", + "*/private/*", + "**/.*", # Hidden files + "*.{jpg,png,gif,css,js}$" # Media and assets + ], + reverse=True # Exclude matching patterns +) + +# Content type with extension mapping +detailed_content_filter = ContentTypeFilter( + allowed_types=["text", "application"], + check_extension=True, + ext_map={ + "html": "text/html", + "htm": "text/html", + "md": "text/markdown", + "pdf": "application/pdf", + "doc": "application/msword", + "docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document" + } +) +``` + +### Performance-Optimized Filtering + +```python +# High-performance filter chain for large-scale crawling +class OptimizedFilterChain: + def __init__(self): + # Fast filters first (domain, patterns) + self.fast_filters = [ + DomainFilter( + allowed_domains=["example.com", "docs.example.com"], + blocked_domains=["ads.example.com"] + ), + URLPatternFilter([ + "*.html", "*.pdf", "*/blog/*", "*/docs/*" + ]) + ] + + # Slower filters last (content analysis) + self.slow_filters = [ + ContentRelevanceFilter( + query="important content", + threshold=0.3 + ) + ] + + async def apply_optimized(self, url: str) -> bool: + # Apply fast filters first + for filter_obj in self.fast_filters: + if not filter_obj.apply(url): + return False + + # Only apply slow filters if fast filters pass + for filter_obj in self.slow_filters: + if not await filter_obj.apply(url): + return False + + return True + +# Batch filtering with concurrency +async def batch_filter_urls(urls, filter_chain, max_concurrent=50): + import asyncio + semaphore = asyncio.Semaphore(max_concurrent) + + async def filter_single(url): + async with semaphore: + return await filter_chain.apply(url), url + + tasks = [filter_single(url) for url in urls] + results = await asyncio.gather(*tasks) + + return [url for passed, url in results if passed] + +# Usage with 1000 URLs +large_url_list = [f"https://example.com/page{i}.html" for i in range(1000)] +optimized_chain = OptimizedFilterChain() +filtered = await batch_filter_urls(large_url_list, optimized_chain) +``` + +### Custom Filter Implementation + +```python +from crawl4ai.deep_crawling.filters import URLFilter +import re + +class CustomLanguageFilter(URLFilter): + """Filter URLs by language indicators""" + + def __init__(self, allowed_languages=["en"], weight=1.0): + super().__init__() + self.allowed_languages = set(allowed_languages) + self.lang_patterns = { + "en": re.compile(r"/en/|/english/|lang=en"), + "es": re.compile(r"/es/|/spanish/|lang=es"), + "fr": re.compile(r"/fr/|/french/|lang=fr"), + "de": re.compile(r"/de/|/german/|lang=de") + } + + def apply(self, url: str) -> bool: + # Default to English if no language indicators + if not any(pattern.search(url) for pattern in self.lang_patterns.values()): + result = "en" in self.allowed_languages + self._update_stats(result) + return result + + # Check for allowed languages + for lang in self.allowed_languages: + if lang in self.lang_patterns: + if self.lang_patterns[lang].search(url): + self._update_stats(True) + return True + + self._update_stats(False) + return False + +# Custom scorer implementation +from crawl4ai.deep_crawling.scorers import URLScorer + +class CustomComplexityScorer(URLScorer): + """Score URLs by content complexity indicators""" + + def __init__(self, weight=1.0): + super().__init__(weight) + self.complexity_indicators = { + "tutorial": 0.9, + "guide": 0.8, + "example": 0.7, + "reference": 0.6, + "api": 0.5 + } + + def _calculate_score(self, url: str) -> float: + url_lower = url.lower() + max_score = 0.0 + + for indicator, score in self.complexity_indicators.items(): + if indicator in url_lower: + max_score = max(max_score, score) + + return max_score + +# Use custom filters and scorers +custom_filter = CustomLanguageFilter(allowed_languages=["en", "es"]) +custom_scorer = CustomComplexityScorer(weight=1.2) + +url = "https://example.com/en/tutorial/advanced-guide.html" +passes_filter = custom_filter.apply(url) +complexity_score = custom_scorer.score(url) + +print(f"Passes language filter: {passes_filter}") +print(f"Complexity score: {complexity_score}") +``` + +### Integration with Deep Crawling + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import DeepCrawlStrategy + +async def deep_crawl_with_filtering(): + # Create comprehensive filter chain + filter_chain = FilterChain([ + DomainFilter(allowed_domains=["python.org"]), + URLPatternFilter(["*/tutorial/*", "*/guide/*", "*/docs/*"]), + ContentTypeFilter(["text/html"]), + SEOFilter(threshold=0.6, keywords=["python", "programming"]) + ]) + + # Create composite scorer + scorer = CompositeScorer([ + KeywordRelevanceScorer(["python", "tutorial"], weight=1.5), + FreshnessScorer(weight=0.8), + PathDepthScorer(optimal_depth=3, weight=1.0) + ], normalize=True) + + # Configure deep crawl strategy with filters and scorers + deep_strategy = DeepCrawlStrategy( + max_depth=3, + max_pages=100, + url_filter=filter_chain, + url_scorer=scorer, + score_threshold=0.6 # Only crawl URLs scoring above 0.6 + ) + + config = CrawlerRunConfig( + deep_crawl_strategy=deep_strategy, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://python.org", + config=config + ) + + print(f"Deep crawl completed: {result.success}") + if hasattr(result, 'deep_crawl_results'): + print(f"Pages crawled: {len(result.deep_crawl_results)}") + +# Run the deep crawl +await deep_crawl_with_filtering() +``` + +**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Custom Filter Development](https://docs.crawl4ai.com/advanced/custom-filters/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/) +--- + + +## Summary + +Crawl4AI provides a comprehensive solution for web crawling and data extraction optimized for AI applications. From simple page crawling to complex multi-URL operations with advanced filtering, the library offers the flexibility and performance needed for modern data extraction workflows. + +**Key Takeaways:** +- Start with basic installation and simple crawling patterns +- Use configuration objects for consistent, maintainable code +- Choose appropriate extraction strategies based on your data structure +- Leverage Docker for production deployments +- Implement advanced features like deep crawling and custom filters as needed + +**Next Steps:** +- Explore the [GitHub repository](https://github.com/unclecode/crawl4ai) for latest updates +- Join the [Discord community](https://discord.gg/jP8KfhDhyN) for support +- Check out [example projects](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) for inspiration + +Happy crawling! 🕷️ diff --git a/docs/examples/amazon_product_extraction_direct_url.py b/docs/examples/amazon_product_extraction_direct_url.py index ec734245..c50b9cb5 100644 --- a/docs/examples/amazon_product_extraction_direct_url.py +++ b/docs/examples/amazon_product_extraction_direct_url.py @@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors. """ from crawl4ai import AsyncWebCrawler -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig import json diff --git a/docs/examples/amazon_product_extraction_using_hooks.py b/docs/examples/amazon_product_extraction_using_hooks.py index 5118b5d9..70e06311 100644 --- a/docs/examples/amazon_product_extraction_using_hooks.py +++ b/docs/examples/amazon_product_extraction_using_hooks.py @@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors. """ from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig import json from playwright.async_api import Page, BrowserContext diff --git a/docs/examples/amazon_product_extraction_using_use_javascript.py b/docs/examples/amazon_product_extraction_using_use_javascript.py index e412c931..364c400b 100644 --- a/docs/examples/amazon_product_extraction_using_use_javascript.py +++ b/docs/examples/amazon_product_extraction_using_use_javascript.py @@ -5,7 +5,7 @@ prices, ratings, and other details using CSS selectors. """ from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig import json diff --git a/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py b/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py index 66c586b0..5b02c16b 100644 --- a/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py +++ b/docs/examples/c4a_script/amazon_example/amazon_r2d2_search.py @@ -20,7 +20,7 @@ from pathlib import Path from typing import List, Dict, Any from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.script.c4a_compile import C4ACompiler diff --git a/docs/examples/c4a_script/github_search/github_search_crawler.py b/docs/examples/c4a_script/github_search/github_search_crawler.py index 71b936e3..71ad7caa 100644 --- a/docs/examples/c4a_script/github_search/github_search_crawler.py +++ b/docs/examples/c4a_script/github_search/github_search_crawler.py @@ -20,7 +20,7 @@ from pathlib import Path from typing import List, Dict, Any from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.script.c4a_compile import C4ACompiler diff --git a/docs/examples/extraction_strategies_examples.py b/docs/examples/extraction_strategies_examples.py index 84192f97..5f072f88 100644 --- a/docs/examples/extraction_strategies_examples.py +++ b/docs/examples/extraction_strategies_examples.py @@ -12,7 +12,7 @@ import os from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode from crawl4ai import LLMConfig -from crawl4ai.extraction_strategy import ( +from crawl4ai import ( LLMExtractionStrategy, JsonCssExtractionStrategy, JsonXPathExtractionStrategy, diff --git a/docs/examples/quickstart.ipynb b/docs/examples/quickstart.ipynb index 56365cde..cc5be006 100644 --- a/docs/examples/quickstart.ipynb +++ b/docs/examples/quickstart.ipynb @@ -518,7 +518,7 @@ } ], "source": [ - "from crawl4ai.extraction_strategy import LLMExtractionStrategy\n", + "from crawl4ai import LLMExtractionStrategy\n", "from pydantic import BaseModel, Field\n", "import os, json\n", "\n", @@ -594,7 +594,7 @@ } ], "source": [ - "from crawl4ai.extraction_strategy import CosineStrategy\n", + "from crawl4ai import CosineStrategy\n", "\n", "async def cosine_similarity_extraction():\n", " async with AsyncWebCrawler() as crawler:\n", diff --git a/docs/examples/quickstart.py b/docs/examples/quickstart.py index 5efb785d..9992a6bf 100644 --- a/docs/examples/quickstart.py +++ b/docs/examples/quickstart.py @@ -16,7 +16,7 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import PruningContentFilter -from crawl4ai.extraction_strategy import ( +from crawl4ai import ( JsonCssExtractionStrategy, LLMExtractionStrategy, ) @@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): - from crawl4ai.extraction_strategy import CosineStrategy + from crawl4ai import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( diff --git a/docs/examples/quickstart_examples_set_2.py b/docs/examples/quickstart_examples_set_2.py index 3adbfc0d..b12b084c 100644 --- a/docs/examples/quickstart_examples_set_2.py +++ b/docs/examples/quickstart_examples_set_2.py @@ -16,7 +16,7 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import PruningContentFilter -from crawl4ai.extraction_strategy import ( +from crawl4ai import ( JsonCssExtractionStrategy, LLMExtractionStrategy, ) @@ -416,7 +416,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): - from crawl4ai.extraction_strategy import CosineStrategy + from crawl4ai import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( diff --git a/docs/examples/summarize_page.py b/docs/examples/summarize_page.py index da2bcd21..cd377d80 100644 --- a/docs/examples/summarize_page.py +++ b/docs/examples/summarize_page.py @@ -2,7 +2,7 @@ import os import json from crawl4ai.web_crawler import WebCrawler from crawl4ai.chunking_strategy import * -from crawl4ai.extraction_strategy import * +from crawl4ai import * from crawl4ai.crawler_strategy import * url = r"https://marketplace.visualstudio.com/items?itemName=Unclecode.groqopilot" diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py index fe8e0a2b..1693b1f7 100644 --- a/docs/examples/tutorial_v0.5.py +++ b/docs/examples/tutorial_v0.5.py @@ -18,7 +18,7 @@ from crawl4ai import RoundRobinProxyStrategy from crawl4ai.content_filter_strategy import LLMContentFilter from crawl4ai import DefaultMarkdownGenerator from crawl4ai import LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy from pprint import pprint diff --git a/docs/md_v2/advanced/session-management.md b/docs/md_v2/advanced/session-management.md index 180dfc85..d63b1e80 100644 --- a/docs/md_v2/advanced/session-management.md +++ b/docs/md_v2/advanced/session-management.md @@ -45,7 +45,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv ```python from crawl4ai.async_configs import CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): diff --git a/docs/md_v2/api/arun.md b/docs/md_v2/api/arun.md index ea0f8176..a3086a8c 100644 --- a/docs/md_v2/api/arun.md +++ b/docs/md_v2/api/arun.md @@ -215,7 +215,7 @@ Below is a snippet combining many parameters: ```python import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): # Example schema diff --git a/docs/md_v2/api/async-webcrawler.md b/docs/md_v2/api/async-webcrawler.md index 50177f3e..b8f105fc 100644 --- a/docs/md_v2/api/async-webcrawler.md +++ b/docs/md_v2/api/async-webcrawler.md @@ -217,7 +217,7 @@ Below is an example hooking it all together: ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy import json async def main(): diff --git a/docs/md_v2/api/strategies.md b/docs/md_v2/api/strategies.md index a44d0fcd..07649ee9 100644 --- a/docs/md_v2/api/strategies.md +++ b/docs/md_v2/api/strategies.md @@ -169,7 +169,7 @@ OverlappingWindowChunking( ```python from pydantic import BaseModel -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy from crawl4ai import LLMConfig # Define schema @@ -247,7 +247,7 @@ async with AsyncWebCrawler() as crawler: ### CSS Extraction ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Define schema schema = { diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js index 0c3c37b9..883bb020 100644 --- a/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js +++ b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js @@ -1701,7 +1701,7 @@ Generated: ${new Date().toISOString()} import asyncio import json from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # The extraction schema generated from your selections EXTRACTION_SCHEMA = ${schemaJson} @@ -1782,7 +1782,7 @@ import asyncio import json from pathlib import Path from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # HTML snippet of the selected container element HTML_SNIPPET = """ diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js b/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js index de1ecac0..f7ef854f 100644 --- a/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js +++ b/docs/md_v2/apps/crawl4ai-assistant/content/scriptBuilder.js @@ -2437,7 +2437,7 @@ Generated: ${timestamp} import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # C4A Script commands C4A_SCRIPT = """ diff --git a/docs/md_v2/assets/llm.txt/txt/docker.txt b/docs/md_v2/assets/llm.txt/txt/docker.txt index a4c55d8a..65372fb3 100644 --- a/docs/md_v2/assets/llm.txt/txt/docker.txt +++ b/docs/md_v2/assets/llm.txt/txt/docker.txt @@ -476,7 +476,7 @@ services: ```python # Method 1: Create config objects and dump to see expected JSON structure from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy import json # Create browser config and see JSON structure diff --git a/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt b/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt index 4518761a..a9f5c18c 100644 --- a/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt +++ b/docs/md_v2/assets/llm.txt/txt/extraction-llm.txt @@ -37,7 +37,7 @@ import json from pydantic import BaseModel, Field from typing import List from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class SentimentAnalysis(BaseModel): """Use LLM when you need semantic understanding""" diff --git a/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt b/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt index 1b07df3b..099227e3 100644 --- a/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt +++ b/docs/md_v2/assets/llm.txt/txt/extraction-no-llm.txt @@ -39,7 +39,7 @@ import json import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def smart_extraction_workflow(): """ @@ -176,7 +176,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema( import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Manual schema for consistent product pages simple_schema = { @@ -342,7 +342,7 @@ asyncio.run(extract_complex_ecommerce()) ### XPath Alternative (When CSS Isn't Enough) ```python -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy # XPath for more complex selections xpath_schema = { @@ -387,7 +387,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import RegexExtractionStrategy +from crawl4ai import RegexExtractionStrategy async def extract_common_patterns(): # Use built-in patterns for common data types diff --git a/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt b/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt index be0937e8..12dad603 100644 --- a/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt +++ b/docs/md_v2/assets/llm.txt/txt/llms-full-v0.1.1.txt @@ -1835,7 +1835,7 @@ import json from pydantic import BaseModel, Field from typing import List from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class SentimentAnalysis(BaseModel): """Use LLM when you need semantic understanding""" @@ -2743,7 +2743,7 @@ import json import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def smart_extraction_workflow(): """ @@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema( import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Manual schema for consistent product pages simple_schema = { @@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce()) ### XPath Alternative (When CSS Isn't Enough) ```python -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy # XPath for more complex selections xpath_schema = { @@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import RegexExtractionStrategy +from crawl4ai import RegexExtractionStrategy async def extract_common_patterns(): # Use built-in patterns for common data types @@ -4711,7 +4711,7 @@ services: ```python # Method 1: Create config objects and dump to see expected JSON structure from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy import json # Create browser config and see JSON structure @@ -5792,7 +5792,7 @@ import json import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def smart_extraction_workflow(): """ @@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema( import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Manual schema for consistent product pages simple_schema = { @@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce()) ### XPath Alternative (When CSS Isn't Enough) ```python -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy # XPath for more complex selections xpath_schema = { @@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import RegexExtractionStrategy +from crawl4ai import RegexExtractionStrategy async def extract_common_patterns(): # Use built-in patterns for common data types diff --git a/docs/md_v2/assets/llm.txt/txt/llms-full.txt b/docs/md_v2/assets/llm.txt/txt/llms-full.txt index be0937e8..12dad603 100644 --- a/docs/md_v2/assets/llm.txt/txt/llms-full.txt +++ b/docs/md_v2/assets/llm.txt/txt/llms-full.txt @@ -1835,7 +1835,7 @@ import json from pydantic import BaseModel, Field from typing import List from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class SentimentAnalysis(BaseModel): """Use LLM when you need semantic understanding""" @@ -2743,7 +2743,7 @@ import json import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def smart_extraction_workflow(): """ @@ -2880,7 +2880,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema( import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Manual schema for consistent product pages simple_schema = { @@ -3046,7 +3046,7 @@ asyncio.run(extract_complex_ecommerce()) ### XPath Alternative (When CSS Isn't Enough) ```python -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy # XPath for more complex selections xpath_schema = { @@ -3091,7 +3091,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import RegexExtractionStrategy +from crawl4ai import RegexExtractionStrategy async def extract_common_patterns(): # Use built-in patterns for common data types @@ -4711,7 +4711,7 @@ services: ```python # Method 1: Create config objects and dump to see expected JSON structure from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy import json # Create browser config and see JSON structure @@ -5792,7 +5792,7 @@ import json import asyncio from pathlib import Path from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def smart_extraction_workflow(): """ @@ -5929,7 +5929,7 @@ social_schema = JsonCssExtractionStrategy.generate_schema( import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy # Manual schema for consistent product pages simple_schema = { @@ -6095,7 +6095,7 @@ asyncio.run(extract_complex_ecommerce()) ### XPath Alternative (When CSS Isn't Enough) ```python -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy # XPath for more complex selections xpath_schema = { @@ -6140,7 +6140,7 @@ strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import RegexExtractionStrategy +from crawl4ai import RegexExtractionStrategy async def extract_common_patterns(): # Use built-in patterns for common data types diff --git a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt b/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt deleted file mode 100644 index 846b6914..00000000 --- a/docs/md_v2/assets/llmtxt/crawl4ai_all_reasoning_content.llm.txt +++ /dev/null @@ -1,7708 +0,0 @@ -# Code Concatenation - -Generated on 2025-05-24 - -## File: docs/md_v2/core/browser-crawler-config.md - -```md -# Browser, Crawler & LLM Configuration (Quick Overview) - -Crawl4AI's flexibility stems from two key classes: - -1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent). -2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.). -3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.) - -In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md). - ---- - -## 1. BrowserConfig Essentials - -```python -class BrowserConfig: - def __init__( - browser_type="chromium", - headless=True, - proxy_config=None, - viewport_width=1080, - viewport_height=600, - verbose=True, - use_persistent_context=False, - user_data_dir=None, - cookies=None, - headers=None, - user_agent=None, - text_mode=False, - light_mode=False, - extra_args=None, - # ... other advanced parameters omitted here - ): - ... -``` - -### Key Fields to Note - -1. **`browser_type`** -- Options: `"chromium"`, `"firefox"`, or `"webkit"`. -- Defaults to `"chromium"`. -- If you need a different engine, specify it here. - -2. **`headless`** - - `True`: Runs the browser in headless mode (invisible browser). - - `False`: Runs the browser in visible mode, which helps with debugging. - -3. **`proxy_config`** - - A dictionary with fields like: -```json -{ - "server": "http://proxy.example.com:8080", - "username": "...", - "password": "..." -} -``` - - Leave as `None` if a proxy is not required. - -4. **`viewport_width` & `viewport_height`**: - - The initial window size. - - Some sites behave differently with smaller or bigger viewports. - -5. **`verbose`**: - - If `True`, prints extra logs. - - Handy for debugging. - -6. **`use_persistent_context`**: - - If `True`, uses a **persistent** browser profile, storing cookies/local storage across runs. - - Typically also set `user_data_dir` to point to a folder. - -7. **`cookies`** & **`headers`**: - - If you want to start with specific cookies or add universal HTTP headers, set them here. - - E.g. `cookies=[{"name": "session", "value": "abc123", "domain": "example.com"}]`. - -8. **`user_agent`**: - - Custom User-Agent string. If `None`, a default is used. - - You can also set `user_agent_mode="random"` for randomization (if you want to fight bot detection). - -9. **`text_mode`** & **`light_mode`**: - - `text_mode=True` disables images, possibly speeding up text-only crawls. - - `light_mode=True` turns off certain background features for performance. - -10. **`extra_args`**: - - Additional flags for the underlying browser. - - E.g. `["--disable-extensions"]`. - -### Helper Methods - -Both configuration classes provide a `clone()` method to create modified copies: - -```python -# Create a base browser config -base_browser = BrowserConfig( - browser_type="chromium", - headless=True, - text_mode=True -) - -# Create a visible browser config for debugging -debug_browser = base_browser.clone( - headless=False, - verbose=True -) -``` - -**Minimal Example**: - -```python -from crawl4ai import AsyncWebCrawler, BrowserConfig - -browser_conf = BrowserConfig( - browser_type="firefox", - headless=False, - text_mode=True -) - -async with AsyncWebCrawler(config=browser_conf) as crawler: - result = await crawler.arun("https://example.com") - print(result.markdown[:300]) -``` - ---- - -## 2. CrawlerRunConfig Essentials - -```python -class CrawlerRunConfig: - def __init__( - word_count_threshold=200, - extraction_strategy=None, - markdown_generator=None, - cache_mode=None, - js_code=None, - wait_for=None, - screenshot=False, - pdf=False, - capture_mhtml=False, - # Location and Identity Parameters - locale=None, # e.g. "en-US", "fr-FR" - timezone_id=None, # e.g. "America/New_York" - geolocation=None, # GeolocationConfig object - # Resource Management - enable_rate_limiting=False, - rate_limit_config=None, - memory_threshold_percent=70.0, - check_interval=1.0, - max_session_permit=20, - display_mode=None, - verbose=True, - stream=False, # Enable streaming for arun_many() - # ... other advanced parameters omitted - ): - ... -``` - -### Key Fields to Note - -1. **`word_count_threshold`**: - - The minimum word count before a block is considered. - - If your site has lots of short paragraphs or items, you can lower it. - -2. **`extraction_strategy`**: - - Where you plug in JSON-based extraction (CSS, LLM, etc.). - - If `None`, no structured extraction is done (only raw/cleaned HTML + markdown). - -3. **`markdown_generator`**: - - E.g., `DefaultMarkdownGenerator(...)`, controlling how HTML→Markdown conversion is done. - - If `None`, a default approach is used. - -4. **`cache_mode`**: - - Controls caching behavior (`ENABLED`, `BYPASS`, `DISABLED`, etc.). - - If `None`, defaults to some level of caching or you can specify `CacheMode.ENABLED`. - -5. **`js_code`**: - - A string or list of JS strings to execute. - - Great for "Load More" buttons or user interactions. - -6. **`wait_for`**: - - A CSS or JS expression to wait for before extracting content. - - Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`. - -7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**: - - If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded. - - The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string). - -8. **Location Parameters**: - - **`locale`**: Browser's locale (e.g., `"en-US"`, `"fr-FR"`) for language preferences - - **`timezone_id`**: Browser's timezone (e.g., `"America/New_York"`, `"Europe/Paris"`) - - **`geolocation`**: GPS coordinates via `GeolocationConfig(latitude=48.8566, longitude=2.3522)` - - See [Identity Based Crawling](../advanced/identity-based-crawling.md#7-locale-timezone-and-geolocation-control) - -9. **`verbose`**: - - Logs additional runtime details. - - Overlaps with the browser's verbosity if also set to `True` in `BrowserConfig`. - -10. **`enable_rate_limiting`**: - - If `True`, enables rate limiting for batch processing. - - Requires `rate_limit_config` to be set. - -11. **`memory_threshold_percent`**: - - The memory threshold (as a percentage) to monitor. - - If exceeded, the crawler will pause or slow down. - -12. **`check_interval`**: - - The interval (in seconds) to check system resources. - - Affects how often memory and CPU usage are monitored. - -13. **`max_session_permit`**: - - The maximum number of concurrent crawl sessions. - - Helps prevent overwhelming the system. - -14. **`display_mode`**: - - The display mode for progress information (`DETAILED`, `BRIEF`, etc.). - - Affects how much information is printed during the crawl. - - -### Helper Methods - -The `clone()` method is particularly useful for creating variations of your crawler configuration: - -```python -# Create a base configuration -base_config = CrawlerRunConfig( - cache_mode=CacheMode.ENABLED, - word_count_threshold=200, - wait_until="networkidle" -) - -# Create variations for different use cases -stream_config = base_config.clone( - stream=True, # Enable streaming mode - cache_mode=CacheMode.BYPASS -) - -debug_config = base_config.clone( - page_timeout=120000, # Longer timeout for debugging - verbose=True -) -``` - -The `clone()` method: -- Creates a new instance with all the same settings -- Updates only the specified parameters -- Leaves the original configuration unchanged -- Perfect for creating variations without repeating all parameters - ---- - - -## 3. LLMConfig Essentials - -### Key fields to note - -1. **`provider`**: -- Which LLM provoder to use. -- Possible values are `"ollama/llama3","groq/llama3-70b-8192","groq/llama3-8b-8192", "openai/gpt-4o-mini" ,"openai/gpt-4o","openai/o1-mini","openai/o1-preview","openai/o3-mini","openai/o3-mini-high","anthropic/claude-3-haiku-20240307","anthropic/claude-3-opus-20240229","anthropic/claude-3-sonnet-20240229","anthropic/claude-3-5-sonnet-20240620","gemini/gemini-pro","gemini/gemini-1.5-pro","gemini/gemini-2.0-flash","gemini/gemini-2.0-flash-exp","gemini/gemini-2.0-flash-lite-preview-02-05","deepseek/deepseek-chat"`
*(default: `"openai/gpt-4o-mini"`)* - -2. **`api_token`**: - - Optional. When not provided explicitly, api_token will be read from environment variables based on provider. For example: If a gemini model is passed as provider then,`"GEMINI_API_KEY"` will be read from environment variables - - API token of LLM provider
eg: `api_token = "gsk_1ClHGGJ7Lpn4WGybR7vNWGdyb3FY7zXEw3SCiy0BAVM9lL8CQv"` - - Environment variable - use with prefix "env:"
eg:`api_token = "env: GROQ_API_KEY"` - -3. **`base_url`**: - - If your provider has a custom endpoint - -```python -llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")) -``` - -## 4. Putting It All Together - -In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call's needs: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy - -async def main(): - # 1) Browser config: headless, bigger viewport, no proxy - browser_conf = BrowserConfig( - headless=True, - viewport_width=1280, - viewport_height=720 - ) - - # 2) Example extraction strategy - schema = { - "name": "Articles", - "baseSelector": "div.article", - "fields": [ - {"name": "title", "selector": "h2", "type": "text"}, - {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} - ] - } - extraction = JsonCssExtractionStrategy(schema) - - # 3) Example LLM content filtering - - gemini_config = LLMConfig( - provider="gemini/gemini-1.5-pro" - api_token = "env:GEMINI_API_TOKEN" - ) - - # Initialize LLM filter with specific instruction - filter = LLMContentFilter( - llm_config=gemini_config, # or your preferred provider - instruction=""" - Focus on extracting the core educational content. - Include: - - Key concepts and explanations - - Important code examples - - Essential technical details - Exclude: - - Navigation elements - - Sidebars - - Footer content - Format the output as clean markdown with proper code blocks and headers. - """, - chunk_token_threshold=500, # Adjust based on your needs - verbose=True - ) - - md_generator = DefaultMarkdownGenerator( - content_filter=filter, - options={"ignore_links": True} - - # 4) Crawler run config: skip cache, use extraction - run_conf = CrawlerRunConfig( - markdown_generator=md_generator, - extraction_strategy=extraction, - cache_mode=CacheMode.BYPASS, - ) - - async with AsyncWebCrawler(config=browser_conf) as crawler: - # 4) Execute the crawl - result = await crawler.arun(url="https://example.com/news", config=run_conf) - - if result.success: - print("Extracted content:", result.extracted_content) - else: - print("Error:", result.error_message) - -if __name__ == "__main__": - asyncio.run(main()) -``` - ---- - -## 5. Next Steps - -For a **detailed list** of available parameters (including advanced ones), see: - -- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md) - -You can explore topics like: - -- **Custom Hooks & Auth** (Inject JavaScript or handle login forms). -- **Session Management** (Re-use pages, preserve state across multiple calls). -- **Magic Mode** or **Identity-based Crawling** (Fight bot detection by simulating user behavior). -- **Advanced Caching** (Fine-tune read/write cache modes). - ---- - -## 6. Conclusion - -**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define: - -- **Which** browser to launch, how it should run, and any proxy or user agent needs. -- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc. -- **Which** LLM provider to use, api token, temperature and base url for custom endpoints - -Use them together for **clear, maintainable** code, and when you need more specialized behavior, check out the advanced parameters in the [reference docs](../api/parameters.md). Happy crawling! -``` - - -## File: docs/md_v2/core/cache-modes.md - -```md -# Crawl4AI Cache System and Migration Guide - -## Overview -Starting from version 0.5.0, Crawl4AI introduces a new caching system that replaces the old boolean flags with a more intuitive `CacheMode` enum. This change simplifies cache control and makes the behavior more predictable. - -## Old vs New Approach - -### Old Way (Deprecated) -The old system used multiple boolean flags: -- `bypass_cache`: Skip cache entirely -- `disable_cache`: Disable all caching -- `no_cache_read`: Don't read from cache -- `no_cache_write`: Don't write to cache - -### New Way (Recommended) -The new system uses a single `CacheMode` enum: -- `CacheMode.ENABLED`: Normal caching (read/write) -- `CacheMode.DISABLED`: No caching at all -- `CacheMode.READ_ONLY`: Only read from cache -- `CacheMode.WRITE_ONLY`: Only write to cache -- `CacheMode.BYPASS`: Skip cache for this operation - -## Migration Example - -### Old Code (Deprecated) -```python -import asyncio -from crawl4ai import AsyncWebCrawler - -async def use_proxy(): - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun( - url="https://www.nbcnews.com/business", - bypass_cache=True # Old way - ) - print(len(result.markdown)) - -async def main(): - await use_proxy() - -if __name__ == "__main__": - asyncio.run(main()) -``` - -### New Code (Recommended) -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CacheMode -from crawl4ai.async_configs import CrawlerRunConfig - -async def use_proxy(): - # Use CacheMode in CrawlerRunConfig - config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) - async with AsyncWebCrawler(verbose=True) as crawler: - result = await crawler.arun( - url="https://www.nbcnews.com/business", - config=config # Pass the configuration object - ) - print(len(result.markdown)) - -async def main(): - await use_proxy() - -if __name__ == "__main__": - asyncio.run(main()) -``` - -## Common Migration Patterns - -| Old Flag | New Mode | -|-----------------------|---------------------------------| -| `bypass_cache=True` | `cache_mode=CacheMode.BYPASS` | -| `disable_cache=True` | `cache_mode=CacheMode.DISABLED`| -| `no_cache_read=True` | `cache_mode=CacheMode.WRITE_ONLY` | -| `no_cache_write=True` | `cache_mode=CacheMode.READ_ONLY` | -``` - - -## File: docs/md_v2/core/content-selection.md - -```md -# Content Selection - -Crawl4AI provides multiple ways to **select**, **filter**, and **refine** the content from your crawls. Whether you need to target a specific CSS region, exclude entire tags, filter out external links, or remove certain domains and images, **`CrawlerRunConfig`** offers a wide range of parameters. - -Below, we show how to configure these parameters and combine them for precise control. - ---- - -## 1. CSS-Based Selection - -There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`. - -### 1.1 Using `css_selector` - -A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - -async def main(): - config = CrawlerRunConfig( - # e.g., first 30 items from Hacker News - css_selector=".athing:nth-child(-n+30)" - ) - async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url="https://news.ycombinator.com/newest", - config=config - ) - print("Partial HTML length:", len(result.cleaned_html)) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**Result**: Only elements matching that selector remain in `result.cleaned_html`. - -### 1.2 Using `target_elements` - -The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features: - -```python -import asyncio -from crawl4ai import AsyncWebCrawler, CrawlerRunConfig - -async def main(): - config = CrawlerRunConfig( - # Target article body and sidebar, but not other content - target_elements=["article.main-content", "aside.sidebar"] - ) - async with AsyncWebCrawler() as crawler: - result = await crawler.arun( - url="https://example.com/blog-post", - config=config - ) - print("Markdown focused on target elements") - print("Links from entire page still available:", len(result.links.get("internal", []))) - -if __name__ == "__main__": - asyncio.run(main()) -``` - -**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection. - ---- - -## 2. Content Filtering & Exclusions - -### 2.1 Basic Overview - -```python -config = CrawlerRunConfig( - # Content thresholds - word_count_threshold=10, # Minimum words per block - - # Tag exclusions - excluded_tags=['form', 'header', 'footer', 'nav'], - - # Link filtering - exclude_external_links=True, - exclude_social_media_links=True, - # Block entire domains - exclude_domains=["adtrackers.com", "spammynews.org"], - exclude_social_media_domains=["facebook.com", "twitter.com"], - - # Media filtering - exclude_external_images=True -) -``` - -**Explanation**: - -- **`word_count_threshold`**: Ignores text blocks under X words. Helps skip trivial blocks like short nav or disclaimers. -- **`excluded_tags`**: Removes entire tags (`
`, `
`, `