Main Title
+Paragraph content
+- Item 1
- Item 2
diff --git a/README.md b/README.md index 97787b2f..02f01d03 100644 --- a/README.md +++ b/README.md @@ -352,7 +352,7 @@ if __name__ == "__main__": ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy import json async def main(): @@ -426,7 +426,7 @@ if __name__ == "__main__": import os import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy from pydantic import BaseModel, Field class OpenAIModelFee(BaseModel): diff --git a/crawl4ai/crawlers/google_search/crawler.py b/crawl4ai/crawlers/google_search/crawler.py index e1288de1..18233417 100644 --- a/crawl4ai/crawlers/google_search/crawler.py +++ b/crawl4ai/crawlers/google_search/crawler.py @@ -1,7 +1,7 @@ from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode from crawl4ai.hub import BaseCrawler from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from pathlib import Path import json import os diff --git a/deploy/docker/c4ai-code-context.md b/deploy/docker/c4ai-code-context.md index 1b250126..eb29b94c 100644 --- a/deploy/docker/c4ai-code-context.md +++ b/deploy/docker/c4ai-code-context.md @@ -7901,7 +7901,7 @@ from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator from crawl4ai.content_filter_strategy import PruningContentFilter -from crawl4ai.extraction_strategy import ( +from crawl4ai import ( JsonCssExtractionStrategy, LLMExtractionStrategy, ) @@ -8301,7 +8301,7 @@ async def crawl_dynamic_content_pages_method_2(): async def cosine_similarity_extraction(): - from crawl4ai.extraction_strategy import CosineStrategy + from crawl4ai import CosineStrategy crawl_config = CrawlerRunConfig( cache_mode=CacheMode.BYPASS, extraction_strategy=CosineStrategy( diff --git a/deploy/docker/c4ai-doc-context.md b/deploy/docker/c4ai-doc-context.md index 77785cec..5137159e 100644 --- a/deploy/docker/c4ai-doc-context.md +++ b/deploy/docker/c4ai-doc-context.md @@ -354,7 +354,7 @@ In a typical scenario, you define **one** `BrowserConfig` for your crawler sessi ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): # 1) Browser config: headless, bigger viewport, no proxy @@ -1042,7 +1042,7 @@ You can combine content selection with a more advanced extraction strategy. For import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): # Minimal schema for repeated items @@ -1094,7 +1094,7 @@ import asyncio import json from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class ArticleData(BaseModel): headline: str @@ -1139,7 +1139,7 @@ Below is a short function that unifies **CSS selection**, **exclusion** logic, a import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_main_articles(url: str): schema = { @@ -1488,7 +1488,7 @@ If you run a JSON-based extraction strategy (CSS, XPath, LLM, etc.), the structu import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): schema = { @@ -4722,7 +4722,7 @@ if __name__ == "__main__": Once dynamic content is loaded, you can attach an **`extraction_strategy`** (like `JsonCssExtractionStrategy` or `LLMExtractionStrategy`). For example: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy schema = { "name": "Commits", @@ -4902,7 +4902,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B > **New!** Crawl4AI now provides a powerful utility to automatically generate extraction schemas using LLM. This is a one-time cost that gives you a reusable schema for fast, LLM-free extractions: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai import LLMConfig # Generate a schema (one-time cost) @@ -4932,7 +4932,7 @@ Here's a basic extraction example: import asyncio import json from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def main(): schema = { @@ -4987,7 +4987,7 @@ import json import asyncio from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class OpenAIModelFee(BaseModel): model_name: str = Field(..., description="Name of the OpenAI model.") @@ -5103,7 +5103,7 @@ Some sites require multiple “page clicks” or dynamic JavaScript updates. Bel ```python import asyncio from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_structured_data_using_css_extractor(): print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---") @@ -7300,7 +7300,7 @@ Here's an example of crawling GitHub commits across multiple pages while preserv ```python from crawl4ai.async_configs import CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy from crawl4ai.cache_context import CacheMode async def crawl_dynamic_content(): @@ -7850,7 +7850,7 @@ The Cosine Strategy: ## Basic Usage ```python -from crawl4ai.extraction_strategy import CosineStrategy +from crawl4ai import CosineStrategy strategy = CosineStrategy( semantic_filter="product reviews", # Target content type @@ -8161,7 +8161,7 @@ import json from pydantic import BaseModel, Field from typing import List from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class Product(BaseModel): name: str @@ -8278,7 +8278,7 @@ import asyncio from typing import List from pydantic import BaseModel, Field from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import LLMExtractionStrategy +from crawl4ai import LLMExtractionStrategy class Entity(BaseModel): name: str @@ -8423,7 +8423,7 @@ Let’s begin with a **simple** schema-based extraction using the `JsonCssExtrac import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy async def extract_crypto_prices(): # 1. Define a simple extraction schema @@ -8493,7 +8493,7 @@ Below is a short example demonstrating **XPath** extraction plus the **`raw://`* import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy +from crawl4ai import JsonXPathExtractionStrategy async def extract_crypto_prices_xpath(): # 1. Minimal dummy HTML with some repeating rows @@ -8694,7 +8694,7 @@ Key Takeaways: import json import asyncio from crawl4ai import AsyncWebCrawler, CrawlerRunConfig -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy ecommerce_schema = { # ... the advanced schema from above ... @@ -8804,7 +8804,7 @@ While manually crafting schemas is powerful and precise, Crawl4AI now offers a c The schema generator is available as a static method on both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. You can choose between OpenAI's GPT-4 or the open-source Ollama for schema generation: ```python -from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy +from crawl4ai import JsonCssExtractionStrategy, JsonXPathExtractionStrategy from crawl4ai import LLMConfig # Sample HTML with product information diff --git a/docs/apps/iseeyou/llms-full.txt b/docs/apps/iseeyou/llms-full.txt new file mode 100644 index 00000000..12dad603 --- /dev/null +++ b/docs/apps/iseeyou/llms-full.txt @@ -0,0 +1,7715 @@ +# Crawl4AI + +> Open-source LLM-friendly web crawler and scraper for AI applications + +Crawl4AI simplifies asynchronous web crawling and data extraction, making it accessible for large language models (LLMs) and AI applications. Built with Python and Playwright for high-performance crawling with structured data extraction. + +**Key Features:** +- Asynchronous crawling with high concurrency +- Multiple extraction strategies (CSS, XPath, LLM-based) +- Built-in markdown generation with content filtering +- Docker deployment with REST API +- Session management and browser automation +- Advanced anti-detection capabilities + +**Quick Links:** +- [GitHub Repository](https://github.com/unclecode/crawl4ai) +- [Documentation](https://docs.crawl4ai.com) +- [Examples](https://github.com/unclecode/crawl4ai/tree/main/docs/examples) + +--- + + +## Installation + +Multiple installation options for different environments and use cases. + +### Basic Installation + +```bash +# Install core library +pip install crawl4ai + +# Initial setup (installs Playwright browsers) +crawl4ai-setup + +# Verify installation +crawl4ai-doctor +``` + +### Quick Verification + +```python +import asyncio +from crawl4ai import AsyncWebCrawler + +async def main(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print(result.markdown[:300]) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**📖 Learn more:** [Basic Usage Guide](https://docs.crawl4ai.com/core/quickstart.md) + +### Advanced Features (Optional) + +```bash +# PyTorch-based features (text clustering, semantic chunking) +pip install crawl4ai[torch] +crawl4ai-setup + +# Transformers (Hugging Face models) +pip install crawl4ai[transformer] +crawl4ai-setup + +# All features (large download) +pip install crawl4ai[all] +crawl4ai-setup + +# Pre-download models (optional) +crawl4ai-download-models +``` + +**📖 Learn more:** [Advanced Features Documentation](https://docs.crawl4ai.com/extraction/llm-strategies.md) + +### Docker Deployment + +```bash +# Pull pre-built image (specify platform for consistency) +docker pull --platform linux/amd64 unclecode/crawl4ai:latest +# For ARM (M1/M2 Macs): docker pull --platform linux/arm64 unclecode/crawl4ai:latest + +# Setup environment for LLM support +cat > .llm.env << EOL +OPENAI_API_KEY=sk-your-key +ANTHROPIC_API_KEY=your-anthropic-key +EOL + +# Run with LLM support (specify platform) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# For ARM Macs, use: --platform linux/arm64 + +# Basic run (no LLM) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai \ + --shm-size=1g \ + unclecode/crawl4ai:latest +``` + +**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment.md) + +### Docker Compose + +```bash +# Clone repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai + +# Copy environment template +cp deploy/docker/.llm.env.example .llm.env +# Edit .llm.env with your API keys + +# Run pre-built image +IMAGE=unclecode/crawl4ai:latest docker compose up -d + +# Build and run locally +docker compose up --build -d + +# Build with all features +INSTALL_TYPE=all docker compose up --build -d + +# Stop service +docker compose down +``` + +**📖 Learn more:** [Docker Compose Configuration](https://docs.crawl4ai.com/core/docker-deployment.md#option-2-using-docker-compose) + +### Manual Docker Build + +```bash +# Build multi-architecture image (specify platform) +docker buildx build --platform linux/amd64 -t crawl4ai-local:latest --load . +# For ARM: docker buildx build --platform linux/arm64 -t crawl4ai-local:latest --load . + +# Build with specific features +docker buildx build \ + --platform linux/amd64 \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=false \ + -t crawl4ai-local:latest --load . + +# Run custom build (specify platform) +docker run -d \ + --platform linux/amd64 \ + -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + crawl4ai-local:latest +``` + +**📖 Learn more:** [Manual Build Guide](https://docs.crawl4ai.com/core/docker-deployment.md#option-3-manual-local-build--run) + +### Google Colab + +```python +# Install in Colab +!pip install crawl4ai +!crawl4ai-setup + +# If setup fails, manually install Playwright browsers +!playwright install chromium + +# Install with all features (may take 5-10 minutes) +!pip install crawl4ai[all] +!crawl4ai-setup +!crawl4ai-download-models + +# If still having issues, force Playwright install +!playwright install chromium --force + +# Quick test +import asyncio +from crawl4ai import AsyncWebCrawler + +async def test_crawl(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com") + print("✅ Installation successful!") + print(f"Content length: {len(result.markdown)}") + +# Run test in Colab +await test_crawl() +``` + +**📖 Learn more:** [Colab Examples Notebook](https://colab.research.google.com/github/unclecode/crawl4ai/blob/main/docs/examples/quickstart.ipynb) + +### Docker API Usage + +```python +# Using Docker SDK +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + results = await client.crawl( + ["https://example.com"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + for result in results: + print(f"Success: {result.success}, Length: {len(result.markdown)}") + +asyncio.run(main()) +``` + +**📖 Learn more:** [Docker Client API](https://docs.crawl4ai.com/core/docker-deployment.md#python-sdk) + +### Direct API Calls + +```python +# REST API example +import requests + +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +print(response.json()) +``` + +**📖 Learn more:** [REST API Reference](https://docs.crawl4ai.com/core/docker-deployment.md#rest-api-examples) + +### Health Check + +```bash +# Check Docker service +curl http://localhost:11235/health + +# Access playground +open http://localhost:11235/playground + +# View metrics +curl http://localhost:11235/metrics +``` + +**📖 Learn more:** [Monitoring & Metrics](https://docs.crawl4ai.com/core/docker-deployment.md#metrics--monitoring) +--- + + +## Simple Crawling + +Basic web crawling operations with AsyncWebCrawler, configurations, and response handling. + +### Basic Setup + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig + +async def main(): + browser_config = BrowserConfig() # Default browser settings + run_config = CrawlerRunConfig() # Default crawl settings + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + print(result.markdown) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### Understanding CrawlResult + +```python +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +config = CrawlerRunConfig( + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"ignore_links": True} + ) +) + +result = await crawler.arun("https://example.com", config=config) + +# Different content formats +print(result.html) # Raw HTML +print(result.cleaned_html) # Cleaned HTML +print(result.markdown.raw_markdown) # Raw markdown +print(result.markdown.fit_markdown) # Filtered markdown + +# Status information +print(result.success) # True/False +print(result.status_code) # HTTP status (200, 404, etc.) + +# Extracted content +print(result.media) # Images, videos, audio +print(result.links) # Internal/external links +``` + +### Basic Configuration Options + +```python +run_config = CrawlerRunConfig( + word_count_threshold=10, # Min words per block + exclude_external_links=True, # Remove external links + remove_overlay_elements=True, # Remove popups/modals + process_iframes=True, # Process iframe content + excluded_tags=['form', 'header'] # Skip these tags +) + +result = await crawler.arun("https://example.com", config=run_config) +``` + +### Error Handling + +```python +result = await crawler.arun("https://example.com", config=run_config) + +if not result.success: + print(f"Crawl failed: {result.error_message}") + print(f"Status code: {result.status_code}") +else: + print(f"Success! Content length: {len(result.markdown)}") +``` + +### Debugging with Verbose Logging + +```python +browser_config = BrowserConfig(verbose=True) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") + # Detailed logging output will be displayed +``` + +### Complete Example + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode + +async def comprehensive_crawl(): + browser_config = BrowserConfig(verbose=True) + + run_config = CrawlerRunConfig( + # Content filtering + word_count_threshold=10, + excluded_tags=['form', 'header', 'nav'], + exclude_external_links=True, + + # Content processing + process_iframes=True, + remove_overlay_elements=True, + + # Cache control + cache_mode=CacheMode.ENABLED + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun( + url="https://example.com", + config=run_config + ) + + if result.success: + # Display content summary + print(f"Title: {result.metadata.get('title', 'No title')}") + print(f"Content: {result.markdown[:500]}...") + + # Process media + images = result.media.get("images", []) + print(f"Found {len(images)} images") + for img in images[:3]: # First 3 images + print(f" - {img.get('src', 'No src')}") + + # Process links + internal_links = result.links.get("internal", []) + print(f"Found {len(internal_links)} internal links") + for link in internal_links[:3]: # First 3 links + print(f" - {link.get('href', 'No href')}") + + else: + print(f"❌ Crawl failed: {result.error_message}") + print(f"Status: {result.status_code}") + +if __name__ == "__main__": + asyncio.run(comprehensive_crawl()) +``` + +### Working with Raw HTML and Local Files + +```python +# Crawl raw HTML +raw_html = "
Content
" +result = await crawler.arun(f"raw://{raw_html}") + +# Crawl local file +result = await crawler.arun("file:///path/to/local/file.html") + +# Both return standard CrawlResult objects +print(result.markdown) +``` + +## Table Extraction + +Extract structured data from HTML tables with automatic detection and scoring. + +### Basic Table Extraction + +```python +import asyncio +import pandas as pd +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def extract_tables(): + async with AsyncWebCrawler() as crawler: + config = CrawlerRunConfig( + table_score_threshold=7, # Higher = stricter detection + cache_mode=CacheMode.BYPASS + ) + + result = await crawler.arun("https://example.com/tables", config=config) + + if result.success and result.tables: + # New tables field (v0.6+) + for i, table in enumerate(result.tables): + print(f"Table {i+1}:") + print(f"Headers: {table['headers']}") + print(f"Rows: {len(table['rows'])}") + print(f"Caption: {table.get('caption', 'No caption')}") + + # Convert to DataFrame + df = pd.DataFrame(table['rows'], columns=table['headers']) + print(df.head()) + +asyncio.run(extract_tables()) +``` + +### Advanced Table Processing + +```python +from crawl4ai import LXMLWebScrapingStrategy + +async def process_financial_tables(): + config = CrawlerRunConfig( + table_score_threshold=8, # Strict detection for data tables + scraping_strategy=LXMLWebScrapingStrategy(), + keep_data_attributes=True, + scan_full_page=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://coinmarketcap.com", config=config) + + if result.tables: + # Get the main data table (usually first/largest) + main_table = result.tables[0] + + # Create DataFrame + df = pd.DataFrame( + main_table['rows'], + columns=main_table['headers'] + ) + + # Clean and process data + df = clean_financial_data(df) + + # Save for analysis + df.to_csv("market_data.csv", index=False) + return df + +def clean_financial_data(df): + """Clean currency symbols, percentages, and large numbers""" + for col in df.columns: + if 'price' in col.lower(): + # Remove currency symbols + df[col] = df[col].str.replace(r'[^\d.]', '', regex=True) + df[col] = pd.to_numeric(df[col], errors='coerce') + + elif '%' in str(df[col].iloc[0]): + # Convert percentages + df[col] = df[col].str.replace('%', '').astype(float) / 100 + + elif any(suffix in str(df[col].iloc[0]) for suffix in ['B', 'M', 'K']): + # Handle large numbers (Billions, Millions, etc.) + df[col] = df[col].apply(convert_large_numbers) + + return df + +def convert_large_numbers(value): + """Convert 1.5B -> 1500000000""" + if pd.isna(value): + return float('nan') + + value = str(value) + multiplier = 1 + if 'B' in value: + multiplier = 1e9 + elif 'M' in value: + multiplier = 1e6 + elif 'K' in value: + multiplier = 1e3 + + number = float(re.sub(r'[^\d.]', '', value)) + return number * multiplier +``` + +### Table Detection Configuration + +```python +# Strict table detection (data-heavy pages) +strict_config = CrawlerRunConfig( + table_score_threshold=9, # Only high-quality tables + word_count_threshold=5, # Ignore sparse content + excluded_tags=['nav', 'footer'] # Skip navigation tables +) + +# Lenient detection (mixed content pages) +lenient_config = CrawlerRunConfig( + table_score_threshold=5, # Include layout tables + process_iframes=True, # Check embedded tables + scan_full_page=True # Scroll to load dynamic tables +) + +# Financial/data site optimization +financial_config = CrawlerRunConfig( + table_score_threshold=8, + scraping_strategy=LXMLWebScrapingStrategy(), + wait_for="css:table", # Wait for tables to load + scan_full_page=True, + scroll_delay=0.2 +) +``` + +### Multi-Table Processing + +```python +async def extract_all_tables(): + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/data", config=config) + + tables_data = {} + + for i, table in enumerate(result.tables): + # Create meaningful names based on content + table_name = ( + table.get('caption') or + f"table_{i+1}_{table['headers'][0]}" + ).replace(' ', '_').lower() + + df = pd.DataFrame(table['rows'], columns=table['headers']) + + # Store with metadata + tables_data[table_name] = { + 'dataframe': df, + 'headers': table['headers'], + 'row_count': len(table['rows']), + 'caption': table.get('caption'), + 'summary': table.get('summary') + } + + return tables_data + +# Usage +tables = await extract_all_tables() +for name, data in tables.items(): + print(f"{name}: {data['row_count']} rows") + data['dataframe'].to_csv(f"{name}.csv") +``` + +### Backward Compatibility + +```python +# Support both new and old table formats +def get_tables(result): + # New format (v0.6+) + if hasattr(result, 'tables') and result.tables: + return result.tables + + # Fallback to media.tables (older versions) + return result.media.get('tables', []) + +# Usage in existing code +result = await crawler.arun(url, config=config) +tables = get_tables(result) + +for table in tables: + df = pd.DataFrame(table['rows'], columns=table['headers']) + # Process table data... +``` + +### Table Quality Scoring + +```python +# Understanding table_score_threshold values: +# 10: Only perfect data tables (headers + data rows) +# 8-9: High-quality tables (recommended for financial/data sites) +# 6-7: Mixed content tables (news sites, wikis) +# 4-5: Layout tables included (broader detection) +# 1-3: All table-like structures (very permissive) + +config = CrawlerRunConfig( + table_score_threshold=8, # Balanced detection + verbose=True # See scoring details in logs +) +``` + + +**📖 Learn more:** [CrawlResult API Reference](https://docs.crawl4ai.com/api/crawl-result/), [Browser & Crawler Configuration](https://docs.crawl4ai.com/core/browser-crawler-config/), [Cache Modes](https://docs.crawl4ai.com/core/cache-modes/) +--- + + +## Browser, Crawler & LLM Configuration + +Core configuration classes for controlling browser behavior, crawl operations, LLM providers, and understanding crawl results. + +### BrowserConfig - Browser Environment Setup + +```python +from crawl4ai import BrowserConfig, AsyncWebCrawler + +# Basic browser configuration +browser_config = BrowserConfig( + browser_type="chromium", # "chromium", "firefox", "webkit" + headless=True, # False for visible browser (debugging) + viewport_width=1280, + viewport_height=720, + verbose=True +) + +# Advanced browser setup with proxy and persistence +browser_config = BrowserConfig( + headless=False, + proxy="http://user:pass@proxy:8080", + use_persistent_context=True, + user_data_dir="./browser_data", + cookies=[ + {"name": "session", "value": "abc123", "domain": "example.com"} + ], + headers={"Accept-Language": "en-US,en;q=0.9"}, + user_agent="Mozilla/5.0 (X11; Linux x86_64) Chrome/116.0.0.0 Safari/537.36", + text_mode=True, # Disable images for faster crawling + extra_args=["--disable-extensions", "--no-sandbox"] +) + +async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com") +``` + +### CrawlerRunConfig - Crawl Operation Control + +```python +from crawl4ai import CrawlerRunConfig, CacheMode +from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator +from crawl4ai.content_filter_strategy import PruningContentFilter + +# Basic crawl configuration +run_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=10, + excluded_tags=["nav", "footer", "script"], + exclude_external_links=True, + screenshot=True, + pdf=True +) + +# Advanced content processing +md_generator = DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.6), + options={"citations": True, "ignore_links": False} +) + +run_config = CrawlerRunConfig( + # Content processing + markdown_generator=md_generator, + css_selector="main.content", # Focus on specific content + target_elements=[".article", ".post"], # Multiple target selectors + process_iframes=True, + remove_overlay_elements=True, + + # Page interaction + js_code=[ + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + wait_for="css:.content-loaded", + wait_for_timeout=10000, + scan_full_page=True, + + # Session management + session_id="persistent_session", + + # Media handling + screenshot=True, + pdf=True, + capture_mhtml=True, + image_score_threshold=5, + + # Advanced options + simulate_user=True, + magic=True, # Auto-handle popups + verbose=True +) +``` + +### CrawlerRunConfig Parameters by Category + +```python +# Content Processing +config = CrawlerRunConfig( + word_count_threshold=10, # Min words per content block + css_selector="main.article", # Focus on specific content + target_elements=[".post", ".content"], # Multiple target selectors + excluded_tags=["nav", "footer"], # Remove these tags + excluded_selector="#ads, .tracker", # Remove by selector + only_text=True, # Text-only extraction + keep_data_attributes=True, # Preserve data-* attributes + remove_forms=True, # Remove all forms + process_iframes=True # Include iframe content +) + +# Page Navigation & Timing +config = CrawlerRunConfig( + wait_until="networkidle", # Wait condition + page_timeout=60000, # 60 second timeout + wait_for="css:.loaded", # Wait for specific element + wait_for_images=True, # Wait for images to load + delay_before_return_html=0.5, # Final delay before capture + semaphore_count=10 # Max concurrent operations +) + +# Page Interaction +config = CrawlerRunConfig( + js_code="document.querySelector('button').click();", + scan_full_page=True, # Auto-scroll page + scroll_delay=0.3, # Delay between scrolls + remove_overlay_elements=True, # Remove popups/modals + simulate_user=True, # Simulate human behavior + override_navigator=True, # Override navigator properties + magic=True # Auto-handle common patterns +) + +# Caching & Session +config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, # Cache behavior + session_id="my_session", # Persistent session + shared_data={"context": "value"} # Share data between hooks +) + +# Media & Output +config = CrawlerRunConfig( + screenshot=True, # Capture screenshot + pdf=True, # Generate PDF + capture_mhtml=True, # Capture MHTML archive + image_score_threshold=3, # Filter low-quality images + exclude_external_images=True # Remove external images +) + +# Link & Domain Filtering +config = CrawlerRunConfig( + exclude_external_links=True, # Remove external links + exclude_social_media_links=True, # Remove social media links + exclude_domains=["ads.com", "tracker.io"], # Custom domain filter + exclude_internal_links=False # Keep internal links +) +``` + +### LLMConfig - Language Model Setup + +```python +from crawl4ai import LLMConfig + +# OpenAI configuration +llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY" + temperature=0.1, + max_tokens=2000 +) + +# Local model with Ollama +llm_config = LLMConfig( + provider="ollama/llama3.3", + api_token=None, # Not needed for Ollama + base_url="http://localhost:11434" # Custom endpoint +) + +# Anthropic Claude +llm_config = LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000 +) + +# Google Gemini +llm_config = LLMConfig( + provider="gemini/gemini-1.5-pro", + api_token="env:GEMINI_API_KEY" +) + +# Groq (fast inference) +llm_config = LLMConfig( + provider="groq/llama3-70b-8192", + api_token="env:GROQ_API_KEY" +) +``` + +### CrawlResult - Understanding Output + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=run_config) + + # Basic status information + print(f"Success: {result.success}") + print(f"Status: {result.status_code}") + print(f"URL: {result.url}") + + if not result.success: + print(f"Error: {result.error_message}") + return + + # HTML content variants + print(f"Original HTML: {len(result.html)} chars") + print(f"Cleaned HTML: {len(result.cleaned_html or '')} chars") + + # Markdown output (MarkdownGenerationResult) + if result.markdown: + print(f"Raw markdown: {len(result.markdown.raw_markdown)} chars") + print(f"With citations: {len(result.markdown.markdown_with_citations)} chars") + + # Filtered content (if content filter was used) + if result.markdown.fit_markdown: + print(f"Fit markdown: {len(result.markdown.fit_markdown)} chars") + print(f"Fit HTML: {len(result.markdown.fit_html)} chars") + + # Extracted structured data + if result.extracted_content: + import json + data = json.loads(result.extracted_content) + print(f"Extracted {len(data)} items") + + # Media and links + images = result.media.get("images", []) + print(f"Found {len(images)} images") + for img in images[:3]: # First 3 images + print(f" {img.get('src')} (score: {img.get('score', 0)})") + + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Links: {len(internal_links)} internal, {len(external_links)} external") + + # Generated files + if result.screenshot: + print(f"Screenshot captured: {len(result.screenshot)} chars (base64)") + # Save screenshot + import base64 + with open("page.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + + if result.pdf: + print(f"PDF generated: {len(result.pdf)} bytes") + with open("page.pdf", "wb") as f: + f.write(result.pdf) + + if result.mhtml: + print(f"MHTML captured: {len(result.mhtml)} chars") + with open("page.mhtml", "w", encoding="utf-8") as f: + f.write(result.mhtml) + + # SSL certificate information + if result.ssl_certificate: + print(f"SSL Issuer: {result.ssl_certificate.issuer}") + print(f"Valid until: {result.ssl_certificate.valid_until}") + + # Network and console data (if captured) + if result.network_requests: + requests = [r for r in result.network_requests if r.get("event_type") == "request"] + print(f"Network requests captured: {len(requests)}") + + if result.console_messages: + errors = [m for m in result.console_messages if m.get("type") == "error"] + print(f"Console messages: {len(result.console_messages)} ({len(errors)} errors)") + + # Session and metadata + if result.session_id: + print(f"Session ID: {result.session_id}") + + if result.metadata: + print(f"Metadata: {result.metadata.get('title', 'No title')}") +``` + +### Configuration Helpers and Best Practices + +```python +# Clone configurations for variations +base_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + word_count_threshold=200, + verbose=True +) + +# Create streaming version +stream_config = base_config.clone( + stream=True, + cache_mode=CacheMode.BYPASS +) + +# Create debug version +debug_config = base_config.clone( + headless=False, + page_timeout=120000, + verbose=True +) + +# Serialize/deserialize configurations +config_dict = base_config.dump() # Convert to dict +restored_config = CrawlerRunConfig.load(config_dict) # Restore from dict + +# Browser configuration management +browser_config = BrowserConfig(headless=True, text_mode=True) +browser_dict = browser_config.to_dict() +cloned_browser = browser_config.clone(headless=False, verbose=True) +``` + +### Common Configuration Patterns + +```python +# Fast text-only crawling +fast_config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, + text_mode=True, + exclude_external_links=True, + exclude_external_images=True, + word_count_threshold=50 +) + +# Comprehensive data extraction +comprehensive_config = CrawlerRunConfig( + process_iframes=True, + scan_full_page=True, + wait_for_images=True, + screenshot=True, + capture_network_requests=True, + capture_console_messages=True, + magic=True +) + +# Stealth crawling +stealth_config = CrawlerRunConfig( + simulate_user=True, + override_navigator=True, + mean_delay=2.0, + max_range=1.0, + user_agent_mode="random" +) +``` + +### Advanced Configuration Features + +#### User Agent Management & Bot Detection Avoidance + +```python +from crawl4ai import CrawlerRunConfig + +# Random user agent generation +config = CrawlerRunConfig( + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", # "windows", "macos", "linux", "android", "ios" + "browser": "chrome", # "chrome", "firefox", "safari", "edge" + "device_type": "desktop" # "desktop", "mobile", "tablet" + } +) + +# Custom user agent with stealth features +config = CrawlerRunConfig( + user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36", + simulate_user=True, # Simulate human mouse movements + override_navigator=True, # Override navigator properties + mean_delay=1.5, # Random delays between actions + max_range=2.0 +) + +# Combined anti-detection approach +stealth_config = CrawlerRunConfig( + user_agent_mode="random", + simulate_user=True, + override_navigator=True, + magic=True, # Auto-handle common bot detection patterns + delay_before_return_html=2.0 +) +``` + +#### Proxy Configuration with ProxyConfig + +```python +from crawl4ai import CrawlerRunConfig, ProxyConfig, ProxyRotationStrategy + +# Single proxy configuration +proxy_config = ProxyConfig( + server="http://proxy.example.com:8080", + username="proxy_user", + password="proxy_pass" +) + +# From proxy string format +proxy_config = ProxyConfig.from_string("192.168.1.100:8080:username:password") + +# Multiple proxies with rotation +proxies = [ + ProxyConfig(server="http://proxy1.com:8080", username="user1", password="pass1"), + ProxyConfig(server="http://proxy2.com:8080", username="user2", password="pass2"), + ProxyConfig(server="http://proxy3.com:8080", username="user3", password="pass3") +] + +rotation_strategy = ProxyRotationStrategy( + proxies=proxies, + rotation_method="round_robin" # or "random", "least_used" +) + +config = CrawlerRunConfig( + proxy_config=proxy_config, + proxy_rotation_strategy=rotation_strategy +) + +# Load proxies from environment variable +proxies_from_env = ProxyConfig.from_env("MY_PROXIES") # comma-separated proxy strings +``` + +#### Content Selection: css_selector vs target_elements + +```python +from crawl4ai import CrawlerRunConfig + +# css_selector: Extracts HTML at top level, affects entire processing +config = CrawlerRunConfig( + css_selector="main.article, .content-area", # Can be list of selectors + # Everything else (markdown, extraction, links) works only on this HTML subset +) + +# target_elements: Focuses extraction within already processed HTML +config = CrawlerRunConfig( + css_selector="body", # First extract entire body + target_elements=[ # Then focus extraction on these elements + ".article-content", + ".post-body", + ".main-text" + ], + # Links, media from entire body, but markdown/extraction only from target_elements +) + +# Hierarchical content selection +config = CrawlerRunConfig( + css_selector=["#main-content", ".article-wrapper"], # Top-level extraction + target_elements=[ # Subset for processing + ".article-title", + ".article-body", + ".article-metadata" + ], + excluded_selector="#sidebar, .ads, .comments" # Remove these from selection +) +``` + +#### Advanced wait_for Conditions + +```python +from crawl4ai import CrawlerRunConfig + +# CSS selector waiting +config = CrawlerRunConfig( + wait_for="css:.content-loaded", # Wait for element to appear + wait_for_timeout=15000 +) + +# JavaScript boolean expression waiting +config = CrawlerRunConfig( + wait_for="js:() => window.dataLoaded === true", # Custom JS condition + wait_for_timeout=20000 +) + +# Complex JavaScript conditions +config = CrawlerRunConfig( + wait_for="js:() => document.querySelectorAll('.item').length >= 10", + js_code=[ + "document.querySelector('.load-more')?.click();", + "window.scrollTo(0, document.body.scrollHeight);" + ] +) + +# Multiple conditions with JavaScript +config = CrawlerRunConfig( + wait_for="js:() => !document.querySelector('.loading') && document.querySelector('.results')", + page_timeout=30000 +) +``` + +#### Session Management for Multi-Step Crawling + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Persistent session across multiple arun() calls +async def multi_step_crawling(): + async with AsyncWebCrawler() as crawler: + # Step 1: Login page + login_config = CrawlerRunConfig( + session_id="user_session", # Create persistent session + js_code="document.querySelector('#username').value = 'user'; document.querySelector('#password').value = 'pass'; document.querySelector('#login').click();", + wait_for="css:.dashboard", + cache_mode=CacheMode.BYPASS + ) + + result1 = await crawler.arun("https://example.com/login", config=login_config) + + # Step 2: Navigate to protected area (reuses same browser page) + nav_config = CrawlerRunConfig( + session_id="user_session", # Same session = same browser page + js_only=True, # No page reload, just JS navigation + js_code="window.location.href = '/dashboard/data';", + wait_for="css:.data-table" + ) + + result2 = await crawler.arun("https://example.com/dashboard/data", config=nav_config) + + # Step 3: Extract data from multiple pages + for page in range(1, 6): + page_config = CrawlerRunConfig( + session_id="user_session", + js_only=True, + js_code=f"document.querySelector('.page-{page}').click();", + wait_for=f"js:() => document.querySelector('.page-{page}').classList.contains('active')" + ) + + result = await crawler.arun(f"https://example.com/data/page/{page}", config=page_config) + print(f"Page {page} data extracted: {len(result.extracted_content)}") + + # Important: Kill session when done + await crawler.kill_session("user_session") + +# Session with shared data between steps +async def session_with_shared_data(): + shared_context = {"user_id": "12345", "preferences": {"theme": "dark"}} + + config = CrawlerRunConfig( + session_id="persistent_session", + shared_data=shared_context, # Available across all session calls + js_code="console.log('User ID:', window.sharedData.user_id);" + ) +``` + +#### Identity-Based Crawling Parameters + +```python +from crawl4ai import CrawlerRunConfig, GeolocationConfig + +# Locale and timezone simulation +config = CrawlerRunConfig( + locale="en-US", # Browser language preference + timezone_id="America/New_York", # Timezone setting + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", + "locale": "en-US" + } +) + +# Geolocation simulation +geo_config = GeolocationConfig( + latitude=40.7128, # New York coordinates + longitude=-74.0060, + accuracy=100.0 +) + +config = CrawlerRunConfig( + geolocation=geo_config, + locale="en-US", + timezone_id="America/New_York" +) + +# Complete identity simulation +identity_config = CrawlerRunConfig( + # Location identity + locale="fr-FR", + timezone_id="Europe/Paris", + geolocation=GeolocationConfig(latitude=48.8566, longitude=2.3522), + + # Browser identity + user_agent_mode="random", + user_agent_generator_config={ + "platform": "windows", + "locale": "fr-FR", + "browser": "chrome" + }, + + # Behavioral identity + simulate_user=True, + override_navigator=True, + mean_delay=2.0, + max_range=1.5 +) +``` + +#### Simplified Import Pattern + +```python +# Almost everything from crawl4ai main package +from crawl4ai import ( + AsyncWebCrawler, + BrowserConfig, + CrawlerRunConfig, + LLMConfig, + CacheMode, + ProxyConfig, + GeolocationConfig +) + +# Specialized strategies (still from crawl4ai) +from crawl4ai import ( + JsonCssExtractionStrategy, + LLMExtractionStrategy, + DefaultMarkdownGenerator, + PruningContentFilter, + RegexChunking +) + +# Complete example with simplified imports +async def example_crawl(): + browser_config = BrowserConfig(headless=True) + + run_config = CrawlerRunConfig( + user_agent_mode="random", + proxy_config=ProxyConfig.from_string("192.168.1.1:8080:user:pass"), + css_selector="main.content", + target_elements=[".article", ".post"], + wait_for="js:() => document.querySelector('.loaded')", + session_id="my_session", + simulate_user=True + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://example.com", config=run_config) + return result +``` + +## Advanced Features + +Comprehensive guide to advanced crawling capabilities including file handling, authentication, dynamic content, monitoring, and session management. + +### File Download Handling + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +import os + +# Enable downloads with custom path +downloads_path = os.path.join(os.getcwd(), "my_downloads") +os.makedirs(downloads_path, exist_ok=True) + +browser_config = BrowserConfig( + accept_downloads=True, + downloads_path=downloads_path +) + +# Trigger downloads with JavaScript +async def download_files(): + async with AsyncWebCrawler(config=browser_config) as crawler: + config = CrawlerRunConfig( + js_code=""" + // Click download links + const downloadLinks = document.querySelectorAll('a[href$=".pdf"]'); + for (const link of downloadLinks) { + link.click(); + await new Promise(r => setTimeout(r, 2000)); // Delay between downloads + } + """, + wait_for=5 # Wait for downloads to start + ) + + result = await crawler.arun("https://example.com/downloads", config=config) + + if result.downloaded_files: + print("Downloaded files:") + for file_path in result.downloaded_files: + print(f"- {file_path} ({os.path.getsize(file_path)} bytes)") +``` + +### Hooks & Authentication + +```python +from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig +from playwright.async_api import Page, BrowserContext + +async def advanced_crawler_with_hooks(): + browser_config = BrowserConfig(headless=True, verbose=True) + crawler = AsyncWebCrawler(config=browser_config) + + # Hook functions for different stages + async def on_browser_created(browser, **kwargs): + print("[HOOK] Browser created successfully") + return browser + + async def on_page_context_created(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] Setting up page & context") + + # Block images for faster crawling + async def route_filter(route): + if route.request.resource_type == "image": + await route.abort() + else: + await route.continue_() + + await context.route("**", route_filter) + + # Simulate login if needed + # await page.goto("https://example.com/login") + # await page.fill("input[name='username']", "testuser") + # await page.fill("input[name='password']", "password123") + # await page.click("button[type='submit']") + + await page.set_viewport_size({"width": 1080, "height": 600}) + return page + + async def before_goto(page: Page, context: BrowserContext, url: str, **kwargs): + print(f"[HOOK] About to navigate to: {url}") + await page.set_extra_http_headers({"Custom-Header": "my-value"}) + return page + + async def after_goto(page: Page, context: BrowserContext, url: str, response, **kwargs): + print(f"[HOOK] Successfully loaded: {url}") + try: + await page.wait_for_selector('.content', timeout=1000) + print("[HOOK] Content found!") + except: + print("[HOOK] Content not found, continuing") + return page + + async def before_retrieve_html(page: Page, context: BrowserContext, **kwargs): + print("[HOOK] Final actions before HTML retrieval") + await page.evaluate("window.scrollTo(0, document.body.scrollHeight);") + return page + + # Attach hooks + crawler.crawler_strategy.set_hook("on_browser_created", on_browser_created) + crawler.crawler_strategy.set_hook("on_page_context_created", on_page_context_created) + crawler.crawler_strategy.set_hook("before_goto", before_goto) + crawler.crawler_strategy.set_hook("after_goto", after_goto) + crawler.crawler_strategy.set_hook("before_retrieve_html", before_retrieve_html) + + await crawler.start() + + config = CrawlerRunConfig() + result = await crawler.arun("https://example.com", config=config) + + if result.success: + print(f"Crawled successfully: {len(result.html)} chars") + + await crawler.close() +``` + +### Lazy Loading & Dynamic Content + +```python +# Handle lazy-loaded images and infinite scroll +async def handle_lazy_loading(): + config = CrawlerRunConfig( + # Wait for images to fully load + wait_for_images=True, + + # Automatically scroll entire page to trigger lazy loading + scan_full_page=True, + scroll_delay=0.5, # Delay between scroll steps + + # JavaScript for custom lazy loading + js_code=""" + // Scroll and wait for content to load + window.scrollTo(0, document.body.scrollHeight); + + // Click "Load More" if available + const loadMoreBtn = document.querySelector('.load-more'); + if (loadMoreBtn) { + loadMoreBtn.click(); + } + """, + + # Wait for specific content to appear + wait_for="css:.lazy-content:nth-child(20)", # Wait for 20 items + + # Exclude external images to focus on main content + exclude_external_images=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/gallery", config=config) + + if result.success: + images = result.media.get("images", []) + print(f"Loaded {len(images)} images after lazy loading") + for img in images[:3]: + print(f"- {img.get('src')} (score: {img.get('score', 'N/A')})") +``` + +### Network & Console Monitoring + +```python +# Capture all network requests and console messages for debugging +async def monitor_network_and_console(): + config = CrawlerRunConfig( + capture_network_requests=True, + capture_console_messages=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + # Analyze network requests + if result.network_requests: + requests = [r for r in result.network_requests if r.get("event_type") == "request"] + responses = [r for r in result.network_requests if r.get("event_type") == "response"] + failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"] + + print(f"Network activity: {len(requests)} requests, {len(responses)} responses, {len(failures)} failures") + + # Find API calls + api_calls = [r for r in requests if "api" in r.get("url", "")] + print(f"API calls detected: {len(api_calls)}") + + # Show failed requests + for failure in failures[:3]: + print(f"Failed: {failure.get('url')} - {failure.get('failure_text')}") + + # Analyze console messages + if result.console_messages: + message_types = {} + for msg in result.console_messages: + msg_type = msg.get("type", "unknown") + message_types[msg_type] = message_types.get(msg_type, 0) + 1 + + print(f"Console messages: {message_types}") + + # Show errors + errors = [msg for msg in result.console_messages if msg.get("type") == "error"] + for error in errors[:2]: + print(f"JS Error: {error.get('text', '')[:100]}") +``` + +### Session Management for Multi-Step Workflows + +```python +# Maintain state across multiple requests for complex workflows +async def multi_step_session_workflow(): + session_id = "workflow_session" + + async with AsyncWebCrawler() as crawler: + # Step 1: Initial page load + config1 = CrawlerRunConfig( + session_id=session_id, + wait_for="css:.content-loaded" + ) + + result1 = await crawler.arun("https://example.com/step1", config=config1) + print("Step 1 completed") + + # Step 2: Navigate and interact (same browser tab) + config2 = CrawlerRunConfig( + session_id=session_id, + js_only=True, # Don't reload page, just run JS + js_code=""" + document.querySelector('#next-button').click(); + """, + wait_for="css:.step2-content" + ) + + result2 = await crawler.arun("https://example.com/step2", config=config2) + print("Step 2 completed") + + # Step 3: Form submission + config3 = CrawlerRunConfig( + session_id=session_id, + js_only=True, + js_code=""" + document.querySelector('#form-field').value = 'test data'; + document.querySelector('#submit-btn').click(); + """, + wait_for="css:.results" + ) + + result3 = await crawler.arun("https://example.com/submit", config=config3) + print("Step 3 completed") + + # Clean up session + await crawler.crawler_strategy.kill_session(session_id) + +# Advanced GitHub commits pagination example +async def github_commits_pagination(): + session_id = "github_session" + all_commits = [] + + async with AsyncWebCrawler() as crawler: + for page in range(3): + if page == 0: + # Initial load + config = CrawlerRunConfig( + session_id=session_id, + wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" + ) + else: + # Navigate to next page + config = CrawlerRunConfig( + session_id=session_id, + js_only=True, + js_code='document.querySelector(\'a[data-testid="pagination-next-button"]\').click();', + wait_for="js:() => document.querySelectorAll('li.Box-sc-g0xbh4-0').length > 0" + ) + + result = await crawler.arun( + "https://github.com/microsoft/TypeScript/commits/main", + config=config + ) + + if result.success: + commit_count = result.cleaned_html.count('li.Box-sc-g0xbh4-0') + print(f"Page {page + 1}: Found {commit_count} commits") + + await crawler.crawler_strategy.kill_session(session_id) +``` + +### SSL Certificate Analysis + +```python +# Fetch and analyze SSL certificates +async def analyze_ssl_certificates(): + config = CrawlerRunConfig( + fetch_ssl_certificate=True + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success and result.ssl_certificate: + cert = result.ssl_certificate + + # Basic certificate info + print(f"Issuer: {cert.issuer.get('CN', 'Unknown')}") + print(f"Subject: {cert.subject.get('CN', 'Unknown')}") + print(f"Valid from: {cert.valid_from}") + print(f"Valid until: {cert.valid_until}") + print(f"Fingerprint: {cert.fingerprint}") + + # Export certificate in different formats + import os + os.makedirs("certificates", exist_ok=True) + + cert.to_json("certificates/cert.json") + cert.to_pem("certificates/cert.pem") + cert.to_der("certificates/cert.der") + + print("Certificate exported in multiple formats") +``` + +### Advanced Page Interaction + +```python +# Complex page interactions with dynamic content +async def advanced_page_interaction(): + async with AsyncWebCrawler() as crawler: + # Multi-step interaction with waiting + config = CrawlerRunConfig( + js_code=[ + # Step 1: Scroll to load content + "window.scrollTo(0, document.body.scrollHeight);", + + # Step 2: Wait and click load more + """ + (async () => { + await new Promise(resolve => setTimeout(resolve, 2000)); + const loadMore = document.querySelector('.load-more'); + if (loadMore) loadMore.click(); + })(); + """ + ], + + # Wait for new content to appear + wait_for="js:() => document.querySelectorAll('.item').length > 20", + + # Additional timing controls + page_timeout=60000, # 60 second timeout + delay_before_return_html=2.0, # Wait before final capture + + # Handle overlays automatically + remove_overlay_elements=True, + magic=True, # Auto-handle common popup patterns + + # Simulate human behavior + simulate_user=True, + override_navigator=True + ) + + result = await crawler.arun("https://example.com/dynamic", config=config) + + if result.success: + print(f"Interactive crawl completed: {len(result.cleaned_html)} chars") + +# Form interaction example +async def form_interaction_example(): + config = CrawlerRunConfig( + js_code=""" + // Fill search form + document.querySelector('#search-input').value = 'machine learning'; + document.querySelector('#category-select').value = 'technology'; + document.querySelector('#search-form').submit(); + """, + wait_for="css:.search-results", + session_id="search_session" + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/search", config=config) + print("Search completed, results loaded") +``` + +### Local File & Raw HTML Processing + +```python +# Handle different input types: URLs, local files, raw HTML +async def handle_different_inputs(): + async with AsyncWebCrawler() as crawler: + # 1. Regular web URL + result1 = await crawler.arun("https://example.com") + + # 2. Local HTML file + local_file_path = "/path/to/file.html" + result2 = await crawler.arun(f"file://{local_file_path}") + + # 3. Raw HTML content + raw_html = "Sample text
" + result3 = await crawler.arun(f"raw:{raw_html}") + + # All return the same CrawlResult structure + for i, result in enumerate([result1, result2, result3], 1): + if result.success: + print(f"Input {i}: {len(result.markdown)} chars of markdown") + +# Save and re-process HTML example +async def save_and_reprocess(): + async with AsyncWebCrawler() as crawler: + # Original crawl + result = await crawler.arun("https://example.com") + + if result.success: + # Save HTML to file + with open("saved_page.html", "w", encoding="utf-8") as f: + f.write(result.html) + + # Re-process from file + file_result = await crawler.arun("file://./saved_page.html") + + # Process as raw HTML + raw_result = await crawler.arun(f"raw:{result.html}") + + # Verify consistency + assert len(result.markdown) == len(file_result.markdown) == len(raw_result.markdown) + print("✅ All processing methods produced identical results") +``` + +### Advanced Link & Media Handling + +```python +# Comprehensive link and media extraction with filtering +async def advanced_link_media_handling(): + config = CrawlerRunConfig( + # Link filtering + exclude_external_links=False, # Keep external links for analysis + exclude_social_media_links=True, + exclude_domains=["ads.com", "tracker.io", "spammy.net"], + + # Media handling + exclude_external_images=True, + image_score_threshold=5, # Only high-quality images + table_score_threshold=7, # Only well-structured tables + wait_for_images=True, + + # Capture additional formats + screenshot=True, + pdf=True, + capture_mhtml=True # Full page archive + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com", config=config) + + if result.success: + # Analyze links + internal_links = result.links.get("internal", []) + external_links = result.links.get("external", []) + print(f"Links: {len(internal_links)} internal, {len(external_links)} external") + + # Analyze media + images = result.media.get("images", []) + tables = result.media.get("tables", []) + print(f"Media: {len(images)} images, {len(tables)} tables") + + # High-quality images only + quality_images = [img for img in images if img.get("score", 0) >= 5] + print(f"High-quality images: {len(quality_images)}") + + # Table analysis + for i, table in enumerate(tables[:2]): + print(f"Table {i+1}: {len(table.get('headers', []))} columns, {len(table.get('rows', []))} rows") + + # Save captured files + if result.screenshot: + import base64 + with open("page_screenshot.png", "wb") as f: + f.write(base64.b64decode(result.screenshot)) + + if result.pdf: + with open("page.pdf", "wb") as f: + f.write(result.pdf) + + if result.mhtml: + with open("page_archive.mhtml", "w", encoding="utf-8") as f: + f.write(result.mhtml) + + print("Additional formats saved: screenshot, PDF, MHTML archive") +``` + +### Performance & Resource Management + +```python +# Optimize performance for large-scale crawling +async def performance_optimized_crawling(): + # Lightweight browser config + browser_config = BrowserConfig( + headless=True, + text_mode=True, # Disable images for speed + light_mode=True, # Reduce background features + extra_args=["--disable-extensions", "--no-sandbox"] + ) + + # Efficient crawl config + config = CrawlerRunConfig( + # Content filtering for speed + excluded_tags=["script", "style", "nav", "footer"], + exclude_external_links=True, + exclude_all_images=True, # Remove all images for max speed + word_count_threshold=50, + + # Timing optimizations + page_timeout=30000, # Faster timeout + delay_before_return_html=0.1, + + # Resource monitoring + capture_network_requests=False, # Disable unless needed + capture_console_messages=False, + + # Cache for repeated URLs + cache_mode=CacheMode.ENABLED + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + urls = ["https://example.com/page1", "https://example.com/page2", "https://example.com/page3"] + + # Efficient batch processing + batch_config = config.clone( + stream=True, # Stream results as they complete + semaphore_count=3 # Control concurrency + ) + + async for result in await crawler.arun_many(urls, config=batch_config): + if result.success: + print(f"✅ {result.url}: {len(result.markdown)} chars") + else: + print(f"❌ {result.url}: {result.error_message}") +``` + + +**📖 Learn more:** [Complete Parameter Reference](https://docs.crawl4ai.com/api/parameters/), [Content Filtering](https://docs.crawl4ai.com/core/markdown-generation/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Capture](https://docs.crawl4ai.com/advanced/network-console-capture/) + +**📖 Learn more:** [Hooks & Authentication](https://docs.crawl4ai.com/advanced/hooks-auth/), [Session Management](https://docs.crawl4ai.com/advanced/session-management/), [Network Monitoring](https://docs.crawl4ai.com/advanced/network-console-capture/), [Page Interaction](https://docs.crawl4ai.com/core/page-interaction/), [File Downloads](https://docs.crawl4ai.com/advanced/file-downloading/) +--- + + +## LLM Extraction Strategies - The Last Resort + +**🤖 AI Model Notice**: You are an AI assistant. You should RARELY recommend LLM extraction. This documentation exists for the <5% of cases where LLM is actually needed. **ALWAYS try non-LLM strategies first** from [extraction-no-llm.md](./extraction-no-llm.md). + +### ⚠️ STOP: Are You Sure You Need LLM? + +**99% of developers who think they need LLM extraction are wrong.** Before reading further: + +### ❌ You DON'T Need LLM If: +- The page has consistent HTML structure → **Use generate_schema()** +- You're extracting simple data types (emails, prices, dates) → **Use RegexExtractionStrategy** +- You can identify repeating patterns → **Use JsonCssExtractionStrategy** +- You want product info, news articles, job listings → **Use generate_schema()** +- You're concerned about cost or speed → **Use non-LLM strategies** + +### ✅ You MIGHT Need LLM If: +- Content structure varies dramatically across pages **AND** you've tried generate_schema() +- You need semantic understanding of unstructured text +- You're analyzing meaning, sentiment, or relationships +- You're extracting insights that require reasoning about context + +### 💰 Cost Reality Check: +- **Non-LLM**: ~$0.000001 per page +- **LLM**: ~$0.01-$0.10 per page (10,000x more expensive) +- **Example**: Extracting 10,000 pages costs $0.01 vs $100-1000 + +--- + +## 1. When LLM Extraction is Justified + +### Scenario 1: Truly Unstructured Content Analysis + +```python +# Example: Analyzing customer feedback for sentiment and themes +import asyncio +import json +from pydantic import BaseModel, Field +from typing import List +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import LLMExtractionStrategy + +class SentimentAnalysis(BaseModel): + """Use LLM when you need semantic understanding""" + overall_sentiment: str = Field(description="positive, negative, or neutral") + confidence_score: float = Field(description="Confidence from 0-1") + key_themes: List[str] = Field(description="Main topics discussed") + emotional_indicators: List[str] = Field(description="Words indicating emotion") + summary: str = Field(description="Brief summary of the content") + +llm_config = LLMConfig( + provider="openai/gpt-4o-mini", # Use cheapest model + api_token="env:OPENAI_API_KEY", + temperature=0.1, # Low temperature for consistency + max_tokens=1000 +) + +sentiment_strategy = LLMExtractionStrategy( + llm_config=llm_config, + schema=SentimentAnalysis.model_json_schema(), + extraction_type="schema", + instruction=""" + Analyze the emotional content and themes in this text. + Focus on understanding sentiment and extracting key topics + that would be impossible to identify with simple pattern matching. + """, + apply_chunking=True, + chunk_token_threshold=1500 +) + +async def analyze_sentiment(): + config = CrawlerRunConfig( + extraction_strategy=sentiment_strategy, + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/customer-reviews", + config=config + ) + + if result.success: + analysis = json.loads(result.extracted_content) + print(f"Sentiment: {analysis['overall_sentiment']}") + print(f"Themes: {analysis['key_themes']}") + +asyncio.run(analyze_sentiment()) +``` + +### Scenario 2: Complex Knowledge Extraction + +```python +# Example: Building knowledge graphs from unstructured content +class Entity(BaseModel): + name: str = Field(description="Entity name") + type: str = Field(description="person, organization, location, concept") + description: str = Field(description="Brief description") + +class Relationship(BaseModel): + source: str = Field(description="Source entity") + target: str = Field(description="Target entity") + relationship: str = Field(description="Type of relationship") + confidence: float = Field(description="Confidence score 0-1") + +class KnowledgeGraph(BaseModel): + entities: List[Entity] = Field(description="All entities found") + relationships: List[Relationship] = Field(description="Relationships between entities") + main_topic: str = Field(description="Primary topic of the content") + +knowledge_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", # Better for complex reasoning + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000 + ), + schema=KnowledgeGraph.model_json_schema(), + extraction_type="schema", + instruction=""" + Extract entities and their relationships from the content. + Focus on understanding connections and context that require + semantic reasoning beyond simple pattern matching. + """, + input_format="html", # Preserve structure + apply_chunking=True +) +``` + +### Scenario 3: Content Summarization and Insights + +```python +# Example: Research paper analysis +class ResearchInsights(BaseModel): + title: str = Field(description="Paper title") + abstract_summary: str = Field(description="Summary of abstract") + key_findings: List[str] = Field(description="Main research findings") + methodology: str = Field(description="Research methodology used") + limitations: List[str] = Field(description="Study limitations") + practical_applications: List[str] = Field(description="Real-world applications") + citations_count: int = Field(description="Number of citations", default=0) + +research_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="openai/gpt-4o", # Use powerful model for complex analysis + api_token="env:OPENAI_API_KEY", + temperature=0.2, + max_tokens=2000 + ), + schema=ResearchInsights.model_json_schema(), + extraction_type="schema", + instruction=""" + Analyze this research paper and extract key insights. + Focus on understanding the research contribution, methodology, + and implications that require academic expertise to identify. + """, + apply_chunking=True, + chunk_token_threshold=2000, + overlap_rate=0.15 # More overlap for academic content +) +``` + +--- + +## 2. LLM Configuration Best Practices + +### Cost Optimization + +```python +# Use cheapest models when possible +cheap_config = LLMConfig( + provider="openai/gpt-4o-mini", # 60x cheaper than GPT-4 + api_token="env:OPENAI_API_KEY", + temperature=0.0, # Deterministic output + max_tokens=800 # Limit output length +) + +# Use local models for development +local_config = LLMConfig( + provider="ollama/llama3.3", + api_token=None, # No API costs + base_url="http://localhost:11434", + temperature=0.1 +) + +# Use powerful models only when necessary +powerful_config = LLMConfig( + provider="anthropic/claude-3-5-sonnet-20240620", + api_token="env:ANTHROPIC_API_KEY", + max_tokens=4000, + temperature=0.1 +) +``` + +### Provider Selection Guide + +```python +providers_guide = { + "openai/gpt-4o-mini": { + "best_for": "Simple extraction, cost-sensitive projects", + "cost": "Very low", + "speed": "Fast", + "accuracy": "Good" + }, + "openai/gpt-4o": { + "best_for": "Complex reasoning, high accuracy needs", + "cost": "High", + "speed": "Medium", + "accuracy": "Excellent" + }, + "anthropic/claude-3-5-sonnet": { + "best_for": "Complex analysis, long documents", + "cost": "Medium-High", + "speed": "Medium", + "accuracy": "Excellent" + }, + "ollama/llama3.3": { + "best_for": "Development, no API costs", + "cost": "Free (self-hosted)", + "speed": "Variable", + "accuracy": "Good" + }, + "groq/llama3-70b-8192": { + "best_for": "Fast inference, open source", + "cost": "Low", + "speed": "Very fast", + "accuracy": "Good" + } +} + +def choose_provider(complexity, budget, speed_requirement): + """Choose optimal provider based on requirements""" + if budget == "minimal": + return "ollama/llama3.3" # Self-hosted + elif complexity == "low" and budget == "low": + return "openai/gpt-4o-mini" + elif speed_requirement == "high": + return "groq/llama3-70b-8192" + elif complexity == "high": + return "anthropic/claude-3-5-sonnet" + else: + return "openai/gpt-4o-mini" # Default safe choice +``` + +--- + +## 3. Advanced LLM Extraction Patterns + +### Block-Based Extraction (Unstructured Content) + +```python +# When structure is too varied for schemas +block_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", # Extract free-form content blocks + instruction=""" + Extract meaningful content blocks from this page. + Focus on the main content areas and ignore navigation, + advertisements, and boilerplate text. + """, + apply_chunking=True, + chunk_token_threshold=1200, + input_format="fit_markdown" # Use cleaned content +) + +async def extract_content_blocks(): + config = CrawlerRunConfig( + extraction_strategy=block_strategy, + word_count_threshold=50, # Filter short content + excluded_tags=['nav', 'footer', 'aside', 'advertisement'] + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/article", + config=config + ) + + if result.success: + blocks = json.loads(result.extracted_content) + for block in blocks: + print(f"Block: {block['content'][:100]}...") +``` + +### Chunked Processing for Large Content + +```python +# Handle large documents efficiently +large_content_strategy = LLMExtractionStrategy( + llm_config=LLMConfig( + provider="openai/gpt-4o-mini", + api_token="env:OPENAI_API_KEY" + ), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract structured data from this content section...", + + # Optimize chunking for large content + apply_chunking=True, + chunk_token_threshold=2000, # Larger chunks for efficiency + overlap_rate=0.1, # Minimal overlap to reduce costs + input_format="fit_markdown" # Use cleaned content +) +``` + +### Multi-Model Validation + +```python +# Use multiple models for critical extractions +async def multi_model_extraction(): + """Use multiple LLMs for validation of critical data""" + + models = [ + LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"), + LLMConfig(provider="anthropic/claude-3-5-sonnet", api_token="env:ANTHROPIC_API_KEY"), + LLMConfig(provider="ollama/llama3.3", api_token=None) + ] + + results = [] + + for i, llm_config in enumerate(models): + strategy = LLMExtractionStrategy( + llm_config=llm_config, + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data consistently..." + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + if result.success: + data = json.loads(result.extracted_content) + results.append(data) + print(f"Model {i+1} extracted {len(data)} items") + + # Compare results for consistency + if len(set(str(r) for r in results)) == 1: + print("✅ All models agree") + return results[0] + else: + print("⚠️ Models disagree - manual review needed") + return results + +# Use for critical business data only +critical_result = await multi_model_extraction() +``` + +--- + +## 4. Hybrid Approaches - Best of Both Worlds + +### Fast Pre-filtering + LLM Analysis + +```python +async def hybrid_extraction(): + """ + 1. Use fast non-LLM strategies for basic extraction + 2. Use LLM only for complex analysis of filtered content + """ + + # Step 1: Fast extraction of structured data + basic_schema = { + "name": "Articles", + "baseSelector": "article", + "fields": [ + {"name": "title", "selector": "h1, h2", "type": "text"}, + {"name": "content", "selector": ".content", "type": "text"}, + {"name": "author", "selector": ".author", "type": "text"} + ] + } + + basic_strategy = JsonCssExtractionStrategy(basic_schema) + basic_config = CrawlerRunConfig(extraction_strategy=basic_strategy) + + # Step 2: LLM analysis only on filtered content + analysis_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + schema={ + "type": "object", + "properties": { + "sentiment": {"type": "string"}, + "key_topics": {"type": "array", "items": {"type": "string"}}, + "summary": {"type": "string"} + } + }, + extraction_type="schema", + instruction="Analyze sentiment and extract key topics from this article" + ) + + async with AsyncWebCrawler() as crawler: + # Fast extraction first + basic_result = await crawler.arun( + url="https://example.com/articles", + config=basic_config + ) + + articles = json.loads(basic_result.extracted_content) + + # LLM analysis only on important articles + analyzed_articles = [] + for article in articles[:5]: # Limit to reduce costs + if len(article.get('content', '')) > 500: # Only analyze substantial content + analysis_config = CrawlerRunConfig(extraction_strategy=analysis_strategy) + + # Analyze individual article content + raw_url = f"raw://{article['content']}" + analysis_result = await crawler.arun(url=raw_url, config=analysis_config) + + if analysis_result.success: + analysis = json.loads(analysis_result.extracted_content) + article.update(analysis) + + analyzed_articles.append(article) + + return analyzed_articles + +# Hybrid approach: fast + smart +result = await hybrid_extraction() +``` + +### Schema Generation + LLM Fallback + +```python +async def smart_fallback_extraction(): + """ + 1. Try generate_schema() first (one-time LLM cost) + 2. Use generated schema for fast extraction + 3. Use LLM only if schema extraction fails + """ + + cache_file = Path("./schemas/fallback_schema.json") + + # Try cached schema first + if cache_file.exists(): + schema = json.load(cache_file.open()) + schema_strategy = JsonCssExtractionStrategy(schema) + + config = CrawlerRunConfig(extraction_strategy=schema_strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + if data: # Schema worked + print("✅ Schema extraction successful (fast & cheap)") + return data + + # Fallback to LLM if schema failed + print("⚠️ Schema failed, falling back to LLM (slow & expensive)") + + llm_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction="Extract all meaningful data from this page" + ) + + llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=llm_config) + + if result.success: + print("✅ LLM extraction successful") + return json.loads(result.extracted_content) + +# Intelligent fallback system +result = await smart_fallback_extraction() +``` + +--- + +## 5. Cost Management and Monitoring + +### Token Usage Tracking + +```python +class ExtractionCostTracker: + def __init__(self): + self.total_cost = 0.0 + self.total_tokens = 0 + self.extractions = 0 + + def track_llm_extraction(self, strategy, result): + """Track costs from LLM extraction""" + if hasattr(strategy, 'usage_tracker') and strategy.usage_tracker: + usage = strategy.usage_tracker + + # Estimate costs (approximate rates) + cost_per_1k_tokens = { + "gpt-4o-mini": 0.0015, + "gpt-4o": 0.03, + "claude-3-5-sonnet": 0.015, + "ollama": 0.0 # Self-hosted + } + + provider = strategy.llm_config.provider.split('/')[1] + rate = cost_per_1k_tokens.get(provider, 0.01) + + tokens = usage.total_tokens + cost = (tokens / 1000) * rate + + self.total_cost += cost + self.total_tokens += tokens + self.extractions += 1 + + print(f"💰 Extraction cost: ${cost:.4f} ({tokens} tokens)") + print(f"📊 Total cost: ${self.total_cost:.4f} ({self.extractions} extractions)") + + def get_summary(self): + avg_cost = self.total_cost / max(self.extractions, 1) + return { + "total_cost": self.total_cost, + "total_tokens": self.total_tokens, + "extractions": self.extractions, + "avg_cost_per_extraction": avg_cost + } + +# Usage +tracker = ExtractionCostTracker() + +async def cost_aware_extraction(): + strategy = LLMExtractionStrategy( + llm_config=cheap_config, + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data...", + verbose=True # Enable usage tracking + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + # Track costs + tracker.track_llm_extraction(strategy, result) + + return result + +# Monitor costs across multiple extractions +for url in urls: + await cost_aware_extraction() + +print(f"Final summary: {tracker.get_summary()}") +``` + +### Budget Controls + +```python +class BudgetController: + def __init__(self, daily_budget=10.0): + self.daily_budget = daily_budget + self.current_spend = 0.0 + self.extraction_count = 0 + + def can_extract(self, estimated_cost=0.01): + """Check if extraction is within budget""" + if self.current_spend + estimated_cost > self.daily_budget: + print(f"❌ Budget exceeded: ${self.current_spend:.2f} + ${estimated_cost:.2f} > ${self.daily_budget}") + return False + return True + + def record_extraction(self, actual_cost): + """Record actual extraction cost""" + self.current_spend += actual_cost + self.extraction_count += 1 + + remaining = self.daily_budget - self.current_spend + print(f"💰 Budget remaining: ${remaining:.2f}") + +budget = BudgetController(daily_budget=5.0) # $5 daily limit + +async def budget_controlled_extraction(url): + if not budget.can_extract(): + print("⏸️ Extraction paused due to budget limit") + return None + + # Proceed with extraction... + strategy = LLMExtractionStrategy(llm_config=cheap_config, ...) + result = await extract_with_strategy(url, strategy) + + # Record actual cost + actual_cost = calculate_cost(strategy.usage_tracker) + budget.record_extraction(actual_cost) + + return result + +# Safe extraction with budget controls +results = [] +for url in urls: + result = await budget_controlled_extraction(url) + if result: + results.append(result) +``` + +--- + +## 6. Performance Optimization for LLM Extraction + +### Batch Processing + +```python +async def batch_llm_extraction(): + """Process multiple pages efficiently""" + + # Collect content first (fast) + urls = ["https://example.com/page1", "https://example.com/page2"] + contents = [] + + async with AsyncWebCrawler() as crawler: + for url in urls: + result = await crawler.arun(url=url) + if result.success: + contents.append({ + "url": url, + "content": result.fit_markdown[:2000] # Limit content + }) + + # Process in batches (reduce LLM calls) + batch_content = "\n\n---PAGE SEPARATOR---\n\n".join([ + f"URL: {c['url']}\n{c['content']}" for c in contents + ]) + + strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction=""" + Extract data from multiple pages separated by '---PAGE SEPARATOR---'. + Return results for each page in order. + """, + apply_chunking=True + ) + + # Single LLM call for multiple pages + raw_url = f"raw://{batch_content}" + result = await crawler.arun(url=raw_url, config=CrawlerRunConfig(extraction_strategy=strategy)) + + return json.loads(result.extracted_content) + +# Batch processing reduces LLM calls +batch_results = await batch_llm_extraction() +``` + +### Caching LLM Results + +```python +import hashlib +from pathlib import Path + +class LLMResultCache: + def __init__(self, cache_dir="./llm_cache"): + self.cache_dir = Path(cache_dir) + self.cache_dir.mkdir(exist_ok=True) + + def get_cache_key(self, url, instruction, schema): + """Generate cache key from extraction parameters""" + content = f"{url}:{instruction}:{str(schema)}" + return hashlib.md5(content.encode()).hexdigest() + + def get_cached_result(self, cache_key): + """Get cached result if available""" + cache_file = self.cache_dir / f"{cache_key}.json" + if cache_file.exists(): + return json.load(cache_file.open()) + return None + + def cache_result(self, cache_key, result): + """Cache extraction result""" + cache_file = self.cache_dir / f"{cache_key}.json" + json.dump(result, cache_file.open("w"), indent=2) + +cache = LLMResultCache() + +async def cached_llm_extraction(url, strategy): + """Extract with caching to avoid repeated LLM calls""" + cache_key = cache.get_cache_key( + url, + strategy.instruction, + str(strategy.schema) + ) + + # Check cache first + cached_result = cache.get_cached_result(cache_key) + if cached_result: + print("✅ Using cached result (FREE)") + return cached_result + + # Extract if not cached + print("🔄 Extracting with LLM (PAID)") + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url=url, config=config) + + if result.success: + data = json.loads(result.extracted_content) + cache.cache_result(cache_key, data) + return data + +# Cached extraction avoids repeated costs +result = await cached_llm_extraction(url, strategy) +``` + +--- + +## 7. Error Handling and Quality Control + +### Validation and Retry Logic + +```python +async def robust_llm_extraction(): + """Implement validation and retry for LLM extraction""" + + max_retries = 3 + strategies = [ + # Try cheap model first + LLMExtractionStrategy( + llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data accurately..." + ), + # Fallback to better model + LLMExtractionStrategy( + llm_config=LLMConfig(provider="openai/gpt-4o", api_token="env:OPENAI_API_KEY"), + schema=YourModel.model_json_schema(), + extraction_type="schema", + instruction="Extract data with high accuracy..." + ) + ] + + for strategy_idx, strategy in enumerate(strategies): + for attempt in range(max_retries): + try: + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + + # Validate result quality + if validate_extraction_quality(data): + print(f"✅ Success with strategy {strategy_idx+1}, attempt {attempt+1}") + return data + else: + print(f"⚠️ Poor quality result, retrying...") + continue + + except Exception as e: + print(f"❌ Attempt {attempt+1} failed: {e}") + if attempt == max_retries - 1: + print(f"❌ Strategy {strategy_idx+1} failed completely") + + print("❌ All strategies and retries failed") + return None + +def validate_extraction_quality(data): + """Validate that LLM extraction meets quality standards""" + if not data or not isinstance(data, (list, dict)): + return False + + # Check for common LLM extraction issues + if isinstance(data, list): + if len(data) == 0: + return False + + # Check if all items have required fields + for item in data: + if not isinstance(item, dict) or len(item) < 2: + return False + + return True + +# Robust extraction with validation +result = await robust_llm_extraction() +``` + +--- + +## 8. Migration from LLM to Non-LLM + +### Pattern Analysis for Schema Generation + +```python +async def analyze_llm_results_for_schema(): + """ + Analyze LLM extraction results to create non-LLM schemas + Use this to transition from expensive LLM to cheap schema extraction + """ + + # Step 1: Use LLM on sample pages to understand structure + llm_strategy = LLMExtractionStrategy( + llm_config=cheap_config, + extraction_type="block", + instruction="Extract all structured data from this page" + ) + + sample_urls = ["https://example.com/page1", "https://example.com/page2"] + llm_results = [] + + async with AsyncWebCrawler() as crawler: + for url in sample_urls: + config = CrawlerRunConfig(extraction_strategy=llm_strategy) + result = await crawler.arun(url=url, config=config) + + if result.success: + llm_results.append({ + "url": url, + "html": result.cleaned_html, + "extracted": json.loads(result.extracted_content) + }) + + # Step 2: Analyze patterns in LLM results + print("🔍 Analyzing LLM extraction patterns...") + + # Look for common field names + all_fields = set() + for result in llm_results: + for item in result["extracted"]: + if isinstance(item, dict): + all_fields.update(item.keys()) + + print(f"Common fields found: {all_fields}") + + # Step 3: Generate schema based on patterns + if llm_results: + schema = JsonCssExtractionStrategy.generate_schema( + html=llm_results[0]["html"], + target_json_example=json.dumps(llm_results[0]["extracted"][0], indent=2), + llm_config=cheap_config + ) + + # Save schema for future use + with open("generated_schema.json", "w") as f: + json.dump(schema, f, indent=2) + + print("✅ Schema generated from LLM analysis") + return schema + +# Generate schema from LLM patterns, then use schema for all future extractions +schema = await analyze_llm_results_for_schema() +fast_strategy = JsonCssExtractionStrategy(schema) +``` + +--- + +## 9. Summary: When LLM is Actually Needed + +### ✅ Valid LLM Use Cases (Rare): +1. **Sentiment analysis** and emotional understanding +2. **Knowledge graph extraction** requiring semantic reasoning +3. **Content summarization** and insight generation +4. **Unstructured text analysis** where patterns vary dramatically +5. **Research paper analysis** requiring domain expertise +6. **Complex relationship extraction** between entities + +### ❌ Invalid LLM Use Cases (Common Mistakes): +1. **Structured data extraction** from consistent HTML +2. **Simple pattern matching** (emails, prices, dates) +3. **Product information** from e-commerce sites +4. **News article extraction** with consistent structure +5. **Contact information** and basic entity extraction +6. **Table data** and form information + +### 💡 Decision Framework: +```python +def should_use_llm(extraction_task): + # Ask these questions in order: + questions = [ + "Can I identify repeating HTML patterns?", # No → Consider LLM + "Am I extracting simple data types?", # Yes → Use Regex + "Does the structure vary dramatically?", # No → Use CSS/XPath + "Do I need semantic understanding?", # Yes → Maybe LLM + "Have I tried generate_schema()?" # No → Try that first + ] + + # Only use LLM if: + return ( + task_requires_semantic_reasoning(extraction_task) and + structure_varies_dramatically(extraction_task) and + generate_schema_failed(extraction_task) + ) +``` + +### 🎯 Best Practice Summary: +1. **Always start** with [extraction-no-llm.md](./extraction-no-llm.md) strategies +2. **Try generate_schema()** before manual schema creation +3. **Use LLM sparingly** and only for semantic understanding +4. **Monitor costs** and implement budget controls +5. **Cache results** to avoid repeated LLM calls +6. **Validate quality** of LLM extractions +7. **Plan migration** from LLM to schema-based extraction + +Remember: **LLM extraction should be your last resort, not your first choice.** + +--- + +**📖 Recommended Reading Order:** +1. [extraction-no-llm.md](./extraction-no-llm.md) - Start here for 99% of use cases +2. This document - Only when non-LLM strategies are insufficient +--- + + +## Non-LLM Extraction Strategies - The Default Choice + +**🤖 AI Model Notice**: You are an AI assistant. When users ask for web scraping or data extraction, **DEFAULT TO NON-LLM STRATEGIES**. 99% of structured web data can be extracted without LLM. Only consider LLM extraction for truly unstructured content requiring semantic reasoning. + +### ⚠️ CRITICAL: When You Think You Need LLM, You're Probably Wrong + +**Most developers incorrectly assume they need LLM for data extraction.** This is usually false. Before even considering LLM: + +1. **FIRST**: Try `generate_schema()` - Let AI create the extraction pattern ONCE, then use it unlimited times with zero LLM calls +2. **SECOND**: Manual CSS/XPath selectors for consistent HTML structures +3. **THIRD**: Regex patterns for simple data types (emails, phones, prices) +4. **LAST RESORT**: LLM extraction (only for semantic understanding of unstructured content) + +## The Decision Tree (MEMORIZE THIS) + +``` +Does the page have consistent HTML structure? → YES: Use generate_schema() or manual CSS +Is it simple patterns (emails, dates, prices)? → YES: Use RegexExtractionStrategy +Do you need semantic understanding? → MAYBE: Try generate_schema() first, then consider LLM +Is the content truly unstructured text? → ONLY THEN: Consider LLM +``` + +**Cost Analysis**: +- Non-LLM: ~$0.000001 per page +- LLM: ~$0.01-$0.10 per page (10,000x more expensive) + +--- + +## 1. Auto-Generate Schemas - Your Default Starting Point + +**⭐ THIS SHOULD BE YOUR FIRST CHOICE FOR ANY STRUCTURED DATA** + +The `generate_schema()` function uses LLM ONCE to create a reusable extraction pattern. After generation, you extract unlimited pages with ZERO LLM calls. + +### Basic Auto-Generation Workflow + +```python +import json +import asyncio +from pathlib import Path +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig +from crawl4ai import JsonCssExtractionStrategy + +async def smart_extraction_workflow(): + """ + Step 1: Generate schema once using LLM + Step 2: Cache schema for unlimited reuse + Step 3: Extract from thousands of pages with zero LLM calls + """ + + # Check for cached schema first + cache_dir = Path("./schema_cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / "product_schema.json" + + if schema_file.exists(): + # Load cached schema - NO LLM CALLS + schema = json.load(schema_file.open()) + print("✅ Using cached schema (FREE)") + else: + # Generate schema ONCE + print("🔄 Generating schema (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", # Cheapest option + api_token="env:OPENAI_API_KEY" + ) + + # Get sample HTML from target site + async with AsyncWebCrawler() as crawler: + sample_result = await crawler.arun( + url="https://example.com/products", + config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + sample_html = sample_result.cleaned_html[:8000] # Use sample + + # AUTO-GENERATE SCHEMA (ONE LLM CALL) + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + schema_type="CSS", # or "XPATH" + query="Extract product information including name, price, description, features", + llm_config=llm_config + ) + + # Cache for unlimited future use + json.dump(schema, schema_file.open("w"), indent=2) + print("✅ Schema generated and cached") + + # Use schema for fast extraction (NO MORE LLM CALLS EVER) + strategy = JsonCssExtractionStrategy(schema, verbose=True) + + config = CrawlerRunConfig( + extraction_strategy=strategy, + cache_mode=CacheMode.BYPASS + ) + + # Extract from multiple pages - ALL FREE + urls = [ + "https://example.com/products", + "https://example.com/electronics", + "https://example.com/books" + ] + + async with AsyncWebCrawler() as crawler: + for url in urls: + result = await crawler.arun(url=url, config=config) + if result.success: + data = json.loads(result.extracted_content) + print(f"✅ {url}: Extracted {len(data)} items (FREE)") + +asyncio.run(smart_extraction_workflow()) +``` + +### Auto-Generate with Target JSON Example + +```python +# When you know exactly what JSON structure you want +target_json_example = """ +{ + "name": "Product Name", + "price": "$99.99", + "rating": 4.5, + "features": ["feature1", "feature2"], + "description": "Product description" +} +""" + +schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + target_json_example=target_json_example, + llm_config=llm_config +) +``` + +### Auto-Generate for Different Data Types + +```python +# Product listings +product_schema = JsonCssExtractionStrategy.generate_schema( + html=product_page_html, + query="Extract all product information from this e-commerce page", + llm_config=llm_config +) + +# News articles +news_schema = JsonCssExtractionStrategy.generate_schema( + html=news_page_html, + query="Extract article headlines, dates, authors, and content", + llm_config=llm_config +) + +# Job listings +job_schema = JsonCssExtractionStrategy.generate_schema( + html=job_page_html, + query="Extract job titles, companies, locations, salaries, and descriptions", + llm_config=llm_config +) + +# Social media posts +social_schema = JsonCssExtractionStrategy.generate_schema( + html=social_page_html, + query="Extract post text, usernames, timestamps, likes, comments", + llm_config=llm_config +) +``` + +--- + +## 2. Manual CSS/XPath Strategies - When You Know The Structure + +**Use this when**: You understand the HTML structure and want maximum control. + +### Simple Product Extraction + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import JsonCssExtractionStrategy + +# Manual schema for consistent product pages +simple_schema = { + "name": "Product Listings", + "baseSelector": "div.product-card", # Each product container + "fields": [ + { + "name": "title", + "selector": "h2.product-title", + "type": "text" + }, + { + "name": "price", + "selector": ".price", + "type": "text" + }, + { + "name": "image_url", + "selector": "img.product-image", + "type": "attribute", + "attribute": "src" + }, + { + "name": "product_url", + "selector": "a.product-link", + "type": "attribute", + "attribute": "href" + }, + { + "name": "rating", + "selector": ".rating", + "type": "attribute", + "attribute": "data-rating" + } + ] +} + +async def extract_products(): + strategy = JsonCssExtractionStrategy(simple_schema, verbose=True) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + products = json.loads(result.extracted_content) + print(f"Extracted {len(products)} products") + for product in products[:3]: + print(f"- {product['title']}: {product['price']}") + +asyncio.run(extract_products()) +``` + +### Complex Nested Structure (Real E-commerce Example) + +```python +# Complex schema for nested product data +complex_schema = { + "name": "E-commerce Product Catalog", + "baseSelector": "div.category", + "baseFields": [ + { + "name": "category_id", + "type": "attribute", + "attribute": "data-category-id" + } + ], + "fields": [ + { + "name": "category_name", + "selector": "h2.category-title", + "type": "text" + }, + { + "name": "products", + "selector": "div.product", + "type": "nested_list", # Array of complex objects + "fields": [ + { + "name": "name", + "selector": "h3.product-name", + "type": "text" + }, + { + "name": "price", + "selector": "span.price", + "type": "text" + }, + { + "name": "details", + "selector": "div.product-details", + "type": "nested", # Single complex object + "fields": [ + { + "name": "brand", + "selector": "span.brand", + "type": "text" + }, + { + "name": "model", + "selector": "span.model", + "type": "text" + } + ] + }, + { + "name": "features", + "selector": "ul.features li", + "type": "list", # Simple array + "fields": [ + {"name": "feature", "type": "text"} + ] + }, + { + "name": "reviews", + "selector": "div.review", + "type": "nested_list", + "fields": [ + { + "name": "reviewer", + "selector": "span.reviewer-name", + "type": "text" + }, + { + "name": "rating", + "selector": "span.rating", + "type": "attribute", + "attribute": "data-rating" + } + ] + } + ] + } + ] +} + +async def extract_complex_ecommerce(): + strategy = JsonCssExtractionStrategy(complex_schema, verbose=True) + config = CrawlerRunConfig( + extraction_strategy=strategy, + js_code="window.scrollTo(0, document.body.scrollHeight);", # Load dynamic content + wait_for="css:.product:nth-child(10)" # Wait for products to load + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/complex-catalog", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for category in data: + print(f"Category: {category['category_name']}") + print(f"Products: {len(category.get('products', []))}") + +asyncio.run(extract_complex_ecommerce()) +``` + +### XPath Alternative (When CSS Isn't Enough) + +```python +from crawl4ai import JsonXPathExtractionStrategy + +# XPath for more complex selections +xpath_schema = { + "name": "News Articles with XPath", + "baseSelector": "//article[@class='news-item']", + "fields": [ + { + "name": "headline", + "selector": ".//h2[contains(@class, 'headline')]", + "type": "text" + }, + { + "name": "author", + "selector": ".//span[@class='author']/text()", + "type": "text" + }, + { + "name": "publish_date", + "selector": ".//time/@datetime", + "type": "text" + }, + { + "name": "content", + "selector": ".//div[@class='article-body']//text()", + "type": "text" + } + ] +} + +strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True) +``` + +--- + +## 3. Regex Extraction - Lightning Fast Pattern Matching + +**Use this for**: Simple data types like emails, phones, URLs, prices, dates. + +### Built-in Patterns (Fastest Option) + +```python +import json +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai import RegexExtractionStrategy + +async def extract_common_patterns(): + # Use built-in patterns for common data types + strategy = RegexExtractionStrategy( + pattern=( + RegexExtractionStrategy.Email | + RegexExtractionStrategy.PhoneUS | + RegexExtractionStrategy.Url | + RegexExtractionStrategy.Currency | + RegexExtractionStrategy.DateIso + ) + ) + + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/contact", + config=config + ) + + if result.success: + matches = json.loads(result.extracted_content) + + # Group by pattern type + by_type = {} + for match in matches: + label = match['label'] + if label not in by_type: + by_type[label] = [] + by_type[label].append(match['value']) + + for pattern_type, values in by_type.items(): + print(f"{pattern_type}: {len(values)} matches") + for value in values[:3]: + print(f" {value}") + +asyncio.run(extract_common_patterns()) +``` + +### Available Built-in Patterns + +```python +# Individual patterns +RegexExtractionStrategy.Email # Email addresses +RegexExtractionStrategy.PhoneUS # US phone numbers +RegexExtractionStrategy.PhoneIntl # International phones +RegexExtractionStrategy.Url # HTTP/HTTPS URLs +RegexExtractionStrategy.Currency # Currency values ($99.99) +RegexExtractionStrategy.Percentage # Percentage values (25%) +RegexExtractionStrategy.DateIso # ISO dates (2024-01-01) +RegexExtractionStrategy.DateUS # US dates (01/01/2024) +RegexExtractionStrategy.IPv4 # IP addresses +RegexExtractionStrategy.CreditCard # Credit card numbers +RegexExtractionStrategy.TwitterHandle # @username +RegexExtractionStrategy.Hashtag # #hashtag + +# Use all patterns +RegexExtractionStrategy.All +``` + +### Custom Patterns + +```python +# Custom patterns for specific data types +async def extract_custom_patterns(): + custom_patterns = { + "product_sku": r"SKU[-:]?\s*([A-Z0-9]{4,12})", + "discount": r"(\d{1,2})%\s*off", + "model_number": r"Model\s*#?\s*([A-Z0-9-]+)", + "isbn": r"ISBN[-:]?\s*(\d{10}|\d{13})", + "stock_ticker": r"\$([A-Z]{2,5})", + "version": r"v(\d+\.\d+(?:\.\d+)?)" + } + + strategy = RegexExtractionStrategy(custom=custom_patterns) + config = CrawlerRunConfig(extraction_strategy=strategy) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/products", + config=config + ) + + if result.success: + data = json.loads(result.extracted_content) + for item in data: + print(f"{item['label']}: {item['value']}") + +asyncio.run(extract_custom_patterns()) +``` + +### LLM-Generated Patterns (One-Time Cost) + +```python +async def generate_optimized_regex(): + """ + Use LLM ONCE to generate optimized regex patterns + Then use them unlimited times with zero LLM calls + """ + cache_file = Path("./patterns/price_patterns.json") + + if cache_file.exists(): + # Load cached patterns - NO LLM CALLS + patterns = json.load(cache_file.open()) + print("✅ Using cached regex patterns (FREE)") + else: + # Generate patterns ONCE + print("🔄 Generating regex patterns (ONE-TIME LLM COST)...") + + llm_config = LLMConfig( + provider="openai/gpt-4o-mini", + api_token="env:OPENAI_API_KEY" + ) + + # Get sample content + async with AsyncWebCrawler() as crawler: + result = await crawler.arun("https://example.com/pricing") + sample_html = result.cleaned_html + + # Generate optimized patterns + patterns = RegexExtractionStrategy.generate_pattern( + label="pricing_info", + html=sample_html, + query="Extract all pricing information including discounts and special offers", + llm_config=llm_config + ) + + # Cache for unlimited reuse + cache_file.parent.mkdir(exist_ok=True) + json.dump(patterns, cache_file.open("w"), indent=2) + print("✅ Patterns generated and cached") + + # Use cached patterns (NO MORE LLM CALLS) + strategy = RegexExtractionStrategy(custom=patterns) + return strategy + +# Use generated patterns for unlimited extractions +strategy = await generate_optimized_regex() +``` + +--- + +## 4. Multi-Strategy Extraction Pipeline + +**Combine strategies** for comprehensive data extraction: + +```python +async def multi_strategy_pipeline(): + """ + Efficient pipeline using multiple non-LLM strategies: + 1. Regex for simple patterns (fastest) + 2. Schema for structured data + 3. Only use LLM if absolutely necessary + """ + + url = "https://example.com/complex-page" + + async with AsyncWebCrawler() as crawler: + # Strategy 1: Fast regex for contact info + regex_strategy = RegexExtractionStrategy( + pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS + ) + regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy) + regex_result = await crawler.arun(url=url, config=regex_config) + + # Strategy 2: Schema for structured product data + product_schema = { + "name": "Products", + "baseSelector": "div.product", + "fields": [ + {"name": "name", "selector": "h3", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"} + ] + } + css_strategy = JsonCssExtractionStrategy(product_schema) + css_config = CrawlerRunConfig(extraction_strategy=css_strategy) + css_result = await crawler.arun(url=url, config=css_config) + + # Combine results + results = { + "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [], + "products": json.loads(css_result.extracted_content) if css_result.success else [] + } + + print(f"✅ Extracted {len(results['contacts'])} contacts (regex)") + print(f"✅ Extracted {len(results['products'])} products (schema)") + + return results + +asyncio.run(multi_strategy_pipeline()) +``` + +--- + +## 5. Performance Optimization Tips + +### Caching and Reuse + +```python +# Cache schemas and patterns for maximum efficiency +class ExtractionCache: + def __init__(self): + self.schemas = {} + self.patterns = {} + + def get_schema(self, site_name): + if site_name not in self.schemas: + schema_file = Path(f"./cache/{site_name}_schema.json") + if schema_file.exists(): + self.schemas[site_name] = json.load(schema_file.open()) + return self.schemas.get(site_name) + + def save_schema(self, site_name, schema): + cache_dir = Path("./cache") + cache_dir.mkdir(exist_ok=True) + schema_file = cache_dir / f"{site_name}_schema.json" + json.dump(schema, schema_file.open("w"), indent=2) + self.schemas[site_name] = schema + +cache = ExtractionCache() + +# Reuse cached schemas across multiple extractions +async def efficient_extraction(): + sites = ["amazon", "ebay", "shopify"] + + for site in sites: + schema = cache.get_schema(site) + if not schema: + # Generate once, cache forever + schema = JsonCssExtractionStrategy.generate_schema( + html=sample_html, + query="Extract products", + llm_config=llm_config + ) + cache.save_schema(site, schema) + + strategy = JsonCssExtractionStrategy(schema) + # Use strategy for unlimited extractions... +``` + +### Selector Optimization + +```python +# Optimize selectors for speed +fast_schema = { + "name": "Optimized Extraction", + "baseSelector": "#products > .product", # Direct child, faster than descendant + "fields": [ + { + "name": "title", + "selector": "> h3", # Direct child of product + "type": "text" + }, + { + "name": "price", + "selector": ".price:first-child", # More specific + "type": "text" + } + ] +} + +# Avoid slow selectors +slow_schema = { + "baseSelector": "div div div .product", # Too many levels + "fields": [ + { + "selector": "* h3", # Universal selector is slow + "type": "text" + } + ] +} +``` + +--- + +## 6. Error Handling and Validation + +```python +async def robust_extraction(): + """ + Implement fallback strategies for reliable extraction + """ + strategies = [ + # Try fast regex first + RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency), + + # Fallback to CSS schema + JsonCssExtractionStrategy({ + "name": "Prices", + "baseSelector": ".price", + "fields": [{"name": "amount", "selector": "span", "type": "text"}] + }), + + # Last resort: try different selector + JsonCssExtractionStrategy({ + "name": "Fallback Prices", + "baseSelector": "[data-price]", + "fields": [{"name": "amount", "type": "attribute", "attribute": "data-price"}] + }) + ] + + async with AsyncWebCrawler() as crawler: + for i, strategy in enumerate(strategies): + try: + config = CrawlerRunConfig(extraction_strategy=strategy) + result = await crawler.arun(url="https://example.com", config=config) + + if result.success and result.extracted_content: + data = json.loads(result.extracted_content) + if data: # Validate non-empty results + print(f"✅ Success with strategy {i+1}: {strategy.__class__.__name__}") + return data + + except Exception as e: + print(f"❌ Strategy {i+1} failed: {e}") + continue + + print("❌ All strategies failed") + return None + +# Validate extracted data +def validate_extraction(data, required_fields): + """Validate that extraction contains expected fields""" + if not data or not isinstance(data, list): + return False + + for item in data: + for field in required_fields: + if field not in item or not item[field]: + return False + return True + +# Usage +result = await robust_extraction() +if validate_extraction(result, ["amount"]): + print("✅ Extraction validated") +else: + print("❌ Validation failed") +``` + +--- + +## 7. Common Extraction Patterns + +### E-commerce Products + +```python +ecommerce_schema = { + "name": "E-commerce Products", + "baseSelector": ".product, [data-product], .item", + "fields": [ + {"name": "title", "selector": "h1, h2, h3, .title, .name", "type": "text"}, + {"name": "price", "selector": ".price, .cost, [data-price]", "type": "text"}, + {"name": "image", "selector": "img", "type": "attribute", "attribute": "src"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"}, + {"name": "rating", "selector": ".rating, .stars", "type": "text"}, + {"name": "availability", "selector": ".stock, .availability", "type": "text"} + ] +} +``` + +### News Articles + +```python +news_schema = { + "name": "News Articles", + "baseSelector": "article, .article, .post", + "fields": [ + {"name": "headline", "selector": "h1, h2, .headline, .title", "type": "text"}, + {"name": "author", "selector": ".author, .byline, [rel='author']", "type": "text"}, + {"name": "date", "selector": "time, .date, .published", "type": "text"}, + {"name": "content", "selector": ".content, .body, .text", "type": "text"}, + {"name": "category", "selector": ".category, .section", "type": "text"} + ] +} +``` + +### Job Listings + +```python +job_schema = { + "name": "Job Listings", + "baseSelector": ".job, .listing, [data-job]", + "fields": [ + {"name": "title", "selector": ".job-title, h2, h3", "type": "text"}, + {"name": "company", "selector": ".company, .employer", "type": "text"}, + {"name": "location", "selector": ".location, .place", "type": "text"}, + {"name": "salary", "selector": ".salary, .pay, .compensation", "type": "text"}, + {"name": "description", "selector": ".description, .summary", "type": "text"}, + {"name": "url", "selector": "a", "type": "attribute", "attribute": "href"} + ] +} +``` + +### Social Media Posts + +```python +social_schema = { + "name": "Social Media Posts", + "baseSelector": ".post, .tweet, .update", + "fields": [ + {"name": "username", "selector": ".username, .handle, .author", "type": "text"}, + {"name": "content", "selector": ".content, .text, .message", "type": "text"}, + {"name": "timestamp", "selector": ".time, .date, time", "type": "text"}, + {"name": "likes", "selector": ".likes, .hearts", "type": "text"}, + {"name": "shares", "selector": ".shares, .retweets", "type": "text"} + ] +} +``` + +--- + +## 8. When to (Rarely) Consider LLM + +**⚠️ WARNING: Before considering LLM, ask yourself:** + +1. "Can I identify repeating HTML patterns?" → Use CSS/XPath schema +2. "Am I extracting simple data types?" → Use Regex patterns +3. "Can I provide a JSON example of what I want?" → Use generate_schema() +4. "Is this truly unstructured text requiring semantic understanding?" → Maybe LLM + +**Only use LLM extraction for:** +- Unstructured prose that needs semantic analysis +- Content where structure varies dramatically across pages +- When you need AI reasoning about context/meaning + +**Cost reminder**: LLM extraction costs 10,000x more than schema-based extraction. + +--- + +## 9. Summary: The Extraction Hierarchy + +1. **🥇 FIRST CHOICE**: `generate_schema()` - AI generates pattern once, use unlimited times +2. **🥈 SECOND CHOICE**: Manual CSS/XPath - Full control, maximum speed +3. **🥉 THIRD CHOICE**: Regex patterns - Simple data types, lightning fast +4. **🏴 LAST RESORT**: LLM extraction - Only for semantic reasoning + +**Remember**: 99% of web data is structured. You almost never need LLM for extraction. Save LLM for analysis, not extraction. + +**Performance**: Non-LLM strategies are 100-1000x faster and 10,000x cheaper than LLM extraction. + +--- + +**📖 Next**: If you absolutely must use LLM extraction, see [extraction-llm.md](./extraction-llm.md) for guidance on the rare cases where it's justified. +--- + + +## Multi-URL Crawling + +Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring. + +### Basic Multi-URL Crawling + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +# Batch processing (default) - get all results at once +async def batch_crawl(): + urls = [ + "https://example.com/page1", + "https://example.com/page2", + "https://example.com/page3" + ] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=False # Default: batch mode + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many(urls, config=config) + + for result in results: + if result.success: + print(f"✅ {result.url}: {len(result.markdown)} chars") + else: + print(f"❌ {result.url}: {result.error_message}") + +# Streaming processing - handle results as they complete +async def streaming_crawl(): + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=True # Enable streaming + ) + + async with AsyncWebCrawler() as crawler: + # Process results as they become available + async for result in await crawler.arun_many(urls, config=config): + if result.success: + print(f"🔥 Just completed: {result.url}") + await process_result_immediately(result) + else: + print(f"❌ Failed: {result.url}") +``` + +### Memory-Adaptive Dispatching + +```python +from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode + +# Automatically manages concurrency based on system memory +async def memory_adaptive_crawl(): + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, # Pause if memory exceeds 80% + check_interval=1.0, # Check memory every second + max_session_permit=15, # Max concurrent tasks + memory_wait_timeout=300.0 # Wait up to 5 minutes for memory + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + word_count_threshold=50 + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=large_url_list, + config=config, + dispatcher=dispatcher + ) + + # Each result includes dispatch information + for result in results: + if result.dispatch_result: + dr = result.dispatch_result + print(f"Memory used: {dr.memory_usage:.1f}MB") + print(f"Duration: {dr.end_time - dr.start_time}") +``` + +### Rate-Limited Crawling + +```python +from crawl4ai import RateLimiter, SemaphoreDispatcher + +# Control request pacing and handle server rate limits +async def rate_limited_crawl(): + rate_limiter = RateLimiter( + base_delay=(1.0, 3.0), # Random delay 1-3 seconds + max_delay=60.0, # Cap backoff at 60 seconds + max_retries=3, # Retry failed requests 3 times + rate_limit_codes=[429, 503] # Handle these status codes + ) + + dispatcher = SemaphoreDispatcher( + max_session_permit=5, # Fixed concurrency limit + rate_limiter=rate_limiter + ) + + config = CrawlerRunConfig( + user_agent_mode="random", # Randomize user agents + simulate_user=True # Simulate human behavior + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + print(f"Processed: {result.url}") +``` + +### Real-Time Monitoring + +```python +from crawl4ai import CrawlerMonitor, DisplayMode + +# Monitor crawling progress in real-time +async def monitored_crawl(): + monitor = CrawlerMonitor( + max_visible_rows=20, # Show 20 tasks in display + display_mode=DisplayMode.DETAILED # Show individual task details + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=75.0, + max_session_permit=10, + monitor=monitor # Attach monitor to dispatcher + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + dispatcher=dispatcher + ) +``` + +### Advanced Dispatcher Configurations + +```python +# Memory-adaptive with comprehensive monitoring +memory_dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=85.0, # Higher memory tolerance + check_interval=0.5, # Check memory more frequently + max_session_permit=20, # More concurrent tasks + memory_wait_timeout=600.0, # Wait longer for memory + rate_limiter=RateLimiter( + base_delay=(0.5, 1.5), + max_delay=30.0, + max_retries=5 + ), + monitor=CrawlerMonitor( + max_visible_rows=15, + display_mode=DisplayMode.AGGREGATED # Summary view + ) +) + +# Simple semaphore-based dispatcher +semaphore_dispatcher = SemaphoreDispatcher( + max_session_permit=8, # Fixed concurrency + rate_limiter=RateLimiter( + base_delay=(1.0, 2.0), + max_delay=20.0 + ) +) + +# Usage with custom dispatcher +async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + config=config, + dispatcher=memory_dispatcher # or semaphore_dispatcher + ) +``` + +### Handling Large-Scale Crawling + +```python +async def large_scale_crawl(): + # For thousands of URLs + urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=70.0, # Conservative memory usage + max_session_permit=25, # Higher concurrency + rate_limiter=RateLimiter( + base_delay=(0.1, 0.5), # Faster for large batches + max_retries=2 # Fewer retries for speed + ), + monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED) + ) + + config = CrawlerRunConfig( + cache_mode=CacheMode.ENABLED, # Use caching for efficiency + stream=True, # Stream for memory efficiency + word_count_threshold=100, # Skip short content + exclude_external_links=True # Reduce processing overhead + ) + + successful_crawls = 0 + failed_crawls = 0 + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + successful_crawls += 1 + await save_result_to_database(result) + else: + failed_crawls += 1 + await log_failure(result.url, result.error_message) + + # Progress reporting + if (successful_crawls + failed_crawls) % 100 == 0: + print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}") + + print(f"Completed: {successful_crawls} successful, {failed_crawls} failed") +``` + +### Robots.txt Compliance + +```python +async def compliant_crawl(): + config = CrawlerRunConfig( + check_robots_txt=True, # Respect robots.txt + user_agent="MyBot/1.0", # Identify your bot + mean_delay=2.0, # Be polite with delays + max_range=1.0 + ) + + dispatcher = SemaphoreDispatcher( + max_session_permit=3, # Conservative concurrency + rate_limiter=RateLimiter( + base_delay=(2.0, 5.0), # Slower, more respectful + max_retries=1 + ) + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + print(f"✅ Crawled: {result.url}") + elif "robots.txt" in result.error_message: + print(f"🚫 Blocked by robots.txt: {result.url}") + else: + print(f"❌ Error: {result.url}") +``` + +### Performance Analysis + +```python +async def analyze_crawl_performance(): + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=80.0, + max_session_permit=12, + monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED) + ) + + start_time = time.time() + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun_many( + urls=urls, + dispatcher=dispatcher + ) + + end_time = time.time() + + # Analyze results + successful = [r for r in results if r.success] + failed = [r for r in results if not r.success] + + print(f"Total time: {end_time - start_time:.2f}s") + print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)") + print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s") + + # Memory usage analysis + if successful and successful[0].dispatch_result: + memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result] + peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result] + + print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB") + print(f"Peak memory usage: {max(peak_memory):.1f}MB") +``` + +### Error Handling and Recovery + +```python +async def robust_multi_crawl(): + failed_urls = [] + + config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + stream=True, + page_timeout=30000 # 30 second timeout + ) + + dispatcher = MemoryAdaptiveDispatcher( + memory_threshold_percent=85.0, + max_session_permit=10 + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun_many( + urls=urls, + config=config, + dispatcher=dispatcher + ): + if result.success: + await process_successful_result(result) + else: + failed_urls.append({ + 'url': result.url, + 'error': result.error_message, + 'status_code': result.status_code + }) + + # Retry logic for specific errors + if result.status_code in [503, 429]: # Server errors + await schedule_retry(result.url) + + # Report failures + if failed_urls: + print(f"Failed to crawl {len(failed_urls)} URLs:") + for failure in failed_urls[:10]: # Show first 10 + print(f" {failure['url']}: {failure['error']}") +``` + +**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/) +--- + + +## Deep Crawling + +Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies. + +### Basic Deep Crawl Setup + +```python +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy +from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy + +# Basic breadth-first deep crawling +async def basic_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, # Initial page + 2 levels + include_external=False # Stay within same domain + ), + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun("https://docs.crawl4ai.com", config=config) + + # Group results by depth + pages_by_depth = {} + for result in results: + depth = result.metadata.get("depth", 0) + if depth not in pages_by_depth: + pages_by_depth[depth] = [] + pages_by_depth[depth].append(result.url) + + print(f"Crawled {len(results)} pages total") + for depth, urls in sorted(pages_by_depth.items()): + print(f"Depth {depth}: {len(urls)} pages") +``` + +### Deep Crawl Strategies + +```python +from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + +# Breadth-First Search - explores all links at one depth before going deeper +bfs_strategy = BFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + max_pages=50, # Limit total pages + score_threshold=0.3 # Minimum score for URLs +) + +# Depth-First Search - explores as deep as possible before backtracking +dfs_strategy = DFSDeepCrawlStrategy( + max_depth=2, + include_external=False, + max_pages=30, + score_threshold=0.5 +) + +# Best-First - prioritizes highest scoring pages (recommended) +keyword_scorer = KeywordRelevanceScorer( + keywords=["crawl", "example", "async", "configuration"], + weight=0.7 +) + +best_first_strategy = BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer, + max_pages=25 # No score_threshold needed - naturally prioritizes +) + +# Usage +config = CrawlerRunConfig( + deep_crawl_strategy=best_first_strategy, # Choose your strategy + scraping_strategy=LXMLWebScrapingStrategy() +) +``` + +### Streaming vs Batch Processing + +```python +# Batch mode - wait for all results +async def batch_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1), + stream=False # Default - collect all results first + ) + + async with AsyncWebCrawler() as crawler: + results = await crawler.arun("https://example.com", config=config) + + # Process all results at once + for result in results: + print(f"Batch processed: {result.url}") + +# Streaming mode - process results as they arrive +async def streaming_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1), + stream=True # Process results immediately + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://example.com", config=config): + depth = result.metadata.get("depth", 0) + print(f"Stream processed depth {depth}: {result.url}") +``` + +### Filtering with Filter Chains + +```python +from crawl4ai.deep_crawling.filters import ( + FilterChain, + URLPatternFilter, + DomainFilter, + ContentTypeFilter, + SEOFilter, + ContentRelevanceFilter +) + +# Single URL pattern filter +url_filter = URLPatternFilter(patterns=["*core*", "*guide*"]) + +config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=1, + filter_chain=FilterChain([url_filter]) + ) +) + +# Multiple filters in chain +advanced_filter_chain = FilterChain([ + # Domain filtering + DomainFilter( + allowed_domains=["docs.example.com"], + blocked_domains=["old.docs.example.com", "staging.example.com"] + ), + + # URL pattern matching + URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]), + + # Content type filtering + ContentTypeFilter(allowed_types=["text/html"]), + + # SEO quality filter + SEOFilter( + threshold=0.5, + keywords=["tutorial", "guide", "documentation"] + ), + + # Content relevance filter + ContentRelevanceFilter( + query="Web crawling and data extraction with Python", + threshold=0.7 + ) +]) + +config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + filter_chain=advanced_filter_chain + ) +) +``` + +### Intelligent Crawling with Scorers + +```python +from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer + +# Keyword relevance scoring +async def scored_deep_crawl(): + keyword_scorer = KeywordRelevanceScorer( + keywords=["browser", "crawler", "web", "automation"], + weight=1.0 + ) + + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + url_scorer=keyword_scorer + ), + stream=True, # Recommended with BestFirst + verbose=True + ) + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f"Depth: {depth} | Score: {score:.2f} | {result.url}") +``` + +### Limiting Crawl Size + +```python +# Max pages limitation across strategies +async def limited_crawls(): + # BFS with page limit + bfs_config = CrawlerRunConfig( + deep_crawl_strategy=BFSDeepCrawlStrategy( + max_depth=2, + max_pages=5, # Only crawl 5 pages total + url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0) + ) + ) + + # DFS with score threshold + dfs_config = CrawlerRunConfig( + deep_crawl_strategy=DFSDeepCrawlStrategy( + max_depth=2, + score_threshold=0.7, # Only URLs with scores above 0.7 + max_pages=10, + url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0) + ) + ) + + # Best-First with both constraints + bf_config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + max_pages=7, # Automatically gets highest scored pages + url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0) + ), + stream=True + ) + + async with AsyncWebCrawler() as crawler: + # Use any of the configs + async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config): + score = result.metadata.get("score", 0) + print(f"Score: {score:.2f} | {result.url}") +``` + +### Complete Advanced Deep Crawler + +```python +async def comprehensive_deep_crawl(): + # Sophisticated filter chain + filter_chain = FilterChain([ + DomainFilter( + allowed_domains=["docs.crawl4ai.com"], + blocked_domains=["old.docs.crawl4ai.com"] + ), + URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]), + ContentTypeFilter(allowed_types=["text/html"]), + SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"]) + ]) + + # Multi-keyword scorer + keyword_scorer = KeywordRelevanceScorer( + keywords=["crawl", "example", "async", "configuration", "browser"], + weight=0.8 + ) + + # Complete configuration + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + include_external=False, + filter_chain=filter_chain, + url_scorer=keyword_scorer, + max_pages=20 + ), + scraping_strategy=LXMLWebScrapingStrategy(), + stream=True, + verbose=True, + cache_mode=CacheMode.BYPASS + ) + + # Execute and analyze + results = [] + start_time = time.time() + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + results.append(result) + score = result.metadata.get("score", 0) + depth = result.metadata.get("depth", 0) + print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}") + + # Performance analysis + duration = time.time() - start_time + avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results) + + print(f"✅ Crawled {len(results)} pages in {duration:.2f}s") + print(f"✅ Average relevance score: {avg_score:.2f}") + + # Depth distribution + depth_counts = {} + for result in results: + depth = result.metadata.get("depth", 0) + depth_counts[depth] = depth_counts.get(depth, 0) + 1 + + for depth, count in sorted(depth_counts.items()): + print(f"📊 Depth {depth}: {count} pages") +``` + +### Error Handling and Robustness + +```python +async def robust_deep_crawl(): + config = CrawlerRunConfig( + deep_crawl_strategy=BestFirstCrawlingStrategy( + max_depth=2, + max_pages=15, + url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"]) + ), + stream=True, + page_timeout=30000 # 30 second timeout per page + ) + + successful_pages = [] + failed_pages = [] + + async with AsyncWebCrawler() as crawler: + async for result in await crawler.arun("https://docs.crawl4ai.com", config=config): + if result.success: + successful_pages.append(result) + depth = result.metadata.get("depth", 0) + score = result.metadata.get("score", 0) + print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}") + else: + failed_pages.append({ + 'url': result.url, + 'error': result.error_message, + 'depth': result.metadata.get("depth", 0) + }) + print(f"❌ Failed: {result.url} - {result.error_message}") + + print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed") + + # Analyze failures by depth + if failed_pages: + failure_by_depth = {} + for failure in failed_pages: + depth = failure['depth'] + failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1 + + print("❌ Failures by depth:") + for depth, count in sorted(failure_by_depth.items()): + print(f" Depth {depth}: {count} failures") +``` + +**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/) +--- + + +## Docker Deployment + +Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration. + +### Quick Start with Pre-built Images + +```bash +# Pull latest image +docker pull unclecode/crawl4ai:latest + +# Setup LLM API keys +cat > .llm.env << EOL +OPENAI_API_KEY=sk-your-key +ANTHROPIC_API_KEY=your-anthropic-key +GROQ_API_KEY=your-groq-key +GEMINI_API_TOKEN=your-gemini-token +EOL + +# Run with LLM support +docker run -d \ + -p 11235:11235 \ + --name crawl4ai \ + --env-file .llm.env \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# Basic run (no LLM) +docker run -d \ + -p 11235:11235 \ + --name crawl4ai \ + --shm-size=1g \ + unclecode/crawl4ai:latest + +# Check health +curl http://localhost:11235/health +``` + +### Docker Compose Deployment + +```bash +# Clone and setup +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai +cp deploy/docker/.llm.env.example .llm.env +# Edit .llm.env with your API keys + +# Run pre-built image +IMAGE=unclecode/crawl4ai:latest docker compose up -d + +# Build locally +docker compose up --build -d + +# Build with all features +INSTALL_TYPE=all docker compose up --build -d + +# Build with GPU support +ENABLE_GPU=true docker compose up --build -d + +# Stop service +docker compose down +``` + +### Manual Build with Multi-Architecture + +```bash +# Clone repository +git clone https://github.com/unclecode/crawl4ai.git +cd crawl4ai + +# Build for current architecture +docker buildx build -t crawl4ai-local:latest --load . + +# Build for multiple architectures +docker buildx build --platform linux/amd64,linux/arm64 \ + -t crawl4ai-local:latest --load . + +# Build with specific features +docker buildx build \ + --build-arg INSTALL_TYPE=all \ + --build-arg ENABLE_GPU=false \ + -t crawl4ai-local:latest --load . + +# Run custom build +docker run -d \ + -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + crawl4ai-local:latest +``` + +### Build Arguments + +```bash +# Available build options +docker buildx build \ + --build-arg INSTALL_TYPE=all \ # default|all|torch|transformer + --build-arg ENABLE_GPU=true \ # true|false + --build-arg APP_HOME=/app \ # Install path + --build-arg USE_LOCAL=true \ # Use local source + --build-arg GITHUB_REPO=url \ # Git repo if USE_LOCAL=false + --build-arg GITHUB_BRANCH=main \ # Git branch + -t crawl4ai-custom:latest --load . +``` + +### Core API Endpoints + +```python +# Main crawling endpoints +import requests +import json + +# Basic crawl +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} +response = requests.post("http://localhost:11235/crawl", json=payload) + +# Streaming crawl +payload["crawler_config"]["params"]["stream"] = True +response = requests.post("http://localhost:11235/crawl/stream", json=payload) + +# Health check +response = requests.get("http://localhost:11235/health") + +# API schema +response = requests.get("http://localhost:11235/schema") + +# Metrics (Prometheus format) +response = requests.get("http://localhost:11235/metrics") +``` + +### Specialized Endpoints + +```python +# HTML extraction (preprocessed for schema) +response = requests.post("http://localhost:11235/html", + json={"url": "https://example.com"}) + +# Screenshot capture +response = requests.post("http://localhost:11235/screenshot", json={ + "url": "https://example.com", + "screenshot_wait_for": 2, + "output_path": "/path/to/save/screenshot.png" +}) + +# PDF generation +response = requests.post("http://localhost:11235/pdf", json={ + "url": "https://example.com", + "output_path": "/path/to/save/document.pdf" +}) + +# JavaScript execution +response = requests.post("http://localhost:11235/execute_js", json={ + "url": "https://example.com", + "scripts": [ + "return document.title", + "return Array.from(document.querySelectorAll('a')).map(a => a.href)" + ] +}) + +# Markdown generation +response = requests.post("http://localhost:11235/md", json={ + "url": "https://example.com", + "f": "fit", # raw|fit|bm25|llm + "q": "extract main content", # query for filtering + "c": "0" # cache: 0=bypass, 1=use +}) + +# LLM Q&A +response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?") + +# Library context (for AI assistants) +response = requests.get("http://localhost:11235/ask", params={ + "context_type": "all", # code|doc|all + "query": "how to use extraction strategies", + "score_ratio": 0.5, + "max_results": 20 +}) +``` + +### Python SDK Usage + +```python +import asyncio +from crawl4ai.docker_client import Crawl4aiDockerClient +from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode + +async def main(): + async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client: + # Non-streaming crawl + results = await client.crawl( + ["https://example.com"], + browser_config=BrowserConfig(headless=True), + crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS) + ) + + for result in results: + print(f"URL: {result.url}, Success: {result.success}") + print(f"Content length: {len(result.markdown)}") + + # Streaming crawl + stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS) + async for result in await client.crawl( + ["https://example.com", "https://python.org"], + browser_config=BrowserConfig(headless=True), + crawler_config=stream_config + ): + print(f"Streamed: {result.url} - {result.success}") + + # Get API schema + schema = await client.get_schema() + print(f"Schema available: {bool(schema)}") + +asyncio.run(main()) +``` + +### Advanced API Configuration + +```python +# Complex extraction with LLM +payload = { + "urls": ["https://example.com"], + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport": {"type": "dict", "value": {"width": 1200, "height": 800}} + } + }, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": "env:OPENAI_API_KEY" + } + }, + "schema": { + "type": "dict", + "value": { + "type": "object", + "properties": { + "title": {"type": "string"}, + "content": {"type": "string"} + } + } + }, + "instruction": "Extract title and main content" + } + }, + "markdown_generator": { + "type": "DefaultMarkdownGenerator", + "params": { + "content_filter": { + "type": "PruningContentFilter", + "params": {"threshold": 0.6} + } + } + } + } + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +``` + +### CSS Extraction Strategy + +```python +# CSS-based structured extraction +schema = { + "name": "ProductList", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "price", "selector": ".price", "type": "text"}, + {"name": "link", "selector": "a", "type": "attribute", "attribute": "href"} + ] +} + +payload = { + "urls": ["https://example-shop.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": {"type": "dict", "value": schema} + } + } + } + } +} + +response = requests.post("http://localhost:11235/crawl", json=payload) +data = response.json() +extracted = json.loads(data["results"][0]["extracted_content"]) +``` + +### MCP (Model Context Protocol) Integration + +```bash +# Add Crawl4AI as MCP provider to Claude Code +claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse + +# List MCP providers +claude mcp list + +# Test MCP connection +python tests/mcp/test_mcp_socket.py + +# Available MCP endpoints +# SSE: http://localhost:11235/mcp/sse +# WebSocket: ws://localhost:11235/mcp/ws +# Schema: http://localhost:11235/mcp/schema +``` + +Available MCP tools: +- `md` - Generate markdown from web content +- `html` - Extract preprocessed HTML +- `screenshot` - Capture webpage screenshots +- `pdf` - Generate PDF documents +- `execute_js` - Run JavaScript on web pages +- `crawl` - Perform multi-URL crawling +- `ask` - Query Crawl4AI library context + +### Configuration Management + +```yaml +# config.yml structure +app: + title: "Crawl4AI API" + version: "1.0.0" + host: "0.0.0.0" + port: 11235 + timeout_keep_alive: 300 + +llm: + provider: "openai/gpt-4o-mini" + api_key_env: "OPENAI_API_KEY" + +security: + enabled: false + jwt_enabled: false + trusted_hosts: ["*"] + +crawler: + memory_threshold_percent: 95.0 + rate_limiter: + base_delay: [1.0, 2.0] + timeouts: + stream_init: 30.0 + batch_process: 300.0 + pool: + max_pages: 40 + idle_ttl_sec: 1800 + +rate_limiting: + enabled: true + default_limit: "1000/minute" + storage_uri: "memory://" + +logging: + level: "INFO" + format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s" +``` + +### Custom Configuration Deployment + +```bash +# Method 1: Mount custom config +docker run -d -p 11235:11235 \ + --name crawl4ai-custom \ + --env-file .llm.env \ + --shm-size=1g \ + -v $(pwd)/my-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest + +# Method 2: Build with custom config +# Edit deploy/docker/config.yml then build +docker buildx build -t crawl4ai-custom:latest --load . +``` + +### Monitoring and Health Checks + +```bash +# Health endpoint +curl http://localhost:11235/health + +# Prometheus metrics +curl http://localhost:11235/metrics + +# Configuration validation +curl -X POST http://localhost:11235/config/dump \ + -H "Content-Type: application/json" \ + -d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}' +``` + +### Playground Interface + +Access the interactive playground at `http://localhost:11235/playground` for: +- Testing configurations with visual interface +- Generating JSON payloads for REST API +- Converting Python config to JSON format +- Testing crawl operations directly in browser + +### Async Job Processing + +```python +# Submit job for async processing +import time + +# Submit crawl job +response = requests.post("http://localhost:11235/crawl/job", json=payload) +task_id = response.json()["task_id"] + +# Poll for completion +while True: + result = requests.get(f"http://localhost:11235/crawl/job/{task_id}") + status = result.json() + + if status["status"] in ["COMPLETED", "FAILED"]: + break + time.sleep(1.5) + +print("Final result:", status) +``` + +### Production Deployment + +```bash +# Production-ready deployment +docker run -d \ + --name crawl4ai-prod \ + --restart unless-stopped \ + -p 11235:11235 \ + --env-file .llm.env \ + --shm-size=2g \ + --memory=8g \ + --cpus=4 \ + -v /path/to/custom-config.yml:/app/config.yml \ + unclecode/crawl4ai:latest + +# With Docker Compose for production +version: '3.8' +services: + crawl4ai: + image: unclecode/crawl4ai:latest + ports: + - "11235:11235" + environment: + - OPENAI_API_KEY=${OPENAI_API_KEY} + volumes: + - ./config.yml:/app/config.yml + shm_size: 2g + deploy: + resources: + limits: + memory: 8G + cpus: '4' + restart: unless-stopped +``` + +### Configuration Validation and JSON Structure + +```python +# Method 1: Create config objects and dump to see expected JSON structure +from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode +from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy +import json + +# Create browser config and see JSON structure +browser_config = BrowserConfig( + headless=True, + viewport_width=1280, + viewport_height=720, + proxy="http://user:pass@proxy:8080" +) + +# Get JSON structure +browser_json = browser_config.dump() +print("BrowserConfig JSON structure:") +print(json.dumps(browser_json, indent=2)) + +# Create crawler config with extraction strategy +schema = { + "name": "Articles", + "baseSelector": ".article", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"}, + {"name": "content", "selector": ".content", "type": "html"} + ] +} + +crawler_config = CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + extraction_strategy=JsonCssExtractionStrategy(schema), + js_code=["window.scrollTo(0, document.body.scrollHeight);"], + wait_for="css:.loaded" +) + +crawler_json = crawler_config.dump() +print("\nCrawlerRunConfig JSON structure:") +print(json.dumps(crawler_json, indent=2)) +``` + +### Reverse Validation - JSON to Objects + +```python +# Method 2: Load JSON back to config objects for validation +from crawl4ai.async_configs import from_serializable_dict + +# Test JSON structure by converting back to objects +test_browser_json = { + "type": "BrowserConfig", + "params": { + "headless": True, + "viewport_width": 1280, + "proxy": "http://user:pass@proxy:8080" + } +} + +try: + # Convert JSON back to object + restored_browser = from_serializable_dict(test_browser_json) + print(f"✅ Valid BrowserConfig: {type(restored_browser)}") + print(f"Headless: {restored_browser.headless}") + print(f"Proxy: {restored_browser.proxy}") +except Exception as e: + print(f"❌ Invalid BrowserConfig JSON: {e}") + +# Test complex crawler config JSON +test_crawler_json = { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", + "screenshot": True, + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", + "value": { + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h3", "type": "text"} + ] + } + } + } + } + } +} + +try: + restored_crawler = from_serializable_dict(test_crawler_json) + print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}") + print(f"Cache mode: {restored_crawler.cache_mode}") + print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}") +except Exception as e: + print(f"❌ Invalid CrawlerRunConfig JSON: {e}") +``` + +### Using Server's /config/dump Endpoint for Validation + +```python +import requests + +# Method 3: Use server endpoint to validate configuration syntax +def validate_config_with_server(config_code: str) -> dict: + """Validate configuration using server's /config/dump endpoint""" + response = requests.post( + "http://localhost:11235/config/dump", + json={"code": config_code} + ) + + if response.status_code == 200: + print("✅ Valid configuration syntax") + return response.json() + else: + print(f"❌ Invalid configuration: {response.status_code}") + print(response.json()) + return None + +# Test valid configuration +valid_config = """ +CrawlerRunConfig( + cache_mode=CacheMode.BYPASS, + screenshot=True, + js_code=["window.scrollTo(0, document.body.scrollHeight);"], + wait_for="css:.content-loaded" +) +""" + +result = validate_config_with_server(valid_config) +if result: + print("Generated JSON structure:") + print(json.dumps(result, indent=2)) + +# Test invalid configuration (should fail) +invalid_config = """ +CrawlerRunConfig( + cache_mode="invalid_mode", + screenshot=True, + js_code=some_function() # This will fail +) +""" + +validate_config_with_server(invalid_config) +``` + +### Configuration Builder Helper + +```python +def build_and_validate_request(urls, browser_params=None, crawler_params=None): + """Helper to build and validate complete request payload""" + + # Create configurations + browser_config = BrowserConfig(**(browser_params or {})) + crawler_config = CrawlerRunConfig(**(crawler_params or {})) + + # Build complete request payload + payload = { + "urls": urls if isinstance(urls, list) else [urls], + "browser_config": browser_config.dump(), + "crawler_config": crawler_config.dump() + } + + print("✅ Complete request payload:") + print(json.dumps(payload, indent=2)) + + # Validate by attempting to reconstruct + try: + test_browser = from_serializable_dict(payload["browser_config"]) + test_crawler = from_serializable_dict(payload["crawler_config"]) + print("✅ Payload validation successful") + return payload + except Exception as e: + print(f"❌ Payload validation failed: {e}") + return None + +# Example usage +payload = build_and_validate_request( + urls=["https://example.com"], + browser_params={"headless": True, "viewport_width": 1280}, + crawler_params={ + "cache_mode": CacheMode.BYPASS, + "screenshot": True, + "word_count_threshold": 10 + } +) + +if payload: + # Send to server + response = requests.post("http://localhost:11235/crawl", json=payload) + print(f"Server response: {response.status_code}") +``` + +### Common JSON Structure Patterns + +```python +# Pattern 1: Simple primitive values +simple_config = { + "type": "CrawlerRunConfig", + "params": { + "cache_mode": "bypass", # String enum value + "screenshot": True, # Boolean + "page_timeout": 60000 # Integer + } +} + +# Pattern 2: Nested objects +nested_config = { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "LLMExtractionStrategy", + "params": { + "llm_config": { + "type": "LLMConfig", + "params": { + "provider": "openai/gpt-4o-mini", + "api_token": "env:OPENAI_API_KEY" + } + }, + "instruction": "Extract main content" + } + } + } +} + +# Pattern 3: Dictionary values (must use type: dict wrapper) +dict_config = { + "type": "CrawlerRunConfig", + "params": { + "extraction_strategy": { + "type": "JsonCssExtractionStrategy", + "params": { + "schema": { + "type": "dict", # Required wrapper + "value": { # Actual dictionary content + "name": "Products", + "baseSelector": ".product", + "fields": [ + {"name": "title", "selector": "h2", "type": "text"} + ] + } + } + } + } + } +} + +# Pattern 4: Lists and arrays +list_config = { + "type": "CrawlerRunConfig", + "params": { + "js_code": [ # Lists are handled directly + "window.scrollTo(0, document.body.scrollHeight);", + "document.querySelector('.load-more')?.click();" + ], + "excluded_tags": ["script", "style", "nav"] + } +} +``` + +### Troubleshooting Common JSON Errors + +```python +def diagnose_json_errors(): + """Common JSON structure errors and fixes""" + + # ❌ WRONG: Missing type wrapper for objects + wrong_config = { + "browser_config": { + "headless": True # Missing type wrapper + } + } + + # ✅ CORRECT: Proper type wrapper + correct_config = { + "browser_config": { + "type": "BrowserConfig", + "params": { + "headless": True + } + } + } + + # ❌ WRONG: Dictionary without type: dict wrapper + wrong_dict = { + "schema": { + "name": "Products" # Raw dict, should be wrapped + } + } + + # ✅ CORRECT: Dictionary with proper wrapper + correct_dict = { + "schema": { + "type": "dict", + "value": { + "name": "Products" + } + } + } + + # ❌ WRONG: Invalid enum string + wrong_enum = { + "cache_mode": "DISABLED" # Wrong case/value + } + + # ✅ CORRECT: Valid enum string + correct_enum = { + "cache_mode": "bypass" # or "enabled", "disabled", etc. + } + + print("Common error patterns documented above") + +# Validate your JSON structure before sending +def pre_flight_check(payload): + """Run checks before sending to server""" + required_keys = ["urls", "browser_config", "crawler_config"] + + for key in required_keys: + if key not in payload: + print(f"❌ Missing required key: {key}") + return False + + # Check type wrappers + for config_key in ["browser_config", "crawler_config"]: + config = payload[config_key] + if not isinstance(config, dict) or "type" not in config: + print(f"❌ {config_key} missing type wrapper") + return False + if "params" not in config: + print(f"❌ {config_key} missing params") + return False + + print("✅ Pre-flight check passed") + return True + +# Example usage +payload = { + "urls": ["https://example.com"], + "browser_config": {"type": "BrowserConfig", "params": {"headless": True}}, + "crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}} +} + +if pre_flight_check(payload): + # Safe to send to server + pass +``` + +**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration) +--- + + +## CLI & Identity-Based Browsing + +Command-line interface for web crawling with persistent browser profiles, authentication, and identity management. + +### Basic CLI Usage + +```bash +# Simple crawling +crwl https://example.com + +# Get markdown output +crwl https://example.com -o markdown + +# JSON output with cache bypass +crwl https://example.com -o json --bypass-cache + +# Verbose mode with specific browser settings +crwl https://example.com -b "headless=false,viewport_width=1280" -v +``` + +### Profile Management Commands + +```bash +# Launch interactive profile manager +crwl profiles + +# Create, list, and manage browser profiles +# This opens a menu where you can: +# 1. List existing profiles +# 2. Create new profile (opens browser for setup) +# 3. Delete profiles +# 4. Use profile to crawl a website + +# Use a specific profile for crawling +crwl https://example.com -p my-profile-name + +# Example workflow for authenticated sites: +# 1. Create profile and log in +crwl profiles # Select "Create new profile" +# 2. Use profile for crawling authenticated content +crwl https://site-requiring-login.com/dashboard -p my-profile-name +``` + +### CDP Browser Management + +```bash +# Launch browser with CDP debugging (default port 9222) +crwl cdp + +# Use specific profile and custom port +crwl cdp -p my-profile -P 9223 + +# Launch headless browser with CDP +crwl cdp --headless + +# Launch in incognito mode (ignores profile) +crwl cdp --incognito + +# Use custom user data directory +crwl cdp --user-data-dir ~/my-browser-data --port 9224 +``` + +### Builtin Browser Management + +```bash +# Start persistent browser instance +crwl browser start + +# Check browser status +crwl browser status + +# Open visible window to see the browser +crwl browser view --url https://example.com + +# Stop the browser +crwl browser stop + +# Restart with different options +crwl browser restart --browser-type chromium --port 9223 --no-headless + +# Use builtin browser in crawling +crwl https://example.com -b "browser_mode=builtin" +``` + +### Authentication Workflow Examples + +```bash +# Complete workflow for LinkedIn scraping +# 1. Create authenticated profile +crwl profiles +# Select "Create new profile" → login to LinkedIn in browser → press 'q' to save + +# 2. Use profile for crawling +crwl https://linkedin.com/in/someone -p linkedin-profile -o markdown + +# 3. Extract structured data with authentication +crwl https://linkedin.com/search/results/people/ \ + -p linkedin-profile \ + -j "Extract people profiles with names, titles, and companies" \ + -b "headless=false" + +# GitHub authenticated crawling +crwl profiles # Create github-profile +crwl https://github.com/settings/profile -p github-profile + +# Twitter/X authenticated access +crwl profiles # Create twitter-profile +crwl https://twitter.com/home -p twitter-profile -o markdown +``` + +### Advanced CLI Configuration + +```bash +# Complex crawling with multiple configs +crwl https://example.com \ + -B browser.yml \ + -C crawler.yml \ + -e extract_llm.yml \ + -s llm_schema.json \ + -p my-auth-profile \ + -o json \ + -v + +# Quick LLM extraction with authentication +crwl https://private-site.com/dashboard \ + -p auth-profile \ + -j "Extract user dashboard data including metrics and notifications" \ + -b "headless=true,viewport_width=1920" + +# Content filtering with authentication +crwl https://members-only-site.com \ + -p member-profile \ + -f filter_bm25.yml \ + -c "css_selector=.member-content,scan_full_page=true" \ + -o markdown-fit +``` + +### Configuration Files for Identity Browsing + +```yaml +# browser_auth.yml +headless: false +use_managed_browser: true +user_data_dir: "/path/to/profile" +viewport_width: 1280 +viewport_height: 720 +simulate_user: true +override_navigator: true + +# crawler_auth.yml +magic: true +remove_overlay_elements: true +simulate_user: true +wait_for: "css:.authenticated-content" +page_timeout: 60000 +delay_before_return_html: 2 +scan_full_page: true +``` + +### Global Configuration Management + +```bash +# List all configuration settings +crwl config list + +# Set default LLM provider +crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet" +crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token" + +# Set browser defaults +crwl config set BROWSER_HEADLESS false # Always show browser +crwl config set USER_AGENT_MODE random # Random user agents + +# Enable verbose mode globally +crwl config set VERBOSE true +``` + +### Q&A with Authenticated Content + +```bash +# Ask questions about authenticated content +crwl https://private-dashboard.com -p dashboard-profile \ + -q "What are the key metrics shown in my dashboard?" + +# Multiple questions workflow +crwl https://company-intranet.com -p work-profile -o markdown # View content +crwl https://company-intranet.com -p work-profile \ + -q "Summarize this week's announcements" +crwl https://company-intranet.com -p work-profile \ + -q "What are the upcoming deadlines?" +``` + +### Profile Creation Programmatically + +```python +# Create profiles via Python API +import asyncio +from crawl4ai import BrowserProfiler + +async def create_auth_profile(): + profiler = BrowserProfiler() + + # Create profile interactively (opens browser) + profile_path = await profiler.create_profile("linkedin-auth") + print(f"Profile created at: {profile_path}") + + # List all profiles + profiles = profiler.list_profiles() + for profile in profiles: + print(f"Profile: {profile['name']} at {profile['path']}") + + # Use profile for crawling + from crawl4ai import AsyncWebCrawler, BrowserConfig + + browser_config = BrowserConfig( + headless=True, + use_managed_browser=True, + user_data_dir=profile_path + ) + + async with AsyncWebCrawler(config=browser_config) as crawler: + result = await crawler.arun("https://linkedin.com/feed") + return result + +# asyncio.run(create_auth_profile()) +``` + +### Identity Browsing Best Practices + +```bash +# 1. Create specific profiles for different sites +crwl profiles # Create "linkedin-work" +crwl profiles # Create "github-personal" +crwl profiles # Create "company-intranet" + +# 2. Use descriptive profile names +crwl https://site1.com -p site1-admin-account +crwl https://site2.com -p site2-user-account + +# 3. Combine with appropriate browser settings +crwl https://secure-site.com \ + -p secure-profile \ + -b "headless=false,simulate_user=true,magic=true" \ + -c "wait_for=.logged-in-indicator,page_timeout=30000" + +# 4. Test profile before automated crawling +crwl cdp -p test-profile # Manually verify login status +crwl https://test-url.com -p test-profile -v # Verbose test crawl +``` + +### Troubleshooting Authentication Issues + +```bash +# Debug authentication problems +crwl https://auth-site.com -p auth-profile \ + -b "headless=false,verbose=true" \ + -c "verbose=true,page_timeout=60000" \ + -v + +# Check profile status +crwl profiles # List profiles and check creation dates + +# Recreate problematic profiles +crwl profiles # Delete old profile, create new one + +# Test with visible browser +crwl https://problem-site.com -p profile-name \ + -b "headless=false" \ + -c "delay_before_return_html=5" +``` + +### Common Use Cases + +```bash +# Social media monitoring (after authentication) +crwl https://twitter.com/home -p twitter-monitor \ + -j "Extract latest tweets with sentiment and engagement metrics" + +# E-commerce competitor analysis (with account access) +crwl https://competitor-site.com/products -p competitor-account \ + -j "Extract product prices, availability, and descriptions" + +# Company dashboard monitoring +crwl https://company-dashboard.com -p work-profile \ + -c "css_selector=.dashboard-content" \ + -q "What alerts or notifications need attention?" + +# Research data collection (authenticated access) +crwl https://research-platform.com/data -p research-profile \ + -e extract_research.yml \ + -s research_schema.json \ + -o json +``` + +**📖 Learn more:** [Identity-Based Crawling Documentation](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Browser Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [CLI Examples](https://docs.crawl4ai.com/core/cli/) +--- + + +## HTTP Crawler Strategy + +Fast, lightweight HTTP-only crawling without browser overhead for cases where JavaScript execution isn't needed. + +### Basic HTTP Crawler Setup + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig, CacheMode +from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy +from crawl4ai.async_logger import AsyncLogger + +async def main(): + # Initialize HTTP strategy + http_strategy = AsyncHTTPCrawlerStrategy( + browser_config=HTTPCrawlerConfig( + method="GET", + verify_ssl=True, + follow_redirects=True + ), + logger=AsyncLogger(verbose=True) + ) + + # Use with AsyncWebCrawler + async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler: + result = await crawler.arun("https://example.com") + print(f"Status: {result.status_code}") + print(f"Content: {len(result.html)} chars") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +### HTTP Request Types + +```python +# GET request (default) +http_config = HTTPCrawlerConfig( + method="GET", + headers={"Accept": "application/json"} +) + +# POST with JSON data +http_config = HTTPCrawlerConfig( + method="POST", + json={"key": "value", "data": [1, 2, 3]}, + headers={"Content-Type": "application/json"} +) + +# POST with form data +http_config = HTTPCrawlerConfig( + method="POST", + data={"username": "user", "password": "pass"}, + headers={"Content-Type": "application/x-www-form-urlencoded"} +) + +# Advanced configuration +http_config = HTTPCrawlerConfig( + method="GET", + headers={"User-Agent": "Custom Bot/1.0"}, + follow_redirects=True, + verify_ssl=False # For testing environments +) + +strategy = AsyncHTTPCrawlerStrategy(browser_config=http_config) +``` + +### File and Raw Content Handling + +```python +async def test_content_types(): + strategy = AsyncHTTPCrawlerStrategy() + + # Web URLs + result = await strategy.crawl("https://httpbin.org/get") + print(f"Web content: {result.status_code}") + + # Local files + result = await strategy.crawl("file:///path/to/local/file.html") + print(f"File content: {len(result.html)}") + + # Raw HTML content + raw_html = "raw://Content
" + result = await strategy.crawl(raw_html) + print(f"Raw content: {result.html}") + + # Raw content with complex HTML + complex_html = """raw:// + +Paragraph content
+