crawl4ai/docs/md_v2/assets/llm.txt/txt/extraction.txt

## Extraction Strategies

Powerful data extraction from web pages using LLM-based intelligent parsing or fast schema/pattern-based approaches.

### LLM-Based Extraction - Intelligent Content Understanding

```python
import os
import asyncio
import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy

# Define structured data model
class Product(BaseModel):
    name: str = Field(description="Product name")
    price: str = Field(description="Product price")
    description: str = Field(description="Product description")
    features: List[str] = Field(description="List of product features")
    rating: float = Field(description="Product rating out of 5")

# Configure LLM provider
llm_config = LLMConfig(
    provider="openai/gpt-4o-mini",  # or "ollama/llama3.3", "anthropic/claude-3-5-sonnet"
    api_token=os.getenv("OPENAI_API_KEY"),  # or "env:OPENAI_API_KEY"
    temperature=0.1,
    max_tokens=2000
)

# Create LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
    llm_config=llm_config,
    schema=Product.model_json_schema(),
    extraction_type="schema",  # or "block" for freeform text
    instruction="""
    Extract product information from the webpage content.
    Focus on finding complete product details including:
    - Product name and price
    - Detailed description
    - All listed features
    - Customer rating if available
    Return valid JSON array of products.
    """,
    chunk_token_threshold=1200,  # Split content if too large
    overlap_rate=0.1,           # 10% overlap between chunks
    apply_chunking=True,        # Enable automatic chunking
    input_format="markdown",    # "html", "fit_markdown", or "markdown"
    extra_args={"temperature": 0.0, "max_tokens": 800},
    verbose=True
)

async def extract_with_llm():
    browser_config = BrowserConfig(headless=True)

    crawl_config = CrawlerRunConfig(
        extraction_strategy=llm_strategy,
        cache_mode=CacheMode.BYPASS,
        word_count_threshold=10
    )

    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(
            url="https://example.com/products",
            config=crawl_config
        )

        if result.success:
            # Parse extracted JSON
            products = json.loads(result.extracted_content)
            print(f"Extracted {len(products)} products")

            for product in products[:3]:  # Show first 3
                print(f"Product: {product['name']}")
                print(f"Price: {product['price']}")
                print(f"Rating: {product.get('rating', 'N/A')}")

            # Show token usage and cost
            llm_strategy.show_usage()
        else:
            print(f"Extraction failed: {result.error_message}")

asyncio.run(extract_with_llm())
```

### LLM Strategy Advanced Configuration

```python
# Multiple provider configurations
providers = {
    "openai": LLMConfig(
        provider="openai/gpt-4o",
        api_token="env:OPENAI_API_KEY",
        temperature=0.1
    ),
    "anthropic": LLMConfig(
        provider="anthropic/claude-3-5-sonnet-20240620",
        api_token="env:ANTHROPIC_API_KEY",
        max_tokens=4000
    ),
    "ollama": LLMConfig(
        provider="ollama/llama3.3",
        api_token=None,  # Not needed for Ollama
        base_url="http://localhost:11434"
    ),
    "groq": LLMConfig(
        provider="groq/llama3-70b-8192",
        api_token="env:GROQ_API_KEY"
    )
}

# Advanced chunking for large content
large_content_strategy = LLMExtractionStrategy(
    llm_config=providers["openai"],
    schema=YourModel.model_json_schema(),
    extraction_type="schema",
    instruction="Extract detailed information...",

    # Chunking parameters
    chunk_token_threshold=2000,  # Larger chunks for complex content
    overlap_rate=0.15,          # More overlap for context preservation
    apply_chunking=True,

    # Input format selection
    input_format="fit_markdown",  # Use filtered content if available

    # LLM parameters
    extra_args={
        "temperature": 0.0,      # Deterministic output
        "top_p": 0.9,
        "frequency_penalty": 0.1,
        "presence_penalty": 0.1,
        "max_tokens": 1500
    },
    verbose=True
)

# Knowledge graph extraction
class Entity(BaseModel):
    name: str
    type: str  # "person", "organization", "location", etc.
    description: str

class Relationship(BaseModel):
    source: str
    target: str
    relationship: str
    confidence: float

class KnowledgeGraph(BaseModel):
    entities: List[Entity]
    relationships: List[Relationship]
    summary: str

knowledge_strategy = LLMExtractionStrategy(
    llm_config=providers["anthropic"],
    schema=KnowledgeGraph.model_json_schema(),
    extraction_type="schema",
    instruction="""
    Create a knowledge graph from the content by:
    1. Identifying key entities (people, organizations, locations, concepts)
    2. Finding relationships between entities
    3. Providing confidence scores for relationships
    4. Summarizing the main topics
    """,
    input_format="html",  # Use HTML for better structure preservation
    apply_chunking=True,
    chunk_token_threshold=1500
)
```

### JSON CSS Extraction - Fast Schema-Based Extraction

```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

# Basic CSS extraction schema
simple_schema = {
    "name": "Product Listings",
    "baseSelector": "div.product-card",
    "fields": [
        {
            "name": "title",
            "selector": "h2.product-title",
            "type": "text"
        },
        {
            "name": "price",
            "selector": ".price",
            "type": "text"
        },
        {
            "name": "image_url",
            "selector": "img.product-image",
            "type": "attribute",
            "attribute": "src"
        },
        {
            "name": "product_url",
            "selector": "a.product-link",
            "type": "attribute",
            "attribute": "href"
        }
    ]
}

# Complex nested schema with multiple data types
complex_schema = {
    "name": "E-commerce Product Catalog",
    "baseSelector": "div.category",
    "baseFields": [
        {
            "name": "category_id",
            "type": "attribute",
            "attribute": "data-category-id"
        },
        {
            "name": "category_url",
            "type": "attribute",
            "attribute": "data-url"
        }
    ],
    "fields": [
        {
            "name": "category_name",
            "selector": "h2.category-title",
            "type": "text"
        },
        {
            "name": "products",
            "selector": "div.product",
            "type": "nested_list",  # Array of complex objects
            "fields": [
                {
                    "name": "name",
                    "selector": "h3.product-name",
                    "type": "text",
                    "default": "Unknown Product"
                },
                {
                    "name": "price",
                    "selector": "span.price",
                    "type": "text"
                },
                {
                    "name": "details",
                    "selector": "div.product-details",
                    "type": "nested",  # Single complex object
                    "fields": [
                        {
                            "name": "brand",
                            "selector": "span.brand",
                            "type": "text"
                        },
                        {
                            "name": "model",
                            "selector": "span.model",
                            "type": "text"
                        },
                        {
                            "name": "specs",
                            "selector": "div.specifications",
                            "type": "html"  # Preserve HTML structure
                        }
                    ]
                },
                {
                    "name": "features",
                    "selector": "ul.features li",
                    "type": "list",  # Simple array of strings
                    "fields": [
                        {"name": "feature", "type": "text"}
                    ]
                },
                {
                    "name": "reviews",
                    "selector": "div.review",
                    "type": "nested_list",
                    "fields": [
                        {
                            "name": "reviewer",
                            "selector": "span.reviewer-name",
                            "type": "text"
                        },
                        {
                            "name": "rating",
                            "selector": "span.rating",
                            "type": "attribute",
                            "attribute": "data-rating"
                        },
                        {
                            "name": "comment",
                            "selector": "p.review-text",
                            "type": "text"
                        },
                        {
                            "name": "date",
                            "selector": "time.review-date",
                            "type": "attribute",
                            "attribute": "datetime"
                        }
                    ]
                }
            ]
        }
    ]
}

async def extract_with_css_schema():
    strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)

    config = CrawlerRunConfig(
        extraction_strategy=strategy,
        cache_mode=CacheMode.BYPASS,
        # Enable dynamic content loading if needed
        js_code="window.scrollTo(0, document.body.scrollHeight);",
        wait_for="css:.product:nth-child(10)",  # Wait for products to load
        process_iframes=True
    )

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com/catalog",
            config=config
        )

        if result.success:
            data = json.loads(result.extracted_content)
            print(f"Extracted {len(data)} categories")

            for category in data:
                print(f"Category: {category['category_name']}")
                print(f"Products: {len(category.get('products', []))}")

                # Show first product details
                if category.get('products'):
                    product = category['products'][0]
                    print(f"  First product: {product.get('name')}")
                    print(f"  Features: {len(product.get('features', []))}")
                    print(f"  Reviews: {len(product.get('reviews', []))}")

asyncio.run(extract_with_css_schema())
```

### Automatic Schema Generation - One-Time LLM, Unlimited Use

```python
import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy

async def generate_and_use_schema():
    """
    1. Use LLM once to generate schema from sample HTML
    2. Cache the schema for reuse
    3. Use cached schema for fast extraction without LLM calls
    """

    cache_dir = Path("./schema_cache")
    cache_dir.mkdir(exist_ok=True)
    schema_file = cache_dir / "ecommerce_schema.json"

    # Step 1: Generate or load cached schema
    if schema_file.exists():
        schema = json.load(schema_file.open())
        print("Using cached schema")
    else:
        print("Generating schema using LLM...")

        # Configure LLM for schema generation
        llm_config = LLMConfig(
            provider="openai/gpt-4o",  # or "ollama/llama3.3" for local
            api_token="env:OPENAI_API_KEY"
        )

        # Get sample HTML from target site
        async with AsyncWebCrawler() as crawler:
            sample_result = await crawler.arun(
                url="https://example.com/products",
                config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
            )
            sample_html = sample_result.cleaned_html[:5000]  # Use first 5k chars

        # Generate schema automatically (ONE-TIME LLM COST)
        schema = JsonCssExtractionStrategy.generate_schema(
            html=sample_html,
            schema_type="css",
            llm_config=llm_config,
            instruction="Extract product information including name, price, description, and features"
        )

        # Cache schema for future use (NO MORE LLM CALLS)
        json.dump(schema, schema_file.open("w"), indent=2)
        print("Schema generated and cached")

    # Step 2: Use schema for fast extraction (NO LLM CALLS)
    strategy = JsonCssExtractionStrategy(schema, verbose=True)

    config = CrawlerRunConfig(
        extraction_strategy=strategy,
        cache_mode=CacheMode.BYPASS
    )

    # Step 3: Extract from multiple pages using same schema
    urls = [
        "https://example.com/products",
        "https://example.com/electronics",
        "https://example.com/books"
    ]

    async with AsyncWebCrawler() as crawler:
        for url in urls:
            result = await crawler.arun(url=url, config=config)

            if result.success:
                data = json.loads(result.extracted_content)
                print(f"{url}: Extracted {len(data)} items")
            else:
                print(f"{url}: Failed - {result.error_message}")

asyncio.run(generate_and_use_schema())
```

### XPath Extraction Strategy

```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy

# XPath-based schema (alternative to CSS)
xpath_schema = {
    "name": "News Articles",
    "baseSelector": "//article[@class='news-item']",
    "baseFields": [
        {
            "name": "article_id",
            "type": "attribute",
            "attribute": "data-id"
        }
    ],
    "fields": [
        {
            "name": "headline",
            "selector": ".//h2[@class='headline']",
            "type": "text"
        },
        {
            "name": "author",
            "selector": ".//span[@class='author']/text()",
            "type": "text"
        },
        {
            "name": "publish_date",
            "selector": ".//time/@datetime",
            "type": "text"
        },
        {
            "name": "content",
            "selector": ".//div[@class='article-body']",
            "type": "html"
        },
        {
            "name": "tags",
            "selector": ".//div[@class='tags']/span[@class='tag']",
            "type": "list",
            "fields": [
                {"name": "tag", "type": "text"}
            ]
        }
    ]
}

# Generate XPath schema automatically
async def generate_xpath_schema():
    llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)

    sample_html = """
    <article class="news-item" data-id="123">
        <h2 class="headline">Breaking News</h2>
        <span class="author">John Doe</span>
        <time datetime="2024-01-01">Today</time>
        <div class="article-body"><p>Content here...</p></div>
    </article>
    """

    schema = JsonXPathExtractionStrategy.generate_schema(
        html=sample_html,
        schema_type="xpath",
        llm_config=llm_config
    )

    return schema

# Use XPath strategy
xpath_strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
```

### Regex Extraction Strategy - Pattern-Based Fast Extraction

```python
from crawl4ai.extraction_strategy import RegexExtractionStrategy

# Built-in patterns for common data types
async def extract_with_builtin_patterns():
    # Use multiple built-in patterns
    strategy = RegexExtractionStrategy(
        pattern=(
            RegexExtractionStrategy.Email |
            RegexExtractionStrategy.PhoneUS |
            RegexExtractionStrategy.Url |
            RegexExtractionStrategy.Currency |
            RegexExtractionStrategy.DateIso
        )
    )

    config = CrawlerRunConfig(extraction_strategy=strategy)

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com/contact",
            config=config
        )

        if result.success:
            matches = json.loads(result.extracted_content)

            # Group by pattern type
            by_type = {}
            for match in matches:
                label = match['label']
                if label not in by_type:
                    by_type[label] = []
                by_type[label].append(match['value'])

            for pattern_type, values in by_type.items():
                print(f"{pattern_type}: {len(values)} matches")
                for value in values[:3]:  # Show first 3
                    print(f"  {value}")

# Custom regex patterns
custom_patterns = {
    "product_code": r"SKU-\d{4,6}",
    "discount": r"\d{1,2}%\s*off",
    "model_number": r"Model:\s*([A-Z0-9-]+)"
}

async def extract_with_custom_patterns():
    strategy = RegexExtractionStrategy(custom=custom_patterns)

    config = CrawlerRunConfig(extraction_strategy=strategy)

    async with AsyncWebCrawler() as crawler:
        result = await crawler.arun(
            url="https://example.com/products",
            config=config
        )

        if result.success:
            data = json.loads(result.extracted_content)
            for item in data:
                print(f"{item['label']}: {item['value']}")

# LLM-generated patterns (one-time cost)
async def generate_custom_patterns():
    cache_file = Path("./patterns/price_patterns.json")

    if cache_file.exists():
        patterns = json.load(cache_file.open())
    else:
        llm_config = LLMConfig(
            provider="openai/gpt-4o-mini",
            api_token="env:OPENAI_API_KEY"
        )

        # Get sample content
        async with AsyncWebCrawler() as crawler:
            result = await crawler.arun("https://example.com/pricing")
            sample_html = result.cleaned_html

        # Generate optimized patterns
        patterns = RegexExtractionStrategy.generate_pattern(
            label="pricing_info",
            html=sample_html,
            query="Extract all pricing information including discounts and special offers",
            llm_config=llm_config
        )

        # Cache for reuse
        cache_file.parent.mkdir(exist_ok=True)
        json.dump(patterns, cache_file.open("w"), indent=2)

    # Use cached patterns (no more LLM calls)
    strategy = RegexExtractionStrategy(custom=patterns)
    return strategy

asyncio.run(extract_with_builtin_patterns())
asyncio.run(extract_with_custom_patterns())
```

### Complete Extraction Workflow - Combining Strategies

```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import (
    JsonCssExtractionStrategy,
    RegexExtractionStrategy,
    LLMExtractionStrategy
)

async def multi_strategy_extraction():
    """
    Demonstrate using multiple extraction strategies in sequence:
    1. Fast regex for common patterns
    2. Schema-based for structured data
    3. LLM for complex reasoning
    """

    browser_config = BrowserConfig(headless=True)

    # Strategy 1: Fast regex extraction
    regex_strategy = RegexExtractionStrategy(
        pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
    )

    # Strategy 2: Schema-based structured extraction
    product_schema = {
        "name": "Products",
        "baseSelector": "div.product",
        "fields": [
            {"name": "name", "selector": "h3", "type": "text"},
            {"name": "price", "selector": ".price", "type": "text"},
            {"name": "rating", "selector": ".rating", "type": "attribute", "attribute": "data-rating"}
        ]
    }
    css_strategy = JsonCssExtractionStrategy(product_schema)

    # Strategy 3: LLM for complex analysis
    llm_strategy = LLMExtractionStrategy(
        llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
        schema={
            "type": "object",
            "properties": {
                "sentiment": {"type": "string"},
                "key_topics": {"type": "array", "items": {"type": "string"}},
                "summary": {"type": "string"}
            }
        },
        extraction_type="schema",
        instruction="Analyze the content sentiment, extract key topics, and provide a summary"
    )

    url = "https://example.com/product-reviews"

    async with AsyncWebCrawler(config=browser_config) as crawler:
        # Extract contact info with regex
        regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
        regex_result = await crawler.arun(url=url, config=regex_config)

        # Extract structured product data
        css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
        css_result = await crawler.arun(url=url, config=css_config)

        # Extract insights with LLM
        llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
        llm_result = await crawler.arun(url=url, config=llm_config)

        # Combine results
        results = {
            "contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
            "products": json.loads(css_result.extracted_content) if css_result.success else [],
            "analysis": json.loads(llm_result.extracted_content) if llm_result.success else {}
        }

        print(f"Found {len(results['contacts'])} contact entries")
        print(f"Found {len(results['products'])} products")
        print(f"Sentiment: {results['analysis'].get('sentiment', 'N/A')}")

        return results

# Performance comparison
async def compare_extraction_performance():
    """Compare speed and accuracy of different strategies"""
    import time

    url = "https://example.com/large-catalog"

    strategies = {
        "regex": RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
        "css": JsonCssExtractionStrategy({
            "name": "Prices",
            "baseSelector": ".price",
            "fields": [{"name": "amount", "selector": "span", "type": "text"}]
        }),
        "llm": LLMExtractionStrategy(
            llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
            instruction="Extract all prices from the content",
            extraction_type="block"
        )
    }

    async with AsyncWebCrawler() as crawler:
        for name, strategy in strategies.items():
            start_time = time.time()

            config = CrawlerRunConfig(extraction_strategy=strategy)
            result = await crawler.arun(url=url, config=config)

            duration = time.time() - start_time

            if result.success:
                data = json.loads(result.extracted_content)
                print(f"{name}: {len(data)} items in {duration:.2f}s")
            else:
                print(f"{name}: Failed in {duration:.2f}s")

asyncio.run(multi_strategy_extraction())
asyncio.run(compare_extraction_performance())
```

### Best Practices and Strategy Selection

```python
# Strategy selection guide
def choose_extraction_strategy(use_case):
    """
    Guide for selecting the right extraction strategy
    """

    strategies = {
        # Fast pattern matching for common data types
        "contact_info": RegexExtractionStrategy(
            pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
        ),

        # Structured data from consistent HTML
        "product_catalogs": JsonCssExtractionStrategy,

        # Complex reasoning and semantic understanding
        "content_analysis": LLMExtractionStrategy,

        # Mixed approach for comprehensive extraction
        "complete_site_analysis": "multi_strategy"
    }

    recommendations = {
        "speed_priority": "Use RegexExtractionStrategy for simple patterns, JsonCssExtractionStrategy for structured data",
        "accuracy_priority": "Use LLMExtractionStrategy for complex content, JsonCssExtractionStrategy for predictable structure",
        "cost_priority": "Avoid LLM strategies, use schema generation once then JsonCssExtractionStrategy",
        "scale_priority": "Cache schemas, use regex for simple patterns, avoid LLM for high-volume extraction"
    }

    return recommendations.get(use_case, "Combine strategies based on content complexity")

# Error handling and validation
async def robust_extraction():
    strategies = [
        RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email),
        JsonCssExtractionStrategy(simple_schema),
        # LLM as fallback for complex cases
    ]

    async with AsyncWebCrawler() as crawler:
        for strategy in strategies:
            try:
                config = CrawlerRunConfig(extraction_strategy=strategy)
                result = await crawler.arun(url="https://example.com", config=config)

                if result.success and result.extracted_content:
                    data = json.loads(result.extracted_content)
                    if data:  # Validate non-empty results
                        print(f"Success with {strategy.__class__.__name__}")
                        return data

            except Exception as e:
                print(f"Strategy {strategy.__class__.__name__} failed: {e}")
                continue

    print("All strategies failed")
    return None
```

**📖 Learn more:** [LLM Strategies Deep Dive](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Regex Patterns](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)