Files
crawl4ai/docs/details/extraction.md
2024-10-27 19:24:46 +08:00

4.6 KiB

Extraction Strategies

1. LLMExtractionStrategy

LLMExtractionStrategy(
    # Core Parameters
    provider: str = DEFAULT_PROVIDER,  # LLM provider (e.g., "openai/gpt-4", "huggingface/...", "ollama/...")
    api_token: Optional[str] = None,  # API token for the provider
    instruction: str = None,  # Custom instruction for extraction
    schema: Dict = None,  # Pydantic model schema for structured extraction
    extraction_type: str = "block",  # Type of extraction: "block" or "schema"
    
    # Chunking Parameters
    chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD,  # Maximum tokens per chunk
    overlap_rate: float = OVERLAP_RATE,  # Overlap between chunks
    word_token_rate: float = WORD_TOKEN_RATE,  # Conversion rate from words to tokens
    apply_chunking: bool = True,  # Whether to apply text chunking
    
    # API Configuration
    base_url: str = None,  # Base URL for API calls
    api_base: str = None,  # Alternative base URL
    extra_args: Dict = {},  # Additional provider-specific arguments
    
    verbose: bool = False  # Enable verbose logging
)

Usage Example:

class NewsArticle(BaseModel):
    title: str
    content: str

strategy = LLMExtractionStrategy(
    provider="ollama/nemotron",
    api_token="your-token",
    schema=NewsArticle.schema(),
    instruction="Extract news article content with title and main text"
)

result = await crawler.arun(url="https://example.com", extraction_strategy=strategy)

2. JsonCssExtractionStrategy

JsonCssExtractionStrategy(
    schema: Dict[str, Any],  # Schema defining extraction rules
    verbose: bool = False  # Enable verbose logging
)

# Schema Structure
schema = {
    "name": str,  # Name of the extraction schema
    "baseSelector": str,  # CSS selector for base elements
    "fields": [
        {
            "name": str,  # Field name
            "selector": str,  # CSS selector
            "type": str,  # Field type: "text", "attribute", "html", "regex", "nested", "list", "nested_list"
            "attribute": str,  # For type="attribute"
            "pattern": str,  # For type="regex"
            "transform": str,  # Optional: "lowercase", "uppercase", "strip"
            "default": Any,  # Default value if extraction fails
            "fields": List[Dict],  # For nested/list types
        }
    ]
}

Usage Example:

schema = {
    "name": "News Articles",
    "baseSelector": "article.news-item",
    "fields": [
        {
            "name": "title",
            "selector": "h1",
            "type": "text",
            "transform": "strip"
        },
        {
            "name": "date",
            "selector": ".date",
            "type": "attribute",
            "attribute": "datetime"
        }
    ]
}

strategy = JsonCssExtractionStrategy(schema)
result = await crawler.arun(url="https://example.com", extraction_strategy=strategy)

3. CosineStrategy

CosineStrategy(
    # Content Filtering
    semantic_filter: str = None,  # Keyword filter for document filtering
    word_count_threshold: int = 10,  # Minimum words per cluster
    sim_threshold: float = 0.3,  # Similarity threshold for filtering
    
    # Clustering Parameters
    max_dist: float = 0.2,  # Maximum distance for clustering
    linkage_method: str = 'ward',  # Clustering linkage method
    top_k: int = 3,  # Number of top categories to extract
    
    # Model Configuration
    model_name: str = 'sentence-transformers/all-MiniLM-L6-v2',  # Embedding model
    
    verbose: bool = False  # Enable verbose logging
)

Chunking Strategies

1. RegexChunking

RegexChunking(
    patterns: List[str] = None  # List of regex patterns for splitting text
    # Default pattern: [r'\n\n']
)

Usage Example:

chunker = RegexChunking(patterns=[r'\n\n', r'\.\s+'])  # Split on double newlines and sentences
chunks = chunker.chunk(text)

2. SlidingWindowChunking

SlidingWindowChunking(
    window_size: int = 100,  # Size of the window in words
    step: int = 50,  # Number of words to slide the window
)

Usage Example:

chunker = SlidingWindowChunking(window_size=200, step=100)
chunks = chunker.chunk(text)  # Creates overlapping chunks of 200 words, moving 100 words at a time

3. OverlappingWindowChunking

OverlappingWindowChunking(
    window_size: int = 1000,  # Size of each chunk in words
    overlap: int = 100  # Number of words to overlap between chunks
)

Usage Example:

chunker = OverlappingWindowChunking(window_size=500, overlap=50)
chunks = chunker.chunk(text)  # Creates 500-word chunks with 50-word overlap