feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
This commit is contained in:
@@ -36,6 +36,45 @@ LLMExtractionStrategy(
|
||||
)
|
||||
```
|
||||
|
||||
### RegexExtractionStrategy
|
||||
|
||||
Used for fast pattern-based extraction of common entities using regular expressions.
|
||||
|
||||
```python
|
||||
RegexExtractionStrategy(
|
||||
# Pattern Configuration
|
||||
pattern: IntFlag = RegexExtractionStrategy.Nothing, # Bit flags of built-in patterns to use
|
||||
custom: Optional[Dict[str, str]] = None, # Custom pattern dictionary {label: regex}
|
||||
|
||||
# Input Format
|
||||
input_format: str = "fit_html", # "html", "markdown", "text" or "fit_html"
|
||||
)
|
||||
|
||||
# Built-in Patterns as Bit Flags
|
||||
RegexExtractionStrategy.Email # Email addresses
|
||||
RegexExtractionStrategy.PhoneIntl # International phone numbers
|
||||
RegexExtractionStrategy.PhoneUS # US-format phone numbers
|
||||
RegexExtractionStrategy.Url # HTTP/HTTPS URLs
|
||||
RegexExtractionStrategy.IPv4 # IPv4 addresses
|
||||
RegexExtractionStrategy.IPv6 # IPv6 addresses
|
||||
RegexExtractionStrategy.Uuid # UUIDs
|
||||
RegexExtractionStrategy.Currency # Currency values (USD, EUR, etc)
|
||||
RegexExtractionStrategy.Percentage # Percentage values
|
||||
RegexExtractionStrategy.Number # Numeric values
|
||||
RegexExtractionStrategy.DateIso # ISO format dates
|
||||
RegexExtractionStrategy.DateUS # US format dates
|
||||
RegexExtractionStrategy.Time24h # 24-hour format times
|
||||
RegexExtractionStrategy.PostalUS # US postal codes
|
||||
RegexExtractionStrategy.PostalUK # UK postal codes
|
||||
RegexExtractionStrategy.HexColor # HTML hex color codes
|
||||
RegexExtractionStrategy.TwitterHandle # Twitter handles
|
||||
RegexExtractionStrategy.Hashtag # Hashtags
|
||||
RegexExtractionStrategy.MacAddr # MAC addresses
|
||||
RegexExtractionStrategy.Iban # International bank account numbers
|
||||
RegexExtractionStrategy.CreditCard # Credit card numbers
|
||||
RegexExtractionStrategy.All # All available patterns
|
||||
```
|
||||
|
||||
### CosineStrategy
|
||||
|
||||
Used for content similarity-based extraction and clustering.
|
||||
@@ -156,6 +195,55 @@ result = await crawler.arun(
|
||||
data = json.loads(result.extracted_content)
|
||||
```
|
||||
|
||||
### Regex Extraction
|
||||
|
||||
```python
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, RegexExtractionStrategy
|
||||
|
||||
# Method 1: Use built-in patterns
|
||||
strategy = RegexExtractionStrategy(
|
||||
pattern = RegexExtractionStrategy.Email | RegexExtractionStrategy.Url
|
||||
)
|
||||
|
||||
# Method 2: Use custom patterns
|
||||
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
|
||||
strategy = RegexExtractionStrategy(custom=price_pattern)
|
||||
|
||||
# Method 3: Generate pattern with LLM assistance (one-time)
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Get sample HTML first
|
||||
sample_result = await crawler.arun("https://example.com/products")
|
||||
html = sample_result.fit_html
|
||||
|
||||
# Generate regex pattern once
|
||||
pattern = RegexExtractionStrategy.generate_pattern(
|
||||
label="price",
|
||||
html=html,
|
||||
query="Product prices in USD format",
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini")
|
||||
)
|
||||
|
||||
# Save pattern for reuse
|
||||
import json
|
||||
with open("price_pattern.json", "w") as f:
|
||||
json.dump(pattern, f)
|
||||
|
||||
# Use pattern for extraction (no LLM calls)
|
||||
strategy = RegexExtractionStrategy(custom=pattern)
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
config=CrawlerRunConfig(extraction_strategy=strategy)
|
||||
)
|
||||
|
||||
# Process results
|
||||
data = json.loads(result.extracted_content)
|
||||
for item in data:
|
||||
print(f"{item['label']}: {item['value']}")
|
||||
```
|
||||
|
||||
### CSS Extraction
|
||||
|
||||
```python
|
||||
@@ -220,12 +308,28 @@ result = await crawler.arun(
|
||||
|
||||
## Best Practices
|
||||
|
||||
1. **Choose the Right Strategy**
|
||||
- Use `LLMExtractionStrategy` for complex, unstructured content
|
||||
- Use `JsonCssExtractionStrategy` for well-structured HTML
|
||||
1. **Choose the Right Strategy**
|
||||
- Use `RegexExtractionStrategy` for common data types like emails, phones, URLs, dates
|
||||
- Use `JsonCssExtractionStrategy` for well-structured HTML with consistent patterns
|
||||
- Use `LLMExtractionStrategy` for complex, unstructured content requiring reasoning
|
||||
- Use `CosineStrategy` for content similarity and clustering
|
||||
|
||||
2. **Optimize Chunking**
|
||||
2. **Strategy Selection Guide**
|
||||
```
|
||||
Is the target data a common type (email/phone/date/URL)?
|
||||
→ RegexExtractionStrategy
|
||||
|
||||
Does the page have consistent HTML structure?
|
||||
→ JsonCssExtractionStrategy or JsonXPathExtractionStrategy
|
||||
|
||||
Is the data semantically complex or unstructured?
|
||||
→ LLMExtractionStrategy
|
||||
|
||||
Need to find content similar to a specific topic?
|
||||
→ CosineStrategy
|
||||
```
|
||||
|
||||
3. **Optimize Chunking**
|
||||
```python
|
||||
# For long documents
|
||||
strategy = LLMExtractionStrategy(
|
||||
@@ -234,7 +338,26 @@ result = await crawler.arun(
|
||||
)
|
||||
```
|
||||
|
||||
3. **Handle Errors**
|
||||
4. **Combine Strategies for Best Performance**
|
||||
```python
|
||||
# First pass: Extract structure with CSS
|
||||
css_strategy = JsonCssExtractionStrategy(product_schema)
|
||||
css_result = await crawler.arun(url, config=CrawlerRunConfig(extraction_strategy=css_strategy))
|
||||
product_data = json.loads(css_result.extracted_content)
|
||||
|
||||
# Second pass: Extract specific fields with regex
|
||||
descriptions = [product["description"] for product in product_data]
|
||||
regex_strategy = RegexExtractionStrategy(
|
||||
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS,
|
||||
custom={"dimension": r"\d+x\d+x\d+ (?:cm|in)"}
|
||||
)
|
||||
|
||||
# Process descriptions with regex
|
||||
for text in descriptions:
|
||||
matches = regex_strategy.extract("", text) # Direct extraction
|
||||
```
|
||||
|
||||
5. **Handle Errors**
|
||||
```python
|
||||
try:
|
||||
result = await crawler.arun(
|
||||
@@ -247,11 +370,31 @@ result = await crawler.arun(
|
||||
print(f"Extraction failed: {e}")
|
||||
```
|
||||
|
||||
4. **Monitor Performance**
|
||||
6. **Monitor Performance**
|
||||
```python
|
||||
strategy = CosineStrategy(
|
||||
verbose=True, # Enable logging
|
||||
word_count_threshold=20, # Filter short content
|
||||
top_k=5 # Limit results
|
||||
)
|
||||
```
|
||||
|
||||
7. **Cache Generated Patterns**
|
||||
```python
|
||||
# For RegexExtractionStrategy pattern generation
|
||||
import json
|
||||
from pathlib import Path
|
||||
|
||||
cache_dir = Path("./pattern_cache")
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
pattern_file = cache_dir / "product_pattern.json"
|
||||
|
||||
if pattern_file.exists():
|
||||
with open(pattern_file) as f:
|
||||
pattern = json.load(f)
|
||||
else:
|
||||
# Generate once with LLM
|
||||
pattern = RegexExtractionStrategy.generate_pattern(...)
|
||||
with open(pattern_file, "w") as f:
|
||||
json.dump(pattern, f)
|
||||
```
|
||||
Reference in New Issue
Block a user