feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
This commit is contained in:
@@ -23,7 +23,8 @@ from .extraction_strategy import (
|
||||
CosineStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy,
|
||||
JsonLxmlExtractionStrategy
|
||||
JsonLxmlExtractionStrategy,
|
||||
RegexExtractionStrategy
|
||||
)
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
@@ -105,6 +106,7 @@ __all__ = [
|
||||
"JsonCssExtractionStrategy",
|
||||
"JsonXPathExtractionStrategy",
|
||||
"JsonLxmlExtractionStrategy",
|
||||
"RegexExtractionStrategy",
|
||||
"ChunkingStrategy",
|
||||
"RegexChunking",
|
||||
"DefaultMarkdownGenerator",
|
||||
|
||||
Reference in New Issue
Block a user