feat(extraction): add RegexExtractionStrategy for pattern-based extraction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
2025-05-02 21:15:24 +08:00
parent 94e9959fe0
commit 9b5ccac76e
13 changed files with 984 additions and 124 deletions
--- a/docs/examples/hello_world.py
+++ b/docs/examples/hello_world.py
@@ -3,42 +3,19 @@ from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
-    CacheMode,
    DefaultMarkdownGenerator,
    PruningContentFilter,
    CrawlResult
 )

-async def example_cdp():
-    browser_conf = BrowserConfig(
-        headless=False,
-        cdp_url="http://localhost:9223"
-    )
-    crawler_config = CrawlerRunConfig(
-        session_id="test",
-        js_code = """(() => { return {"result": "Hello World!"} })()""",
-        js_only=True
-    )
-    async with AsyncWebCrawler(
-        config=browser_conf,
-        verbose=True,
-    ) as crawler:
-        result : CrawlResult = await crawler.arun(
-            url="https://www.helloworld.org",
-            config=crawler_config,
-        )
-        print(result.js_execution_result)
                   

-async def main():
-    browser_config = BrowserConfig(headless=False, verbose=True)
+async def main():    
+    browser_config = BrowserConfig(headless=True, verbose=True)
    async with AsyncWebCrawler(config=browser_config) as crawler:
        crawler_config = CrawlerRunConfig(
-            cache_mode=CacheMode.BYPASS,
            markdown_generator=DefaultMarkdownGenerator(
-                content_filter=PruningContentFilter(
-                     threshold=0.48, threshold_type="fixed", min_word_threshold=0
-                )
+                content_filter=PruningContentFilter()
            ),
        )
        result : CrawlResult = await crawler.arun(