feat(extraction): add RegexExtractionStrategy for pattern-based extraction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
2025-05-02 21:15:24 +08:00
parent 94e9959fe0
commit 9b5ccac76e
13 changed files with 984 additions and 124 deletions
--- a/crawl4ai/async_webcrawler.py
+++ b/crawl4ai/async_webcrawler.py
@@ -503,6 +503,8 @@ class AsyncWebCrawler:
            tables = media.pop("tables", [])
            links = result.links.model_dump()
            metadata = result.metadata
+            
+        fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)

        ################################
        # Generate Markdown            #
@@ -519,7 +521,7 @@ class AsyncWebCrawler:
        html_source_selector = {
            "raw_html": lambda: html,  # The original raw HTML
            "cleaned_html": lambda: cleaned_html,  # The HTML after scraping strategy
-            "fit_html": lambda: preprocess_html_for_schema(html_content=html),  # Preprocessed raw HTML
+            "fit_html": lambda: fit_html,  # The HTML after preprocessing for schema
        }

        markdown_input_html = cleaned_html  # Default to cleaned_html
@@ -593,6 +595,7 @@ class AsyncWebCrawler:
            content = {
                "markdown": markdown_result.raw_markdown,
                "html": html,
+                "fit_html": fit_html,
                "cleaned_html": cleaned_html,
                "fit_markdown": markdown_result.fit_markdown,
            }.get(content_format, markdown_result.raw_markdown)
@@ -600,7 +603,7 @@ class AsyncWebCrawler:
            # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
            chunking = (
                IdentityChunking()
-                if content_format in ["html", "cleaned_html"]
+                if content_format in ["html", "cleaned_html", "fit_html"]
                else config.chunking_strategy
            )
            sections = chunking.chunk(content)
@@ -624,6 +627,7 @@ class AsyncWebCrawler:
        return CrawlResult(
            url=url,
            html=html,
+            fit_html=fit_html,
            cleaned_html=cleaned_html,
            markdown=markdown_result,
            media=media,