feat(extraction): add RegexExtractionStrategy for pattern-based extraction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
2025-05-02 21:15:24 +08:00
parent 94e9959fe0
commit 9b5ccac76e
13 changed files with 984 additions and 124 deletions
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -571,6 +571,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):

            async def handle_response_capture(response):
                try:
+                    try:
+                        # body = await response.body()
+                        # json_body = await response.json()
+                        text_body = await response.text()
+                    except Exception as e:
+                        body = None
+                        # json_body = None
+                        # text_body = None
                    captured_requests.append({
                        "event_type": "response",
                        "url": response.url,
@@ -579,7 +587,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "headers": dict(response.headers), # Convert Header dict
                        "from_service_worker": response.from_service_worker,
                        "request_timing": response.request.timing, # Detailed timing info
-                        "timestamp": time.time()
+                        "timestamp": time.time(),
+                        "body" : {
+                            # "raw": body,
+                            # "json": json_body,
+                            "text": text_body
+                        }
                    })
                except Exception as e:
                    if self.logger: