feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
This commit is contained in:
@@ -571,6 +571,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
async def handle_response_capture(response):
|
||||
try:
|
||||
try:
|
||||
# body = await response.body()
|
||||
# json_body = await response.json()
|
||||
text_body = await response.text()
|
||||
except Exception as e:
|
||||
body = None
|
||||
# json_body = None
|
||||
# text_body = None
|
||||
captured_requests.append({
|
||||
"event_type": "response",
|
||||
"url": response.url,
|
||||
@@ -579,7 +587,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"headers": dict(response.headers), # Convert Header dict
|
||||
"from_service_worker": response.from_service_worker,
|
||||
"request_timing": response.request.timing, # Detailed timing info
|
||||
"timestamp": time.time()
|
||||
"timestamp": time.time(),
|
||||
"body" : {
|
||||
# "raw": body,
|
||||
# "json": json_body,
|
||||
"text": text_body
|
||||
}
|
||||
})
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
|
||||
Reference in New Issue
Block a user