Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types: - Built-in patterns for emails, URLs, phones, dates, and more - Support for custom regex patterns - LLM-assisted pattern generation utility - Optimized HTML preprocessing with fit_html field - Enhanced network response body capture Breaking changes: None
144 lines
6.2 KiB
Python
144 lines
6.2 KiB
Python
# == File: regex_extraction_quickstart.py ==
|
||
"""
|
||
Mini–quick-start for RegexExtractionStrategy
|
||
────────────────────────────────────────────
|
||
3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
|
||
|
||
1. **Default catalog** – scrape a page and pull out e-mails / phones / URLs, etc.
|
||
2. **Custom pattern** – add your own regex at instantiation time.
|
||
3. **LLM-assisted schema** – ask the model to write a pattern, cache it, then
|
||
run extraction _without_ further LLM calls.
|
||
|
||
Run the whole thing with::
|
||
|
||
python regex_extraction_quickstart.py
|
||
"""
|
||
|
||
import os, json, asyncio
|
||
from pathlib import Path
|
||
from typing import List
|
||
|
||
from crawl4ai import (
|
||
AsyncWebCrawler,
|
||
CrawlerRunConfig,
|
||
CrawlResult,
|
||
RegexExtractionStrategy,
|
||
LLMConfig,
|
||
)
|
||
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
# 1. Default-catalog extraction
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
async def demo_regex_default() -> None:
|
||
print("\n=== 1. Regex extraction – default patterns ===")
|
||
|
||
url = "https://www.iana.org/domains/example" # has e-mail + URLs
|
||
strategy = RegexExtractionStrategy(
|
||
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
|
||
)
|
||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: CrawlResult = await crawler.arun(url, config=config)
|
||
|
||
print(f"Fetched {url} - success={result.success}")
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
for d in data[:10]:
|
||
print(f" {d['label']:<12} {d['value']}")
|
||
print(f"... total matches: {len(data)}")
|
||
else:
|
||
print(" !!! crawl failed")
|
||
|
||
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
# 2. Custom pattern override / extension
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
async def demo_regex_custom() -> None:
|
||
print("\n=== 2. Regex extraction – custom price pattern ===")
|
||
|
||
url = "https://www.apple.com/shop/buy-mac/macbook-pro"
|
||
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
|
||
|
||
strategy = RegexExtractionStrategy(custom = price_pattern)
|
||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: CrawlResult = await crawler.arun(url, config=config)
|
||
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
for d in data:
|
||
print(f" {d['value']}")
|
||
if not data:
|
||
print(" (No prices found - page layout may have changed)")
|
||
else:
|
||
print(" !!! crawl failed")
|
||
|
||
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
# 3. One-shot LLM pattern generation, then fast extraction
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
async def demo_regex_generate_pattern() -> None:
|
||
print("\n=== 3. generate_pattern → regex extraction ===")
|
||
|
||
cache_dir = Path(__file__).parent / "tmp"
|
||
cache_dir.mkdir(exist_ok=True)
|
||
pattern_file = cache_dir / "price_pattern.json"
|
||
|
||
url = "https://www.lazada.sg/tag/smartphone/"
|
||
|
||
# ── 3-A. build or load the cached pattern
|
||
if pattern_file.exists():
|
||
pattern = json.load(pattern_file.open(encoding="utf-8"))
|
||
print("Loaded cached pattern:", pattern)
|
||
else:
|
||
print("Generating pattern via LLM…")
|
||
|
||
llm_cfg = LLMConfig(
|
||
provider="openai/gpt-4o-mini",
|
||
api_token="env:OPENAI_API_KEY",
|
||
)
|
||
|
||
# pull one sample page as HTML context
|
||
async with AsyncWebCrawler() as crawler:
|
||
html = (await crawler.arun(url)).fit_html
|
||
|
||
pattern = RegexExtractionStrategy.generate_pattern(
|
||
label="price",
|
||
html=html,
|
||
query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
|
||
llm_config=llm_cfg,
|
||
)
|
||
|
||
json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
|
||
print("Saved pattern:", pattern_file)
|
||
|
||
# ── 3-B. extraction pass – zero LLM calls
|
||
strategy = RegexExtractionStrategy(custom=pattern)
|
||
config = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
|
||
|
||
async with AsyncWebCrawler() as crawler:
|
||
result: CrawlResult = await crawler.arun(url, config=config)
|
||
|
||
if result.success:
|
||
data = json.loads(result.extracted_content)
|
||
for d in data[:15]:
|
||
print(f" {d['value']}")
|
||
print(f"... total matches: {len(data)}")
|
||
else:
|
||
print(" !!! crawl failed")
|
||
|
||
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
# Entrypoint
|
||
# ────────────────────────────────────────────────────────────────────────────
|
||
async def main() -> None:
|
||
# await demo_regex_default()
|
||
# await demo_regex_custom()
|
||
await demo_regex_generate_pattern()
|
||
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main())
|