Files
crawl4ai/docs/examples/regex_extraction_quickstart.py
UncleCode 9b5ccac76e feat(extraction): add RegexExtractionStrategy for pattern-based extraction
Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types:
- Built-in patterns for emails, URLs, phones, dates, and more
- Support for custom regex patterns
- LLM-assisted pattern generation utility
- Optimized HTML preprocessing with fit_html field
- Enhanced network response body capture

Breaking changes: None
2025-05-02 21:15:24 +08:00

144 lines
6.2 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# == File: regex_extraction_quickstart.py ==
"""
Miniquick-start for RegexExtractionStrategy
────────────────────────────────────────────
3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
1. **Default catalog** scrape a page and pull out e-mails / phones / URLs, etc.
2. **Custom pattern** add your own regex at instantiation time.
3. **LLM-assisted schema** ask the model to write a pattern, cache it, then
run extraction _without_ further LLM calls.
Run the whole thing with::
python regex_extraction_quickstart.py
"""
import os, json, asyncio
from pathlib import Path
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
CrawlResult,
RegexExtractionStrategy,
LLMConfig,
)
# ────────────────────────────────────────────────────────────────────────────
# 1. Default-catalog extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_default() -> None:
print("\n=== 1. Regex extraction default patterns ===")
url = "https://www.iana.org/domains/example" # has e-mail + URLs
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
print(f"Fetched {url} - success={result.success}")
if result.success:
data = json.loads(result.extracted_content)
for d in data[:10]:
print(f" {d['label']:<12} {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 2. Custom pattern override / extension
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_custom() -> None:
print("\n=== 2. Regex extraction custom price pattern ===")
url = "https://www.apple.com/shop/buy-mac/macbook-pro"
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
strategy = RegexExtractionStrategy(custom = price_pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data:
print(f" {d['value']}")
if not data:
print(" (No prices found - page layout may have changed)")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 3. One-shot LLM pattern generation, then fast extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_generate_pattern() -> None:
print("\n=== 3. generate_pattern → regex extraction ===")
cache_dir = Path(__file__).parent / "tmp"
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "price_pattern.json"
url = "https://www.lazada.sg/tag/smartphone/"
# ── 3-A. build or load the cached pattern
if pattern_file.exists():
pattern = json.load(pattern_file.open(encoding="utf-8"))
print("Loaded cached pattern:", pattern)
else:
print("Generating pattern via LLM…")
llm_cfg = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY",
)
# pull one sample page as HTML context
async with AsyncWebCrawler() as crawler:
html = (await crawler.arun(url)).fit_html
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
llm_config=llm_cfg,
)
json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
print("Saved pattern:", pattern_file)
# ── 3-B. extraction pass zero LLM calls
strategy = RegexExtractionStrategy(custom=pattern)
config = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data[:15]:
print(f" {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# Entrypoint
# ────────────────────────────────────────────────────────────────────────────
async def main() -> None:
# await demo_regex_default()
# await demo_regex_custom()
await demo_regex_generate_pattern()
if __name__ == "__main__":
asyncio.run(main())