feat(extraction): add RegexExtractionStrategy for pattern-based extraction

Add new RegexExtractionStrategy for fast, zero-LLM extraction of common data types:
- Built-in patterns for emails, URLs, phones, dates, and more
- Support for custom regex patterns
- LLM-assisted pattern generation utility
- Optimized HTML preprocessing with fit_html field
- Enhanced network response body capture

Breaking changes: None
This commit is contained in:
UncleCode
2025-05-02 21:15:24 +08:00
parent 94e9959fe0
commit 9b5ccac76e
13 changed files with 984 additions and 124 deletions

View File

@@ -5,6 +5,21 @@ All notable changes to Crawl4AI will be documented in this file.
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
## [0.6.2] - 2025-05-02
### Added
- New `RegexExtractionStrategy` for fast pattern-based extraction without requiring LLM
- Built-in patterns for emails, URLs, phone numbers, dates, and more
- Support for custom regex patterns
- `generate_pattern` utility for LLM-assisted pattern creation (one-time use)
- Added `fit_html` as a top-level field in `CrawlResult` for optimized HTML extraction
- Added support for network response body capture in network request tracking
### Changed
- Updated documentation for no-LLM extraction strategies
- Enhanced API reference to include RegexExtractionStrategy examples and usage
- Improved HTML preprocessing with optimized performance for extraction strategies
## [0.6.1] - 2025-04-24
### Added

View File

@@ -23,7 +23,8 @@ from .extraction_strategy import (
CosineStrategy,
JsonCssExtractionStrategy,
JsonXPathExtractionStrategy,
JsonLxmlExtractionStrategy
JsonLxmlExtractionStrategy,
RegexExtractionStrategy
)
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator
@@ -105,6 +106,7 @@ __all__ = [
"JsonCssExtractionStrategy",
"JsonXPathExtractionStrategy",
"JsonLxmlExtractionStrategy",
"RegexExtractionStrategy",
"ChunkingStrategy",
"RegexChunking",
"DefaultMarkdownGenerator",

View File

@@ -571,6 +571,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
async def handle_response_capture(response):
try:
try:
# body = await response.body()
# json_body = await response.json()
text_body = await response.text()
except Exception as e:
body = None
# json_body = None
# text_body = None
captured_requests.append({
"event_type": "response",
"url": response.url,
@@ -579,7 +587,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"headers": dict(response.headers), # Convert Header dict
"from_service_worker": response.from_service_worker,
"request_timing": response.request.timing, # Detailed timing info
"timestamp": time.time()
"timestamp": time.time(),
"body" : {
# "raw": body,
# "json": json_body,
"text": text_body
}
})
except Exception as e:
if self.logger:

View File

@@ -503,6 +503,8 @@ class AsyncWebCrawler:
tables = media.pop("tables", [])
links = result.links.model_dump()
metadata = result.metadata
fit_html = preprocess_html_for_schema(html_content=html, text_threshold= 500, max_size= 300_000)
################################
# Generate Markdown #
@@ -519,7 +521,7 @@ class AsyncWebCrawler:
html_source_selector = {
"raw_html": lambda: html, # The original raw HTML
"cleaned_html": lambda: cleaned_html, # The HTML after scraping strategy
"fit_html": lambda: preprocess_html_for_schema(html_content=html), # Preprocessed raw HTML
"fit_html": lambda: fit_html, # The HTML after preprocessing for schema
}
markdown_input_html = cleaned_html # Default to cleaned_html
@@ -593,6 +595,7 @@ class AsyncWebCrawler:
content = {
"markdown": markdown_result.raw_markdown,
"html": html,
"fit_html": fit_html,
"cleaned_html": cleaned_html,
"fit_markdown": markdown_result.fit_markdown,
}.get(content_format, markdown_result.raw_markdown)
@@ -600,7 +603,7 @@ class AsyncWebCrawler:
# Use IdentityChunking for HTML input, otherwise use provided chunking strategy
chunking = (
IdentityChunking()
if content_format in ["html", "cleaned_html"]
if content_format in ["html", "cleaned_html", "fit_html"]
else config.chunking_strategy
)
sections = chunking.chunk(content)
@@ -624,6 +627,7 @@ class AsyncWebCrawler:
return CrawlResult(
url=url,
html=html,
fit_html=fit_html,
cleaned_html=cleaned_html,
markdown=markdown_result,
media=media,

View File

@@ -475,7 +475,7 @@ class BrowserProfiler:
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
continue
# Print profile information with colorama formatting
# Print profile information
self.logger.info("\nAvailable profiles:", tag="PROFILES")
for i, profile in enumerate(profiles):
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")

View File

@@ -1,9 +1,10 @@
from abc import ABC, abstractmethod
import inspect
from typing import Any, List, Dict, Optional
from typing import Any, List, Dict, Optional, Tuple, Pattern, Union
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import time
from enum import IntFlag, auto
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH, PROMPT_EXTRACT_INFERRED_SCHEMA
from .config import (
@@ -1668,3 +1669,303 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
def _get_element_attribute(self, element, attribute: str):
return element.get(attribute)
"""
RegexExtractionStrategy
Fast, zero-LLM extraction of common entities via regular expressions.
"""
_CTRL = {c: rf"\x{ord(c):02x}" for c in map(chr, range(32)) if c not in "\t\n\r"}
_WB_FIX = re.compile(r"\x08") # stray back-space → word-boundary
_NEEDS_ESCAPE = re.compile(r"(?<!\\)\\(?![\\u])") # lone backslash
def _sanitize_schema(schema: Dict[str, str]) -> Dict[str, str]:
"""Fix common JSON-escape goofs coming from LLMs or manual edits."""
safe = {}
for label, pat in schema.items():
# 1⃣ replace accidental control chars (inc. the infamous back-space)
pat = _WB_FIX.sub(r"\\b", pat).translate(_CTRL)
# 2⃣ double any single backslash that JSON kept single
pat = _NEEDS_ESCAPE.sub(r"\\\\", pat)
# 3⃣ quick sanity compile
try:
re.compile(pat)
except re.error as e:
raise ValueError(f"Regex for '{label}' wont compile after fix: {e}") from None
safe[label] = pat
return safe
class RegexExtractionStrategy(ExtractionStrategy):
"""
A lean strategy that finds e-mails, phones, URLs, dates, money, etc.,
using nothing but pre-compiled regular expressions.
Extraction returns::
{
"url": "<page-url>",
"label": "<pattern-label>",
"value": "<matched-string>",
"span": [start, end]
}
Only `generate_schema()` touches an LLM, extraction itself is pure Python.
"""
# -------------------------------------------------------------- #
# Built-in patterns exposed as IntFlag so callers can bit-OR them
# -------------------------------------------------------------- #
class _B(IntFlag):
EMAIL = auto()
PHONE_INTL = auto()
PHONE_US = auto()
URL = auto()
IPV4 = auto()
IPV6 = auto()
UUID = auto()
CURRENCY = auto()
PERCENTAGE = auto()
NUMBER = auto()
DATE_ISO = auto()
DATE_US = auto()
TIME_24H = auto()
POSTAL_US = auto()
POSTAL_UK = auto()
HTML_COLOR_HEX = auto()
TWITTER_HANDLE = auto()
HASHTAG = auto()
MAC_ADDR = auto()
IBAN = auto()
CREDIT_CARD = auto()
NOTHING = auto()
ALL = (
EMAIL | PHONE_INTL | PHONE_US | URL | IPV4 | IPV6 | UUID
| CURRENCY | PERCENTAGE | NUMBER | DATE_ISO | DATE_US | TIME_24H
| POSTAL_US | POSTAL_UK | HTML_COLOR_HEX | TWITTER_HANDLE
| HASHTAG | MAC_ADDR | IBAN | CREDIT_CARD
)
# user-friendly aliases (RegexExtractionStrategy.Email, .IPv4, …)
Email = _B.EMAIL
PhoneIntl = _B.PHONE_INTL
PhoneUS = _B.PHONE_US
Url = _B.URL
IPv4 = _B.IPV4
IPv6 = _B.IPV6
Uuid = _B.UUID
Currency = _B.CURRENCY
Percentage = _B.PERCENTAGE
Number = _B.NUMBER
DateIso = _B.DATE_ISO
DateUS = _B.DATE_US
Time24h = _B.TIME_24H
PostalUS = _B.POSTAL_US
PostalUK = _B.POSTAL_UK
HexColor = _B.HTML_COLOR_HEX
TwitterHandle = _B.TWITTER_HANDLE
Hashtag = _B.HASHTAG
MacAddr = _B.MAC_ADDR
Iban = _B.IBAN
CreditCard = _B.CREDIT_CARD
All = _B.ALL
Nothing = _B(0) # no patterns
# ------------------------------------------------------------------ #
# Built-in pattern catalog
# ------------------------------------------------------------------ #
DEFAULT_PATTERNS: Dict[str, str] = {
# Communication
"email": r"[\w.+-]+@[\w-]+\.[\w.-]+",
"phone_intl": r"\+?\d[\d .()-]{7,}\d",
"phone_us": r"\(?\d{3}\)?[ -. ]?\d{3}[ -. ]?\d{4}",
# Web
"url": r"https?://[^\s\"'<>]+",
"ipv4": r"(?:\d{1,3}\.){3}\d{1,3}",
"ipv6": r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}",
# IDs
"uuid": r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}",
# Money / numbers
"currency": r"(?:USD|EUR|RM|\$|€|£)\s?\d+(?:[.,]\d{2})?",
"percentage": r"\d+(?:\.\d+)?%",
"number": r"\b\d{1,3}(?:[,.\s]\d{3})*(?:\.\d+)?\b",
# Dates / Times
"date_iso": r"\d{4}-\d{2}-\d{2}",
"date_us": r"\d{1,2}/\d{1,2}/\d{2,4}",
"time_24h": r"\b(?:[01]?\d|2[0-3]):[0-5]\d(?:[:.][0-5]\d)?\b",
# Misc
"postal_us": r"\b\d{5}(?:-\d{4})?\b",
"postal_uk": r"\b[A-Z]{1,2}\d[A-Z\d]? ?\d[A-Z]{2}\b",
"html_color_hex": r"#[0-9A-Fa-f]{6}\b",
"twitter_handle": r"@[\w]{1,15}",
"hashtag": r"#[\w-]+",
"mac_addr": r"(?:[0-9A-Fa-f]{2}:){5}[0-9A-Fa-f]{2}",
"iban": r"[A-Z]{2}\d{2}[A-Z0-9]{11,30}",
"credit_card": r"\b(?:4\d{12}(?:\d{3})?|5[1-5]\d{14}|3[47]\d{13}|6(?:011|5\d{2})\d{12})\b",
}
_FLAGS = re.IGNORECASE | re.MULTILINE
_UNWANTED_PROPS = {
"provider": "Use llm_config instead",
"api_token": "Use llm_config instead",
}
# ------------------------------------------------------------------ #
# Construction
# ------------------------------------------------------------------ #
def __init__(
self,
pattern: "_B" = _B.NOTHING,
*,
custom: Optional[Union[Dict[str, str], List[Tuple[str, str]]]] = None,
input_format: str = "fit_html",
**kwargs,
) -> None:
"""
Args:
patterns: Custom patterns overriding or extending defaults.
Dict[label, regex] or list[tuple(label, regex)].
input_format: "html", "markdown" or "text".
**kwargs: Forwarded to ExtractionStrategy.
"""
super().__init__(input_format=input_format, **kwargs)
# 1⃣ take only the requested built-ins
merged: Dict[str, str] = {
key: rx
for key, rx in self.DEFAULT_PATTERNS.items()
if getattr(self._B, key.upper()).value & pattern
}
# 2⃣ apply user overrides / additions
if custom:
if isinstance(custom, dict):
merged.update(custom)
else: # iterable of (label, regex)
merged.update({lbl: rx for lbl, rx in custom})
self._compiled: Dict[str, Pattern] = {
lbl: re.compile(rx, self._FLAGS) for lbl, rx in merged.items()
}
# ------------------------------------------------------------------ #
# Extraction
# ------------------------------------------------------------------ #
def extract(self, url: str, content: str, *q, **kw) -> List[Dict[str, Any]]:
# text = self._plain_text(html)
out: List[Dict[str, Any]] = []
for label, cre in self._compiled.items():
for m in cre.finditer(content):
out.append(
{
"url": url,
"label": label,
"value": m.group(0),
"span": [m.start(), m.end()],
}
)
return out
# ------------------------------------------------------------------ #
# Helpers
# ------------------------------------------------------------------ #
def _plain_text(self, content: str) -> str:
if self.input_format == "text":
return content
return BeautifulSoup(content, "lxml").get_text(" ", strip=True)
# ------------------------------------------------------------------ #
# LLM-assisted pattern generator
# ------------------------------------------------------------------ #
# ------------------------------------------------------------------ #
# LLM-assisted one-off pattern builder
# ------------------------------------------------------------------ #
@staticmethod
def generate_pattern(
label: str,
html: str,
*,
query: Optional[str] = None,
examples: Optional[List[str]] = None,
llm_config: Optional[LLMConfig] = None,
**kwargs,
) -> Dict[str, str]:
"""
Ask an LLM for a single page-specific regex and return
{label: pattern} ── ready for RegexExtractionStrategy(custom=…)
"""
# ── guard deprecated kwargs
for k in RegexExtractionStrategy._UNWANTED_PROPS:
if k in kwargs:
raise AttributeError(
f"{k} is deprecated, {RegexExtractionStrategy._UNWANTED_PROPS[k]}"
)
# ── default LLM config
if llm_config is None:
llm_config = create_llm_config()
# ── system prompt hardened
system_msg = (
"You are an expert Python-regex engineer.\n"
f"Return **one** JSON object whose single key is exactly \"{label}\", "
"and whose value is a raw-string regex pattern that works with "
"the standard `re` module in Python.\n\n"
"Strict rules (obey every bullet):\n"
"• If a *user query* is supplied, treat it as the precise semantic target and optimise the "
" pattern to capture ONLY text that answers that query. If the query conflicts with the "
" sample HTML, the HTML wins.\n"
"• Tailor the pattern to the *sample HTML* reproduce its exact punctuation, spacing, "
" symbols, capitalisation, etc. Do **NOT** invent a generic form.\n"
"• Keep it minimal and fast: avoid unnecessary capturing, prefer non-capturing `(?: … )`, "
" and guard against catastrophic backtracking.\n"
"• Anchor with `^`, `$`, or `\\b` only when it genuinely improves precision.\n"
"• Use inline flags like `(?i)` when needed; no verbose flag comments.\n"
"• Output must be valid JSON no markdown, code fences, comments, or extra keys.\n"
"• The regex value must be a Python string literal: **double every backslash** "
"(e.g. `\\\\b`, `\\\\d`, `\\\\\\\\`).\n\n"
"Example valid output:\n"
f"{{\"{label}\": \"(?:RM|rm)\\\\s?\\\\d{{1,3}}(?:,\\\\d{{3}})*(?:\\\\.\\\\d{{2}})?\"}}"
)
# ── user message: cropped HTML + optional hints
user_parts = ["```html", html[:5000], "```"] # protect token budget
if query:
user_parts.append(f"\n\n## Query\n{query.strip()}")
if examples:
user_parts.append("## Examples\n" + "\n".join(examples[:20]))
user_msg = "\n\n".join(user_parts)
# ── LLM call (with retry/backoff)
resp = perform_completion_with_backoff(
provider=llm_config.provider,
prompt_with_variables="\n\n".join([system_msg, user_msg]),
json_response=True,
api_token=llm_config.api_token,
base_url=llm_config.base_url,
extra_args=kwargs,
)
# ── clean & load JSON (fix common escape mistakes *before* json.loads)
raw = resp.choices[0].message.content
raw = raw.replace("\x08", "\\b") # stray back-space → \b
raw = re.sub(r'(?<!\\)\\(?![\\u"])', r"\\\\", raw) # lone \ → \\
try:
pattern_dict = json.loads(raw)
except Exception as exc:
raise ValueError(f"LLM did not return valid JSON: {raw}") from exc
# quick sanity-compile
for lbl, pat in pattern_dict.items():
try:
re.compile(pat)
except re.error as e:
raise ValueError(f"Invalid regex for '{lbl}': {e}") from None
return pattern_dict

View File

@@ -129,6 +129,7 @@ class MarkdownGenerationResult(BaseModel):
class CrawlResult(BaseModel):
url: str
html: str
fit_html: Optional[str] = None
success: bool
cleaned_html: Optional[str] = None
media: Dict[str, List[Dict]] = {}

View File

@@ -2735,33 +2735,67 @@ def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_thre
# Also truncate tail text if present
if element.tail and len(element.tail.strip()) > text_threshold:
element.tail = element.tail.strip()[:text_threshold] + '...'
# 4. Find repeated patterns and keep only a few examples
# This is a simplistic approach - more sophisticated pattern detection could be implemented
pattern_elements = {}
for element in tree.xpath('//*[contains(@class, "")]'):
parent = element.getparent()
# 4. Detect duplicates and drop them in a single pass
seen: dict[tuple, None] = {}
for el in list(tree.xpath('//*[@class]')): # snapshot once, XPath is fast
parent = el.getparent()
if parent is None:
continue
# Create a signature based on tag and classes
classes = element.get('class', '')
if not classes:
cls = el.get('class')
if not cls:
continue
signature = f"{element.tag}.{classes}"
if signature in pattern_elements:
pattern_elements[signature].append(element)
# ── build signature ───────────────────────────────────────────
h = xxhash.xxh64() # stream, no big join()
for txt in el.itertext():
h.update(txt)
sig = (el.tag, cls, h.intdigest()) # tuple cheaper & hashable
# ── first seen? keep else drop ─────────────
if sig in seen and parent is not None:
parent.remove(el) # duplicate
else:
pattern_elements[signature] = [element]
seen[sig] = None
# Keep only 3 examples of each repeating pattern
for signature, elements in pattern_elements.items():
if len(elements) > 3:
# Keep the first 2 and last elements
for element in elements[2:-1]:
if element.getparent() is not None:
element.getparent().remove(element)
# # 4. Find repeated patterns and keep only a few examples
# # This is a simplistic approach - more sophisticated pattern detection could be implemented
# pattern_elements = {}
# for element in tree.xpath('//*[contains(@class, "")]'):
# parent = element.getparent()
# if parent is None:
# continue
# # Create a signature based on tag and classes
# classes = element.get('class', '')
# if not classes:
# continue
# innert_text = ''.join(element.xpath('.//text()'))
# innert_text_hash = xxhash.xxh64(innert_text.encode()).hexdigest()
# signature = f"{element.tag}.{classes}.{innert_text_hash}"
# if signature in pattern_elements:
# pattern_elements[signature].append(element)
# else:
# pattern_elements[signature] = [element]
# # Keep only first examples of each repeating pattern
# for signature, elements in pattern_elements.items():
# if len(elements) > 1:
# # Keep the first element and remove the rest
# for element in elements[1:]:
# if element.getparent() is not None:
# element.getparent().remove(element)
# # Keep only 3 examples of each repeating pattern
# for signature, elements in pattern_elements.items():
# if len(elements) > 3:
# # Keep the first 2 and last elements
# for element in elements[2:-1]:
# if element.getparent() is not None:
# element.getparent().remove(element)
# 5. Convert back to string
result = etree.tostring(tree, encoding='unicode', method='html')

View File

@@ -3,42 +3,19 @@ from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter,
CrawlResult
)
async def example_cdp():
browser_conf = BrowserConfig(
headless=False,
cdp_url="http://localhost:9223"
)
crawler_config = CrawlerRunConfig(
session_id="test",
js_code = """(() => { return {"result": "Hello World!"} })()""",
js_only=True
)
async with AsyncWebCrawler(
config=browser_conf,
verbose=True,
) as crawler:
result : CrawlResult = await crawler.arun(
url="https://www.helloworld.org",
config=crawler_config,
)
print(result.js_execution_result)
async def main():
browser_config = BrowserConfig(headless=False, verbose=True)
async def main():
browser_config = BrowserConfig(headless=True, verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
content_filter=PruningContentFilter()
),
)
result : CrawlResult = await crawler.arun(

View File

@@ -0,0 +1,143 @@
# == File: regex_extraction_quickstart.py ==
"""
Miniquick-start for RegexExtractionStrategy
────────────────────────────────────────────
3 bite-sized demos that parallel the style of *quickstart_examples_set_1.py*:
1. **Default catalog** scrape a page and pull out e-mails / phones / URLs, etc.
2. **Custom pattern** add your own regex at instantiation time.
3. **LLM-assisted schema** ask the model to write a pattern, cache it, then
run extraction _without_ further LLM calls.
Run the whole thing with::
python regex_extraction_quickstart.py
"""
import os, json, asyncio
from pathlib import Path
from typing import List
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
CrawlResult,
RegexExtractionStrategy,
LLMConfig,
)
# ────────────────────────────────────────────────────────────────────────────
# 1. Default-catalog extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_default() -> None:
print("\n=== 1. Regex extraction default patterns ===")
url = "https://www.iana.org/domains/example" # has e-mail + URLs
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
print(f"Fetched {url} - success={result.success}")
if result.success:
data = json.loads(result.extracted_content)
for d in data[:10]:
print(f" {d['label']:<12} {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 2. Custom pattern override / extension
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_custom() -> None:
print("\n=== 2. Regex extraction custom price pattern ===")
url = "https://www.apple.com/shop/buy-mac/macbook-pro"
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
strategy = RegexExtractionStrategy(custom = price_pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data:
print(f" {d['value']}")
if not data:
print(" (No prices found - page layout may have changed)")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# 3. One-shot LLM pattern generation, then fast extraction
# ────────────────────────────────────────────────────────────────────────────
async def demo_regex_generate_pattern() -> None:
print("\n=== 3. generate_pattern → regex extraction ===")
cache_dir = Path(__file__).parent / "tmp"
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "price_pattern.json"
url = "https://www.lazada.sg/tag/smartphone/"
# ── 3-A. build or load the cached pattern
if pattern_file.exists():
pattern = json.load(pattern_file.open(encoding="utf-8"))
print("Loaded cached pattern:", pattern)
else:
print("Generating pattern via LLM…")
llm_cfg = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY",
)
# pull one sample page as HTML context
async with AsyncWebCrawler() as crawler:
html = (await crawler.arun(url)).fit_html
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Prices in Malaysian Ringgit (e.g. RM1,299.00 or RM200)",
llm_config=llm_cfg,
)
json.dump(pattern, pattern_file.open("w", encoding="utf-8"), indent=2)
print("Saved pattern:", pattern_file)
# ── 3-B. extraction pass zero LLM calls
strategy = RegexExtractionStrategy(custom=pattern)
config = CrawlerRunConfig(extraction_strategy=strategy, delay_before_return_html=3)
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(url, config=config)
if result.success:
data = json.loads(result.extracted_content)
for d in data[:15]:
print(f" {d['value']}")
print(f"... total matches: {len(data)}")
else:
print(" !!! crawl failed")
# ────────────────────────────────────────────────────────────────────────────
# Entrypoint
# ────────────────────────────────────────────────────────────────────────────
async def main() -> None:
# await demo_regex_default()
# await demo_regex_custom()
await demo_regex_generate_pattern()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -10,6 +10,7 @@ class CrawlResult(BaseModel):
html: str
success: bool
cleaned_html: Optional[str] = None
fit_html: Optional[str] = None # Preprocessed HTML optimized for extraction
media: Dict[str, List[Dict]] = {}
links: Dict[str, List[Dict]] = {}
downloaded_files: Optional[List[str]] = None
@@ -50,7 +51,7 @@ if not result.success:
```
### 1.3 **`status_code`** *(Optional[int])*
**What**: The pages HTTP status code (e.g., 200, 404).
**What**: The page's HTTP status code (e.g., 200, 404).
**Usage**:
```python
if result.status_code == 404:
@@ -82,7 +83,7 @@ if result.response_headers:
```
### 1.7 **`ssl_certificate`** *(Optional[SSLCertificate])*
**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the sites certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`,
**What**: If `fetch_ssl_certificate=True` in your CrawlerRunConfig, **`result.ssl_certificate`** contains a [**`SSLCertificate`**](../advanced/ssl-certificate.md) object describing the site's certificate. You can export the cert in multiple formats (PEM/DER/JSON) or access its properties like `issuer`,
`subject`, `valid_from`, `valid_until`, etc.
**Usage**:
```python
@@ -109,14 +110,6 @@ print(len(result.html))
print(result.cleaned_html[:500]) # Show a snippet
```
### 2.3 **`fit_html`** *(Optional[str])*
**What**: If a **content filter** or heuristic (e.g., Pruning/BM25) modifies the HTML, the “fit” or post-filter version.
**When**: This is **only** present if your `markdown_generator` or `content_filter` produces it.
**Usage**:
```python
if result.markdown.fit_html:
print("High-value HTML content:", result.markdown.fit_html[:300])
```
---
@@ -135,7 +128,7 @@ Crawl4AI can convert HTML→Markdown, optionally including:
- **`raw_markdown`** *(str)*: The full HTML→Markdown conversion.
- **`markdown_with_citations`** *(str)*: Same markdown, but with link references as academic-style citations.
- **`references_markdown`** *(str)*: The reference list or footnotes at the end.
- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered fit text.
- **`fit_markdown`** *(Optional[str])*: If content filtering (Pruning/BM25) was applied, the filtered "fit" text.
- **`fit_html`** *(Optional[str])*: The HTML that led to `fit_markdown`.
**Usage**:
@@ -157,7 +150,7 @@ print(result.markdown.raw_markdown[:200])
print(result.markdown.fit_markdown)
print(result.markdown.fit_html)
```
**Important**: Fit content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
**Important**: "Fit" content (in `fit_markdown`/`fit_html`) exists in result.markdown, only if you used a **filter** (like **PruningContentFilter** or **BM25ContentFilter**) within a `MarkdownGenerationStrategy`.
---
@@ -169,7 +162,7 @@ print(result.markdown.fit_html)
- `src` *(str)*: Media URL
- `alt` or `title` *(str)*: Descriptive text
- `score` *(float)*: Relevance score if the crawlers heuristic found it important
- `score` *(float)*: Relevance score if the crawler's heuristic found it "important"
- `desc` or `description` *(Optional[str])*: Additional context extracted from surrounding text
**Usage**:
@@ -263,7 +256,7 @@ A `DispatchResult` object providing additional concurrency and resource usage in
- **`task_id`**: A unique identifier for the parallel task.
- **`memory_usage`** (float): The memory (in MB) used at the time of completion.
- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the tasks execution.
- **`peak_memory`** (float): The peak memory usage (in MB) recorded during the task's execution.
- **`start_time`** / **`end_time`** (datetime): Time range for this crawling task.
- **`error_message`** (str): Any dispatcher- or concurrency-related error encountered.
@@ -358,7 +351,7 @@ async def handle_result(result: CrawlResult):
# HTML
print("Original HTML size:", len(result.html))
print("Cleaned HTML size:", len(result.cleaned_html or ""))
# Markdown output
if result.markdown:
print("Raw Markdown:", result.markdown.raw_markdown[:300])

View File

@@ -36,6 +36,45 @@ LLMExtractionStrategy(
)
```
### RegexExtractionStrategy
Used for fast pattern-based extraction of common entities using regular expressions.
```python
RegexExtractionStrategy(
# Pattern Configuration
pattern: IntFlag = RegexExtractionStrategy.Nothing, # Bit flags of built-in patterns to use
custom: Optional[Dict[str, str]] = None, # Custom pattern dictionary {label: regex}
# Input Format
input_format: str = "fit_html", # "html", "markdown", "text" or "fit_html"
)
# Built-in Patterns as Bit Flags
RegexExtractionStrategy.Email # Email addresses
RegexExtractionStrategy.PhoneIntl # International phone numbers
RegexExtractionStrategy.PhoneUS # US-format phone numbers
RegexExtractionStrategy.Url # HTTP/HTTPS URLs
RegexExtractionStrategy.IPv4 # IPv4 addresses
RegexExtractionStrategy.IPv6 # IPv6 addresses
RegexExtractionStrategy.Uuid # UUIDs
RegexExtractionStrategy.Currency # Currency values (USD, EUR, etc)
RegexExtractionStrategy.Percentage # Percentage values
RegexExtractionStrategy.Number # Numeric values
RegexExtractionStrategy.DateIso # ISO format dates
RegexExtractionStrategy.DateUS # US format dates
RegexExtractionStrategy.Time24h # 24-hour format times
RegexExtractionStrategy.PostalUS # US postal codes
RegexExtractionStrategy.PostalUK # UK postal codes
RegexExtractionStrategy.HexColor # HTML hex color codes
RegexExtractionStrategy.TwitterHandle # Twitter handles
RegexExtractionStrategy.Hashtag # Hashtags
RegexExtractionStrategy.MacAddr # MAC addresses
RegexExtractionStrategy.Iban # International bank account numbers
RegexExtractionStrategy.CreditCard # Credit card numbers
RegexExtractionStrategy.All # All available patterns
```
### CosineStrategy
Used for content similarity-based extraction and clustering.
@@ -156,6 +195,55 @@ result = await crawler.arun(
data = json.loads(result.extracted_content)
```
### Regex Extraction
```python
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, RegexExtractionStrategy
# Method 1: Use built-in patterns
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Email | RegexExtractionStrategy.Url
)
# Method 2: Use custom patterns
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
strategy = RegexExtractionStrategy(custom=price_pattern)
# Method 3: Generate pattern with LLM assistance (one-time)
from crawl4ai import LLMConfig
async with AsyncWebCrawler() as crawler:
# Get sample HTML first
sample_result = await crawler.arun("https://example.com/products")
html = sample_result.fit_html
# Generate regex pattern once
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Product prices in USD format",
llm_config=LLMConfig(provider="openai/gpt-4o-mini")
)
# Save pattern for reuse
import json
with open("price_pattern.json", "w") as f:
json.dump(pattern, f)
# Use pattern for extraction (no LLM calls)
strategy = RegexExtractionStrategy(custom=pattern)
result = await crawler.arun(
url="https://example.com/products",
config=CrawlerRunConfig(extraction_strategy=strategy)
)
# Process results
data = json.loads(result.extracted_content)
for item in data:
print(f"{item['label']}: {item['value']}")
```
### CSS Extraction
```python
@@ -220,12 +308,28 @@ result = await crawler.arun(
## Best Practices
1. **Choose the Right Strategy**
- Use `LLMExtractionStrategy` for complex, unstructured content
- Use `JsonCssExtractionStrategy` for well-structured HTML
1. **Choose the Right Strategy**
- Use `RegexExtractionStrategy` for common data types like emails, phones, URLs, dates
- Use `JsonCssExtractionStrategy` for well-structured HTML with consistent patterns
- Use `LLMExtractionStrategy` for complex, unstructured content requiring reasoning
- Use `CosineStrategy` for content similarity and clustering
2. **Optimize Chunking**
2. **Strategy Selection Guide**
```
Is the target data a common type (email/phone/date/URL)?
→ RegexExtractionStrategy
Does the page have consistent HTML structure?
→ JsonCssExtractionStrategy or JsonXPathExtractionStrategy
Is the data semantically complex or unstructured?
→ LLMExtractionStrategy
Need to find content similar to a specific topic?
→ CosineStrategy
```
3. **Optimize Chunking**
```python
# For long documents
strategy = LLMExtractionStrategy(
@@ -234,7 +338,26 @@ result = await crawler.arun(
)
```
3. **Handle Errors**
4. **Combine Strategies for Best Performance**
```python
# First pass: Extract structure with CSS
css_strategy = JsonCssExtractionStrategy(product_schema)
css_result = await crawler.arun(url, config=CrawlerRunConfig(extraction_strategy=css_strategy))
product_data = json.loads(css_result.extracted_content)
# Second pass: Extract specific fields with regex
descriptions = [product["description"] for product in product_data]
regex_strategy = RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS,
custom={"dimension": r"\d+x\d+x\d+ (?:cm|in)"}
)
# Process descriptions with regex
for text in descriptions:
matches = regex_strategy.extract("", text) # Direct extraction
```
5. **Handle Errors**
```python
try:
result = await crawler.arun(
@@ -247,11 +370,31 @@ result = await crawler.arun(
print(f"Extraction failed: {e}")
```
4. **Monitor Performance**
6. **Monitor Performance**
```python
strategy = CosineStrategy(
verbose=True, # Enable logging
word_count_threshold=20, # Filter short content
top_k=5 # Limit results
)
```
7. **Cache Generated Patterns**
```python
# For RegexExtractionStrategy pattern generation
import json
from pathlib import Path
cache_dir = Path("./pattern_cache")
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "product_pattern.json"
if pattern_file.exists():
with open(pattern_file) as f:
pattern = json.load(f)
else:
# Generate once with LLM
pattern = RegexExtractionStrategy.generate_pattern(...)
with open(pattern_file, "w") as f:
json.dump(pattern, f)
```

View File

@@ -1,15 +1,20 @@
# Extracting JSON (No LLM)
One of Crawl4AIs **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. By defining a **schema** with CSS or XPath selectors, you can extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
One of Crawl4AI's **most powerful** features is extracting **structured JSON** from websites **without** relying on large language models. Crawl4AI offers several strategies for LLM-free extraction:
1. **Schema-based extraction** with CSS or XPath selectors via `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`
2. **Regular expression extraction** with `RegexExtractionStrategy` for fast pattern matching
These approaches let you extract data instantly—even from complex or nested HTML structures—without the cost, latency, or environmental impact of an LLM.
**Why avoid LLM for basic extractions?**
1. **Faster & Cheaper**: No API calls or GPU overhead.
2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. A well-defined schema is practically carbon-free.
3. **Precise & Repeatable**: CSS/XPath selectors do exactly what you specify. LLM outputs can vary or hallucinate.
4. **Scales Readily**: For thousands of pages, schema-based extraction runs quickly and in parallel.
1. **Faster & Cheaper**: No API calls or GPU overhead.
2. **Lower Carbon Footprint**: LLM inference can be energy-intensive. Pattern-based extraction is practically carbon-free.
3. **Precise & Repeatable**: CSS/XPath selectors and regex patterns do exactly what you specify. LLM outputs can vary or hallucinate.
4. **Scales Readily**: For thousands of pages, pattern-based extraction runs quickly and in parallel.
Below, well explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). Well also highlight advanced features like **nested fields** and **base element attributes**.
Below, we'll explore how to craft these schemas and use them with **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy** if you prefer XPath). We'll also highlight advanced features like **nested fields** and **base element attributes**.
---
@@ -17,17 +22,17 @@ Below, well explore how to craft these schemas and use them with **JsonCssExt
A schema defines:
1. A **base selector** that identifies each container element on the page (e.g., a product row, a blog post card).
2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).
3. **Nested** or **list** types for repeated or hierarchical structures.
1. A **base selector** that identifies each "container" element on the page (e.g., a product row, a blog post card).
2. **Fields** describing which CSS/XPath selectors to use for each piece of data you want to capture (text, attribute, HTML block, etc.).
3. **Nested** or **list** types for repeated or hierarchical structures.
For example, if you have a list of products, each one might have a name, price, reviews, and related products. This approach is faster and more reliable than an LLM for consistent, structured pages.
For example, if you have a list of products, each one might have a name, price, reviews, and "related products." This approach is faster and more reliable than an LLM for consistent, structured pages.
---
## 2. Simple Example: Crypto Prices
Lets begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **dont** call any LLM:
Let's begin with a **simple** schema-based extraction using the `JsonCssExtractionStrategy`. Below is a snippet that extracts cryptocurrency prices from a site (similar to the legacy Coinbase example). Notice we **don't** call any LLM:
```python
import json
@@ -87,7 +92,7 @@ asyncio.run(extract_crypto_prices())
**Highlights**:
- **`baseSelector`**: Tells us where each item (crypto row) is.
- **`baseSelector`**: Tells us where each "item" (crypto row) is.
- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.
- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
@@ -97,7 +102,7 @@ No LLM is needed, and the performance is **near-instant** for hundreds or thousa
### **XPath Example with `raw://` HTML**
Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. Well pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
Below is a short example demonstrating **XPath** extraction plus the **`raw://`** scheme. We'll pass a **dummy HTML** directly (no network request) and define the extraction strategy in `CrawlerRunConfig`.
```python
import json
@@ -168,12 +173,12 @@ asyncio.run(extract_crypto_prices_xpath())
**Key Points**:
1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.
2. **`baseSelector`** and each fields `"selector"` use **XPath** instead of CSS.
3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.
1. **`JsonXPathExtractionStrategy`** is used instead of `JsonCssExtractionStrategy`.
2. **`baseSelector`** and each field's `"selector"` use **XPath** instead of CSS.
3. **`raw://`** lets us pass `dummy_html` with no real network request—handy for local testing.
4. Everything (including the extraction strategy) is in **`CrawlerRunConfig`**.
Thats how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
That's how you keep the config self-contained, illustrate **XPath** usage, and demonstrate the **raw** scheme for direct HTML input—all while avoiding the old approach of passing `extraction_strategy` directly to `arun()`.
---
@@ -187,7 +192,7 @@ We have a **sample e-commerce** HTML file on GitHub (example):
```
https://gist.githubusercontent.com/githubusercontent/2d7b8ba3cd8ab6cf3c8da771ddb36878/raw/1ae2f90c6861ce7dd84cc50d3df9920dee5e1fd2/sample_ecommerce.html
```
This snippet includes categories, products, features, reviews, and related items. Lets see how to define a schema that fully captures that structure **without LLM**.
This snippet includes categories, products, features, reviews, and related items. Let's see how to define a schema that fully captures that structure **without LLM**.
```python
schema = {
@@ -333,24 +338,253 @@ async def extract_ecommerce_data():
asyncio.run(extract_ecommerce_data())
```
If all goes well, you get a **structured** JSON array with each category, containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
If all goes well, you get a **structured** JSON array with each "category," containing an array of `products`. Each product includes `details`, `features`, `reviews`, etc. All of that **without** an LLM.
---
## 4. Why “No LLM” Is Often Better
## 4. RegexExtractionStrategy - Fast Pattern-Based Extraction
1. **Zero Hallucination**: Schema-based extraction doesnt guess text. It either finds it or not.
2. **Guaranteed Structure**: The same schema yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.
3. **Speed**: LLM-based extraction can be 101000x slower for large-scale crawling.
4. **Scalable**: Adding or updating a field is a matter of adjusting the schema, not re-tuning a model.
Crawl4AI now offers a powerful new zero-LLM extraction strategy: `RegexExtractionStrategy`. This strategy provides lightning-fast extraction of common data types like emails, phone numbers, URLs, dates, and more using pre-compiled regular expressions.
**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema approach first for repeated or consistent data patterns.
### Key Features
- **Zero LLM Dependency**: Extracts data without any AI model calls
- **Blazing Fast**: Uses pre-compiled regex patterns for maximum performance
- **Built-in Patterns**: Includes ready-to-use patterns for common data types
- **Custom Patterns**: Add your own regex patterns for domain-specific extraction
- **LLM-Assisted Pattern Generation**: Optionally use an LLM once to generate optimized patterns, then reuse them without further LLM calls
### Simple Example: Extracting Common Entities
The easiest way to start is by using the built-in pattern catalog:
```python
import json
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy
)
async def extract_with_regex():
# Create a strategy using built-in patterns for URLs and currencies
strategy = RegexExtractionStrategy(
pattern = RegexExtractionStrategy.Url | RegexExtractionStrategy.Currency
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data[:5]: # Show first 5 matches
print(f"{item['label']}: {item['value']}")
print(f"Total matches: {len(data)}")
asyncio.run(extract_with_regex())
```
### Available Built-in Patterns
`RegexExtractionStrategy` provides these common patterns as IntFlag attributes for easy combining:
```python
# Use individual patterns
strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email)
# Combine multiple patterns
strategy = RegexExtractionStrategy(
pattern = (
RegexExtractionStrategy.Email |
RegexExtractionStrategy.PhoneUS |
RegexExtractionStrategy.Url
)
)
# Use all available patterns
strategy = RegexExtractionStrategy(pattern=RegexExtractionStrategy.All)
```
Available patterns include:
- `Email` - Email addresses
- `PhoneIntl` - International phone numbers
- `PhoneUS` - US-format phone numbers
- `Url` - HTTP/HTTPS URLs
- `IPv4` - IPv4 addresses
- `IPv6` - IPv6 addresses
- `Uuid` - UUIDs
- `Currency` - Currency values (USD, EUR, etc.)
- `Percentage` - Percentage values
- `Number` - Numeric values
- `DateIso` - ISO format dates
- `DateUS` - US format dates
- `Time24h` - 24-hour format times
- `PostalUS` - US postal codes
- `PostalUK` - UK postal codes
- `HexColor` - HTML hex color codes
- `TwitterHandle` - Twitter handles
- `Hashtag` - Hashtags
- `MacAddr` - MAC addresses
- `Iban` - International bank account numbers
- `CreditCard` - Credit card numbers
### Custom Pattern Example
For more targeted extraction, you can provide custom patterns:
```python
import json
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy
)
async def extract_prices():
# Define a custom pattern for US Dollar prices
price_pattern = {"usd_price": r"\$\s?\d{1,3}(?:,\d{3})*(?:\.\d{2})?"}
# Create strategy with custom pattern
strategy = RegexExtractionStrategy(custom=price_pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data:
print(f"Found price: {item['value']}")
asyncio.run(extract_prices())
```
### LLM-Assisted Pattern Generation
For complex or site-specific patterns, you can use an LLM once to generate an optimized pattern, then save and reuse it without further LLM calls:
```python
import json
import asyncio
from pathlib import Path
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
RegexExtractionStrategy,
LLMConfig
)
async def extract_with_generated_pattern():
cache_dir = Path("./pattern_cache")
cache_dir.mkdir(exist_ok=True)
pattern_file = cache_dir / "price_pattern.json"
# 1. Generate or load pattern
if pattern_file.exists():
pattern = json.load(pattern_file.open())
print(f"Using cached pattern: {pattern}")
else:
print("Generating pattern via LLM...")
# Configure LLM
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY",
)
# Get sample HTML for context
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com/products")
html = result.fit_html
# Generate pattern (one-time LLM usage)
pattern = RegexExtractionStrategy.generate_pattern(
label="price",
html=html,
query="Product prices in USD format",
llm_config=llm_config,
)
# Cache pattern for future use
json.dump(pattern, pattern_file.open("w"), indent=2)
# 2. Use pattern for extraction (no LLM calls)
strategy = RegexExtractionStrategy(custom=pattern)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data[:10]:
print(f"Extracted: {item['value']}")
print(f"Total matches: {len(data)}")
asyncio.run(extract_with_generated_pattern())
```
This pattern allows you to:
1. Use an LLM once to generate a highly optimized regex for your specific site
2. Save the pattern to disk for reuse
3. Extract data using only regex (no further LLM calls) in production
### Extraction Results Format
The `RegexExtractionStrategy` returns results in a consistent format:
```json
[
{
"url": "https://example.com",
"label": "email",
"value": "contact@example.com",
"span": [145, 163]
},
{
"url": "https://example.com",
"label": "url",
"value": "https://support.example.com",
"span": [210, 235]
}
]
```
Each match includes:
- `url`: The source URL
- `label`: The pattern name that matched (e.g., "email", "phone_us")
- `value`: The extracted text
- `span`: The start and end positions in the source content
---
## 5. Base Element Attributes & Additional Fields
## 5. Why "No LLM" Is Often Better
Its easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
1. **Zero Hallucination**: Pattern-based extraction doesn't guess text. It either finds it or not.
2. **Guaranteed Structure**: The same schema or regex yields consistent JSON across many pages, so your downstream pipeline can rely on stable keys.
3. **Speed**: LLM-based extraction can be 101000x slower for large-scale crawling.
4. **Scalable**: Adding or updating a field is a matter of adjusting the schema or regex, not re-tuning a model.
**When might you consider an LLM?** Possibly if the site is extremely unstructured or you want AI summarization. But always try a schema or regex approach first for repeated or consistent data patterns.
---
## 6. Base Element Attributes & Additional Fields
It's easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from your base or nested elements using:
```json
{
@@ -361,11 +595,11 @@ Its easy to **extract attributes** (like `href`, `src`, or `data-xxx`) from y
}
```
You can define them in **`baseFields`** (extracted from the main container element) or in each fields sub-lists. This is especially helpful if you need an items link or ID stored in the parent `<div>`.
You can define them in **`baseFields`** (extracted from the main container element) or in each field's sub-lists. This is especially helpful if you need an item's link or ID stored in the parent `<div>`.
---
## 6. Putting It All Together: Larger Example
## 7. Putting It All Together: Larger Example
Consider a blog site. We have a schema that extracts the **URL** from each post card (via `baseFields` with an `"attribute": "href"`), plus the title, date, summary, and author:
@@ -389,19 +623,20 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
---
## 7. Tips & Best Practices
## 8. Tips & Best Practices
1. **Inspect the DOM** in Chrome DevTools or Firefoxs Inspector to find stable selectors.
2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.
3. **Test** your schema on partial HTML or a test page before a big crawl.
4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.
5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, itll often show warnings.
6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the parent item.
7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.
2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.
3. **Test** your schema on partial HTML or a test page before a big crawl.
4. **Combine with JS Execution** if the site loads content dynamically. You can pass `js_code` or `wait_for` in `CrawlerRunConfig`.
5. **Look at Logs** when `verbose=True`: if your selectors are off or your schema is malformed, it'll often show warnings.
6. **Use baseFields** if you need attributes from the container element (e.g., `href`, `data-id`), especially for the "parent" item.
7. **Performance**: For large pages, make sure your selectors are as narrow as possible.
8. **Consider Using Regex First**: For simple data types like emails, URLs, and dates, `RegexExtractionStrategy` is often the fastest approach.
---
## 8. Schema Generation Utility
## 9. Schema Generation Utility
While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
@@ -481,27 +716,26 @@ strategy = JsonCssExtractionStrategy(css_schema)
- Use OpenAI for production-quality schemas
- Use Ollama for development, testing, or when you need a self-hosted solution
That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
---
## 9. Conclusion
## 10. Conclusion
With **JsonCssExtractionStrategy** (or **JsonXPathExtractionStrategy**), you can build powerful, **LLM-free** pipelines that:
With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
- Scrape any consistent site for structured data.
- Support nested objects, repeating lists, or advanced transformations.
- Support nested objects, repeating lists, or pattern-based extraction.
- Scale to thousands of pages quickly and reliably.
**Next Steps**:
**Choosing the Right Strategy**:
- Combine your extracted JSON with advanced filtering or summarization in a second pass if needed.
- For dynamic pages, combine strategies with `js_code` or infinite scroll hooking to ensure all content is loaded.
- Use **`RegexExtractionStrategy`** for fast extraction of common data types like emails, phones, URLs, dates, etc.
- Use **`JsonCssExtractionStrategy`** or **`JsonXPathExtractionStrategy`** for structured data with clear HTML patterns
- If you need both: first extract structured data with JSON strategies, then use regex on specific fields
**Remember**: For repeated, structured data, you dont need to pay for or wait on an LLM. A well-crafted schema plus CSS or XPath gets you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
**Remember**: For repeated, structured data, you don't need to pay for or wait on an LLM. Well-crafted schemas and regex patterns get you the data faster, cleaner, and cheaper—**the real power** of Crawl4AI.
**Last Updated**: 2025-01-01
**Last Updated**: 2025-05-02
---
Thats it for **Extracting JSON (No LLM)**! Youve seen how schema-based approaches (either CSS or XPath) can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!
That's it for **Extracting JSON (No LLM)**! You've seen how schema-based approaches (either CSS or XPath) and regex patterns can handle everything from simple lists to deeply nested product catalogs—instantly, with minimal overhead. Enjoy building robust scrapers that produce consistent, structured JSON for your data pipelines!