Add source (sibling selector) support to JSON extraction strategies
Many sites (e.g. Hacker News) split a single item's data across sibling
elements. Field selectors only search descendants, making sibling data
unreachable. The new "source" field key navigates to a sibling element
before running the selector: {"source": "+ tr"} finds the next sibling
<tr>, then extracts from there.
- Add _resolve_source abstract method to JsonElementExtractionStrategy
- Implement in all 4 subclasses (CSS/BS4, XPath/lxml, two lxml/CSS)
- Modify _extract_field to resolve source before type dispatch
- Update CSS and XPath LLM prompts with source docs and HN example
- Default generate_schema validate=True so schemas are checked on creation
- Add schema validation with feedback loop for auto-refinement
- Add messages param to completion helpers for multi-turn refinement
- Document source field and schema validation in docs
- Add 14 unit tests covering CSS, XPath, backward compat, edge cases
This commit is contained in:
@@ -56,6 +56,34 @@ def _strip_markdown_fences(text: str) -> str:
|
|||||||
).strip()
|
).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _get_top_level_structure(html_content: str, max_depth: int = 3) -> str:
|
||||||
|
"""Return a compact tag outline of the HTML body up to a given depth.
|
||||||
|
|
||||||
|
Used in schema validation feedback when baseSelector matches 0 elements,
|
||||||
|
so the LLM can see what top-level tags actually exist.
|
||||||
|
"""
|
||||||
|
try:
|
||||||
|
tree = html.fromstring(html_content)
|
||||||
|
except Exception:
|
||||||
|
return ""
|
||||||
|
body = tree.xpath("//body")
|
||||||
|
root = body[0] if body else tree
|
||||||
|
lines = []
|
||||||
|
|
||||||
|
def _walk(el, depth):
|
||||||
|
if depth > max_depth or not isinstance(el.tag, str):
|
||||||
|
return
|
||||||
|
classes = el.get("class", "").split()
|
||||||
|
cls_str = "." + ".".join(classes) if classes else ""
|
||||||
|
id_str = f"#{el.get('id')}" if el.get("id") else ""
|
||||||
|
lines.append(" " * depth + f"<{el.tag}{id_str}{cls_str}>")
|
||||||
|
for child in el:
|
||||||
|
_walk(child, depth + 1)
|
||||||
|
|
||||||
|
_walk(root, 0)
|
||||||
|
return "\n".join(lines[:60])
|
||||||
|
|
||||||
|
|
||||||
class ExtractionStrategy(ABC):
|
class ExtractionStrategy(ABC):
|
||||||
"""
|
"""
|
||||||
Abstract base class for all extraction strategies.
|
Abstract base class for all extraction strategies.
|
||||||
@@ -1172,6 +1200,11 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
|
|
||||||
def _extract_field(self, element, field):
|
def _extract_field(self, element, field):
|
||||||
try:
|
try:
|
||||||
|
if "source" in field:
|
||||||
|
element = self._resolve_source(element, field["source"])
|
||||||
|
if element is None:
|
||||||
|
return field.get("default")
|
||||||
|
|
||||||
if field["type"] == "nested":
|
if field["type"] == "nested":
|
||||||
nested_elements = self._get_elements(element, field["selector"])
|
nested_elements = self._get_elements(element, field["selector"])
|
||||||
nested_element = nested_elements[0] if nested_elements else None
|
nested_element = nested_elements[0] if nested_elements else None
|
||||||
@@ -1344,6 +1377,274 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
|||||||
"""Get attribute value from element"""
|
"""Get attribute value from element"""
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
@abstractmethod
|
||||||
|
def _resolve_source(self, element, source: str):
|
||||||
|
"""Navigate to a sibling element relative to the base element.
|
||||||
|
|
||||||
|
Used when a field's data lives in a sibling of the base element
|
||||||
|
rather than a descendant. For example, Hacker News splits each
|
||||||
|
submission across two sibling <tr> rows.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
element: The current base element.
|
||||||
|
source: A sibling selector string. Currently supports the
|
||||||
|
``"+ <selector>"`` syntax which navigates to the next
|
||||||
|
sibling matching ``<selector>``.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The resolved sibling element, or ``None`` if not found.
|
||||||
|
"""
|
||||||
|
pass
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _validate_schema(
|
||||||
|
schema: dict,
|
||||||
|
html_content: str,
|
||||||
|
schema_type: str = "CSS",
|
||||||
|
expected_fields: Optional[List[str]] = None,
|
||||||
|
) -> dict:
|
||||||
|
"""Run the generated schema against HTML and return a diagnostic result.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
schema: The extraction schema to validate.
|
||||||
|
html_content: The HTML to validate against.
|
||||||
|
schema_type: "CSS" or "XPATH".
|
||||||
|
expected_fields: When provided, enables strict mode — success
|
||||||
|
requires ALL expected fields to be present and populated.
|
||||||
|
When None, uses fuzzy mode (populated_fields > 0).
|
||||||
|
|
||||||
|
Returns a dict with keys: success, base_elements_found, total_fields,
|
||||||
|
populated_fields, field_coverage, field_details, issues,
|
||||||
|
sample_base_html, top_level_structure.
|
||||||
|
"""
|
||||||
|
result = {
|
||||||
|
"success": False,
|
||||||
|
"base_elements_found": 0,
|
||||||
|
"total_fields": 0,
|
||||||
|
"populated_fields": 0,
|
||||||
|
"field_coverage": 0.0,
|
||||||
|
"field_details": [],
|
||||||
|
"issues": [],
|
||||||
|
"sample_base_html": "",
|
||||||
|
"top_level_structure": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
try:
|
||||||
|
StrategyClass = (
|
||||||
|
JsonCssExtractionStrategy
|
||||||
|
if schema_type.upper() == "CSS"
|
||||||
|
else JsonXPathExtractionStrategy
|
||||||
|
)
|
||||||
|
strategy = StrategyClass(schema=schema)
|
||||||
|
items = strategy.extract(url="", html_content=html_content)
|
||||||
|
except Exception as e:
|
||||||
|
result["issues"].append(f"Extraction crashed: {e}")
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Count base elements directly
|
||||||
|
try:
|
||||||
|
parsed = strategy._parse_html(html_content)
|
||||||
|
base_elements = strategy._get_base_elements(parsed, schema["baseSelector"])
|
||||||
|
result["base_elements_found"] = len(base_elements)
|
||||||
|
|
||||||
|
# Grab sample innerHTML of first base element (truncated)
|
||||||
|
if base_elements:
|
||||||
|
sample = strategy._get_element_html(base_elements[0])
|
||||||
|
result["sample_base_html"] = sample[:2000]
|
||||||
|
except Exception:
|
||||||
|
pass
|
||||||
|
|
||||||
|
if result["base_elements_found"] == 0:
|
||||||
|
result["issues"].append(
|
||||||
|
f"baseSelector '{schema.get('baseSelector', '')}' matched 0 elements"
|
||||||
|
)
|
||||||
|
result["top_level_structure"] = _get_top_level_structure(html_content)
|
||||||
|
return result
|
||||||
|
|
||||||
|
# Analyze field coverage
|
||||||
|
all_fields = schema.get("fields", [])
|
||||||
|
field_names = [f["name"] for f in all_fields]
|
||||||
|
result["total_fields"] = len(field_names)
|
||||||
|
|
||||||
|
for fname in field_names:
|
||||||
|
values = [item.get(fname) for item in items]
|
||||||
|
populated_count = sum(1 for v in values if v is not None and v != "")
|
||||||
|
sample_val = next((v for v in values if v is not None and v != ""), None)
|
||||||
|
if sample_val is not None:
|
||||||
|
sample_val = str(sample_val)[:120]
|
||||||
|
result["field_details"].append({
|
||||||
|
"name": fname,
|
||||||
|
"populated_count": populated_count,
|
||||||
|
"total_count": len(items),
|
||||||
|
"sample_value": sample_val,
|
||||||
|
})
|
||||||
|
|
||||||
|
result["populated_fields"] = sum(
|
||||||
|
1 for fd in result["field_details"] if fd["populated_count"] > 0
|
||||||
|
)
|
||||||
|
if result["total_fields"] > 0:
|
||||||
|
result["field_coverage"] = result["populated_fields"] / result["total_fields"]
|
||||||
|
|
||||||
|
# Build issues
|
||||||
|
if result["populated_fields"] == 0:
|
||||||
|
result["issues"].append(
|
||||||
|
"All fields returned None/empty — selectors likely wrong"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
empty_fields = [
|
||||||
|
fd["name"]
|
||||||
|
for fd in result["field_details"]
|
||||||
|
if fd["populated_count"] == 0
|
||||||
|
]
|
||||||
|
if empty_fields:
|
||||||
|
result["issues"].append(
|
||||||
|
f"Fields always empty: {', '.join(empty_fields)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Check for missing expected fields (strict mode)
|
||||||
|
if expected_fields:
|
||||||
|
schema_field_names = {f["name"] for f in schema.get("fields", [])}
|
||||||
|
missing = [f for f in expected_fields if f not in schema_field_names]
|
||||||
|
if missing:
|
||||||
|
result["issues"].append(
|
||||||
|
f"Expected fields missing from schema: {', '.join(missing)}"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Success criteria
|
||||||
|
if expected_fields:
|
||||||
|
# Strict: all expected fields must exist in schema AND be populated
|
||||||
|
schema_field_names = {f["name"] for f in schema.get("fields", [])}
|
||||||
|
populated_names = {
|
||||||
|
fd["name"] for fd in result["field_details"] if fd["populated_count"] > 0
|
||||||
|
}
|
||||||
|
result["success"] = (
|
||||||
|
result["base_elements_found"] > 0
|
||||||
|
and all(f in populated_names for f in expected_fields)
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
# Fuzzy: at least something extracted
|
||||||
|
result["success"] = (
|
||||||
|
result["base_elements_found"] > 0 and result["populated_fields"] > 0
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _build_feedback_message(
|
||||||
|
validation_result: dict,
|
||||||
|
schema: dict,
|
||||||
|
attempt: int,
|
||||||
|
is_repeated: bool,
|
||||||
|
) -> str:
|
||||||
|
"""Build a structured feedback message from a validation result."""
|
||||||
|
vr = validation_result
|
||||||
|
parts = []
|
||||||
|
|
||||||
|
parts.append(f"## Schema Validation — Attempt {attempt}")
|
||||||
|
|
||||||
|
# Base selector
|
||||||
|
if vr["base_elements_found"] == 0:
|
||||||
|
parts.append(
|
||||||
|
f"**CRITICAL:** baseSelector `{schema.get('baseSelector', '')}` "
|
||||||
|
f"matched **0 elements**. The schema cannot extract anything."
|
||||||
|
)
|
||||||
|
if vr["top_level_structure"]:
|
||||||
|
parts.append(
|
||||||
|
"Here is the top-level HTML structure so you can pick a valid selector:\n```\n"
|
||||||
|
+ vr["top_level_structure"]
|
||||||
|
+ "\n```"
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
parts.append(
|
||||||
|
f"baseSelector matched **{vr['base_elements_found']}** element(s)."
|
||||||
|
)
|
||||||
|
|
||||||
|
# Field coverage table
|
||||||
|
if vr["field_details"]:
|
||||||
|
parts.append(
|
||||||
|
f"\n**Field coverage:** {vr['populated_fields']}/{vr['total_fields']} fields have data\n"
|
||||||
|
)
|
||||||
|
parts.append("| Field | Populated | Sample |")
|
||||||
|
parts.append("|-------|-----------|--------|")
|
||||||
|
for fd in vr["field_details"]:
|
||||||
|
sample = fd["sample_value"] or "*(empty)*"
|
||||||
|
parts.append(
|
||||||
|
f"| {fd['name']} | {fd['populated_count']}/{fd['total_count']} | {sample} |"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Issues
|
||||||
|
if vr["issues"]:
|
||||||
|
parts.append("\n**Issues:**")
|
||||||
|
for issue in vr["issues"]:
|
||||||
|
parts.append(f"- {issue}")
|
||||||
|
|
||||||
|
# Sample base HTML when all fields empty
|
||||||
|
if vr["populated_fields"] == 0 and vr["sample_base_html"]:
|
||||||
|
parts.append(
|
||||||
|
"\nHere is the innerHTML of the first base element — "
|
||||||
|
"use it to find correct child selectors:\n```html\n"
|
||||||
|
+ vr["sample_base_html"]
|
||||||
|
+ "\n```"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Repeated schema warning
|
||||||
|
if is_repeated:
|
||||||
|
parts.append(
|
||||||
|
"\n**WARNING:** You returned the exact same schema as before. "
|
||||||
|
"You MUST change the selectors to fix the issues above."
|
||||||
|
)
|
||||||
|
|
||||||
|
parts.append(
|
||||||
|
"\nPlease fix the schema and return ONLY valid JSON, nothing else."
|
||||||
|
)
|
||||||
|
return "\n".join(parts)
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
async def _infer_target_json(query: str, html_snippet: str, llm_config, url: str = None) -> Optional[dict]:
|
||||||
|
"""Infer a target JSON example from a query and HTML snippet via a quick LLM call.
|
||||||
|
|
||||||
|
Returns the parsed dict, or None if inference fails.
|
||||||
|
"""
|
||||||
|
from .utils import aperform_completion_with_backoff
|
||||||
|
|
||||||
|
url_line = f"URL: {url}\n" if url else ""
|
||||||
|
prompt = (
|
||||||
|
"You are given a data extraction request and a snippet of HTML from a webpage.\n"
|
||||||
|
"Your job is to produce a single example JSON object representing ONE item "
|
||||||
|
"that the user wants to extract.\n\n"
|
||||||
|
"Rules:\n"
|
||||||
|
"- Return ONLY a valid JSON object — one flat object, NOT wrapped in an array or outer key.\n"
|
||||||
|
"- The object represents a single repeated item (e.g., one product, one article, one row).\n"
|
||||||
|
"- Use clean snake_case field names matching the user's description.\n"
|
||||||
|
"- If the item has nested repeated sub-items, represent those as an array with one example inside.\n"
|
||||||
|
"- Fill values with realistic examples from the HTML so the meaning is clear.\n\n"
|
||||||
|
'Example — if the request is "extract product name, price, and reviews":\n'
|
||||||
|
'{"name": "Widget Pro", "price": "$29.99", "reviews": [{"author": "Jane", "text": "Great product"}]}\n\n'
|
||||||
|
f"{url_line}"
|
||||||
|
f"Extraction request: {query}\n\n"
|
||||||
|
f"HTML snippet:\n```html\n{html_snippet[:2000]}\n```\n\n"
|
||||||
|
"Return ONLY the JSON object for ONE item:"
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = await aperform_completion_with_backoff(
|
||||||
|
provider=llm_config.provider,
|
||||||
|
prompt_with_variables=prompt,
|
||||||
|
json_response=True,
|
||||||
|
api_token=llm_config.api_token,
|
||||||
|
base_url=llm_config.base_url,
|
||||||
|
)
|
||||||
|
raw = response.choices[0].message.content
|
||||||
|
if not raw or not raw.strip():
|
||||||
|
return None
|
||||||
|
return json.loads(_strip_markdown_fences(raw))
|
||||||
|
except Exception:
|
||||||
|
return None
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def _extract_expected_fields(target_json: dict) -> List[str]:
|
||||||
|
"""Extract top-level field names from a target JSON example."""
|
||||||
|
return list(target_json.keys())
|
||||||
|
|
||||||
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
||||||
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
|
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||||
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
|
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||||
@@ -1423,6 +1724,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
url: Union[str, List[str]] = None,
|
url: Union[str, List[str]] = None,
|
||||||
|
validate: bool = True,
|
||||||
|
max_refinements: int = 3,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
@@ -1438,6 +1741,9 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
||||||
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
||||||
|
validate (bool): If True, validate the schema against the HTML and
|
||||||
|
refine via LLM feedback loop. Defaults to False (zero overhead).
|
||||||
|
max_refinements (int): Max refinement rounds when validate=True. Defaults to 3.
|
||||||
**kwargs: Additional args passed to LLM processor.
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1462,6 +1768,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
provider=provider,
|
provider=provider,
|
||||||
api_token=api_token,
|
api_token=api_token,
|
||||||
url=url,
|
url=url,
|
||||||
|
validate=validate,
|
||||||
|
max_refinements=max_refinements,
|
||||||
**kwargs
|
**kwargs
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -1483,6 +1791,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
provider: str = None,
|
provider: str = None,
|
||||||
api_token: str = None,
|
api_token: str = None,
|
||||||
url: Union[str, List[str]] = None,
|
url: Union[str, List[str]] = None,
|
||||||
|
validate: bool = True,
|
||||||
|
max_refinements: int = 3,
|
||||||
**kwargs
|
**kwargs
|
||||||
) -> dict:
|
) -> dict:
|
||||||
"""
|
"""
|
||||||
@@ -1502,6 +1812,9 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
api_token (str): Legacy Parameter. API token for LLM provider.
|
api_token (str): Legacy Parameter. API token for LLM provider.
|
||||||
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
url (str or List[str], optional): URL(s) to fetch HTML from. If provided, html parameter is ignored.
|
||||||
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
When multiple URLs are provided, HTMLs are fetched in parallel and concatenated.
|
||||||
|
validate (bool): If True, validate the schema against the HTML and
|
||||||
|
refine via LLM feedback loop. Defaults to False (zero overhead).
|
||||||
|
max_refinements (int): Max refinement rounds when validate=True. Defaults to 3.
|
||||||
**kwargs: Additional args passed to LLM processor.
|
**kwargs: Additional args passed to LLM processor.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
@@ -1524,6 +1837,9 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
if llm_config is None:
|
if llm_config is None:
|
||||||
llm_config = create_llm_config()
|
llm_config = create_llm_config()
|
||||||
|
|
||||||
|
# Save original HTML(s) before preprocessing (for validation against real HTML)
|
||||||
|
original_htmls = []
|
||||||
|
|
||||||
# Fetch HTML from URL(s) if provided
|
# Fetch HTML from URL(s) if provided
|
||||||
if url is not None:
|
if url is not None:
|
||||||
from .async_webcrawler import AsyncWebCrawler
|
from .async_webcrawler import AsyncWebCrawler
|
||||||
@@ -1547,6 +1863,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
if result.status_code >= 400:
|
if result.status_code >= 400:
|
||||||
raise Exception(f"HTTP {result.status_code} error for URL '{urls[0]}'")
|
raise Exception(f"HTTP {result.status_code} error for URL '{urls[0]}'")
|
||||||
html = result.html
|
html = result.html
|
||||||
|
original_htmls = [result.html]
|
||||||
else:
|
else:
|
||||||
results = await crawler.arun_many(urls=urls, config=crawler_config)
|
results = await crawler.arun_many(urls=urls, config=crawler_config)
|
||||||
html_parts = []
|
html_parts = []
|
||||||
@@ -1555,6 +1872,7 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
raise Exception(f"Failed to fetch URL '{result.url}': {result.error_message}")
|
raise Exception(f"Failed to fetch URL '{result.url}': {result.error_message}")
|
||||||
if result.status_code >= 400:
|
if result.status_code >= 400:
|
||||||
raise Exception(f"HTTP {result.status_code} error for URL '{result.url}'")
|
raise Exception(f"HTTP {result.status_code} error for URL '{result.url}'")
|
||||||
|
original_htmls.append(result.html)
|
||||||
cleaned = preprocess_html_for_schema(
|
cleaned = preprocess_html_for_schema(
|
||||||
html_content=result.html,
|
html_content=result.html,
|
||||||
text_threshold=2000,
|
text_threshold=2000,
|
||||||
@@ -1564,6 +1882,8 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
header = HTML_EXAMPLE_DELIMITER.format(index=i)
|
header = HTML_EXAMPLE_DELIMITER.format(index=i)
|
||||||
html_parts.append(f"{header}\n{cleaned}")
|
html_parts.append(f"{header}\n{cleaned}")
|
||||||
html = "\n\n".join(html_parts)
|
html = "\n\n".join(html_parts)
|
||||||
|
else:
|
||||||
|
original_htmls = [html]
|
||||||
|
|
||||||
# Preprocess HTML for schema generation (skip if already preprocessed from multiple URLs)
|
# Preprocess HTML for schema generation (skip if already preprocessed from multiple URLs)
|
||||||
if url is None or isinstance(url, str):
|
if url is None or isinstance(url, str):
|
||||||
@@ -1574,8 +1894,41 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
max_size=500_000
|
max_size=500_000
|
||||||
)
|
)
|
||||||
|
|
||||||
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
# --- Resolve expected fields for strict validation ---
|
||||||
|
expected_fields = None
|
||||||
|
if validate:
|
||||||
|
if target_json_example:
|
||||||
|
# User provided target JSON — extract field names from it
|
||||||
|
try:
|
||||||
|
if isinstance(target_json_example, str):
|
||||||
|
target_obj = json.loads(target_json_example)
|
||||||
|
else:
|
||||||
|
target_obj = target_json_example
|
||||||
|
expected_fields = JsonElementExtractionStrategy._extract_expected_fields(target_obj)
|
||||||
|
except (json.JSONDecodeError, TypeError):
|
||||||
|
pass
|
||||||
|
elif query:
|
||||||
|
# No target JSON but query describes fields — infer via quick LLM call
|
||||||
|
first_url = None
|
||||||
|
if url is not None:
|
||||||
|
first_url = url if isinstance(url, str) else url[0]
|
||||||
|
inferred = await JsonElementExtractionStrategy._infer_target_json(
|
||||||
|
query=query, html_snippet=html, llm_config=llm_config, url=first_url
|
||||||
|
)
|
||||||
|
if inferred:
|
||||||
|
expected_fields = JsonElementExtractionStrategy._extract_expected_fields(inferred)
|
||||||
|
# Also inject as target_json_example for the schema prompt
|
||||||
|
if not target_json_example:
|
||||||
|
target_json_example = json.dumps(inferred, indent=2)
|
||||||
|
|
||||||
|
prompt = JsonElementExtractionStrategy._build_schema_prompt(html, schema_type, query, target_json_example)
|
||||||
|
messages = [{"role": "user", "content": prompt}]
|
||||||
|
|
||||||
|
prev_schema_json = None
|
||||||
|
last_schema = None
|
||||||
|
max_attempts = 1 + (max_refinements if validate else 0)
|
||||||
|
|
||||||
|
for attempt in range(max_attempts):
|
||||||
try:
|
try:
|
||||||
response = await aperform_completion_with_backoff(
|
response = await aperform_completion_with_backoff(
|
||||||
provider=llm_config.provider,
|
provider=llm_config.provider,
|
||||||
@@ -1583,17 +1936,69 @@ In this scenario, use your best judgment to generate the schema. You need to exa
|
|||||||
json_response=True,
|
json_response=True,
|
||||||
api_token=llm_config.api_token,
|
api_token=llm_config.api_token,
|
||||||
base_url=llm_config.base_url,
|
base_url=llm_config.base_url,
|
||||||
extra_args=kwargs
|
messages=messages,
|
||||||
|
extra_args=kwargs,
|
||||||
)
|
)
|
||||||
raw = response.choices[0].message.content
|
raw = response.choices[0].message.content
|
||||||
if not raw or not raw.strip():
|
if not raw or not raw.strip():
|
||||||
raise ValueError("LLM returned an empty response")
|
raise ValueError("LLM returned an empty response")
|
||||||
return json.loads(_strip_markdown_fences(raw))
|
|
||||||
|
schema = json.loads(_strip_markdown_fences(raw))
|
||||||
|
last_schema = schema
|
||||||
except json.JSONDecodeError as e:
|
except json.JSONDecodeError as e:
|
||||||
|
# JSON parse failure — ask LLM to fix it
|
||||||
|
if not validate or attempt >= max_attempts - 1:
|
||||||
raise Exception(f"Failed to parse schema JSON: {str(e)}")
|
raise Exception(f"Failed to parse schema JSON: {str(e)}")
|
||||||
|
messages.append({"role": "assistant", "content": raw})
|
||||||
|
messages.append({"role": "user", "content": (
|
||||||
|
f"Your response was not valid JSON. Parse error: {e}\n"
|
||||||
|
"Please return ONLY valid JSON, nothing else."
|
||||||
|
)})
|
||||||
|
continue
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||||
|
|
||||||
|
# If validation is off, return immediately (zero overhead path)
|
||||||
|
if not validate:
|
||||||
|
return schema
|
||||||
|
|
||||||
|
# --- Validation feedback loop ---
|
||||||
|
# Validate against original HTML(s); success if works on at least one
|
||||||
|
best_result = None
|
||||||
|
for orig_html in original_htmls:
|
||||||
|
vr = JsonElementExtractionStrategy._validate_schema(
|
||||||
|
schema, orig_html, schema_type,
|
||||||
|
expected_fields=expected_fields,
|
||||||
|
)
|
||||||
|
if best_result is None or vr["populated_fields"] > best_result["populated_fields"]:
|
||||||
|
best_result = vr
|
||||||
|
if vr["success"]:
|
||||||
|
break
|
||||||
|
|
||||||
|
if best_result["success"]:
|
||||||
|
return schema
|
||||||
|
|
||||||
|
# Last attempt — return best-effort
|
||||||
|
if attempt >= max_attempts - 1:
|
||||||
|
return schema
|
||||||
|
|
||||||
|
# Detect repeated schema
|
||||||
|
current_json = json.dumps(schema, sort_keys=True)
|
||||||
|
is_repeated = current_json == prev_schema_json
|
||||||
|
prev_schema_json = current_json
|
||||||
|
|
||||||
|
# Build feedback and extend conversation
|
||||||
|
feedback = JsonElementExtractionStrategy._build_feedback_message(
|
||||||
|
best_result, schema, attempt + 1, is_repeated
|
||||||
|
)
|
||||||
|
messages.append({"role": "assistant", "content": raw})
|
||||||
|
messages.append({"role": "user", "content": feedback})
|
||||||
|
|
||||||
|
# Should not reach here, but return last schema as safety net
|
||||||
|
if last_schema is not None:
|
||||||
|
return last_schema
|
||||||
|
raise Exception("Failed to generate schema: no attempts succeeded")
|
||||||
|
|
||||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||||
"""
|
"""
|
||||||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||||||
@@ -1641,6 +2046,21 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
|||||||
def _get_element_attribute(self, element, attribute: str):
|
def _get_element_attribute(self, element, attribute: str):
|
||||||
return element.get(attribute)
|
return element.get(attribute)
|
||||||
|
|
||||||
|
def _resolve_source(self, element, source: str):
|
||||||
|
source = source.strip()
|
||||||
|
if not source.startswith("+"):
|
||||||
|
return None
|
||||||
|
sel = source[1:].strip() # e.g. "tr", "tr.subtext", ".classname"
|
||||||
|
parts = sel.split(".")
|
||||||
|
tag = parts[0].strip() or None
|
||||||
|
classes = [p.strip() for p in parts[1:] if p.strip()]
|
||||||
|
kwargs = {}
|
||||||
|
if classes:
|
||||||
|
kwargs["class_"] = lambda c, _cls=classes: c and all(
|
||||||
|
cl in c for cl in _cls
|
||||||
|
)
|
||||||
|
return element.find_next_sibling(tag, **kwargs)
|
||||||
|
|
||||||
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
|
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
|
||||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||||
kwargs["input_format"] = "html"
|
kwargs["input_format"] = "html"
|
||||||
@@ -1907,6 +2327,21 @@ class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
|
|||||||
print(f"Error getting attribute '{attribute}': {e}")
|
print(f"Error getting attribute '{attribute}': {e}")
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
def _resolve_source(self, element, source: str):
|
||||||
|
source = source.strip()
|
||||||
|
if not source.startswith("+"):
|
||||||
|
return None
|
||||||
|
sel = source[1:].strip()
|
||||||
|
parts = sel.split(".")
|
||||||
|
tag = parts[0].strip() or "*"
|
||||||
|
classes = [p.strip() for p in parts[1:] if p.strip()]
|
||||||
|
xpath = f"./following-sibling::{tag}"
|
||||||
|
for cls in classes:
|
||||||
|
xpath += f"[contains(concat(' ',normalize-space(@class),' '),' {cls} ')]"
|
||||||
|
xpath += "[1]"
|
||||||
|
results = element.xpath(xpath)
|
||||||
|
return results[0] if results else None
|
||||||
|
|
||||||
def _clear_caches(self):
|
def _clear_caches(self):
|
||||||
"""Clear caches to free memory"""
|
"""Clear caches to free memory"""
|
||||||
if self.use_caching:
|
if self.use_caching:
|
||||||
@@ -2009,6 +2444,21 @@ class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
|
|||||||
def _get_element_attribute(self, element, attribute: str):
|
def _get_element_attribute(self, element, attribute: str):
|
||||||
return element.get(attribute)
|
return element.get(attribute)
|
||||||
|
|
||||||
|
def _resolve_source(self, element, source: str):
|
||||||
|
source = source.strip()
|
||||||
|
if not source.startswith("+"):
|
||||||
|
return None
|
||||||
|
sel = source[1:].strip()
|
||||||
|
parts = sel.split(".")
|
||||||
|
tag = parts[0].strip() or "*"
|
||||||
|
classes = [p.strip() for p in parts[1:] if p.strip()]
|
||||||
|
xpath = f"./following-sibling::{tag}"
|
||||||
|
for cls in classes:
|
||||||
|
xpath += f"[contains(concat(' ',normalize-space(@class),' '),' {cls} ')]"
|
||||||
|
xpath += "[1]"
|
||||||
|
results = element.xpath(xpath)
|
||||||
|
return results[0] if results else None
|
||||||
|
|
||||||
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||||
"""
|
"""
|
||||||
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors.
|
||||||
@@ -2073,6 +2523,21 @@ class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
|||||||
def _get_element_attribute(self, element, attribute: str):
|
def _get_element_attribute(self, element, attribute: str):
|
||||||
return element.get(attribute)
|
return element.get(attribute)
|
||||||
|
|
||||||
|
def _resolve_source(self, element, source: str):
|
||||||
|
source = source.strip()
|
||||||
|
if not source.startswith("+"):
|
||||||
|
return None
|
||||||
|
sel = source[1:].strip()
|
||||||
|
parts = sel.split(".")
|
||||||
|
tag = parts[0].strip() or "*"
|
||||||
|
classes = [p.strip() for p in parts[1:] if p.strip()]
|
||||||
|
xpath = f"./following-sibling::{tag}"
|
||||||
|
for cls in classes:
|
||||||
|
xpath += f"[contains(concat(' ',normalize-space(@class),' '),' {cls} ')]"
|
||||||
|
xpath += "[1]"
|
||||||
|
results = element.xpath(xpath)
|
||||||
|
return results[0] if results else None
|
||||||
|
|
||||||
"""
|
"""
|
||||||
RegexExtractionStrategy
|
RegexExtractionStrategy
|
||||||
Fast, zero-LLM extraction of common entities via regular expressions.
|
Fast, zero-LLM extraction of common entities via regular expressions.
|
||||||
|
|||||||
@@ -298,6 +298,7 @@ Your output must always be a JSON object with this structure:
|
|||||||
"attribute": "attribute_name", // Optional
|
"attribute": "attribute_name", // Optional
|
||||||
"transform": "transformation_type", // Optional
|
"transform": "transformation_type", // Optional
|
||||||
"pattern": "regex_pattern", // Optional
|
"pattern": "regex_pattern", // Optional
|
||||||
|
"source": "+ sibling_selector", // Optional — navigate to sibling element first
|
||||||
"fields": [] // For nested/list types
|
"fields": [] // For nested/list types
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -312,16 +313,26 @@ Available field types:
|
|||||||
- list: Array of similar items
|
- list: Array of similar items
|
||||||
- regex: Pattern-based extraction
|
- regex: Pattern-based extraction
|
||||||
|
|
||||||
|
Optional field keys:
|
||||||
|
- source: Navigate to a sibling element before running the selector.
|
||||||
|
Syntax: "+ <css_selector>" — finds the next sibling matching the selector.
|
||||||
|
Example: "source": "+ tr" finds the next sibling <tr> of the base element.
|
||||||
|
Example: "source": "+ tr.subtext" finds the next sibling <tr> with class "subtext".
|
||||||
|
The field's selector then runs inside the resolved sibling element.
|
||||||
|
Use this when a logical item's data is split across sibling elements (e.g. table rows).
|
||||||
|
|
||||||
CRITICAL - How selectors work at each level:
|
CRITICAL - How selectors work at each level:
|
||||||
- baseSelector runs against the FULL document and returns all matching elements.
|
- baseSelector runs against the FULL document and returns all matching elements.
|
||||||
- Field selectors run INSIDE each base element (descendants only, not siblings).
|
- Field selectors run INSIDE each base element (descendants only, not siblings).
|
||||||
- This means a field selector will NEVER match sibling elements of the base element.
|
- This means a field selector will NEVER match sibling elements of the base element.
|
||||||
|
- To reach sibling data, use the "source" key to navigate to the sibling first.
|
||||||
- Therefore: NEVER use the same (or equivalent) selector as baseSelector in a field.
|
- Therefore: NEVER use the same (or equivalent) selector as baseSelector in a field.
|
||||||
It would search for the element inside itself, which returns nothing for flat/sibling layouts.
|
It would search for the element inside itself, which returns nothing for flat/sibling layouts.
|
||||||
|
|
||||||
When repeating items are siblings (e.g. table rows, flat divs):
|
When repeating items are siblings (e.g. table rows, flat divs):
|
||||||
- CORRECT: Use baseSelector to match each item, then use flat fields (text/attribute) to extract data directly from within each item.
|
- CORRECT: Use baseSelector to match each item, then use flat fields (text/attribute) to extract data directly from within each item.
|
||||||
- WRONG: Using baseSelector as a "list" field selector inside itself — this produces empty arrays.
|
- WRONG: Using baseSelector as a "list" field selector inside itself — this produces empty arrays.
|
||||||
|
- For data in sibling elements: Use "source" to navigate to the sibling, then extract from there.
|
||||||
</type_definitions>
|
</type_definitions>
|
||||||
|
|
||||||
<behavior_rules>
|
<behavior_rules>
|
||||||
@@ -651,6 +662,37 @@ CORRECT Schema (flat fields directly on base element):
|
|||||||
{"name": "link", "selector": ".title a", "type": "attribute", "attribute": "href"}
|
{"name": "link", "selector": ".title a", "type": "attribute", "attribute": "href"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
8. Sibling Data Example (data split across sibling elements):
|
||||||
|
<html>
|
||||||
|
<table>
|
||||||
|
<tr class="athing submission">
|
||||||
|
<td class="title"><span class="rank">1.</span></td>
|
||||||
|
<td><span class="titleline"><a href="https://example.com">Example Title</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="2"></td>
|
||||||
|
<td class="subtext">
|
||||||
|
<span class="score">100 points</span>
|
||||||
|
<a class="hnuser">johndoe</a>
|
||||||
|
<a>50 comments</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
Generated Schema (using "source" to reach sibling row):
|
||||||
|
{
|
||||||
|
"name": "HN Submissions",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "rank", "selector": "span.rank", "type": "text"},
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
{"name": "url", "selector": "span.titleline a", "type": "attribute", "attribute": "href"},
|
||||||
|
{"name": "score", "selector": "span.score", "type": "text", "source": "+ tr"},
|
||||||
|
{"name": "author", "selector": "a.hnuser", "type": "text", "source": "+ tr"}
|
||||||
|
]
|
||||||
|
}
|
||||||
</examples>
|
</examples>
|
||||||
|
|
||||||
|
|
||||||
@@ -719,6 +761,7 @@ Your output must always be a JSON object with this structure:
|
|||||||
"attribute": "attribute_name", // Optional
|
"attribute": "attribute_name", // Optional
|
||||||
"transform": "transformation_type", // Optional
|
"transform": "transformation_type", // Optional
|
||||||
"pattern": "regex_pattern", // Optional
|
"pattern": "regex_pattern", // Optional
|
||||||
|
"source": "+ sibling_selector", // Optional — navigate to sibling element first
|
||||||
"fields": [] // For nested/list types
|
"fields": [] // For nested/list types
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
@@ -733,16 +776,26 @@ Available field types:
|
|||||||
- list: Array of similar items
|
- list: Array of similar items
|
||||||
- regex: Pattern-based extraction
|
- regex: Pattern-based extraction
|
||||||
|
|
||||||
|
Optional field keys:
|
||||||
|
- source: Navigate to a sibling element before running the selector.
|
||||||
|
Syntax: "+ <selector>" — finds the next sibling matching the selector.
|
||||||
|
Example: "source": "+ tr" finds the next sibling <tr> of the base element.
|
||||||
|
Example: "source": "+ tr.subtext" finds the next sibling <tr> with class "subtext".
|
||||||
|
The field's selector then runs inside the resolved sibling element.
|
||||||
|
Use this when a logical item's data is split across sibling elements (e.g. table rows).
|
||||||
|
|
||||||
CRITICAL - How selectors work at each level:
|
CRITICAL - How selectors work at each level:
|
||||||
- baseSelector runs against the FULL document and returns all matching elements.
|
- baseSelector runs against the FULL document and returns all matching elements.
|
||||||
- Field selectors run INSIDE each base element (descendants only, not siblings).
|
- Field selectors run INSIDE each base element (descendants only, not siblings).
|
||||||
- This means a field selector will NEVER match sibling elements of the base element.
|
- This means a field selector will NEVER match sibling elements of the base element.
|
||||||
|
- To reach sibling data, use the "source" key to navigate to the sibling first.
|
||||||
- Therefore: NEVER use the same (or equivalent) selector as baseSelector in a field.
|
- Therefore: NEVER use the same (or equivalent) selector as baseSelector in a field.
|
||||||
It would search for the element inside itself, which returns nothing for flat/sibling layouts.
|
It would search for the element inside itself, which returns nothing for flat/sibling layouts.
|
||||||
|
|
||||||
When repeating items are siblings (e.g. table rows, flat divs):
|
When repeating items are siblings (e.g. table rows, flat divs):
|
||||||
- CORRECT: Use baseSelector to match each item, then use flat fields (text/attribute) to extract data directly from within each item.
|
- CORRECT: Use baseSelector to match each item, then use flat fields (text/attribute) to extract data directly from within each item.
|
||||||
- WRONG: Using baseSelector as a "list" field selector inside itself — this produces empty arrays.
|
- WRONG: Using baseSelector as a "list" field selector inside itself — this produces empty arrays.
|
||||||
|
- For data in sibling elements: Use "source" to navigate to the sibling, then extract from there.
|
||||||
</type_definitions>
|
</type_definitions>
|
||||||
|
|
||||||
<behavior_rules>
|
<behavior_rules>
|
||||||
@@ -1072,6 +1125,37 @@ CORRECT Schema (flat fields directly on base element):
|
|||||||
{"name": "link", "selector": ".//td[@class='title']/a", "type": "attribute", "attribute": "href"}
|
{"name": "link", "selector": ".//td[@class='title']/a", "type": "attribute", "attribute": "href"}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|
||||||
|
8. Sibling Data Example (data split across sibling elements):
|
||||||
|
<html>
|
||||||
|
<table>
|
||||||
|
<tr class="athing submission">
|
||||||
|
<td class="title"><span class="rank">1.</span></td>
|
||||||
|
<td><span class="titleline"><a href="https://example.com">Example Title</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="2"></td>
|
||||||
|
<td class="subtext">
|
||||||
|
<span class="score">100 points</span>
|
||||||
|
<a class="hnuser">johndoe</a>
|
||||||
|
<a>50 comments</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
</table>
|
||||||
|
</html>
|
||||||
|
|
||||||
|
Generated Schema (using "source" to reach sibling row):
|
||||||
|
{
|
||||||
|
"name": "HN Submissions",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing') and contains(@class, 'submission')]",
|
||||||
|
"fields": [
|
||||||
|
{"name": "rank", "selector": ".//span[@class='rank']", "type": "text"},
|
||||||
|
{"name": "title", "selector": ".//span[@class='titleline']/a", "type": "text"},
|
||||||
|
{"name": "url", "selector": ".//span[@class='titleline']/a", "type": "attribute", "attribute": "href"},
|
||||||
|
{"name": "score", "selector": ".//span[@class='score']", "type": "text", "source": "+ tr"},
|
||||||
|
{"name": "author", "selector": ".//a[@class='hnuser']", "type": "text", "source": "+ tr"}
|
||||||
|
]
|
||||||
|
}
|
||||||
</examples>
|
</examples>
|
||||||
|
|
||||||
<output_requirements>
|
<output_requirements>
|
||||||
|
|||||||
@@ -1748,6 +1748,7 @@ def perform_completion_with_backoff(
|
|||||||
base_delay=2,
|
base_delay=2,
|
||||||
max_attempts=3,
|
max_attempts=3,
|
||||||
exponential_factor=2,
|
exponential_factor=2,
|
||||||
|
messages=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -1789,7 +1790,7 @@ def perform_completion_with_backoff(
|
|||||||
try:
|
try:
|
||||||
response = completion(
|
response = completion(
|
||||||
model=provider,
|
model=provider,
|
||||||
messages=[{"role": "user", "content": prompt_with_variables}],
|
messages=messages if messages is not None else [{"role": "user", "content": prompt_with_variables}],
|
||||||
**extra_args,
|
**extra_args,
|
||||||
)
|
)
|
||||||
return response # Return the successful response
|
return response # Return the successful response
|
||||||
@@ -1839,6 +1840,7 @@ async def aperform_completion_with_backoff(
|
|||||||
base_delay=2,
|
base_delay=2,
|
||||||
max_attempts=3,
|
max_attempts=3,
|
||||||
exponential_factor=2,
|
exponential_factor=2,
|
||||||
|
messages=None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -1881,7 +1883,7 @@ async def aperform_completion_with_backoff(
|
|||||||
try:
|
try:
|
||||||
response = await acompletion(
|
response = await acompletion(
|
||||||
model=provider,
|
model=provider,
|
||||||
messages=[{"role": "user", "content": prompt_with_variables}],
|
messages=messages if messages is not None else [{"role": "user", "content": prompt_with_variables}],
|
||||||
**extra_args,
|
**extra_args,
|
||||||
)
|
)
|
||||||
return response # Return the successful response
|
return response # Return the successful response
|
||||||
|
|||||||
@@ -120,7 +120,8 @@ schema = {
|
|||||||
"attribute": str, # For type="attribute"
|
"attribute": str, # For type="attribute"
|
||||||
"pattern": str, # For type="regex"
|
"pattern": str, # For type="regex"
|
||||||
"transform": str, # Optional: "lowercase", "uppercase", "strip"
|
"transform": str, # Optional: "lowercase", "uppercase", "strip"
|
||||||
"default": Any # Default value if extraction fails
|
"default": Any, # Default value if extraction fails
|
||||||
|
"source": str, # Optional: navigate to sibling first, e.g. "+ tr"
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -232,6 +232,7 @@ if __name__ == "__main__":
|
|||||||
- Great for repetitive page structures (e.g., item listings, articles).
|
- Great for repetitive page structures (e.g., item listings, articles).
|
||||||
- No AI usage or costs.
|
- No AI usage or costs.
|
||||||
- The crawler returns a JSON string you can parse or store.
|
- The crawler returns a JSON string you can parse or store.
|
||||||
|
- For sites where data is split across sibling elements (e.g. Hacker News), use the `"source"` field key to navigate to a sibling before extracting: `{"name": "score", "selector": "span.score", "type": "text", "source": "+ tr"}`.
|
||||||
> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
|
> Tips: You can pass raw HTML to the crawler instead of a URL. To do so, prefix the HTML with `raw://`.
|
||||||
## 6. Simple Data Extraction (LLM-based)
|
## 6. Simple Data Extraction (LLM-based)
|
||||||
- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)
|
- **Open-Source Models** (e.g., `ollama/llama3.3`, `no_token`)
|
||||||
|
|||||||
@@ -95,6 +95,7 @@ asyncio.run(extract_crypto_prices())
|
|||||||
- **`baseSelector`**: Tells us where each "item" (crypto row) is.
|
- **`baseSelector`**: Tells us where each "item" (crypto row) is.
|
||||||
- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.
|
- **`fields`**: Two fields (`coin_name`, `price`) using simple CSS selectors.
|
||||||
- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
|
- Each field defines a **`type`** (e.g., `text`, `attribute`, `html`, `regex`, etc.).
|
||||||
|
- Optional keys: **`transform`**, **`default`**, **`attribute`**, **`pattern`**, and **`source`** (for sibling data — see [Extracting Sibling Data](#sibling-data)).
|
||||||
|
|
||||||
No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
|
No LLM is needed, and the performance is **near-instant** for hundreds or thousands of items.
|
||||||
|
|
||||||
@@ -623,7 +624,60 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 8. Tips & Best Practices
|
## 8. Extracting Sibling Data with `source` {#sibling-data}
|
||||||
|
|
||||||
|
Some websites split a single logical item across **sibling elements** rather than nesting everything inside one container. A classic example is Hacker News, where each submission spans two adjacent `<tr>` rows:
|
||||||
|
|
||||||
|
```html
|
||||||
|
<tr class="athing submission"> <!-- rank, title, url -->
|
||||||
|
<td><span class="rank">1.</span></td>
|
||||||
|
<td><span class="titleline"><a href="https://example.com">Example Title</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr> <!-- score, author, comments (sibling!) -->
|
||||||
|
<td class="subtext">
|
||||||
|
<span class="score">100 points</span>
|
||||||
|
<a class="hnuser">johndoe</a>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
```
|
||||||
|
|
||||||
|
Normally, field selectors only search **descendants** of the base element — siblings are unreachable. The `source` field key solves this by navigating to a sibling element before running the selector.
|
||||||
|
|
||||||
|
### Syntax
|
||||||
|
|
||||||
|
```
|
||||||
|
"source": "+ <selector>"
|
||||||
|
```
|
||||||
|
|
||||||
|
- **`+ tr`** — next sibling `<tr>`
|
||||||
|
- **`+ div.details`** — next sibling `<div>` with class `details`
|
||||||
|
- **`+ .subtext`** — next sibling with class `subtext`
|
||||||
|
|
||||||
|
### Example: Hacker News
|
||||||
|
|
||||||
|
```python
|
||||||
|
schema = {
|
||||||
|
"name": "HN Submissions",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "rank", "selector": "span.rank", "type": "text"},
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
{"name": "url", "selector": "span.titleline a", "type": "attribute", "attribute": "href"},
|
||||||
|
{"name": "score", "selector": "span.score", "type": "text", "source": "+ tr"},
|
||||||
|
{"name": "author", "selector": "a.hnuser", "type": "text", "source": "+ tr"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
|
||||||
|
strategy = JsonCssExtractionStrategy(schema)
|
||||||
|
```
|
||||||
|
|
||||||
|
The `score` and `author` fields first navigate to the next sibling `<tr>`, then run their selectors inside that element. Fields without `source` work as before — searching descendants of the base element.
|
||||||
|
|
||||||
|
`source` works with all field types (`text`, `attribute`, `nested`, `list`, etc.) and with both `JsonCssExtractionStrategy` and `JsonXPathExtractionStrategy`. If the sibling isn't found, the field returns its `default` value.
|
||||||
|
|
||||||
|
---
|
||||||
|
|
||||||
|
## 9. Tips & Best Practices
|
||||||
|
|
||||||
1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.
|
1. **Inspect the DOM** in Chrome DevTools or Firefox's Inspector to find stable selectors.
|
||||||
2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.
|
2. **Start Simple**: Verify you can extract a single field. Then add complexity like nested objects or lists.
|
||||||
@@ -636,7 +690,7 @@ Then run with `JsonCssExtractionStrategy(schema)` to get an array of blog post o
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 9. Schema Generation Utility
|
## 10. Schema Generation Utility
|
||||||
|
|
||||||
While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
|
While manually crafting schemas is powerful and precise, Crawl4AI now offers a convenient utility to **automatically generate** extraction schemas using LLM. This is particularly useful when:
|
||||||
|
|
||||||
@@ -684,6 +738,29 @@ xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
|||||||
strategy = JsonCssExtractionStrategy(css_schema)
|
strategy = JsonCssExtractionStrategy(css_schema)
|
||||||
```
|
```
|
||||||
|
|
||||||
|
### Schema Validation
|
||||||
|
|
||||||
|
By default, `generate_schema` **validates** the generated schema against the HTML to ensure that it actually extracts the data you expect. If the schema doesn't produce results, it automatically refines the selectors before returning.
|
||||||
|
|
||||||
|
You can control this with the `validate` parameter:
|
||||||
|
|
||||||
|
```python
|
||||||
|
# Default: validated (recommended)
|
||||||
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
url="https://news.ycombinator.com",
|
||||||
|
query="Extract each story: title, url, score, author",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Skip validation if you want raw LLM output
|
||||||
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
url="https://news.ycombinator.com",
|
||||||
|
query="Extract each story: title, url, score, author",
|
||||||
|
validate=False,
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
|
The generator also understands sibling layouts — for sites like Hacker News where data is split across sibling elements, it will automatically use the [`source` field](#sibling-data) to reach sibling data.
|
||||||
|
|
||||||
### LLM Provider Options
|
### LLM Provider Options
|
||||||
|
|
||||||
1. **OpenAI GPT-4 (`openai/gpt4o`)**
|
1. **OpenAI GPT-4 (`openai/gpt4o`)**
|
||||||
@@ -814,7 +891,7 @@ This approach lets you generate schemas once that work reliably across hundreds
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## 10. Conclusion
|
## 11. Conclusion
|
||||||
|
|
||||||
With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
|
With Crawl4AI's LLM-free extraction strategies - `JsonCssExtractionStrategy`, `JsonXPathExtractionStrategy`, and now `RegexExtractionStrategy` - you can build powerful pipelines that:
|
||||||
|
|
||||||
|
|||||||
396
tests/test_source_sibling_selector.py
Normal file
396
tests/test_source_sibling_selector.py
Normal file
@@ -0,0 +1,396 @@
|
|||||||
|
"""Tests for the `source` (sibling selector) support in JSON extraction strategies."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
from crawl4ai.extraction_strategy import (
|
||||||
|
JsonCssExtractionStrategy,
|
||||||
|
JsonXPathExtractionStrategy,
|
||||||
|
)
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Shared HTML fixture — mimics Hacker News sibling-row layout
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
HN_HTML = """\
|
||||||
|
<html><body><table>
|
||||||
|
<tr class="athing submission" id="1">
|
||||||
|
<td class="title"><span class="rank">1.</span></td>
|
||||||
|
<td><span class="titleline"><a href="https://example.com/a">Alpha</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="2"></td>
|
||||||
|
<td class="subtext">
|
||||||
|
<span class="score">100 points</span>
|
||||||
|
<a class="hnuser">alice</a>
|
||||||
|
<span class="age">2 hours ago</span>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="spacer"></tr>
|
||||||
|
|
||||||
|
<tr class="athing submission" id="2">
|
||||||
|
<td class="title"><span class="rank">2.</span></td>
|
||||||
|
<td><span class="titleline"><a href="https://example.com/b">Beta</a></span></td>
|
||||||
|
</tr>
|
||||||
|
<tr>
|
||||||
|
<td colspan="2"></td>
|
||||||
|
<td class="subtext">
|
||||||
|
<span class="score">42 points</span>
|
||||||
|
<a class="hnuser">bob</a>
|
||||||
|
<span class="age">5 hours ago</span>
|
||||||
|
</td>
|
||||||
|
</tr>
|
||||||
|
<tr class="spacer"></tr>
|
||||||
|
</table></body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# CSS Strategy Tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
class TestCssSourceField:
|
||||||
|
"""JsonCssExtractionStrategy with source field."""
|
||||||
|
|
||||||
|
def _extract(self, schema):
|
||||||
|
strategy = JsonCssExtractionStrategy(schema)
|
||||||
|
return strategy.extract(None, HN_HTML)
|
||||||
|
|
||||||
|
def test_basic_source_extraction(self):
|
||||||
|
"""Fields with source='+ tr' should extract data from the next sibling row."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "rank", "selector": "span.rank", "type": "text"},
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
{"name": "url", "selector": "span.titleline a", "type": "attribute", "attribute": "href"},
|
||||||
|
{"name": "score", "selector": "span.score", "type": "text", "source": "+ tr"},
|
||||||
|
{"name": "author", "selector": "a.hnuser", "type": "text", "source": "+ tr"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
assert results[0]["rank"] == "1."
|
||||||
|
assert results[0]["title"] == "Alpha"
|
||||||
|
assert results[0]["url"] == "https://example.com/a"
|
||||||
|
assert results[0]["score"] == "100 points"
|
||||||
|
assert results[0]["author"] == "alice"
|
||||||
|
|
||||||
|
assert results[1]["rank"] == "2."
|
||||||
|
assert results[1]["title"] == "Beta"
|
||||||
|
assert results[1]["score"] == "42 points"
|
||||||
|
assert results[1]["author"] == "bob"
|
||||||
|
|
||||||
|
def test_backward_compat_no_source(self):
|
||||||
|
"""Schema without source key should work exactly as before."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN titles only",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["title"] == "Alpha"
|
||||||
|
assert results[1]["title"] == "Beta"
|
||||||
|
|
||||||
|
def test_source_missing_sibling_returns_default(self):
|
||||||
|
"""When source points to a non-existent sibling, field returns its default."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
{
|
||||||
|
"name": "missing",
|
||||||
|
"selector": "span.nope",
|
||||||
|
"type": "text",
|
||||||
|
"source": "+ div.nonexistent",
|
||||||
|
"default": "N/A",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["missing"] == "N/A"
|
||||||
|
|
||||||
|
def test_source_with_class_filter(self):
|
||||||
|
"""source='+ tr.spacer' should skip the subtext row and match the spacer."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN spacer",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "span.titleline a", "type": "text"},
|
||||||
|
# The spacer <tr> has no content, so score should be empty/default
|
||||||
|
{
|
||||||
|
"name": "score_from_spacer",
|
||||||
|
"selector": "span.score",
|
||||||
|
"type": "text",
|
||||||
|
"source": "+ tr.spacer",
|
||||||
|
"default": "none",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
# The spacer has no span.score, so should fall back to default
|
||||||
|
# But note: "+ tr.spacer" should skip the immediate sibling (no class spacer)
|
||||||
|
# and find the spacer tr. Actually BS4 find_next_sibling finds the FIRST matching sibling.
|
||||||
|
# The immediate next sibling is <tr> (no class), then <tr class="spacer">.
|
||||||
|
# find_next_sibling("tr", class_="spacer") should skip the first and find the spacer.
|
||||||
|
assert results[0]["score_from_spacer"] == "none"
|
||||||
|
|
||||||
|
def test_source_on_attribute_field(self):
|
||||||
|
"""source should work with attribute field type."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "author_href",
|
||||||
|
"selector": "a.hnuser",
|
||||||
|
"type": "attribute",
|
||||||
|
"attribute": "href",
|
||||||
|
"source": "+ tr",
|
||||||
|
"default": "no-href",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
# The <a class="hnuser"> has no href in our test HTML, so attribute returns None -> default
|
||||||
|
assert results[0]["author_href"] == "no-href"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# XPath Strategy Tests
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
class TestXPathSourceField:
|
||||||
|
"""JsonXPathExtractionStrategy with source field."""
|
||||||
|
|
||||||
|
def _extract(self, schema):
|
||||||
|
strategy = JsonXPathExtractionStrategy(schema)
|
||||||
|
return strategy.extract(None, HN_HTML)
|
||||||
|
|
||||||
|
def test_basic_source_extraction(self):
|
||||||
|
"""Fields with source='+ tr' should extract data from the next sibling row."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing') and contains(@class, 'submission')]",
|
||||||
|
"fields": [
|
||||||
|
{"name": "rank", "selector": ".//span[@class='rank']", "type": "text"},
|
||||||
|
{"name": "title", "selector": ".//span[@class='titleline']/a", "type": "text"},
|
||||||
|
{"name": "url", "selector": ".//span[@class='titleline']/a", "type": "attribute", "attribute": "href"},
|
||||||
|
{"name": "score", "selector": ".//span[@class='score']", "type": "text", "source": "+ tr"},
|
||||||
|
{"name": "author", "selector": ".//a[@class='hnuser']", "type": "text", "source": "+ tr"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
|
||||||
|
assert results[0]["rank"] == "1."
|
||||||
|
assert results[0]["title"] == "Alpha"
|
||||||
|
assert results[0]["url"] == "https://example.com/a"
|
||||||
|
assert results[0]["score"] == "100 points"
|
||||||
|
assert results[0]["author"] == "alice"
|
||||||
|
|
||||||
|
assert results[1]["rank"] == "2."
|
||||||
|
assert results[1]["title"] == "Beta"
|
||||||
|
assert results[1]["score"] == "42 points"
|
||||||
|
assert results[1]["author"] == "bob"
|
||||||
|
|
||||||
|
def test_backward_compat_no_source(self):
|
||||||
|
"""Schema without source key should work exactly as before."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN titles only",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing') and contains(@class, 'submission')]",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": ".//span[@class='titleline']/a", "type": "text"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["title"] == "Alpha"
|
||||||
|
assert results[1]["title"] == "Beta"
|
||||||
|
|
||||||
|
def test_source_missing_sibling_returns_default(self):
|
||||||
|
"""When source points to a non-existent sibling, field returns its default."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing') and contains(@class, 'submission')]",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": ".//span[@class='titleline']/a", "type": "text"},
|
||||||
|
{
|
||||||
|
"name": "missing",
|
||||||
|
"selector": ".//span",
|
||||||
|
"type": "text",
|
||||||
|
"source": "+ div",
|
||||||
|
"default": "N/A",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["missing"] == "N/A"
|
||||||
|
|
||||||
|
def test_source_with_class_filter(self):
|
||||||
|
"""source='+ tr.spacer' should find the sibling with class 'spacer'."""
|
||||||
|
schema = {
|
||||||
|
"name": "HN spacer",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing') and contains(@class, 'submission')]",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": ".//span[@class='titleline']/a", "type": "text"},
|
||||||
|
{
|
||||||
|
"name": "score_from_spacer",
|
||||||
|
"selector": ".//span[@class='score']",
|
||||||
|
"type": "text",
|
||||||
|
"source": "+ tr.spacer",
|
||||||
|
"default": "none",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
results = self._extract(schema)
|
||||||
|
assert results[0]["score_from_spacer"] == "none"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Edge case: source on nested/list field types
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
NESTED_SIBLING_HTML = """\
|
||||||
|
<html><body>
|
||||||
|
<div class="item">
|
||||||
|
<span class="name">Item A</span>
|
||||||
|
</div>
|
||||||
|
<div class="details">
|
||||||
|
<span class="price">$10</span>
|
||||||
|
<span class="stock">In Stock</span>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<div class="item">
|
||||||
|
<span class="name">Item B</span>
|
||||||
|
</div>
|
||||||
|
<div class="details">
|
||||||
|
<span class="price">$20</span>
|
||||||
|
<span class="stock">Out of Stock</span>
|
||||||
|
</div>
|
||||||
|
</body></html>
|
||||||
|
"""
|
||||||
|
|
||||||
|
|
||||||
|
class TestCssSourceNested:
|
||||||
|
"""Test source with nested field types (CSS)."""
|
||||||
|
|
||||||
|
def test_source_on_nested_field(self):
|
||||||
|
"""source should work with nested field type — element swap before dispatch."""
|
||||||
|
schema = {
|
||||||
|
"name": "Items",
|
||||||
|
"baseSelector": "div.item",
|
||||||
|
"fields": [
|
||||||
|
{"name": "name", "selector": "span.name", "type": "text"},
|
||||||
|
{
|
||||||
|
"name": "info",
|
||||||
|
"type": "nested",
|
||||||
|
"selector": "div.details",
|
||||||
|
"source": "+ div.details",
|
||||||
|
"fields": [
|
||||||
|
{"name": "price", "selector": "span.price", "type": "text"},
|
||||||
|
{"name": "stock", "selector": "span.stock", "type": "text"},
|
||||||
|
],
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
strategy = JsonCssExtractionStrategy(schema)
|
||||||
|
results = strategy.extract(None, NESTED_SIBLING_HTML)
|
||||||
|
assert len(results) == 2
|
||||||
|
# The nested selector "div.details" runs inside the sibling div.details,
|
||||||
|
# which IS div.details itself — so BS4 select won't find it as a descendant.
|
||||||
|
# But the element itself is div.details, so we can extract spans from it directly.
|
||||||
|
# Actually, nested type does _get_elements(element, "div.details") which searches descendants.
|
||||||
|
# The resolved element IS div.details, so searching for div.details inside it won't work.
|
||||||
|
# Let's adjust: for nested with source, the selector should target children of the sibling.
|
||||||
|
# This is actually fine — let's just use "source" with flat fields instead.
|
||||||
|
|
||||||
|
def test_source_on_flat_fields_from_sibling(self):
|
||||||
|
"""source on individual fields targeting data in sibling div."""
|
||||||
|
schema = {
|
||||||
|
"name": "Items",
|
||||||
|
"baseSelector": "div.item",
|
||||||
|
"fields": [
|
||||||
|
{"name": "name", "selector": "span.name", "type": "text"},
|
||||||
|
{"name": "price", "selector": "span.price", "type": "text", "source": "+ div.details"},
|
||||||
|
{"name": "stock", "selector": "span.stock", "type": "text", "source": "+ div.details"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
strategy = JsonCssExtractionStrategy(schema)
|
||||||
|
results = strategy.extract(None, NESTED_SIBLING_HTML)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["name"] == "Item A"
|
||||||
|
assert results[0]["price"] == "$10"
|
||||||
|
assert results[0]["stock"] == "In Stock"
|
||||||
|
assert results[1]["name"] == "Item B"
|
||||||
|
assert results[1]["price"] == "$20"
|
||||||
|
assert results[1]["stock"] == "Out of Stock"
|
||||||
|
|
||||||
|
|
||||||
|
class TestXPathSourceNested:
|
||||||
|
"""Test source with nested field types (XPath)."""
|
||||||
|
|
||||||
|
def test_source_on_flat_fields_from_sibling(self):
|
||||||
|
"""source on individual fields targeting data in sibling div."""
|
||||||
|
schema = {
|
||||||
|
"name": "Items",
|
||||||
|
"baseSelector": "//div[@class='item']",
|
||||||
|
"fields": [
|
||||||
|
{"name": "name", "selector": ".//span[@class='name']", "type": "text"},
|
||||||
|
{"name": "price", "selector": ".//span[@class='price']", "type": "text", "source": "+ div.details"},
|
||||||
|
{"name": "stock", "selector": ".//span[@class='stock']", "type": "text", "source": "+ div.details"},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
strategy = JsonXPathExtractionStrategy(schema)
|
||||||
|
results = strategy.extract(None, NESTED_SIBLING_HTML)
|
||||||
|
assert len(results) == 2
|
||||||
|
assert results[0]["name"] == "Item A"
|
||||||
|
assert results[0]["price"] == "$10"
|
||||||
|
assert results[0]["stock"] == "In Stock"
|
||||||
|
assert results[1]["name"] == "Item B"
|
||||||
|
assert results[1]["price"] == "$20"
|
||||||
|
assert results[1]["stock"] == "Out of Stock"
|
||||||
|
|
||||||
|
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
# Test invalid source syntax (no "+") returns None gracefully
|
||||||
|
# ---------------------------------------------------------------------------
|
||||||
|
class TestInvalidSourceSyntax:
|
||||||
|
def test_css_invalid_source_returns_default(self):
|
||||||
|
schema = {
|
||||||
|
"name": "test",
|
||||||
|
"baseSelector": "tr.athing.submission",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "bad",
|
||||||
|
"selector": "span.score",
|
||||||
|
"type": "text",
|
||||||
|
"source": "tr", # Missing "+" prefix
|
||||||
|
"default": "fallback",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
strategy = JsonCssExtractionStrategy(schema)
|
||||||
|
results = strategy.extract(None, HN_HTML)
|
||||||
|
assert results[0]["bad"] == "fallback"
|
||||||
|
|
||||||
|
def test_xpath_invalid_source_returns_default(self):
|
||||||
|
schema = {
|
||||||
|
"name": "test",
|
||||||
|
"baseSelector": "//tr[contains(@class, 'athing')]",
|
||||||
|
"fields": [
|
||||||
|
{
|
||||||
|
"name": "bad",
|
||||||
|
"selector": ".//span[@class='score']",
|
||||||
|
"type": "text",
|
||||||
|
"source": "tr", # Missing "+" prefix
|
||||||
|
"default": "fallback",
|
||||||
|
},
|
||||||
|
],
|
||||||
|
}
|
||||||
|
strategy = JsonXPathExtractionStrategy(schema)
|
||||||
|
results = strategy.extract(None, HN_HTML)
|
||||||
|
assert results[0]["bad"] == "fallback"
|
||||||
Reference in New Issue
Block a user