feat(content): add target_elements parameter for selective content extraction

Adds new target_elements parameter to CrawlerRunConfig that allows more flexible content selection than css_selector. This enables focusing markdown generation and data extraction on specific elements while still processing the entire page for links and media.

Key changes:
- Added target_elements list parameter to CrawlerRunConfig
- Modified WebScrapingStrategy and LXMLWebScrapingStrategy to handle target_elements
- Updated documentation with examples and comparison between css_selector and target_elements
- Fixed table extraction in content_scraping_strategy.py

BREAKING CHANGE: Table extraction logic has been modified to better handle thead/tbody structures
This commit is contained in:
UncleCode
2025-03-10 18:54:51 +08:00
parent 9d69fce834
commit 9547bada3a
7 changed files with 188 additions and 47 deletions

View File

@@ -11,7 +11,7 @@ from .config import (
)
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy
@@ -501,6 +501,15 @@ class CrawlerRunConfig():
Default: False.
css_selector (str or None): CSS selector to extract a specific portion of the page.
Default: None.
target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation
and structured data extraction. When you set this, only the contents
of these elements are processed for extraction and Markdown generation.
If you do not set any value, the entire page is processed.
The difference between this and css_selector is that this will shrink
the initial raw HTML to the selected element, while this will only affect
the extraction and Markdown generation.
Default: None
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
Default: None.
excluded_selector (str or None): CSS selector to exclude from processing.
@@ -652,6 +661,7 @@ class CrawlerRunConfig():
markdown_generator: MarkdownGenerationStrategy = None,
only_text: bool = False,
css_selector: str = None,
target_elements: List[str] = None,
excluded_tags: list = None,
excluded_selector: str = None,
keep_data_attributes: bool = False,
@@ -732,6 +742,7 @@ class CrawlerRunConfig():
self.markdown_generator = markdown_generator
self.only_text = only_text
self.css_selector = css_selector
self.target_elements = target_elements or []
self.excluded_tags = excluded_tags or []
self.excluded_selector = excluded_selector or ""
self.keep_data_attributes = keep_data_attributes
@@ -862,6 +873,7 @@ class CrawlerRunConfig():
markdown_generator=kwargs.get("markdown_generator"),
only_text=kwargs.get("only_text", False),
css_selector=kwargs.get("css_selector"),
target_elements=kwargs.get("target_elements", []),
excluded_tags=kwargs.get("excluded_tags", []),
excluded_selector=kwargs.get("excluded_selector", ""),
keep_data_attributes=kwargs.get("keep_data_attributes", False),
@@ -963,6 +975,7 @@ class CrawlerRunConfig():
"markdown_generator": self.markdown_generator,
"only_text": self.only_text,
"css_selector": self.css_selector,
"target_elements": self.target_elements,
"excluded_tags": self.excluded_tags,
"excluded_selector": self.excluded_selector,
"keep_data_attributes": self.keep_data_attributes,
@@ -1099,3 +1112,5 @@ class LLMConfig:
config_dict = self.to_dict()
config_dict.update(kwargs)
return LLMConfig.from_kwargs(config_dict)

View File

@@ -514,7 +514,8 @@ class AsyncWebCrawler:
scraping_strategy.logger = self.logger
# Process HTML content
params = {k: v for k, v in config.to_dict().items() if k not in ["url"]}
params = config.__dict__.copy()
params.pop("url", None)
# add keys from kwargs to params that doesn't exist in params
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

View File

@@ -301,7 +301,21 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Extract rows with colspan handling
rows = []
for row in table.select('tr:not(:has(ancestor::thead))'):
all_rows = table.select('tr')
thead = table.select_one('thead')
tbody_rows = []
if thead:
thead_rows = thead.select('tr')
tbody_rows = [row for row in all_rows if row not in thead_rows]
else:
if all_rows and all_rows[0].select('th'):
tbody_rows = all_rows[1:]
else:
tbody_rows = all_rows
for row in tbody_rows:
# for row in table.select('tr:not(:has(ancestor::thead))'):
row_data = []
for cell in row.select('td'):
text = cell.get_text().strip()
@@ -822,6 +836,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
html: str,
word_count_threshold: int = MIN_WORD_THRESHOLD,
css_selector: str = None,
target_elements: List[str] = None,
**kwargs,
) -> Dict[str, Any]:
"""
@@ -876,22 +891,37 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector):
element.extract()
if False and css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
return {
"markdown": "",
"cleaned_html": "",
"success": True,
"media": {"images": [], "videos": [], "audios": []},
"links": {"internal": [], "external": []},
"metadata": {},
"message": f"No elements found for CSS selector: {css_selector}",
}
# raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
body = soup.new_tag("div")
for el in selected_elements:
body.append(el)
# if False and css_selector:
# selected_elements = body.select(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": {},
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
# body = soup.new_tag("div")
# for el in selected_elements:
# body.append(el)
content_element = None
if target_elements:
try:
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(body.select(target_element))
content_element = soup.new_tag("div")
for el in for_content_targeted_element:
content_element.append(el)
except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None
else:
content_element = body
kwargs["exclude_social_media_domains"] = set(
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -951,7 +981,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
str_body = ""
try:
str_body = body.encode_contents().decode("utf-8")
str_body = content_element.encode_contents().decode("utf-8")
except Exception:
# Reset body to the original HTML
success = False
@@ -1447,6 +1477,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
html: str,
word_count_threshold: int = MIN_WORD_THRESHOLD,
css_selector: str = None,
target_elements: List[str] = None,
**kwargs,
) -> Dict[str, Any]:
if not html:
@@ -1497,24 +1528,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
meta = {}
# Handle CSS selector targeting
if css_selector:
# if css_selector:
# try:
# selected_elements = body.cssselect(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": meta,
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# body = lhtml.Element("div")
# body.extend(selected_elements)
# except Exception as e:
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
# return None
content_element = None
if target_elements:
try:
selected_elements = body.cssselect(css_selector)
if not selected_elements:
return {
"markdown": "",
"cleaned_html": "",
"success": True,
"media": {"images": [], "videos": [], "audios": []},
"links": {"internal": [], "external": []},
"metadata": meta,
"message": f"No elements found for CSS selector: {css_selector}",
}
body = lhtml.Element("div")
body.extend(selected_elements)
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(body.cssselect(target_element))
content_element = lhtml.Element("div")
content_element.extend(for_content_targeted_element)
except Exception as e:
self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None
else:
content_element = body
# Remove script and style tags
for tag in ["script", "style", "link", "meta", "noscript"]:
@@ -1585,7 +1630,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
# Generate output HTML
cleaned_html = lhtml.tostring(
body,
# body,
content_element,
encoding="unicode",
pretty_print=True,
method="html",