feat(content): add target_elements parameter for selective content extraction

Adds new target_elements parameter to CrawlerRunConfig that allows more flexible content selection than css_selector. This enables focusing markdown generation and data extraction on specific elements while still processing the entire page for links and media.

Key changes:
- Added target_elements list parameter to CrawlerRunConfig
- Modified WebScrapingStrategy and LXMLWebScrapingStrategy to handle target_elements
- Updated documentation with examples and comparison between css_selector and target_elements
- Fixed table extraction in content_scraping_strategy.py

BREAKING CHANGE: Table extraction logic has been modified to better handle thead/tbody structures
This commit is contained in:
UncleCode
2025-03-10 18:54:51 +08:00
parent 9d69fce834
commit 9547bada3a
7 changed files with 188 additions and 47 deletions

View File

@@ -11,7 +11,7 @@ from .config import (
) )
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
@@ -501,6 +501,15 @@ class CrawlerRunConfig():
Default: False. Default: False.
css_selector (str or None): CSS selector to extract a specific portion of the page. css_selector (str or None): CSS selector to extract a specific portion of the page.
Default: None. Default: None.
target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation
and structured data extraction. When you set this, only the contents
of these elements are processed for extraction and Markdown generation.
If you do not set any value, the entire page is processed.
The difference between this and css_selector is that this will shrink
the initial raw HTML to the selected element, while this will only affect
the extraction and Markdown generation.
Default: None
excluded_tags (list of str or None): List of HTML tags to exclude from processing. excluded_tags (list of str or None): List of HTML tags to exclude from processing.
Default: None. Default: None.
excluded_selector (str or None): CSS selector to exclude from processing. excluded_selector (str or None): CSS selector to exclude from processing.
@@ -652,6 +661,7 @@ class CrawlerRunConfig():
markdown_generator: MarkdownGenerationStrategy = None, markdown_generator: MarkdownGenerationStrategy = None,
only_text: bool = False, only_text: bool = False,
css_selector: str = None, css_selector: str = None,
target_elements: List[str] = None,
excluded_tags: list = None, excluded_tags: list = None,
excluded_selector: str = None, excluded_selector: str = None,
keep_data_attributes: bool = False, keep_data_attributes: bool = False,
@@ -732,6 +742,7 @@ class CrawlerRunConfig():
self.markdown_generator = markdown_generator self.markdown_generator = markdown_generator
self.only_text = only_text self.only_text = only_text
self.css_selector = css_selector self.css_selector = css_selector
self.target_elements = target_elements or []
self.excluded_tags = excluded_tags or [] self.excluded_tags = excluded_tags or []
self.excluded_selector = excluded_selector or "" self.excluded_selector = excluded_selector or ""
self.keep_data_attributes = keep_data_attributes self.keep_data_attributes = keep_data_attributes
@@ -862,6 +873,7 @@ class CrawlerRunConfig():
markdown_generator=kwargs.get("markdown_generator"), markdown_generator=kwargs.get("markdown_generator"),
only_text=kwargs.get("only_text", False), only_text=kwargs.get("only_text", False),
css_selector=kwargs.get("css_selector"), css_selector=kwargs.get("css_selector"),
target_elements=kwargs.get("target_elements", []),
excluded_tags=kwargs.get("excluded_tags", []), excluded_tags=kwargs.get("excluded_tags", []),
excluded_selector=kwargs.get("excluded_selector", ""), excluded_selector=kwargs.get("excluded_selector", ""),
keep_data_attributes=kwargs.get("keep_data_attributes", False), keep_data_attributes=kwargs.get("keep_data_attributes", False),
@@ -963,6 +975,7 @@ class CrawlerRunConfig():
"markdown_generator": self.markdown_generator, "markdown_generator": self.markdown_generator,
"only_text": self.only_text, "only_text": self.only_text,
"css_selector": self.css_selector, "css_selector": self.css_selector,
"target_elements": self.target_elements,
"excluded_tags": self.excluded_tags, "excluded_tags": self.excluded_tags,
"excluded_selector": self.excluded_selector, "excluded_selector": self.excluded_selector,
"keep_data_attributes": self.keep_data_attributes, "keep_data_attributes": self.keep_data_attributes,
@@ -1099,3 +1112,5 @@ class LLMConfig:
config_dict = self.to_dict() config_dict = self.to_dict()
config_dict.update(kwargs) config_dict.update(kwargs)
return LLMConfig.from_kwargs(config_dict) return LLMConfig.from_kwargs(config_dict)

View File

@@ -514,7 +514,8 @@ class AsyncWebCrawler:
scraping_strategy.logger = self.logger scraping_strategy.logger = self.logger
# Process HTML content # Process HTML content
params = {k: v for k, v in config.to_dict().items() if k not in ["url"]} params = config.__dict__.copy()
params.pop("url", None)
# add keys from kwargs to params that doesn't exist in params # add keys from kwargs to params that doesn't exist in params
params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

View File

@@ -301,7 +301,21 @@ class WebScrapingStrategy(ContentScrapingStrategy):
# Extract rows with colspan handling # Extract rows with colspan handling
rows = [] rows = []
for row in table.select('tr:not(:has(ancestor::thead))'): all_rows = table.select('tr')
thead = table.select_one('thead')
tbody_rows = []
if thead:
thead_rows = thead.select('tr')
tbody_rows = [row for row in all_rows if row not in thead_rows]
else:
if all_rows and all_rows[0].select('th'):
tbody_rows = all_rows[1:]
else:
tbody_rows = all_rows
for row in tbody_rows:
# for row in table.select('tr:not(:has(ancestor::thead))'):
row_data = [] row_data = []
for cell in row.select('td'): for cell in row.select('td'):
text = cell.get_text().strip() text = cell.get_text().strip()
@@ -822,6 +836,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
html: str, html: str,
word_count_threshold: int = MIN_WORD_THRESHOLD, word_count_threshold: int = MIN_WORD_THRESHOLD,
css_selector: str = None, css_selector: str = None,
target_elements: List[str] = None,
**kwargs, **kwargs,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
""" """
@@ -876,22 +891,37 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector): for element in body.select(excluded_selector):
element.extract() element.extract()
if False and css_selector: # if False and css_selector:
selected_elements = body.select(css_selector) # selected_elements = body.select(css_selector)
if not selected_elements: # if not selected_elements:
return { # return {
"markdown": "", # "markdown": "",
"cleaned_html": "", # "cleaned_html": "",
"success": True, # "success": True,
"media": {"images": [], "videos": [], "audios": []}, # "media": {"images": [], "videos": [], "audios": []},
"links": {"internal": [], "external": []}, # "links": {"internal": [], "external": []},
"metadata": {}, # "metadata": {},
"message": f"No elements found for CSS selector: {css_selector}", # "message": f"No elements found for CSS selector: {css_selector}",
} # }
# raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
body = soup.new_tag("div") # body = soup.new_tag("div")
for el in selected_elements: # for el in selected_elements:
body.append(el) # body.append(el)
content_element = None
if target_elements:
try:
for_content_targeted_element = []
for target_element in target_elements:
for_content_targeted_element.extend(body.select(target_element))
content_element = soup.new_tag("div")
for el in for_content_targeted_element:
content_element.append(el)
except Exception as e:
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None
else:
content_element = body
kwargs["exclude_social_media_domains"] = set( kwargs["exclude_social_media_domains"] = set(
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
@@ -951,7 +981,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
str_body = "" str_body = ""
try: try:
str_body = body.encode_contents().decode("utf-8") str_body = content_element.encode_contents().decode("utf-8")
except Exception: except Exception:
# Reset body to the original HTML # Reset body to the original HTML
success = False success = False
@@ -1447,6 +1477,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
html: str, html: str,
word_count_threshold: int = MIN_WORD_THRESHOLD, word_count_threshold: int = MIN_WORD_THRESHOLD,
css_selector: str = None, css_selector: str = None,
target_elements: List[str] = None,
**kwargs, **kwargs,
) -> Dict[str, Any]: ) -> Dict[str, Any]:
if not html: if not html:
@@ -1497,24 +1528,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
meta = {} meta = {}
# Handle CSS selector targeting # Handle CSS selector targeting
if css_selector: # if css_selector:
# try:
# selected_elements = body.cssselect(css_selector)
# if not selected_elements:
# return {
# "markdown": "",
# "cleaned_html": "",
# "success": True,
# "media": {"images": [], "videos": [], "audios": []},
# "links": {"internal": [], "external": []},
# "metadata": meta,
# "message": f"No elements found for CSS selector: {css_selector}",
# }
# body = lhtml.Element("div")
# body.extend(selected_elements)
# except Exception as e:
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
# return None
content_element = None
if target_elements:
try: try:
selected_elements = body.cssselect(css_selector) for_content_targeted_element = []
if not selected_elements: for target_element in target_elements:
return { for_content_targeted_element.extend(body.cssselect(target_element))
"markdown": "", content_element = lhtml.Element("div")
"cleaned_html": "", content_element.extend(for_content_targeted_element)
"success": True,
"media": {"images": [], "videos": [], "audios": []},
"links": {"internal": [], "external": []},
"metadata": meta,
"message": f"No elements found for CSS selector: {css_selector}",
}
body = lhtml.Element("div")
body.extend(selected_elements)
except Exception as e: except Exception as e:
self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
return None return None
else:
content_element = body
# Remove script and style tags # Remove script and style tags
for tag in ["script", "style", "link", "meta", "noscript"]: for tag in ["script", "style", "link", "meta", "noscript"]:
@@ -1585,7 +1630,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
# Generate output HTML # Generate output HTML
cleaned_html = lhtml.tostring( cleaned_html = lhtml.tostring(
body, # body,
content_element,
encoding="unicode", encoding="unicode",
pretty_print=True, pretty_print=True,
method="html", method="html",

View File

@@ -352,7 +352,10 @@ Example:
from crawl4ai import CrawlerRunConfig, PruningContentFilter from crawl4ai import CrawlerRunConfig, PruningContentFilter
config = CrawlerRunConfig( config = CrawlerRunConfig(
content_filter=PruningContentFilter(threshold=0.48) markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
),
cache_mode= CacheMode.BYPASS
) )
print(config.dump()) # Use this JSON in your API calls print(config.dump()) # Use this JSON in your API calls
``` ```

View File

@@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
start = time.perf_counter() start = time.perf_counter()
async with AsyncWebCrawler(config=browser_config) as crawler: async with AsyncWebCrawler(config=browser_config) as crawler:
dispatcher = MemoryAdaptiveDispatcher( dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0, memory_threshold_percent=95.0,
max_session_permit=10, max_session_permit=10,
rate_limiter=RateLimiter( rate_limiter=RateLimiter(
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2 base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2

View File

@@ -71,7 +71,8 @@ We group them by category.
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | | **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | | **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | | **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | | **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |

View File

@@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co
## 1. CSS-Based Selection ## 1. CSS-Based Selection
There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
### 1.1 Using `css_selector`
A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
```python ```python
@@ -32,6 +36,33 @@ if __name__ == "__main__":
**Result**: Only elements matching that selector remain in `result.cleaned_html`. **Result**: Only elements matching that selector remain in `result.cleaned_html`.
### 1.2 Using `target_elements`
The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
config = CrawlerRunConfig(
# Target article body and sidebar, but not other content
target_elements=["article.main-content", "aside.sidebar"]
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/blog-post",
config=config
)
print("Markdown focused on target elements")
print("Links from entire page still available:", len(result.links.get("internal", [])))
if __name__ == "__main__":
asyncio.run(main())
```
**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
--- ---
## 2. Content Filtering & Exclusions ## 2. Content Filtering & Exclusions
@@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when:
--- ---
## 7. Conclusion ## 7. Combining CSS Selection Methods
By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include: You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
1. **`css_selector`** Basic scoping to an element or region. ```python
2. **`word_count_threshold`** Skip short blocks. import asyncio
3. **`excluded_tags`** Remove entire HTML tags. from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** Filter out unwanted links or domains.
5. **`exclude_external_images`** Remove images from external sources. async def main():
6. **`process_iframes`** Merge iframe content if needed. # Target specific content but preserve page context
config = CrawlerRunConfig(
# Focus markdown on main content and sidebar
target_elements=["#main-content", ".sidebar"],
# Global filters applied to entire page
excluded_tags=["nav", "footer", "header"],
exclude_external_links=True,
# Use basic content thresholds
word_count_threshold=15,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/article",
config=config
)
print(f"Content focuses on specific elements, but all links still analyzed")
print(f"Internal links: {len(result.links.get('internal', []))}")
print(f"External links: {len(result.links.get('external', []))}")
if __name__ == "__main__":
asyncio.run(main())
```
This approach gives you the best of both worlds:
- Markdown generation and content extraction focus on the elements you care about
- Links, images and other page data still give you the full context of the page
- Content filtering still applies globally
## 8. Conclusion
By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
1. **`target_elements`** Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
2. **`css_selector`** Basic scoping to an element or region for all extraction processes.
3. **`word_count_threshold`** Skip short blocks.
4. **`excluded_tags`** Remove entire HTML tags.
5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** Filter out unwanted links or domains.
6. **`exclude_external_images`** Remove images from external sources.
7. **`process_iframes`** Merge iframe content if needed.
Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max! Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!