From 9547bada3a36dfc64eaf146090c164a3babf1496 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Mon, 10 Mar 2025 18:54:51 +0800 Subject: [PATCH] feat(content): add target_elements parameter for selective content extraction Adds new target_elements parameter to CrawlerRunConfig that allows more flexible content selection than css_selector. This enables focusing markdown generation and data extraction on specific elements while still processing the entire page for links and media. Key changes: - Added target_elements list parameter to CrawlerRunConfig - Modified WebScrapingStrategy and LXMLWebScrapingStrategy to handle target_elements - Updated documentation with examples and comparison between css_selector and target_elements - Fixed table extraction in content_scraping_strategy.py BREAKING CHANGE: Table extraction logic has been modified to better handle thead/tbody structures --- crawl4ai/async_configs.py | 17 +++- crawl4ai/async_webcrawler.py | 3 +- crawl4ai/content_scraping_strategy.py | 114 ++++++++++++++++++-------- deploy/docker/README.md | 5 +- docs/examples/dispatcher_example.py | 2 +- docs/md_v2/api/parameters.md | 3 +- docs/md_v2/core/content-selection.md | 91 ++++++++++++++++++-- 7 files changed, 188 insertions(+), 47 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index e652057b..937ae4eb 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -11,7 +11,7 @@ from .config import ( ) from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator -from .extraction_strategy import ExtractionStrategy +from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy @@ -501,6 +501,15 @@ class CrawlerRunConfig(): Default: False. css_selector (str or None): CSS selector to extract a specific portion of the page. Default: None. + + target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation + and structured data extraction. When you set this, only the contents + of these elements are processed for extraction and Markdown generation. + If you do not set any value, the entire page is processed. + The difference between this and css_selector is that this will shrink + the initial raw HTML to the selected element, while this will only affect + the extraction and Markdown generation. + Default: None excluded_tags (list of str or None): List of HTML tags to exclude from processing. Default: None. excluded_selector (str or None): CSS selector to exclude from processing. @@ -652,6 +661,7 @@ class CrawlerRunConfig(): markdown_generator: MarkdownGenerationStrategy = None, only_text: bool = False, css_selector: str = None, + target_elements: List[str] = None, excluded_tags: list = None, excluded_selector: str = None, keep_data_attributes: bool = False, @@ -732,6 +742,7 @@ class CrawlerRunConfig(): self.markdown_generator = markdown_generator self.only_text = only_text self.css_selector = css_selector + self.target_elements = target_elements or [] self.excluded_tags = excluded_tags or [] self.excluded_selector = excluded_selector or "" self.keep_data_attributes = keep_data_attributes @@ -862,6 +873,7 @@ class CrawlerRunConfig(): markdown_generator=kwargs.get("markdown_generator"), only_text=kwargs.get("only_text", False), css_selector=kwargs.get("css_selector"), + target_elements=kwargs.get("target_elements", []), excluded_tags=kwargs.get("excluded_tags", []), excluded_selector=kwargs.get("excluded_selector", ""), keep_data_attributes=kwargs.get("keep_data_attributes", False), @@ -963,6 +975,7 @@ class CrawlerRunConfig(): "markdown_generator": self.markdown_generator, "only_text": self.only_text, "css_selector": self.css_selector, + "target_elements": self.target_elements, "excluded_tags": self.excluded_tags, "excluded_selector": self.excluded_selector, "keep_data_attributes": self.keep_data_attributes, @@ -1099,3 +1112,5 @@ class LLMConfig: config_dict = self.to_dict() config_dict.update(kwargs) return LLMConfig.from_kwargs(config_dict) + + diff --git a/crawl4ai/async_webcrawler.py b/crawl4ai/async_webcrawler.py index dd777a36..430e26a0 100644 --- a/crawl4ai/async_webcrawler.py +++ b/crawl4ai/async_webcrawler.py @@ -514,7 +514,8 @@ class AsyncWebCrawler: scraping_strategy.logger = self.logger # Process HTML content - params = {k: v for k, v in config.to_dict().items() if k not in ["url"]} + params = config.__dict__.copy() + params.pop("url", None) # add keys from kwargs to params that doesn't exist in params params.update({k: v for k, v in kwargs.items() if k not in params.keys()}) diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py index 5ebf8394..a806b045 100644 --- a/crawl4ai/content_scraping_strategy.py +++ b/crawl4ai/content_scraping_strategy.py @@ -301,7 +301,21 @@ class WebScrapingStrategy(ContentScrapingStrategy): # Extract rows with colspan handling rows = [] - for row in table.select('tr:not(:has(ancestor::thead))'): + all_rows = table.select('tr') + thead = table.select_one('thead') + tbody_rows = [] + + if thead: + thead_rows = thead.select('tr') + tbody_rows = [row for row in all_rows if row not in thead_rows] + else: + if all_rows and all_rows[0].select('th'): + tbody_rows = all_rows[1:] + else: + tbody_rows = all_rows + + for row in tbody_rows: + # for row in table.select('tr:not(:has(ancestor::thead))'): row_data = [] for cell in row.select('td'): text = cell.get_text().strip() @@ -822,6 +836,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, + target_elements: List[str] = None, **kwargs, ) -> Dict[str, Any]: """ @@ -876,22 +891,37 @@ class WebScrapingStrategy(ContentScrapingStrategy): for element in body.select(excluded_selector): element.extract() - if False and css_selector: - selected_elements = body.select(css_selector) - if not selected_elements: - return { - "markdown": "", - "cleaned_html": "", - "success": True, - "media": {"images": [], "videos": [], "audios": []}, - "links": {"internal": [], "external": []}, - "metadata": {}, - "message": f"No elements found for CSS selector: {css_selector}", - } - # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") - body = soup.new_tag("div") - for el in selected_elements: - body.append(el) + # if False and css_selector: + # selected_elements = body.select(css_selector) + # if not selected_elements: + # return { + # "markdown": "", + # "cleaned_html": "", + # "success": True, + # "media": {"images": [], "videos": [], "audios": []}, + # "links": {"internal": [], "external": []}, + # "metadata": {}, + # "message": f"No elements found for CSS selector: {css_selector}", + # } + # # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}") + # body = soup.new_tag("div") + # for el in selected_elements: + # body.append(el) + + content_element = None + if target_elements: + try: + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.select(target_element)) + content_element = soup.new_tag("div") + for el in for_content_targeted_element: + content_element.append(el) + except Exception as e: + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") + return None + else: + content_element = body kwargs["exclude_social_media_domains"] = set( kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS @@ -951,7 +981,7 @@ class WebScrapingStrategy(ContentScrapingStrategy): str_body = "" try: - str_body = body.encode_contents().decode("utf-8") + str_body = content_element.encode_contents().decode("utf-8") except Exception: # Reset body to the original HTML success = False @@ -1447,6 +1477,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): html: str, word_count_threshold: int = MIN_WORD_THRESHOLD, css_selector: str = None, + target_elements: List[str] = None, **kwargs, ) -> Dict[str, Any]: if not html: @@ -1497,24 +1528,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): meta = {} # Handle CSS selector targeting - if css_selector: + # if css_selector: + # try: + # selected_elements = body.cssselect(css_selector) + # if not selected_elements: + # return { + # "markdown": "", + # "cleaned_html": "", + # "success": True, + # "media": {"images": [], "videos": [], "audios": []}, + # "links": {"internal": [], "external": []}, + # "metadata": meta, + # "message": f"No elements found for CSS selector: {css_selector}", + # } + # body = lhtml.Element("div") + # body.extend(selected_elements) + # except Exception as e: + # self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") + # return None + + content_element = None + if target_elements: try: - selected_elements = body.cssselect(css_selector) - if not selected_elements: - return { - "markdown": "", - "cleaned_html": "", - "success": True, - "media": {"images": [], "videos": [], "audios": []}, - "links": {"internal": [], "external": []}, - "metadata": meta, - "message": f"No elements found for CSS selector: {css_selector}", - } - body = lhtml.Element("div") - body.extend(selected_elements) + for_content_targeted_element = [] + for target_element in target_elements: + for_content_targeted_element.extend(body.cssselect(target_element)) + content_element = lhtml.Element("div") + content_element.extend(for_content_targeted_element) except Exception as e: - self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE") + self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE") return None + else: + content_element = body # Remove script and style tags for tag in ["script", "style", "link", "meta", "noscript"]: @@ -1585,7 +1630,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy): # Generate output HTML cleaned_html = lhtml.tostring( - body, + # body, + content_element, encoding="unicode", pretty_print=True, method="html", diff --git a/deploy/docker/README.md b/deploy/docker/README.md index fdcb9744..c4582031 100644 --- a/deploy/docker/README.md +++ b/deploy/docker/README.md @@ -352,7 +352,10 @@ Example: from crawl4ai import CrawlerRunConfig, PruningContentFilter config = CrawlerRunConfig( - content_filter=PruningContentFilter(threshold=0.48) + markdown_generator=DefaultMarkdownGenerator( + content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed") + ), + cache_mode= CacheMode.BYPASS ) print(config.dump()) # Use this JSON in your API calls ``` diff --git a/docs/examples/dispatcher_example.py b/docs/examples/dispatcher_example.py index cac08186..8ac24d3b 100644 --- a/docs/examples/dispatcher_example.py +++ b/docs/examples/dispatcher_example.py @@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config): start = time.perf_counter() async with AsyncWebCrawler(config=browser_config) as crawler: dispatcher = MemoryAdaptiveDispatcher( - memory_threshold_percent=70.0, + memory_threshold_percent=95.0, max_session_permit=10, rate_limiter=RateLimiter( base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2 diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 626f7e92..d352e162 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -71,7 +71,8 @@ We group them by category. | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | -| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | +| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. | +| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. | | **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | | **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | | **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. | diff --git a/docs/md_v2/core/content-selection.md b/docs/md_v2/core/content-selection.md index 21546d09..07c8861b 100644 --- a/docs/md_v2/core/content-selection.md +++ b/docs/md_v2/core/content-selection.md @@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co ## 1. CSS-Based Selection +There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`. + +### 1.1 Using `css_selector` + A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**: ```python @@ -32,6 +36,33 @@ if __name__ == "__main__": **Result**: Only elements matching that selector remain in `result.cleaned_html`. +### 1.2 Using `target_elements` + +The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features: + +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig + +async def main(): + config = CrawlerRunConfig( + # Target article body and sidebar, but not other content + target_elements=["article.main-content", "aside.sidebar"] + ) + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/blog-post", + config=config + ) + print("Markdown focused on target elements") + print("Links from entire page still available:", len(result.links.get("internal", []))) + +if __name__ == "__main__": + asyncio.run(main()) +``` + +**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection. + --- ## 2. Content Filtering & Exclusions @@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when: --- -## 7. Conclusion +## 7. Combining CSS Selection Methods -By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include: +You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output: -1. **`css_selector`** – Basic scoping to an element or region. -2. **`word_count_threshold`** – Skip short blocks. -3. **`excluded_tags`** – Remove entire HTML tags. -4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains. -5. **`exclude_external_images`** – Remove images from external sources. -6. **`process_iframes`** – Merge iframe content if needed. +```python +import asyncio +from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode + +async def main(): + # Target specific content but preserve page context + config = CrawlerRunConfig( + # Focus markdown on main content and sidebar + target_elements=["#main-content", ".sidebar"], + + # Global filters applied to entire page + excluded_tags=["nav", "footer", "header"], + exclude_external_links=True, + + # Use basic content thresholds + word_count_threshold=15, + + cache_mode=CacheMode.BYPASS + ) + + async with AsyncWebCrawler() as crawler: + result = await crawler.arun( + url="https://example.com/article", + config=config + ) + + print(f"Content focuses on specific elements, but all links still analyzed") + print(f"Internal links: {len(result.links.get('internal', []))}") + print(f"External links: {len(result.links.get('external', []))}") + +if __name__ == "__main__": + asyncio.run(main()) +``` + +This approach gives you the best of both worlds: +- Markdown generation and content extraction focus on the elements you care about +- Links, images and other page data still give you the full context of the page +- Content filtering still applies globally + +## 8. Conclusion + +By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include: + +1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media. +2. **`css_selector`** – Basic scoping to an element or region for all extraction processes. +3. **`word_count_threshold`** – Skip short blocks. +4. **`excluded_tags`** – Remove entire HTML tags. +5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains. +6. **`exclude_external_images`** – Remove images from external sources. +7. **`process_iframes`** – Merge iframe content if needed. Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max! \ No newline at end of file