From 43e09da694d5ce9797dfc6c15185e76990ab813d Mon Sep 17 00:00:00 2001 From: UncleCode Date: Wed, 12 Feb 2025 21:59:19 +0800 Subject: [PATCH] refactor(crawler): remove content filter functionality Remove content filter related code and parameters as part of simplifying the crawler configuration. This includes: - Removing ContentFilter import and related classes - Removing content_filter parameter from CrawlerRunConfig - Cleaning up LLMExtractionStrategy constructor parameters BREAKING CHANGE: Removed content_filter parameter from CrawlerRunConfig. Users should migrate to using extraction strategies for content filtering. --- crawl4ai/async_configs.py | 7 ------- crawl4ai/extraction_strategy.py | 22 ++++++++++++---------- docs/md_v2/api/parameters.md | 1 - 3 files changed, 12 insertions(+), 18 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index b60cc468..0981c1d2 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -11,7 +11,6 @@ from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy -from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .deep_crawling import DeepCrawlStrategy from typing import Union, List @@ -387,8 +386,6 @@ class CrawlerRunConfig(): Default: RegexChunking(). markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. Default: None. - content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content. - Default: None. only_text (bool): If True, attempt to extract text-only content where applicable. Default: False. css_selector (str or None): CSS selector to extract a specific portion of the page. @@ -532,7 +529,6 @@ class CrawlerRunConfig(): extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, - content_filter : RelevantContentFilter = None, only_text: bool = False, css_selector: str = None, excluded_tags: list = None, @@ -611,7 +607,6 @@ class CrawlerRunConfig(): self.extraction_strategy = extraction_strategy self.chunking_strategy = chunking_strategy self.markdown_generator = markdown_generator - self.content_filter = content_filter self.only_text = only_text self.css_selector = css_selector self.excluded_tags = excluded_tags or [] @@ -723,7 +718,6 @@ class CrawlerRunConfig(): extraction_strategy=kwargs.get("extraction_strategy"), chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()), markdown_generator=kwargs.get("markdown_generator"), - content_filter=kwargs.get("content_filter"), only_text=kwargs.get("only_text", False), css_selector=kwargs.get("css_selector"), excluded_tags=kwargs.get("excluded_tags", []), @@ -821,7 +815,6 @@ class CrawlerRunConfig(): "extraction_strategy": self.extraction_strategy, "chunking_strategy": self.chunking_strategy, "markdown_generator": self.markdown_generator, - "content_filter": self.content_filter, "only_text": self.only_text, "css_selector": self.css_selector, "excluded_tags": self.excluded_tags, diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py index 90b726bc..f7abab17 100644 --- a/crawl4ai/extraction_strategy.py +++ b/crawl4ai/extraction_strategy.py @@ -508,6 +508,10 @@ class LLMExtractionStrategy(ExtractionStrategy): overlap_rate=OVERLAP_RATE, word_token_rate=WORD_TOKEN_RATE, apply_chunking=True, + api_base: str =None, + base_url: str =None, + input_format: str = "markdown", + verbose=False, **kwargs, ): """ @@ -531,7 +535,7 @@ class LLMExtractionStrategy(ExtractionStrategy): total_usage: Accumulated token usage. """ - super().__init__(**kwargs) + super().__init__( input_format=input_format, **kwargs) self.provider = provider if api_token and not api_token.startswith("env:"): self.api_token = api_token @@ -548,19 +552,17 @@ class LLMExtractionStrategy(ExtractionStrategy): if schema: self.extract_type = "schema" - self.chunk_token_threshold = kwargs.get( - "chunk_token_threshold", CHUNK_TOKEN_THRESHOLD - ) - self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) - self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) - self.apply_chunking = kwargs.get("apply_chunking", True) - self.base_url = kwargs.get("base_url", None) - self.api_base = kwargs.get("api_base", kwargs.get("base_url", None)) + self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD + self.overlap_rate = overlap_rate + self.word_token_rate = word_token_rate + self.apply_chunking = apply_chunking + self.base_url = base_url + self.api_base = api_base or base_url self.extra_args = kwargs.get("extra_args", {}) if not self.apply_chunking: self.chunk_token_threshold = 1e9 - self.verbose = kwargs.get("verbose", False) + self.verbose = verbose self.usages = [] # Store individual usages self.total_usage = TokenUsage() # Accumulated usage diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md index 0b994fd6..9db3d767 100644 --- a/docs/md_v2/api/parameters.md +++ b/docs/md_v2/api/parameters.md @@ -71,7 +71,6 @@ We group them by category. | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | -| **`content_filter`** | `RelevantContentFilter` (None) | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`. | | **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | | **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | | **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |