From 43e09da694d5ce9797dfc6c15185e76990ab813d Mon Sep 17 00:00:00 2001
From: UncleCode <unclecode@kidocode.com>
Date: Wed, 12 Feb 2025 21:59:19 +0800
Subject: [PATCH] refactor(crawler): remove content filter functionality

Remove content filter related code and parameters as part of simplifying the crawler configuration. This includes:
- Removing ContentFilter import and related classes
- Removing content_filter parameter from CrawlerRunConfig
- Cleaning up LLMExtractionStrategy constructor parameters

BREAKING CHANGE: Removed content_filter parameter from CrawlerRunConfig. Users should migrate to using extraction strategies for content filtering.
---
 crawl4ai/async_configs.py       |  7 -------
 crawl4ai/extraction_strategy.py | 22 ++++++++++++----------
 docs/md_v2/api/parameters.md    |  1 -
 3 files changed, 12 insertions(+), 18 deletions(-)

diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index b60cc468..0981c1d2 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -11,7 +11,6 @@ from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
 from .extraction_strategy import ExtractionStrategy
 from .chunking_strategy import ChunkingStrategy, RegexChunking
 from .markdown_generation_strategy import MarkdownGenerationStrategy
-from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter
 from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
 from .deep_crawling import DeepCrawlStrategy
 from typing import Union, List
@@ -387,8 +386,6 @@ class CrawlerRunConfig():
                                               Default: RegexChunking().
         markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
                                                          Default: None.
-        content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
-                                                        Default: None.
         only_text (bool): If True, attempt to extract text-only content where applicable.
                           Default: False.
         css_selector (str or None): CSS selector to extract a specific portion of the page.
@@ -532,7 +529,6 @@ class CrawlerRunConfig():
         extraction_strategy: ExtractionStrategy = None,
         chunking_strategy: ChunkingStrategy = RegexChunking(),
         markdown_generator: MarkdownGenerationStrategy = None,
-        content_filter : RelevantContentFilter = None,
         only_text: bool = False,
         css_selector: str = None,
         excluded_tags: list = None,
@@ -611,7 +607,6 @@ class CrawlerRunConfig():
         self.extraction_strategy = extraction_strategy
         self.chunking_strategy = chunking_strategy
         self.markdown_generator = markdown_generator
-        self.content_filter = content_filter
         self.only_text = only_text
         self.css_selector = css_selector
         self.excluded_tags = excluded_tags or []
@@ -723,7 +718,6 @@ class CrawlerRunConfig():
             extraction_strategy=kwargs.get("extraction_strategy"),
             chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
             markdown_generator=kwargs.get("markdown_generator"),
-            content_filter=kwargs.get("content_filter"),
             only_text=kwargs.get("only_text", False),
             css_selector=kwargs.get("css_selector"),
             excluded_tags=kwargs.get("excluded_tags", []),
@@ -821,7 +815,6 @@ class CrawlerRunConfig():
             "extraction_strategy": self.extraction_strategy,
             "chunking_strategy": self.chunking_strategy,
             "markdown_generator": self.markdown_generator,
-            "content_filter": self.content_filter,
             "only_text": self.only_text,
             "css_selector": self.css_selector,
             "excluded_tags": self.excluded_tags,
diff --git a/crawl4ai/extraction_strategy.py b/crawl4ai/extraction_strategy.py
index 90b726bc..f7abab17 100644
--- a/crawl4ai/extraction_strategy.py
+++ b/crawl4ai/extraction_strategy.py
@@ -508,6 +508,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
         overlap_rate=OVERLAP_RATE,
         word_token_rate=WORD_TOKEN_RATE,
         apply_chunking=True,
+        api_base: str =None,
+        base_url: str =None,
+        input_format: str = "markdown",
+        verbose=False,
         **kwargs,
     ):
         """
@@ -531,7 +535,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
             total_usage: Accumulated token usage.
 
         """
-        super().__init__(**kwargs)
+        super().__init__( input_format=input_format, **kwargs)
         self.provider = provider
         if api_token and not api_token.startswith("env:"):
             self.api_token = api_token
@@ -548,19 +552,17 @@ class LLMExtractionStrategy(ExtractionStrategy):
         if schema:
             self.extract_type = "schema"
 
-        self.chunk_token_threshold = kwargs.get(
-            "chunk_token_threshold", CHUNK_TOKEN_THRESHOLD
-        )
-        self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
-        self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
-        self.apply_chunking = kwargs.get("apply_chunking", True)
-        self.base_url = kwargs.get("base_url", None)
-        self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
+        self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
+        self.overlap_rate = overlap_rate
+        self.word_token_rate = word_token_rate
+        self.apply_chunking = apply_chunking
+        self.base_url = base_url
+        self.api_base = api_base or base_url
         self.extra_args = kwargs.get("extra_args", {})
         if not self.apply_chunking:
             self.chunk_token_threshold = 1e9
 
-        self.verbose = kwargs.get("verbose", False)
+        self.verbose = verbose
         self.usages = []  # Store individual usages
         self.total_usage = TokenUsage()  # Accumulated usage
 
diff --git a/docs/md_v2/api/parameters.md b/docs/md_v2/api/parameters.md
index 0b994fd6..9db3d767 100644
--- a/docs/md_v2/api/parameters.md
+++ b/docs/md_v2/api/parameters.md
@@ -71,7 +71,6 @@ We group them by category.
 | **`word_count_threshold`**   | `int` (default: ~200)                | Skips text blocks below X words. Helps ignore trivial sections.                                 |
 | **`extraction_strategy`**    | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.).                                  |
 | **`markdown_generator`**     | `MarkdownGenerationStrategy` (None)  | If you want specialized markdown output (citations, filtering, chunking, etc.).                 |
-| **`content_filter`**         | `RelevantContentFilter` (None)       | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`.        |
 | **`css_selector`**           | `str` (None)                         | Retains only the part of the page matching this selector.                                       |
 | **`excluded_tags`**          | `list` (None)                        | Removes entire tags (e.g. `["script", "style"]`).                                               |
 | **`excluded_selector`**      | `str` (None)                         | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`.                                    |