refactor(crawler): remove content filter functionality

Remove content filter related code and parameters as part of simplifying the crawler configuration. This includes:
- Removing ContentFilter import and related classes
- Removing content_filter parameter from CrawlerRunConfig
- Cleaning up LLMExtractionStrategy constructor parameters

BREAKING CHANGE: Removed content_filter parameter from CrawlerRunConfig. Users should migrate to using extraction strategies for content filtering.
This commit is contained in:
UncleCode
2025-02-12 21:59:19 +08:00
parent 69705df0b3
commit 43e09da694
3 changed files with 12 additions and 18 deletions

View File

@@ -11,7 +11,6 @@ from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from .deep_crawling import DeepCrawlStrategy from .deep_crawling import DeepCrawlStrategy
from typing import Union, List from typing import Union, List
@@ -387,8 +386,6 @@ class CrawlerRunConfig():
Default: RegexChunking(). Default: RegexChunking().
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown. markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
Default: None. Default: None.
content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
Default: None.
only_text (bool): If True, attempt to extract text-only content where applicable. only_text (bool): If True, attempt to extract text-only content where applicable.
Default: False. Default: False.
css_selector (str or None): CSS selector to extract a specific portion of the page. css_selector (str or None): CSS selector to extract a specific portion of the page.
@@ -532,7 +529,6 @@ class CrawlerRunConfig():
extraction_strategy: ExtractionStrategy = None, extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(), chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None, markdown_generator: MarkdownGenerationStrategy = None,
content_filter : RelevantContentFilter = None,
only_text: bool = False, only_text: bool = False,
css_selector: str = None, css_selector: str = None,
excluded_tags: list = None, excluded_tags: list = None,
@@ -611,7 +607,6 @@ class CrawlerRunConfig():
self.extraction_strategy = extraction_strategy self.extraction_strategy = extraction_strategy
self.chunking_strategy = chunking_strategy self.chunking_strategy = chunking_strategy
self.markdown_generator = markdown_generator self.markdown_generator = markdown_generator
self.content_filter = content_filter
self.only_text = only_text self.only_text = only_text
self.css_selector = css_selector self.css_selector = css_selector
self.excluded_tags = excluded_tags or [] self.excluded_tags = excluded_tags or []
@@ -723,7 +718,6 @@ class CrawlerRunConfig():
extraction_strategy=kwargs.get("extraction_strategy"), extraction_strategy=kwargs.get("extraction_strategy"),
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()), chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
markdown_generator=kwargs.get("markdown_generator"), markdown_generator=kwargs.get("markdown_generator"),
content_filter=kwargs.get("content_filter"),
only_text=kwargs.get("only_text", False), only_text=kwargs.get("only_text", False),
css_selector=kwargs.get("css_selector"), css_selector=kwargs.get("css_selector"),
excluded_tags=kwargs.get("excluded_tags", []), excluded_tags=kwargs.get("excluded_tags", []),
@@ -821,7 +815,6 @@ class CrawlerRunConfig():
"extraction_strategy": self.extraction_strategy, "extraction_strategy": self.extraction_strategy,
"chunking_strategy": self.chunking_strategy, "chunking_strategy": self.chunking_strategy,
"markdown_generator": self.markdown_generator, "markdown_generator": self.markdown_generator,
"content_filter": self.content_filter,
"only_text": self.only_text, "only_text": self.only_text,
"css_selector": self.css_selector, "css_selector": self.css_selector,
"excluded_tags": self.excluded_tags, "excluded_tags": self.excluded_tags,

View File

@@ -508,6 +508,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
overlap_rate=OVERLAP_RATE, overlap_rate=OVERLAP_RATE,
word_token_rate=WORD_TOKEN_RATE, word_token_rate=WORD_TOKEN_RATE,
apply_chunking=True, apply_chunking=True,
api_base: str =None,
base_url: str =None,
input_format: str = "markdown",
verbose=False,
**kwargs, **kwargs,
): ):
""" """
@@ -531,7 +535,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
total_usage: Accumulated token usage. total_usage: Accumulated token usage.
""" """
super().__init__(**kwargs) super().__init__( input_format=input_format, **kwargs)
self.provider = provider self.provider = provider
if api_token and not api_token.startswith("env:"): if api_token and not api_token.startswith("env:"):
self.api_token = api_token self.api_token = api_token
@@ -548,19 +552,17 @@ class LLMExtractionStrategy(ExtractionStrategy):
if schema: if schema:
self.extract_type = "schema" self.extract_type = "schema"
self.chunk_token_threshold = kwargs.get( self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
"chunk_token_threshold", CHUNK_TOKEN_THRESHOLD self.overlap_rate = overlap_rate
) self.word_token_rate = word_token_rate
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE) self.apply_chunking = apply_chunking
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE) self.base_url = base_url
self.apply_chunking = kwargs.get("apply_chunking", True) self.api_base = api_base or base_url
self.base_url = kwargs.get("base_url", None)
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
self.extra_args = kwargs.get("extra_args", {}) self.extra_args = kwargs.get("extra_args", {})
if not self.apply_chunking: if not self.apply_chunking:
self.chunk_token_threshold = 1e9 self.chunk_token_threshold = 1e9
self.verbose = kwargs.get("verbose", False) self.verbose = verbose
self.usages = [] # Store individual usages self.usages = [] # Store individual usages
self.total_usage = TokenUsage() # Accumulated usage self.total_usage = TokenUsage() # Accumulated usage

View File

@@ -71,7 +71,6 @@ We group them by category.
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. | | **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). | | **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). | | **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
| **`content_filter`** | `RelevantContentFilter` (None) | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`. |
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. | | **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). | | **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. | | **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |