refactor(crawler): remove content filter functionality
Remove content filter related code and parameters as part of simplifying the crawler configuration. This includes: - Removing ContentFilter import and related classes - Removing content_filter parameter from CrawlerRunConfig - Cleaning up LLMExtractionStrategy constructor parameters BREAKING CHANGE: Removed content_filter parameter from CrawlerRunConfig. Users should migrate to using extraction strategies for content filtering.
This commit is contained in:
@@ -11,7 +11,6 @@ from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
from typing import Union, List
|
||||
@@ -387,8 +386,6 @@ class CrawlerRunConfig():
|
||||
Default: RegexChunking().
|
||||
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
||||
Default: None.
|
||||
content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
|
||||
Default: None.
|
||||
only_text (bool): If True, attempt to extract text-only content where applicable.
|
||||
Default: False.
|
||||
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
||||
@@ -532,7 +529,6 @@ class CrawlerRunConfig():
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
content_filter : RelevantContentFilter = None,
|
||||
only_text: bool = False,
|
||||
css_selector: str = None,
|
||||
excluded_tags: list = None,
|
||||
@@ -611,7 +607,6 @@ class CrawlerRunConfig():
|
||||
self.extraction_strategy = extraction_strategy
|
||||
self.chunking_strategy = chunking_strategy
|
||||
self.markdown_generator = markdown_generator
|
||||
self.content_filter = content_filter
|
||||
self.only_text = only_text
|
||||
self.css_selector = css_selector
|
||||
self.excluded_tags = excluded_tags or []
|
||||
@@ -723,7 +718,6 @@ class CrawlerRunConfig():
|
||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
content_filter=kwargs.get("content_filter"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
css_selector=kwargs.get("css_selector"),
|
||||
excluded_tags=kwargs.get("excluded_tags", []),
|
||||
@@ -821,7 +815,6 @@ class CrawlerRunConfig():
|
||||
"extraction_strategy": self.extraction_strategy,
|
||||
"chunking_strategy": self.chunking_strategy,
|
||||
"markdown_generator": self.markdown_generator,
|
||||
"content_filter": self.content_filter,
|
||||
"only_text": self.only_text,
|
||||
"css_selector": self.css_selector,
|
||||
"excluded_tags": self.excluded_tags,
|
||||
|
||||
Reference in New Issue
Block a user