refactor(crawler): remove content filter functionality
Remove content filter related code and parameters as part of simplifying the crawler configuration. This includes: - Removing ContentFilter import and related classes - Removing content_filter parameter from CrawlerRunConfig - Cleaning up LLMExtractionStrategy constructor parameters BREAKING CHANGE: Removed content_filter parameter from CrawlerRunConfig. Users should migrate to using extraction strategies for content filtering.
This commit is contained in:
@@ -11,7 +11,6 @@ from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
|||||||
from .extraction_strategy import ExtractionStrategy
|
from .extraction_strategy import ExtractionStrategy
|
||||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||||
from .content_filter_strategy import RelevantContentFilter # , BM25ContentFilter, LLMContentFilter, PruningContentFilter
|
|
||||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||||
from .deep_crawling import DeepCrawlStrategy
|
from .deep_crawling import DeepCrawlStrategy
|
||||||
from typing import Union, List
|
from typing import Union, List
|
||||||
@@ -387,8 +386,6 @@ class CrawlerRunConfig():
|
|||||||
Default: RegexChunking().
|
Default: RegexChunking().
|
||||||
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
markdown_generator (MarkdownGenerationStrategy): Strategy for generating markdown.
|
||||||
Default: None.
|
Default: None.
|
||||||
content_filter (RelevantContentFilter or None): Optional filter to prune irrelevant content.
|
|
||||||
Default: None.
|
|
||||||
only_text (bool): If True, attempt to extract text-only content where applicable.
|
only_text (bool): If True, attempt to extract text-only content where applicable.
|
||||||
Default: False.
|
Default: False.
|
||||||
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
||||||
@@ -532,7 +529,6 @@ class CrawlerRunConfig():
|
|||||||
extraction_strategy: ExtractionStrategy = None,
|
extraction_strategy: ExtractionStrategy = None,
|
||||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||||
markdown_generator: MarkdownGenerationStrategy = None,
|
markdown_generator: MarkdownGenerationStrategy = None,
|
||||||
content_filter : RelevantContentFilter = None,
|
|
||||||
only_text: bool = False,
|
only_text: bool = False,
|
||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
excluded_tags: list = None,
|
excluded_tags: list = None,
|
||||||
@@ -611,7 +607,6 @@ class CrawlerRunConfig():
|
|||||||
self.extraction_strategy = extraction_strategy
|
self.extraction_strategy = extraction_strategy
|
||||||
self.chunking_strategy = chunking_strategy
|
self.chunking_strategy = chunking_strategy
|
||||||
self.markdown_generator = markdown_generator
|
self.markdown_generator = markdown_generator
|
||||||
self.content_filter = content_filter
|
|
||||||
self.only_text = only_text
|
self.only_text = only_text
|
||||||
self.css_selector = css_selector
|
self.css_selector = css_selector
|
||||||
self.excluded_tags = excluded_tags or []
|
self.excluded_tags = excluded_tags or []
|
||||||
@@ -723,7 +718,6 @@ class CrawlerRunConfig():
|
|||||||
extraction_strategy=kwargs.get("extraction_strategy"),
|
extraction_strategy=kwargs.get("extraction_strategy"),
|
||||||
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
chunking_strategy=kwargs.get("chunking_strategy", RegexChunking()),
|
||||||
markdown_generator=kwargs.get("markdown_generator"),
|
markdown_generator=kwargs.get("markdown_generator"),
|
||||||
content_filter=kwargs.get("content_filter"),
|
|
||||||
only_text=kwargs.get("only_text", False),
|
only_text=kwargs.get("only_text", False),
|
||||||
css_selector=kwargs.get("css_selector"),
|
css_selector=kwargs.get("css_selector"),
|
||||||
excluded_tags=kwargs.get("excluded_tags", []),
|
excluded_tags=kwargs.get("excluded_tags", []),
|
||||||
@@ -821,7 +815,6 @@ class CrawlerRunConfig():
|
|||||||
"extraction_strategy": self.extraction_strategy,
|
"extraction_strategy": self.extraction_strategy,
|
||||||
"chunking_strategy": self.chunking_strategy,
|
"chunking_strategy": self.chunking_strategy,
|
||||||
"markdown_generator": self.markdown_generator,
|
"markdown_generator": self.markdown_generator,
|
||||||
"content_filter": self.content_filter,
|
|
||||||
"only_text": self.only_text,
|
"only_text": self.only_text,
|
||||||
"css_selector": self.css_selector,
|
"css_selector": self.css_selector,
|
||||||
"excluded_tags": self.excluded_tags,
|
"excluded_tags": self.excluded_tags,
|
||||||
|
|||||||
@@ -508,6 +508,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
overlap_rate=OVERLAP_RATE,
|
overlap_rate=OVERLAP_RATE,
|
||||||
word_token_rate=WORD_TOKEN_RATE,
|
word_token_rate=WORD_TOKEN_RATE,
|
||||||
apply_chunking=True,
|
apply_chunking=True,
|
||||||
|
api_base: str =None,
|
||||||
|
base_url: str =None,
|
||||||
|
input_format: str = "markdown",
|
||||||
|
verbose=False,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -531,7 +535,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
total_usage: Accumulated token usage.
|
total_usage: Accumulated token usage.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
super().__init__(**kwargs)
|
super().__init__( input_format=input_format, **kwargs)
|
||||||
self.provider = provider
|
self.provider = provider
|
||||||
if api_token and not api_token.startswith("env:"):
|
if api_token and not api_token.startswith("env:"):
|
||||||
self.api_token = api_token
|
self.api_token = api_token
|
||||||
@@ -548,19 +552,17 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
|||||||
if schema:
|
if schema:
|
||||||
self.extract_type = "schema"
|
self.extract_type = "schema"
|
||||||
|
|
||||||
self.chunk_token_threshold = kwargs.get(
|
self.chunk_token_threshold = chunk_token_threshold or CHUNK_TOKEN_THRESHOLD
|
||||||
"chunk_token_threshold", CHUNK_TOKEN_THRESHOLD
|
self.overlap_rate = overlap_rate
|
||||||
)
|
self.word_token_rate = word_token_rate
|
||||||
self.overlap_rate = kwargs.get("overlap_rate", OVERLAP_RATE)
|
self.apply_chunking = apply_chunking
|
||||||
self.word_token_rate = kwargs.get("word_token_rate", WORD_TOKEN_RATE)
|
self.base_url = base_url
|
||||||
self.apply_chunking = kwargs.get("apply_chunking", True)
|
self.api_base = api_base or base_url
|
||||||
self.base_url = kwargs.get("base_url", None)
|
|
||||||
self.api_base = kwargs.get("api_base", kwargs.get("base_url", None))
|
|
||||||
self.extra_args = kwargs.get("extra_args", {})
|
self.extra_args = kwargs.get("extra_args", {})
|
||||||
if not self.apply_chunking:
|
if not self.apply_chunking:
|
||||||
self.chunk_token_threshold = 1e9
|
self.chunk_token_threshold = 1e9
|
||||||
|
|
||||||
self.verbose = kwargs.get("verbose", False)
|
self.verbose = verbose
|
||||||
self.usages = [] # Store individual usages
|
self.usages = [] # Store individual usages
|
||||||
self.total_usage = TokenUsage() # Accumulated usage
|
self.total_usage = TokenUsage() # Accumulated usage
|
||||||
|
|
||||||
|
|||||||
@@ -71,7 +71,6 @@ We group them by category.
|
|||||||
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
||||||
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
||||||
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
|
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
|
||||||
| **`content_filter`** | `RelevantContentFilter` (None) | Filters out irrelevant text blocks. E.g., `PruningContentFilter` or `BM25ContentFilter`. |
|
|
||||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
|
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
|
||||||
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
|
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
|
||||||
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
|
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
|
||||||
|
|||||||
Reference in New Issue
Block a user