Add PruningContentFilter with unit tests and update documentation

- Introduced the PruningContentFilter for better content relevance.
  - Implemented comprehensive unit tests for verification of functionality.
  - Enhanced existing BM25ContentFilter tests for edge case coverage.
  - Updated documentation to include usage examples for new filter.
This commit is contained in:
unclecode
2024-12-01 19:17:33 +08:00
parent 80d58ad24c
commit 293f299c08
9 changed files with 499 additions and 135 deletions

View File

@@ -1,5 +1,55 @@
# Changelog
## [0.3.75] December 1, 2024
### PruningContentFilter
#### 1. Introduced PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
A new content filtering strategy that removes less relevant nodes based on metrics like text and link density.
**Affected Files:**
- `crawl4ai/content_filter_strategy.py`: Enhancement of content filtering capabilities.
```diff
Implemented effective pruning algorithm with comprehensive scoring.
```
- `README.md`: Improved documentation regarding new features.
```diff
Updated to include usage and explanation for the PruningContentFilter.
```
- `docs/md_v2/basic/content_filtering.md`: Expanded documentation for users.
```diff
Added detailed section explaining the PruningContentFilter.
```
#### 2. Added Unit Tests for PruningContentFilter (Dec 01, 2024) (Dec 01, 2024)
Comprehensive tests added to ensure correct functionality of PruningContentFilter
**Affected Files:**
- `tests/async/test_content_filter_prune.py`: Increased test coverage for content filtering strategies.
```diff
Created test cases for various scenarios using the PruningContentFilter.
```
### Development Updates
#### 3. Enhanced BM25ContentFilter tests (Dec 01, 2024) (Dec 01, 2024)
Extended testing to cover additional edge cases and performance metrics.
**Affected Files:**
- `tests/async/test_content_filter_bm25.py`: Improved reliability and performance assurance.
```diff
Added tests for new extraction scenarios including malformed HTML.
```
### Infrastructure & Documentation
#### 4. Updated Examples (Dec 01, 2024) (Dec 01, 2024)
Altered examples in documentation to promote the use of PruningContentFilter alongside existing strategies.
**Affected Files:**
- `docs/examples/quickstart_async.py`: Enhanced usability and clarity for new users.
- Revised example to illustrate usage of PruningContentFilter.
## [0.3.746] November 29, 2024
### Major Features

View File

@@ -422,7 +422,7 @@ You can check the project structure in the directory [https://github.com/uncleco
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.content_filter_strategy import PruningContentFilter, BM25ContentFilter
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
async def main():
@@ -434,8 +434,11 @@ async def main():
url="https://docs.micronaut.io/4.7.6/guide/",
cache_mode=CacheMode.ENABLED,
markdown_generator=DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
),
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(user_query="WHEN_WE_FOCUS_BASED_ON_A_USER_QUERY", bm25_threshold=1.0)
# ),
)
print(len(result.markdown))
print(len(result.fit_markdown))

View File

@@ -4,10 +4,10 @@ from typing import List, Tuple, Dict
from rank_bm25 import BM25Okapi
from time import perf_counter
from collections import deque
from bs4 import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from .utils import clean_tokens
from abc import ABC, abstractmethod
import math
from snowballstemmer import stemmer
@@ -358,145 +358,186 @@ class BM25ContentFilter(RelevantContentFilter):
return [self.clean_element(tag) for _, _, tag in selected_candidates]
class HeuristicContentFilter(RelevantContentFilter):
def __init__(self):
super().__init__()
# Weights for different heuristics
self.tag_weights = {
'article': 10,
'main': 8,
'section': 5,
'div': 3,
'p': 2,
'pre': 2,
'code': 2,
'blockquote': 2,
'li': 1,
'span': 1,
}
self.max_depth = 5 # Maximum depth from body to consider
def filter_content(self, html: str) -> List[str]:
"""Implements heuristic content filtering without relying on a query."""
class PruningContentFilter(RelevantContentFilter):
def __init__(self, user_query: str = None, min_word_threshold: int = None,
threshold_type: str = 'fixed', threshold: float = 0.48):
super().__init__(user_query)
self.min_word_threshold = min_word_threshold
self.threshold_type = threshold_type
self.threshold = threshold
# Add tag importance for dynamic threshold
self.tag_importance = {
'article': 1.5,
'main': 1.4,
'section': 1.3,
'p': 1.2,
'h1': 1.4,
'h2': 1.3,
'h3': 1.2,
'div': 0.7,
'span': 0.6
}
# Metric configuration
self.metric_config = {
'text_density': True,
'link_density': True,
'tag_weight': True,
'class_id_weight': True,
'text_length': True,
}
self.metric_weights = {
'text_density': 0.4,
'link_density': 0.2,
'tag_weight': 0.2,
'class_id_weight': 0.1,
'text_length': 0.1,
}
self.tag_weights = {
'div': 0.5,
'p': 1.0,
'article': 1.5,
'section': 1.0,
'span': 0.3,
'li': 0.5,
'ul': 0.5,
'ol': 0.5,
'h1': 1.2,
'h2': 1.1,
'h3': 1.0,
'h4': 0.9,
'h5': 0.8,
'h6': 0.7,
}
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
if not html or not isinstance(html, str):
return []
soup = BeautifulSoup(html, 'lxml')
# Ensure there is a body tag
if not soup.body:
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
body = soup.body
# Remove comments and unwanted tags
self._remove_comments(soup)
self._remove_unwanted_tags(soup)
# Prune tree starting from body
body = soup.find('body')
self._prune_tree(body)
# Extract remaining content as list of HTML strings
content_blocks = []
for element in body.children:
if isinstance(element, str) or not hasattr(element, 'name'):
continue
if len(element.get_text(strip=True)) > 0:
content_blocks.append(str(element))
return content_blocks
# Extract candidate text chunks
candidates = self.extract_text_chunks(body)
def _remove_comments(self, soup):
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
if not candidates:
return []
def _remove_unwanted_tags(self, soup):
for tag in self.excluded_tags:
for element in soup.find_all(tag):
element.decompose()
# Score each candidate
scored_candidates = []
for index, text, tag_type, tag in candidates:
score = self.score_element(tag, text)
if score > 0:
scored_candidates.append((score, index, text, tag))
def _prune_tree(self, node):
if not node or not hasattr(node, 'name') or node.name is None:
return
# Sort candidates by score and then by document order
scored_candidates.sort(key=lambda x: (-x[0], x[1]))
text_len = len(node.get_text(strip=True))
tag_len = len(node.encode_contents().decode('utf-8'))
link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
# Extract the top candidates (e.g., top 5)
top_candidates = scored_candidates[:5] # Adjust the number as needed
metrics = {
'node': node,
'tag_name': node.name,
'text_len': text_len,
'tag_len': tag_len,
'link_text_len': link_text_len
}
# Sort the top candidates back to their original document order
top_candidates.sort(key=lambda x: x[1])
score = self._compute_composite_score(metrics)
# Clean and return the content
return [self.clean_element(tag) for _, _, _, tag in top_candidates]
if self.threshold_type == 'fixed':
should_remove = score < self.threshold
else: # dynamic
tag_importance = self.tag_importance.get(node.name, 0.7)
text_ratio = text_len / tag_len if tag_len > 0 else 0
link_ratio = link_text_len / text_len if text_len > 0 else 1
threshold = self.threshold # base threshold
if tag_importance > 1:
threshold *= 0.8
if text_ratio > 0.4:
threshold *= 0.9
if link_ratio > 0.6:
threshold *= 1.2
should_remove = score < threshold
def score_element(self, tag: Tag, text: str) -> float:
"""Compute a score for an element based on heuristics."""
if not text or not tag:
return 0
if should_remove:
node.decompose()
else:
children = [child for child in node.children if hasattr(child, 'name')]
for child in children:
self._prune_tree(child)
# Exclude unwanted tags
if self.is_excluded(tag):
return 0
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
if self.min_word_threshold:
# Get raw text from metrics node - avoid extra processing
text = metrics['node'].get_text(strip=True)
word_count = text.count(' ') + 1
if word_count < self.min_word_threshold:
return -1.0 # Guaranteed removal
score = 0.0
total_weight = 0.0
# Text density
text_length = len(text.strip())
html_length = len(str(tag))
text_density = text_length / html_length if html_length > 0 else 0
if self.metric_config['text_density']:
density = text_len / tag_len if tag_len > 0 else 0
score += self.metric_weights['text_density'] * density
total_weight += self.metric_weights['text_density']
# Link density
link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a'))
link_density = link_text_length / text_length if text_length > 0 else 0
if self.metric_config['link_density']:
density = 1 - (link_text_len / text_len if text_len > 0 else 0)
score += self.metric_weights['link_density'] * density
total_weight += self.metric_weights['link_density']
# Tag weight
tag_weight = self.tag_weights.get(tag.name, 1)
if self.metric_config['tag_weight']:
tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
score += self.metric_weights['tag_weight'] * tag_score
total_weight += self.metric_weights['tag_weight']
# Depth factor (prefer elements closer to the body tag)
depth = self.get_depth(tag)
depth_weight = max(self.max_depth - depth, 1) / self.max_depth
if self.metric_config['class_id_weight']:
class_score = self._compute_class_id_weight(metrics['node'])
score += self.metric_weights['class_id_weight'] * max(0, class_score)
total_weight += self.metric_weights['class_id_weight']
# Compute the final score
score = (text_density * tag_weight * depth_weight) / (1 + link_density)
if self.metric_config['text_length']:
score += self.metric_weights['text_length'] * math.log(text_len + 1)
total_weight += self.metric_weights['text_length']
return score
return score / total_weight if total_weight > 0 else 0
def get_depth(self, tag: Tag) -> int:
"""Compute the depth of the tag from the body tag."""
depth = 0
current = tag
while current and current != current.parent and current.name != 'body':
current = current.parent
depth += 1
return depth
def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]:
"""
Extracts text chunks from the body element while preserving order.
Returns list of tuples (index, text, tag_type, tag) for scoring.
"""
chunks = []
index = 0
def traverse(element):
nonlocal index
if isinstance(element, NavigableString):
return
if not isinstance(element, Tag):
return
if self.is_excluded(element):
return
# Only consider included tags
if element.name in self.included_tags:
text = element.get_text(separator=' ', strip=True)
if len(text.split()) >= self.min_word_count:
tag_type = 'header' if element.name in self.header_tags else 'content'
chunks.append((index, text, tag_type, element))
index += 1
# Do not traverse children of this element to prevent duplication
return
for child in element.children:
traverse(child)
traverse(body)
return chunks
def is_excluded(self, tag: Tag) -> bool:
"""Determine if a tag should be excluded based on heuristics."""
if tag.name in self.excluded_tags:
return True
class_id = ' '.join(filter(None, [
' '.join(tag.get('class', [])),
tag.get('id', '')
]))
if self.negative_patterns.search(class_id):
return True
# Exclude tags with high link density (e.g., navigation menus)
text = tag.get_text(separator=' ', strip=True)
link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a'))
text_length = len(text)
if text_length > 0 and (link_text_length / text_length) > 0.5:
return True
return False
def _compute_class_id_weight(self, node):
class_id_score = 0
if 'class' in node.attrs:
classes = ' '.join(node['class'])
if self.negative_patterns.match(classes):
class_id_score -= 0.5
if 'id' in node.attrs:
element_id = node['id']
if self.negative_patterns.match(element_id):
class_id_score -= 0.5
return class_id_score

View File

@@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment
from urllib.parse import urljoin
from requests.exceptions import InvalidSchema
# from .content_cleaning_strategy import ContentCleaningStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .models import MarkdownGenerationResult
from .utils import (
@@ -110,10 +110,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if markdown_generator:
try:
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
markdown_generator.content_filter = BM25ContentFilter(
user_query=kwargs.get('fit_markdown_user_query', None),
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
markdown_generator.content_filter = PruningContentFilter(
threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'),
threshold=kwargs.get('fit_markdown_treshold', 0.48),
min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ),
)
# markdown_generator.content_filter = BM25ContentFilter(
# user_query=kwargs.get('fit_markdown_user_query', None),
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
# )
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,

View File

@@ -15,7 +15,7 @@ from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import BM25ContentFilter
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
@@ -466,7 +466,8 @@ async def speed_comparison():
url="https://www.nbcnews.com/business",
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
cache_mode=CacheMode.BYPASS,
verbose=False,
@@ -489,7 +490,8 @@ async def speed_comparison():
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
content_filter = PruningContentFilter(threshold=0.48, threshold_type="fixed", min_word_threshold=0)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
verbose=False,
)

View File

@@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra
## Relevance Content Filter
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
## Pruning Content Filter
The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
### Usage
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.content_filter_strategy import PruningContentFilter
async def filter_content(url):
async with AsyncWebCrawler() as crawler:
content_filter = PruningContentFilter(
min_word_threshold=5,
threshold_type='dynamic',
threshold=0.45
)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
if result.success:
print(f"Cleaned Markdown:\n{result.fit_markdown}")
```
### Parameters
- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
- `'fixed'`: Uses a constant threshold value for all nodes
- `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
- For fixed threshold: Nodes scoring below this value are removed
- For dynamic threshold: This value is adjusted based on node properties
### How It Works
The pruning algorithm evaluates each node using multiple metrics:
- Text density: Ratio of actual text to overall node content
- Link density: Proportion of text within links
- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
- Content quality: Metrics like text length and structural importance
Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
The algorithm is particularly effective for:
- Removing boilerplate content
- Eliminating navigation menus and sidebars
- Preserving main article content
- Maintaining document structure while removing noise
## BM25 Algorithm

View File

@@ -4,7 +4,59 @@ This guide explains how to use content filtering strategies in Crawl4AI to extra
## Relevance Content Filter
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
The `RelevanceContentFilter` is an abstract class that provides a common interface for content filtering strategies. Specific filtering algorithms, like `PruningContentFilter` or `BM25ContentFilter`, inherit from this class and implement the `filter_content` method. This method takes the HTML content as input and returns a list of filtered text blocks.
## Pruning Content Filter
The `PruningContentFilter` is a tree-shaking algorithm that analyzes the HTML DOM structure and removes less relevant nodes based on various metrics like text density, link density, and tag importance. It evaluates each node using a composite scoring system and "prunes" nodes that fall below a certain threshold.
### Usage
```python
from crawl4ai import AsyncWebCrawler
from crawl4ai.content_filter_strategy import PruningContentFilter
async def filter_content(url):
async with AsyncWebCrawler() as crawler:
content_filter = PruningContentFilter(
min_word_threshold=5,
threshold_type='dynamic',
threshold=0.45
)
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True)
if result.success:
print(f"Cleaned Markdown:\n{result.fit_markdown}")
```
### Parameters
- **`min_word_threshold`**: (Optional) Minimum number of words a node must contain to be considered relevant. Nodes with fewer words are automatically pruned.
- **`threshold_type`**: (Optional, default 'fixed') Controls how pruning thresholds are calculated:
- `'fixed'`: Uses a constant threshold value for all nodes
- `'dynamic'`: Adjusts threshold based on node characteristics like tag importance and text/link ratios
- **`threshold`**: (Optional, default 0.48) Base threshold value for node pruning:
- For fixed threshold: Nodes scoring below this value are removed
- For dynamic threshold: This value is adjusted based on node properties
### How It Works
The pruning algorithm evaluates each node using multiple metrics:
- Text density: Ratio of actual text to overall node content
- Link density: Proportion of text within links
- Tag importance: Weight based on HTML tag type (e.g., article, p, div)
- Content quality: Metrics like text length and structural importance
Nodes scoring below the threshold are removed, effectively "shaking" less relevant content from the DOM tree. This results in a cleaner document containing only the most relevant content blocks.
The algorithm is particularly effective for:
- Removing boilerplate content
- Eliminating navigation menus and sidebars
- Preserving main article content
- Maintaining document structure while removing noise
## BM25 Algorithm
@@ -21,7 +73,7 @@ from crawl4ai.content_filter_strategy import BM25ContentFilter
async def filter_content(url, query=None):
async with AsyncWebCrawler() as crawler:
content_filter = BM25ContentFilter(user_query=query)
result = await crawler.arun(url=url, content_filter=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
result = await crawler.arun(url=url, extraction_strategy=content_filter, fit_markdown=True) # Set fit_markdown flag to True to trigger BM25 filtering
if result.success:
print(f"Filtered Content (JSON):\n{result.extracted_content}")
print(f"\nFiltered Markdown:\n{result.fit_markdown}") # New field in CrawlResult object
@@ -71,7 +123,7 @@ class MyCustomFilter(RelevantContentFilter):
async def custom_filter_demo(url: str):
async with AsyncWebCrawler() as crawler:
custom_filter = MyCustomFilter()
result = await crawler.arun(url, content_filter=custom_filter)
result = await crawler.arun(url, extraction_strategy=custom_filter)
if result.success:
print(result.extracted_content)

View File

@@ -0,0 +1,159 @@
import os, sys
import pytest
from bs4 import BeautifulSoup
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
sys.path.append(parent_dir)
from crawl4ai.content_filter_strategy import PruningContentFilter
@pytest.fixture
def basic_html():
return """
<html>
<body>
<article>
<h1>Main Article</h1>
<p>This is a high-quality paragraph with substantial text content. It contains enough words to pass the threshold and has good text density without too many links. This kind of content should survive the pruning process.</p>
<div class="sidebar">Low quality sidebar content</div>
<div class="social-share">Share buttons</div>
</article>
</body>
</html>
"""
@pytest.fixture
def link_heavy_html():
return """
<html>
<body>
<div class="content">
<p>Good content paragraph that should remain.</p>
<div class="links">
<a href="#">Link 1</a>
<a href="#">Link 2</a>
<a href="#">Link 3</a>
<a href="#">Link 4</a>
</div>
</div>
</body>
</html>
"""
@pytest.fixture
def mixed_content_html():
return """
<html>
<body>
<article>
<h1>Article Title</h1>
<p class="summary">Short summary.</p>
<div class="content">
<p>Long high-quality paragraph with substantial content that should definitely survive the pruning process. This content has good text density and proper formatting which makes it valuable for retention.</p>
</div>
<div class="comments">
<p>Short comment 1</p>
<p>Short comment 2</p>
</div>
</article>
</body>
</html>
"""
class TestPruningContentFilter:
def test_basic_pruning(self, basic_html):
"""Test basic content pruning functionality"""
filter = PruningContentFilter(min_word_threshold=5)
contents = filter.filter_content(basic_html)
combined_content = ' '.join(contents).lower()
assert "high-quality paragraph" in combined_content
assert "sidebar content" not in combined_content
assert "share buttons" not in combined_content
def test_min_word_threshold(self, mixed_content_html):
"""Test minimum word threshold filtering"""
filter = PruningContentFilter(min_word_threshold=10)
contents = filter.filter_content(mixed_content_html)
combined_content = ' '.join(contents).lower()
assert "short summary" not in combined_content
assert "long high-quality paragraph" in combined_content
assert "short comment" not in combined_content
def test_threshold_types(self, basic_html):
"""Test fixed vs dynamic thresholds"""
fixed_filter = PruningContentFilter(threshold_type='fixed', threshold=0.48)
dynamic_filter = PruningContentFilter(threshold_type='dynamic', threshold=0.45)
fixed_contents = fixed_filter.filter_content(basic_html)
dynamic_contents = dynamic_filter.filter_content(basic_html)
assert len(fixed_contents) != len(dynamic_contents), \
"Fixed and dynamic thresholds should yield different results"
def test_link_density_impact(self, link_heavy_html):
"""Test handling of link-heavy content"""
filter = PruningContentFilter(threshold_type='dynamic')
contents = filter.filter_content(link_heavy_html)
combined_content = ' '.join(contents).lower()
assert "good content paragraph" in combined_content
assert len([c for c in contents if 'href' in c]) < 2, \
"Should prune link-heavy sections"
def test_tag_importance(self, mixed_content_html):
"""Test tag importance in scoring"""
filter = PruningContentFilter(threshold_type='dynamic')
contents = filter.filter_content(mixed_content_html)
has_article = any('article' in c.lower() for c in contents)
has_h1 = any('h1' in c.lower() for c in contents)
assert has_article or has_h1, "Should retain important tags"
def test_empty_input(self):
"""Test handling of empty input"""
filter = PruningContentFilter()
assert filter.filter_content("") == []
assert filter.filter_content(None) == []
def test_malformed_html(self):
"""Test handling of malformed HTML"""
malformed_html = "<div>Unclosed div<p>Nested<span>content</div>"
filter = PruningContentFilter()
contents = filter.filter_content(malformed_html)
assert isinstance(contents, list)
def test_performance(self, basic_html):
"""Test performance with timer"""
filter = PruningContentFilter()
import time
start = time.perf_counter()
filter.filter_content(basic_html)
duration = time.perf_counter() - start
# Extra strict on performance since you mentioned milliseconds matter
assert duration < 0.1, f"Processing took too long: {duration:.3f} seconds"
@pytest.mark.parametrize("threshold,expected_count", [
(0.3, 4), # Very lenient
(0.48, 2), # Default
(0.7, 1), # Very strict
])
def test_threshold_levels(self, mixed_content_html, threshold, expected_count):
"""Test different threshold levels"""
filter = PruningContentFilter(threshold_type='fixed', threshold=threshold)
contents = filter.filter_content(mixed_content_html)
assert len(contents) <= expected_count, \
f"Expected {expected_count} or fewer elements with threshold {threshold}"
def test_consistent_output(self, basic_html):
"""Test output consistency across multiple runs"""
filter = PruningContentFilter()
first_run = filter.filter_content(basic_html)
second_run = filter.filter_content(basic_html)
assert first_run == second_run, "Output should be consistent"
if __name__ == "__main__":
pytest.main([__file__])