Add PruningContentFilter with unit tests and update documentation

- Introduced the PruningContentFilter for better content relevance.
  - Implemented comprehensive unit tests for verification of functionality.
  - Enhanced existing BM25ContentFilter tests for edge case coverage.
  - Updated documentation to include usage examples for new filter.
This commit is contained in:
unclecode
2024-12-01 19:17:33 +08:00
parent 80d58ad24c
commit 293f299c08
9 changed files with 499 additions and 135 deletions

View File

@@ -4,10 +4,10 @@ from typing import List, Tuple, Dict
from rank_bm25 import BM25Okapi
from time import perf_counter
from collections import deque
from bs4 import BeautifulSoup, NavigableString, Tag
from bs4 import BeautifulSoup, NavigableString, Tag, Comment
from .utils import clean_tokens
from abc import ABC, abstractmethod
import math
from snowballstemmer import stemmer
@@ -358,145 +358,186 @@ class BM25ContentFilter(RelevantContentFilter):
return [self.clean_element(tag) for _, _, tag in selected_candidates]
class HeuristicContentFilter(RelevantContentFilter):
def __init__(self):
super().__init__()
# Weights for different heuristics
self.tag_weights = {
'article': 10,
'main': 8,
'section': 5,
'div': 3,
'p': 2,
'pre': 2,
'code': 2,
'blockquote': 2,
'li': 1,
'span': 1,
}
self.max_depth = 5 # Maximum depth from body to consider
def filter_content(self, html: str) -> List[str]:
"""Implements heuristic content filtering without relying on a query."""
class PruningContentFilter(RelevantContentFilter):
def __init__(self, user_query: str = None, min_word_threshold: int = None,
threshold_type: str = 'fixed', threshold: float = 0.48):
super().__init__(user_query)
self.min_word_threshold = min_word_threshold
self.threshold_type = threshold_type
self.threshold = threshold
# Add tag importance for dynamic threshold
self.tag_importance = {
'article': 1.5,
'main': 1.4,
'section': 1.3,
'p': 1.2,
'h1': 1.4,
'h2': 1.3,
'h3': 1.2,
'div': 0.7,
'span': 0.6
}
# Metric configuration
self.metric_config = {
'text_density': True,
'link_density': True,
'tag_weight': True,
'class_id_weight': True,
'text_length': True,
}
self.metric_weights = {
'text_density': 0.4,
'link_density': 0.2,
'tag_weight': 0.2,
'class_id_weight': 0.1,
'text_length': 0.1,
}
self.tag_weights = {
'div': 0.5,
'p': 1.0,
'article': 1.5,
'section': 1.0,
'span': 0.3,
'li': 0.5,
'ul': 0.5,
'ol': 0.5,
'h1': 1.2,
'h2': 1.1,
'h3': 1.0,
'h4': 0.9,
'h5': 0.8,
'h6': 0.7,
}
def filter_content(self, html: str, min_word_threshold: int = None) -> List[str]:
if not html or not isinstance(html, str):
return []
soup = BeautifulSoup(html, 'lxml')
# Ensure there is a body tag
if not soup.body:
soup = BeautifulSoup(f'<body>{html}</body>', 'lxml')
body = soup.body
# Remove comments and unwanted tags
self._remove_comments(soup)
self._remove_unwanted_tags(soup)
# Prune tree starting from body
body = soup.find('body')
self._prune_tree(body)
# Extract remaining content as list of HTML strings
content_blocks = []
for element in body.children:
if isinstance(element, str) or not hasattr(element, 'name'):
continue
if len(element.get_text(strip=True)) > 0:
content_blocks.append(str(element))
return content_blocks
# Extract candidate text chunks
candidates = self.extract_text_chunks(body)
def _remove_comments(self, soup):
for element in soup(text=lambda text: isinstance(text, Comment)):
element.extract()
if not candidates:
return []
def _remove_unwanted_tags(self, soup):
for tag in self.excluded_tags:
for element in soup.find_all(tag):
element.decompose()
# Score each candidate
scored_candidates = []
for index, text, tag_type, tag in candidates:
score = self.score_element(tag, text)
if score > 0:
scored_candidates.append((score, index, text, tag))
def _prune_tree(self, node):
if not node or not hasattr(node, 'name') or node.name is None:
return
# Sort candidates by score and then by document order
scored_candidates.sort(key=lambda x: (-x[0], x[1]))
text_len = len(node.get_text(strip=True))
tag_len = len(node.encode_contents().decode('utf-8'))
link_text_len = sum(len(s.strip()) for s in (a.string for a in node.find_all('a', recursive=False)) if s)
# Extract the top candidates (e.g., top 5)
top_candidates = scored_candidates[:5] # Adjust the number as needed
metrics = {
'node': node,
'tag_name': node.name,
'text_len': text_len,
'tag_len': tag_len,
'link_text_len': link_text_len
}
# Sort the top candidates back to their original document order
top_candidates.sort(key=lambda x: x[1])
score = self._compute_composite_score(metrics)
# Clean and return the content
return [self.clean_element(tag) for _, _, _, tag in top_candidates]
if self.threshold_type == 'fixed':
should_remove = score < self.threshold
else: # dynamic
tag_importance = self.tag_importance.get(node.name, 0.7)
text_ratio = text_len / tag_len if tag_len > 0 else 0
link_ratio = link_text_len / text_len if text_len > 0 else 1
threshold = self.threshold # base threshold
if tag_importance > 1:
threshold *= 0.8
if text_ratio > 0.4:
threshold *= 0.9
if link_ratio > 0.6:
threshold *= 1.2
should_remove = score < threshold
def score_element(self, tag: Tag, text: str) -> float:
"""Compute a score for an element based on heuristics."""
if not text or not tag:
return 0
if should_remove:
node.decompose()
else:
children = [child for child in node.children if hasattr(child, 'name')]
for child in children:
self._prune_tree(child)
# Exclude unwanted tags
if self.is_excluded(tag):
return 0
def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):
if self.min_word_threshold:
# Get raw text from metrics node - avoid extra processing
text = metrics['node'].get_text(strip=True)
word_count = text.count(' ') + 1
if word_count < self.min_word_threshold:
return -1.0 # Guaranteed removal
score = 0.0
total_weight = 0.0
# Text density
text_length = len(text.strip())
html_length = len(str(tag))
text_density = text_length / html_length if html_length > 0 else 0
if self.metric_config['text_density']:
density = text_len / tag_len if tag_len > 0 else 0
score += self.metric_weights['text_density'] * density
total_weight += self.metric_weights['text_density']
# Link density
link_text_length = sum(len(a.get_text().strip()) for a in tag.find_all('a'))
link_density = link_text_length / text_length if text_length > 0 else 0
if self.metric_config['link_density']:
density = 1 - (link_text_len / text_len if text_len > 0 else 0)
score += self.metric_weights['link_density'] * density
total_weight += self.metric_weights['link_density']
# Tag weight
tag_weight = self.tag_weights.get(tag.name, 1)
if self.metric_config['tag_weight']:
tag_score = self.tag_weights.get(metrics['tag_name'], 0.5)
score += self.metric_weights['tag_weight'] * tag_score
total_weight += self.metric_weights['tag_weight']
# Depth factor (prefer elements closer to the body tag)
depth = self.get_depth(tag)
depth_weight = max(self.max_depth - depth, 1) / self.max_depth
if self.metric_config['class_id_weight']:
class_score = self._compute_class_id_weight(metrics['node'])
score += self.metric_weights['class_id_weight'] * max(0, class_score)
total_weight += self.metric_weights['class_id_weight']
# Compute the final score
score = (text_density * tag_weight * depth_weight) / (1 + link_density)
if self.metric_config['text_length']:
score += self.metric_weights['text_length'] * math.log(text_len + 1)
total_weight += self.metric_weights['text_length']
return score
return score / total_weight if total_weight > 0 else 0
def get_depth(self, tag: Tag) -> int:
"""Compute the depth of the tag from the body tag."""
depth = 0
current = tag
while current and current != current.parent and current.name != 'body':
current = current.parent
depth += 1
return depth
def extract_text_chunks(self, body: Tag) -> List[Tuple[int, str, str, Tag]]:
"""
Extracts text chunks from the body element while preserving order.
Returns list of tuples (index, text, tag_type, tag) for scoring.
"""
chunks = []
index = 0
def traverse(element):
nonlocal index
if isinstance(element, NavigableString):
return
if not isinstance(element, Tag):
return
if self.is_excluded(element):
return
# Only consider included tags
if element.name in self.included_tags:
text = element.get_text(separator=' ', strip=True)
if len(text.split()) >= self.min_word_count:
tag_type = 'header' if element.name in self.header_tags else 'content'
chunks.append((index, text, tag_type, element))
index += 1
# Do not traverse children of this element to prevent duplication
return
for child in element.children:
traverse(child)
traverse(body)
return chunks
def is_excluded(self, tag: Tag) -> bool:
"""Determine if a tag should be excluded based on heuristics."""
if tag.name in self.excluded_tags:
return True
class_id = ' '.join(filter(None, [
' '.join(tag.get('class', [])),
tag.get('id', '')
]))
if self.negative_patterns.search(class_id):
return True
# Exclude tags with high link density (e.g., navigation menus)
text = tag.get_text(separator=' ', strip=True)
link_text_length = sum(len(a.get_text(strip=True)) for a in tag.find_all('a'))
text_length = len(text)
if text_length > 0 and (link_text_length / text_length) > 0.5:
return True
return False
def _compute_class_id_weight(self, node):
class_id_score = 0
if 'class' in node.attrs:
classes = ' '.join(node['class'])
if self.negative_patterns.match(classes):
class_id_score -= 0.5
if 'id' in node.attrs:
element_id = node['id']
if self.negative_patterns.match(element_id):
class_id_score -= 0.5
return class_id_score

View File

@@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment
from urllib.parse import urljoin
from requests.exceptions import InvalidSchema
# from .content_cleaning_strategy import ContentCleaningStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .models import MarkdownGenerationResult
from .utils import (
@@ -110,10 +110,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if markdown_generator:
try:
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
markdown_generator.content_filter = BM25ContentFilter(
user_query=kwargs.get('fit_markdown_user_query', None),
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
markdown_generator.content_filter = PruningContentFilter(
threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'),
threshold=kwargs.get('fit_markdown_treshold', 0.48),
min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ),
)
# markdown_generator.content_filter = BM25ContentFilter(
# user_query=kwargs.get('fit_markdown_user_query', None),
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
# )
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,