Add PruningContentFilter with unit tests and update documentation

- Introduced the PruningContentFilter for better content relevance.
  - Implemented comprehensive unit tests for verification of functionality.
  - Enhanced existing BM25ContentFilter tests for edge case coverage.
  - Updated documentation to include usage examples for new filter.
This commit is contained in:
unclecode
2024-12-01 19:17:33 +08:00
parent 80d58ad24c
commit 293f299c08
9 changed files with 499 additions and 135 deletions

View File

@@ -9,7 +9,7 @@ from bs4 import element, NavigableString, Comment
from urllib.parse import urljoin
from requests.exceptions import InvalidSchema
# from .content_cleaning_strategy import ContentCleaningStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter#, HeuristicContentFilter
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, PruningContentFilter
from .markdown_generation_strategy import MarkdownGenerationStrategy, DefaultMarkdownGenerator
from .models import MarkdownGenerationResult
from .utils import (
@@ -110,10 +110,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
if markdown_generator:
try:
if kwargs.get('fit_markdown', False) and not markdown_generator.content_filter:
markdown_generator.content_filter = BM25ContentFilter(
user_query=kwargs.get('fit_markdown_user_query', None),
bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
markdown_generator.content_filter = PruningContentFilter(
threshold_type=kwargs.get('fit_markdown_treshold_type', 'fixed'),
threshold=kwargs.get('fit_markdown_treshold', 0.48),
min_word_threshold=kwargs.get('fit_markdown_min_word_threshold', ),
)
# markdown_generator.content_filter = BM25ContentFilter(
# user_query=kwargs.get('fit_markdown_user_query', None),
# bm25_threshold=kwargs.get('fit_markdown_bm25_threshold', 1.0)
# )
markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
cleaned_html=cleaned_html,