feat(docs): update examples and documentation to replace bypass_cache with cache_mode for improved clarity

This commit is contained in:
UncleCode
2024-11-17 19:44:45 +08:00
parent a59c107b23
commit df63a40606
17 changed files with 422 additions and 80 deletions

View File

@@ -532,14 +532,13 @@ class WebScrapingStrategy(ContentScrapingStrategy):
fit_markdown = "Set flag 'fit_markdown' to True to get cleaned HTML content."
fit_html = "Set flag 'fit_markdown' to True to get cleaned HTML content."
if kwargs.get('fit_markdown', False):
# cleaner = ContentCleaningStrategy()
# fit_html = cleaner.clean(cleaned_html)
# fit_markdown = h.handle(fit_html)
content_filter = BM25ContentFilter(
user_query= kwargs.get('fit_markdown_user_query', None),
bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0)
)
if kwargs.get('content_filter', None) or kwargs.get('fit_markdown', False):
content_filter = kwargs.get('content_filter', None)
if not content_filter:
content_filter = BM25ContentFilter(
user_query= kwargs.get('fit_markdown_user_query', None),
bm25_threshold= kwargs.get('fit_markdown_bm25_threshold', 1.0)
)
fit_html = content_filter.filter_content(html)
fit_html = '\n'.join('<div>{}</div>'.format(s) for s in fit_html)
fit_markdown = h.handle(fit_html)