import asyncio from typing import Dict from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator import time # Test HTML samples TEST_HTML_SAMPLES = { "basic": """

Test Title

This is a test paragraph with a link.

Section 1

Main Article

Important content paragraph with useful link.

Key Section

Detailed explanation with multiple sentences. This should be kept in the final output. Very important information here.

""", "edge_cases": """

Ad content to remove

!!Special>> Characters## Title!!

def test(): pass

""", "links_citations": """

Document with Links

First link to Example 1

Second link to Test 2

Image link: test image

Repeated link to Example 1 again

""", } def test_content_filters() -> Dict[str, Dict[str, int]]: """Test various content filtering strategies and return length comparisons.""" results = {} # Initialize filters pruning_filter = PruningContentFilter( threshold=0.48, threshold_type="fixed", min_word_threshold=2 ) bm25_filter = BM25ContentFilter( bm25_threshold=1.0, user_query="test article content important" ) # Test each HTML sample for test_name, html in TEST_HTML_SAMPLES.items(): # Store results for this test case results[test_name] = {} # Test PruningContentFilter start_time = time.time() pruned_content = pruning_filter.filter_content(html) pruning_time = time.time() - start_time # Test BM25ContentFilter start_time = time.time() bm25_content = bm25_filter.filter_content(html) bm25_time = time.time() - start_time # Store results results[test_name] = { "original_length": len(html), "pruned_length": sum(len(c) for c in pruned_content), "bm25_length": sum(len(c) for c in bm25_content), "pruning_time": pruning_time, "bm25_time": bm25_time } return results def test_markdown_generation(): """Test markdown generation with different configurations.""" results = [] # Initialize generators with different configurations generators = { "no_filter": DefaultMarkdownGenerator(), "pruning": DefaultMarkdownGenerator( content_filter=PruningContentFilter(threshold=0.48) ), "bm25": DefaultMarkdownGenerator( content_filter=BM25ContentFilter( user_query="test article content important" ) ) } # Test each generator with each HTML sample for test_name, html in TEST_HTML_SAMPLES.items(): for gen_name, generator in generators.items(): start_time = time.time() result = generator.generate_markdown( html, base_url="http://example.com", citations=True ) results.append({ "test_case": test_name, "generator": gen_name, "time": time.time() - start_time, "raw_length": len(result.raw_markdown), "fit_length": len(result.fit_markdown) if result.fit_markdown else 0, "citations": len(result.references_markdown) }) return results def main(): """Run all tests and print results.""" print("Starting content filter tests...") filter_results = test_content_filters() print("\nContent Filter Results:") print("-" * 50) for test_name, metrics in filter_results.items(): print(f"\nTest case: {test_name}") print(f"Original length: {metrics['original_length']}") print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)") print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)") print("\nStarting markdown generation tests...") markdown_results = test_markdown_generation() print("\nMarkdown Generation Results:") print("-" * 50) for result in markdown_results: print(f"\nTest: {result['test_case']} - Generator: {result['generator']}") print(f"Time: {result['time']:.3f}s") print(f"Raw length: {result['raw_length']}") print(f"Fit length: {result['fit_length']}") print(f"Citations: {result['citations']}") if __name__ == "__main__": main()