import asyncio from typing import Dict from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator import time # Test HTML samples TEST_HTML_SAMPLES = { "basic": """

Test Title

This is a test paragraph with a link.

Section 1

More content here with bold text.

""", "complex": """
Header content to remove

Main Article

Important content paragraph with useful link.

Key Section

Detailed explanation with multiple sentences. This should be kept in the final output. Very important information here.

""", "edge_cases": """

Share buttons to remove

!!Special>> Characters## Title!!

def test(): pass
""", "links_citations": """

Document with Links

First link to Example 1

Second link to Test 2

Image link: test image

Repeated link to Example 1 again

""", } def test_content_filters() -> Dict[str, Dict[str, int]]: """Test various content filtering strategies and return length comparisons.""" results = {} # Initialize filters pruning_filter = PruningContentFilter( threshold=0.48, threshold_type="fixed", min_word_threshold=2 ) bm25_filter = BM25ContentFilter( bm25_threshold=1.0, user_query="test article content important" ) # Test each HTML sample for test_name, html in TEST_HTML_SAMPLES.items(): # Store results for this test case results[test_name] = {} # Test PruningContentFilter start_time = time.time() pruned_content = pruning_filter.filter_content(html) pruning_time = time.time() - start_time # Test BM25ContentFilter start_time = time.time() bm25_content = bm25_filter.filter_content(html) bm25_time = time.time() - start_time # Store results results[test_name] = { "original_length": len(html), "pruned_length": sum(len(c) for c in pruned_content), "bm25_length": sum(len(c) for c in bm25_content), "pruning_time": pruning_time, "bm25_time": bm25_time } return results def test_markdown_generation(): """Test markdown generation with different configurations.""" results = [] # Initialize generators with different configurations generators = { "no_filter": DefaultMarkdownGenerator(), "pruning": DefaultMarkdownGenerator( content_filter=PruningContentFilter(threshold=0.48) ), "bm25": DefaultMarkdownGenerator( content_filter=BM25ContentFilter( user_query="test article content important" ) ) } # Test each generator with each HTML sample for test_name, html in TEST_HTML_SAMPLES.items(): for gen_name, generator in generators.items(): start_time = time.time() result = generator.generate_markdown( html, base_url="http://example.com", citations=True ) results.append({ "test_case": test_name, "generator": gen_name, "time": time.time() - start_time, "raw_length": len(result.raw_markdown), "fit_length": len(result.fit_markdown) if result.fit_markdown else 0, "citations": len(result.references_markdown) }) return results def main(): """Run all tests and print results.""" print("Starting content filter tests...") filter_results = test_content_filters() print("\nContent Filter Results:") print("-" * 50) for test_name, metrics in filter_results.items(): print(f"\nTest case: {test_name}") print(f"Original length: {metrics['original_length']}") print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)") print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)") print("\nStarting markdown generation tests...") markdown_results = test_markdown_generation() print("\nMarkdown Generation Results:") print("-" * 50) for result in markdown_results: print(f"\nTest: {result['test_case']} - Generator: {result['generator']}") print(f"Time: {result['time']:.3f}s") print(f"Raw length: {result['raw_length']}") print(f"Fit length: {result['fit_length']}") print(f"Citations: {result['citations']}") if __name__ == "__main__": main()