Make fields in MediaItem and Link models optional with default values to prevent validation errors when data is incomplete. Also expose BaseDispatcher in __init__ and fix markdown field handling in database manager. BREAKING CHANGE: MediaItem and Link model fields are now optional with default values which may affect existing code expecting required fields.
171 lines
5.9 KiB
Python
171 lines
5.9 KiB
Python
import asyncio
|
|
from typing import Dict
|
|
from crawl4ai.content_filter_strategy import BM25ContentFilter, PruningContentFilter
|
|
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
import time
|
|
|
|
# Test HTML samples
|
|
TEST_HTML_SAMPLES = {
|
|
"basic": """
|
|
<body>
|
|
<h1>Test Title</h1>
|
|
<p>This is a test paragraph with <a href="http://example.com">a link</a>.</p>
|
|
<div class="content">
|
|
<h2>Section 1</h2>
|
|
<p>More content here with <b>bold text</b>.</p>
|
|
</div>
|
|
</body>
|
|
""",
|
|
|
|
"complex": """
|
|
<body>
|
|
<nav>Navigation menu that should be removed</nav>
|
|
<header>Header content to remove</header>
|
|
<main>
|
|
<article>
|
|
<h1>Main Article</h1>
|
|
<p>Important content paragraph with <a href="http://test.com">useful link</a>.</p>
|
|
<section>
|
|
<h2>Key Section</h2>
|
|
<p>Detailed explanation with multiple sentences. This should be kept
|
|
in the final output. Very important information here.</p>
|
|
</section>
|
|
</article>
|
|
<aside>Sidebar content to remove</aside>
|
|
</main>
|
|
<footer>Footer content to remove</footer>
|
|
</body>
|
|
""",
|
|
|
|
"edge_cases": """
|
|
<body>
|
|
<div>
|
|
<p></p>
|
|
<p> </p>
|
|
<script>alert('remove me');</script>
|
|
<div class="advertisement">Ad content to remove</div>
|
|
<p class="social-share">Share buttons to remove</p>
|
|
<h1>!!Special>> Characters## Title!!</h1>
|
|
<pre><code>def test(): pass</code></pre>
|
|
</div>
|
|
</body>
|
|
""",
|
|
|
|
"links_citations": """
|
|
<body>
|
|
<h1>Document with Links</h1>
|
|
<p>First link to <a href="http://example.com/1">Example 1</a></p>
|
|
<p>Second link to <a href="http://example.com/2" title="Example 2">Test 2</a></p>
|
|
<p>Image link: <img src="test.jpg" alt="test image"></p>
|
|
<p>Repeated link to <a href="http://example.com/1">Example 1 again</a></p>
|
|
</body>
|
|
""",
|
|
}
|
|
|
|
def test_content_filters() -> Dict[str, Dict[str, int]]:
|
|
"""Test various content filtering strategies and return length comparisons."""
|
|
results = {}
|
|
|
|
# Initialize filters
|
|
pruning_filter = PruningContentFilter(
|
|
threshold=0.48,
|
|
threshold_type="fixed",
|
|
min_word_threshold=2
|
|
)
|
|
|
|
bm25_filter = BM25ContentFilter(
|
|
bm25_threshold=1.0,
|
|
user_query="test article content important"
|
|
)
|
|
|
|
# Test each HTML sample
|
|
for test_name, html in TEST_HTML_SAMPLES.items():
|
|
# Store results for this test case
|
|
results[test_name] = {}
|
|
|
|
# Test PruningContentFilter
|
|
start_time = time.time()
|
|
pruned_content = pruning_filter.filter_content(html)
|
|
pruning_time = time.time() - start_time
|
|
|
|
# Test BM25ContentFilter
|
|
start_time = time.time()
|
|
bm25_content = bm25_filter.filter_content(html)
|
|
bm25_time = time.time() - start_time
|
|
|
|
# Store results
|
|
results[test_name] = {
|
|
"original_length": len(html),
|
|
"pruned_length": sum(len(c) for c in pruned_content),
|
|
"bm25_length": sum(len(c) for c in bm25_content),
|
|
"pruning_time": pruning_time,
|
|
"bm25_time": bm25_time
|
|
}
|
|
|
|
return results
|
|
|
|
def test_markdown_generation():
|
|
"""Test markdown generation with different configurations."""
|
|
results = []
|
|
|
|
# Initialize generators with different configurations
|
|
generators = {
|
|
"no_filter": DefaultMarkdownGenerator(),
|
|
"pruning": DefaultMarkdownGenerator(
|
|
content_filter=PruningContentFilter(threshold=0.48)
|
|
),
|
|
"bm25": DefaultMarkdownGenerator(
|
|
content_filter=BM25ContentFilter(
|
|
user_query="test article content important"
|
|
)
|
|
)
|
|
}
|
|
|
|
# Test each generator with each HTML sample
|
|
for test_name, html in TEST_HTML_SAMPLES.items():
|
|
for gen_name, generator in generators.items():
|
|
start_time = time.time()
|
|
result = generator.generate_markdown(
|
|
html,
|
|
base_url="http://example.com",
|
|
citations=True
|
|
)
|
|
|
|
results.append({
|
|
"test_case": test_name,
|
|
"generator": gen_name,
|
|
"time": time.time() - start_time,
|
|
"raw_length": len(result.raw_markdown),
|
|
"fit_length": len(result.fit_markdown) if result.fit_markdown else 0,
|
|
"citations": len(result.references_markdown)
|
|
})
|
|
|
|
return results
|
|
|
|
def main():
|
|
"""Run all tests and print results."""
|
|
print("Starting content filter tests...")
|
|
filter_results = test_content_filters()
|
|
|
|
print("\nContent Filter Results:")
|
|
print("-" * 50)
|
|
for test_name, metrics in filter_results.items():
|
|
print(f"\nTest case: {test_name}")
|
|
print(f"Original length: {metrics['original_length']}")
|
|
print(f"Pruned length: {metrics['pruned_length']} ({metrics['pruning_time']:.3f}s)")
|
|
print(f"BM25 length: {metrics['bm25_length']} ({metrics['bm25_time']:.3f}s)")
|
|
|
|
print("\nStarting markdown generation tests...")
|
|
markdown_results = test_markdown_generation()
|
|
|
|
print("\nMarkdown Generation Results:")
|
|
print("-" * 50)
|
|
for result in markdown_results:
|
|
print(f"\nTest: {result['test_case']} - Generator: {result['generator']}")
|
|
print(f"Time: {result['time']:.3f}s")
|
|
print(f"Raw length: {result['raw_length']}")
|
|
print(f"Fit length: {result['fit_length']}")
|
|
print(f"Citations: {result['citations']}")
|
|
|
|
if __name__ == "__main__":
|
|
main() |