Files
crawl4ai/tests/adaptive/test_confidence_debug.py
UncleCode 843457a9cb Refactor adaptive crawling state management
- Renamed `CrawlState` to `AdaptiveCrawlResult` to better reflect its purpose.
- Updated all references to `CrawlState` in the codebase, including method signatures and documentation.
- Modified the `AdaptiveCrawler` class to initialize and manage the new `AdaptiveCrawlResult` state.
- Adjusted example strategies and documentation to align with the new state class.
- Ensured all tests are updated to use `AdaptiveCrawlResult` instead of `CrawlState`.
2025-07-24 20:11:43 +08:00

182 lines
7.6 KiB
Python

"""
Test script for debugging confidence calculation in adaptive crawler
Focus: Testing why confidence decreases when crawling relevant URLs
"""
import asyncio
import sys
from pathlib import Path
from typing import List, Dict
import math
# Add parent directory to path for imports
sys.path.append(str(Path(__file__).parent.parent))
from crawl4ai import AsyncWebCrawler
from crawl4ai.adaptive_crawler import AdaptiveCrawlResult, StatisticalStrategy
from crawl4ai.models import CrawlResult
class ConfidenceTestHarness:
"""Test harness for analyzing confidence calculation"""
def __init__(self):
self.strategy = StatisticalStrategy()
self.test_urls = [
'https://docs.python.org/3/library/asyncio.html',
'https://docs.python.org/3/library/asyncio-runner.html',
'https://docs.python.org/3/library/asyncio-api-index.html',
'https://docs.python.org/3/library/contextvars.html',
'https://docs.python.org/3/library/asyncio-stream.html'
]
self.query = "async await context manager"
async def test_confidence_progression(self):
"""Test confidence calculation as we crawl each URL"""
print(f"Testing confidence for query: '{self.query}'")
print("=" * 80)
# Initialize state
state = AdaptiveCrawlResult(query=self.query)
# Create crawler
async with AsyncWebCrawler() as crawler:
for i, url in enumerate(self.test_urls, 1):
print(f"\n{i}. Crawling: {url}")
print("-" * 80)
# Crawl the URL
result = await crawler.arun(url=url)
# Extract markdown content
if hasattr(result, '_results') and result._results:
result = result._results[0]
# Create a mock CrawlResult with markdown
mock_result = type('CrawlResult', (), {
'markdown': type('Markdown', (), {
'raw_markdown': result.markdown.raw_markdown if hasattr(result, 'markdown') else ''
})(),
'url': url
})()
# Update state
state.knowledge_base.append(mock_result)
await self.strategy.update_state(state, [mock_result])
# Calculate metrics
confidence = await self.strategy.calculate_confidence(state)
# Get individual components
coverage = state.metrics.get('coverage', 0)
consistency = state.metrics.get('consistency', 0)
saturation = state.metrics.get('saturation', 0)
# Analyze term frequencies
query_terms = self.strategy._tokenize(self.query.lower())
term_stats = {}
for term in query_terms:
term_stats[term] = {
'tf': state.term_frequencies.get(term, 0),
'df': state.document_frequencies.get(term, 0)
}
# Print detailed results
print(f"State after crawl {i}:")
print(f" Total documents: {state.total_documents}")
print(f" Unique terms: {len(state.term_frequencies)}")
print(f" New terms added: {state.new_terms_history[-1] if state.new_terms_history else 0}")
print(f"\nQuery term statistics:")
for term, stats in term_stats.items():
print(f" '{term}': tf={stats['tf']}, df={stats['df']}")
print(f"\nMetrics:")
print(f" Coverage: {coverage:.3f}")
print(f" Consistency: {consistency:.3f}")
print(f" Saturation: {saturation:.3f}")
print(f" → Confidence: {confidence:.3f}")
# Show coverage calculation details
print(f"\nCoverage calculation details:")
self._debug_coverage_calculation(state, query_terms)
# Alert if confidence decreased
if i > 1 and confidence < state.metrics.get('prev_confidence', 0):
print(f"\n⚠️ WARNING: Confidence decreased from {state.metrics.get('prev_confidence', 0):.3f} to {confidence:.3f}")
state.metrics['prev_confidence'] = confidence
def _debug_coverage_calculation(self, state: AdaptiveCrawlResult, query_terms: List[str]):
"""Debug coverage calculation step by step"""
coverage_score = 0.0
max_possible_score = 0.0
for term in query_terms:
tf = state.term_frequencies.get(term, 0)
df = state.document_frequencies.get(term, 0)
if df > 0:
idf = math.log((state.total_documents - df + 0.5) / (df + 0.5) + 1)
doc_coverage = df / state.total_documents
tf_boost = min(tf / df, 3.0)
term_score = doc_coverage * idf * (1 + 0.1 * math.log1p(tf_boost))
print(f" '{term}': doc_cov={doc_coverage:.2f}, idf={idf:.2f}, boost={1 + 0.1 * math.log1p(tf_boost):.2f} → score={term_score:.3f}")
coverage_score += term_score
else:
print(f" '{term}': not found → score=0.000")
max_possible_score += 1.0 * 1.0 * 1.1
print(f" Total: {coverage_score:.3f} / {max_possible_score:.3f} = {coverage_score/max_possible_score if max_possible_score > 0 else 0:.3f}")
# New coverage calculation
print(f"\n NEW Coverage calculation (without IDF):")
new_coverage = self._calculate_coverage_new(state, query_terms)
print(f" → New Coverage: {new_coverage:.3f}")
def _calculate_coverage_new(self, state: AdaptiveCrawlResult, query_terms: List[str]) -> float:
"""New coverage calculation without IDF"""
if not query_terms or state.total_documents == 0:
return 0.0
term_scores = []
max_tf = max(state.term_frequencies.values()) if state.term_frequencies else 1
for term in query_terms:
tf = state.term_frequencies.get(term, 0)
df = state.document_frequencies.get(term, 0)
if df > 0:
# Document coverage: what fraction of docs contain this term
doc_coverage = df / state.total_documents
# Frequency signal: normalized log frequency
freq_signal = math.log(1 + tf) / math.log(1 + max_tf) if max_tf > 0 else 0
# Combined score: document coverage with frequency boost
term_score = doc_coverage * (1 + 0.5 * freq_signal)
print(f" '{term}': doc_cov={doc_coverage:.2f}, freq_signal={freq_signal:.2f} → score={term_score:.3f}")
term_scores.append(term_score)
else:
print(f" '{term}': not found → score=0.000")
term_scores.append(0.0)
# Average across all query terms
coverage = sum(term_scores) / len(term_scores)
return coverage
async def main():
"""Run the confidence test"""
tester = ConfidenceTestHarness()
await tester.test_confidence_progression()
print("\n" + "=" * 80)
print("Test complete!")
if __name__ == "__main__":
asyncio.run(main())