crawl4ai/docs/examples/adaptive_crawling/custom_strategies.py

"""
Custom Adaptive Crawling Strategies

This example demonstrates how to implement custom scoring strategies
for domain-specific crawling needs.
"""

import asyncio
import re
from typing import List, Dict, Set
from crawl4ai import AsyncWebCrawler, AdaptiveCrawler, AdaptiveConfig
from crawl4ai.adaptive_crawler import CrawlState, Link
import math


class APIDocumentationStrategy:
    """
    Custom strategy optimized for API documentation crawling.
    Prioritizes endpoint references, code examples, and parameter descriptions.
    """

    def __init__(self):
        # Keywords that indicate high-value API documentation
        self.api_keywords = {
            'endpoint', 'request', 'response', 'parameter', 'authentication',
            'header', 'body', 'query', 'path', 'method', 'get', 'post', 'put',
            'delete', 'patch', 'status', 'code', 'example', 'curl', 'python'
        }

        # URL patterns that typically contain API documentation
        self.valuable_patterns = [
            r'/api/',
            r'/reference/',
            r'/endpoints?/',
            r'/methods?/',
            r'/resources?/'
        ]

        # Patterns to avoid
        self.avoid_patterns = [
            r'/blog/',
            r'/news/',
            r'/about/',
            r'/contact/',
            r'/legal/'
        ]

    def score_link(self, link: Link, query: str, state: CrawlState) -> float:
        """Custom link scoring for API documentation"""
        score = 1.0
        url = link.href.lower()

        # Boost API-related URLs
        for pattern in self.valuable_patterns:
            if re.search(pattern, url):
                score *= 2.0
                break

        # Reduce score for non-API content
        for pattern in self.avoid_patterns:
            if re.search(pattern, url):
                score *= 0.1
                break

        # Boost if preview contains API keywords
        if link.text:
            preview_lower = link.text.lower()
            keyword_count = sum(1 for kw in self.api_keywords if kw in preview_lower)
            score *= (1 + keyword_count * 0.2)

        # Prioritize shallow URLs (likely overview pages)
        depth = url.count('/') - 2  # Subtract protocol slashes
        if depth <= 3:
            score *= 1.5
        elif depth > 6:
            score *= 0.5

        return score

    def calculate_api_coverage(self, state: CrawlState, query: str) -> Dict[str, float]:
        """Calculate specialized coverage metrics for API documentation"""
        metrics = {
            'endpoint_coverage': 0.0,
            'example_coverage': 0.0,
            'parameter_coverage': 0.0
        }

        # Analyze knowledge base for API-specific content
        endpoint_patterns = [r'GET\s+/', r'POST\s+/', r'PUT\s+/', r'DELETE\s+/']
        example_patterns = [r'```\w+', r'curl\s+-', r'import\s+requests']
        param_patterns = [r'param(?:eter)?s?\s*:', r'required\s*:', r'optional\s*:']

        total_docs = len(state.knowledge_base)
        if total_docs == 0:
            return metrics

        docs_with_endpoints = 0
        docs_with_examples = 0
        docs_with_params = 0

        for doc in state.knowledge_base:
            content = doc.markdown.raw_markdown if hasattr(doc, 'markdown') else str(doc)

            # Check for endpoints
            if any(re.search(pattern, content, re.IGNORECASE) for pattern in endpoint_patterns):
                docs_with_endpoints += 1

            # Check for examples
            if any(re.search(pattern, content, re.IGNORECASE) for pattern in example_patterns):
                docs_with_examples += 1

            # Check for parameters
            if any(re.search(pattern, content, re.IGNORECASE) for pattern in param_patterns):
                docs_with_params += 1

        metrics['endpoint_coverage'] = docs_with_endpoints / total_docs
        metrics['example_coverage'] = docs_with_examples / total_docs
        metrics['parameter_coverage'] = docs_with_params / total_docs

        return metrics


class ResearchPaperStrategy:
    """
    Strategy optimized for crawling research papers and academic content.
    Prioritizes citations, abstracts, and methodology sections.
    """

    def __init__(self):
        self.academic_keywords = {
            'abstract', 'introduction', 'methodology', 'results', 'conclusion',
            'references', 'citation', 'paper', 'study', 'research', 'analysis',
            'hypothesis', 'experiment', 'findings', 'doi'
        }

        self.citation_patterns = [
            r'\[\d+\]',  # [1] style citations
            r'\(\w+\s+\d{4}\)',  # (Author 2024) style
            r'doi:\s*\S+',  # DOI references
        ]

    def calculate_academic_relevance(self, content: str, query: str) -> float:
        """Calculate relevance score for academic content"""
        score = 0.0
        content_lower = content.lower()

        # Check for academic keywords
        keyword_matches = sum(1 for kw in self.academic_keywords if kw in content_lower)
        score += keyword_matches * 0.1

        # Check for citations
        citation_count = sum(
            len(re.findall(pattern, content))
            for pattern in self.citation_patterns
        )
        score += min(citation_count * 0.05, 1.0)  # Cap at 1.0

        # Check for query terms in academic context
        query_terms = query.lower().split()
        for term in query_terms:
            # Boost if term appears near academic keywords
            for keyword in ['abstract', 'conclusion', 'results']:
                if keyword in content_lower:
                    section = content_lower[content_lower.find(keyword):content_lower.find(keyword) + 500]
                    if term in section:
                        score += 0.2

        return min(score, 2.0)  # Cap total score


async def demo_custom_strategies():
    """Demonstrate custom strategy usage"""

    # Example 1: API Documentation Strategy
    print("="*60)
    print("EXAMPLE 1: Custom API Documentation Strategy")
    print("="*60)

    api_strategy = APIDocumentationStrategy()

    async with AsyncWebCrawler() as crawler:
        # Standard adaptive crawler
        config = AdaptiveConfig(
            confidence_threshold=0.8,
            max_pages=15
        )

        adaptive = AdaptiveCrawler(crawler, config)

        # Override link scoring with custom strategy
        original_rank_links = adaptive._rank_links

        def custom_rank_links(links, query, state):
            # Apply custom scoring
            scored_links = []
            for link in links:
                base_score = api_strategy.score_link(link, query, state)
                scored_links.append((link, base_score))

            # Sort by score
            scored_links.sort(key=lambda x: x[1], reverse=True)
            return [link for link, _ in scored_links[:config.top_k_links]]

        adaptive._rank_links = custom_rank_links

        # Crawl API documentation
        print("\nCrawling API documentation with custom strategy...")
        state = await adaptive.digest(
            start_url="https://httpbin.org",
            query="api endpoints authentication headers"
        )

        # Calculate custom metrics
        api_metrics = api_strategy.calculate_api_coverage(state, "api endpoints")

        print(f"\nResults:")
        print(f"Pages crawled: {len(state.crawled_urls)}")
        print(f"Confidence: {adaptive.confidence:.2%}")
        print(f"\nAPI-Specific Metrics:")
        print(f"  - Endpoint coverage: {api_metrics['endpoint_coverage']:.2%}")
        print(f"  - Example coverage: {api_metrics['example_coverage']:.2%}")
        print(f"  - Parameter coverage: {api_metrics['parameter_coverage']:.2%}")

    # Example 2: Combined Strategy
    print("\n" + "="*60)
    print("EXAMPLE 2: Hybrid Strategy Combining Multiple Approaches")
    print("="*60)

    class HybridStrategy:
        """Combines multiple strategies with weights"""

        def __init__(self):
            self.api_strategy = APIDocumentationStrategy()
            self.research_strategy = ResearchPaperStrategy()
            self.weights = {
                'api': 0.7,
                'research': 0.3
            }

        def score_content(self, content: str, query: str) -> float:
            # Get scores from each strategy
            api_score = self._calculate_api_score(content, query)
            research_score = self.research_strategy.calculate_academic_relevance(content, query)

            # Weighted combination
            total_score = (
                api_score * self.weights['api'] +
                research_score * self.weights['research']
            )

            return total_score

        def _calculate_api_score(self, content: str, query: str) -> float:
            # Simplified API scoring based on keyword presence
            content_lower = content.lower()
            api_keywords = self.api_strategy.api_keywords

            keyword_count = sum(1 for kw in api_keywords if kw in content_lower)
            return min(keyword_count * 0.1, 2.0)

    hybrid_strategy = HybridStrategy()

    async with AsyncWebCrawler() as crawler:
        adaptive = AdaptiveCrawler(crawler)

        # Crawl with hybrid scoring
        print("\nTesting hybrid strategy on technical documentation...")
        state = await adaptive.digest(
            start_url="https://docs.python.org/3/library/asyncio.html",
            query="async await coroutines api"
        )

        # Analyze results with hybrid strategy
        print(f"\nHybrid Strategy Analysis:")
        total_score = 0
        for doc in adaptive.get_relevant_content(top_k=5):
            content = doc['content'] or ""
            score = hybrid_strategy.score_content(content, "async await api")
            total_score += score
            print(f"  - {doc['url'][:50]}... Score: {score:.2f}")

        print(f"\nAverage hybrid score: {total_score/5:.2f}")


async def demo_performance_optimization():
    """Demonstrate performance optimization with custom strategies"""

    print("\n" + "="*60)
    print("EXAMPLE 3: Performance-Optimized Strategy")
    print("="*60)

    class PerformanceOptimizedStrategy:
        """Strategy that balances thoroughness with speed"""

        def __init__(self):
            self.url_cache: Set[str] = set()
            self.domain_scores: Dict[str, float] = {}

        def should_crawl_domain(self, url: str) -> bool:
            """Implement domain-level filtering"""
            domain = url.split('/')[2] if url.startswith('http') else url

            # Skip if we've already crawled many pages from this domain
            domain_count = sum(1 for cached in self.url_cache if domain in cached)
            if domain_count > 5:
                return False

            # Skip low-scoring domains
            if domain in self.domain_scores and self.domain_scores[domain] < 0.3:
                return False

            return True

        def update_domain_score(self, url: str, relevance: float):
            """Track domain-level performance"""
            domain = url.split('/')[2] if url.startswith('http') else url

            if domain not in self.domain_scores:
                self.domain_scores[domain] = relevance
            else:
                # Moving average
                self.domain_scores[domain] = (
                    0.7 * self.domain_scores[domain] + 0.3 * relevance
                )

    perf_strategy = PerformanceOptimizedStrategy()

    async with AsyncWebCrawler() as crawler:
        config = AdaptiveConfig(
            confidence_threshold=0.7,
            max_pages=10,
            top_k_links=2  # Fewer links for speed
        )

        adaptive = AdaptiveCrawler(crawler, config)

        # Track performance
        import time
        start_time = time.time()

        state = await adaptive.digest(
            start_url="https://httpbin.org",
            query="http methods headers"
        )

        elapsed = time.time() - start_time

        print(f"\nPerformance Results:")
        print(f"  - Time elapsed: {elapsed:.2f} seconds")
        print(f"  - Pages crawled: {len(state.crawled_urls)}")
        print(f"  - Pages per second: {len(state.crawled_urls)/elapsed:.2f}")
        print(f"  - Final confidence: {adaptive.confidence:.2%}")
        print(f"  - Efficiency: {adaptive.confidence/len(state.crawled_urls):.2%} confidence per page")


async def main():
    """Run all demonstrations"""
    try:
        await demo_custom_strategies()
        await demo_performance_optimization()

        print("\n" + "="*60)
        print("All custom strategy examples completed!")
        print("="*60)

    except Exception as e:
        print(f"Error: {e}")
        import traceback
        traceback.print_exc()


if __name__ == "__main__":
    asyncio.run(main())