crawl4ai/crawl4ai/pipeline/middlewares.py

import time
import sys
from typing import Dict, Any, List
import json

from crawl4ai.models import (
    CrawlResult,
    MarkdownGenerationResult,
    ScrapingResult,
    CrawlResultContainer,
)
from crawl4ai.async_database import async_db_manager
from crawl4ai.cache_context import CacheMode, CacheContext
from crawl4ai.utils import (
    sanitize_input_encode,
    InvalidCSSSelectorError,
    fast_format_html,
    create_box_message,
    get_error_context,
)


async def initialize_context_middleware(context: Dict[str, Any]) -> int:
    """Initialize the context with basic configuration and validation"""
    url = context.get("url")
    config = context.get("config")

    if not isinstance(url, str) or not url:
        context["error_message"] = "Invalid URL, make sure the URL is a non-empty string"
        return 0

    # Default to ENABLED if no cache mode specified
    if config.cache_mode is None:
        config.cache_mode = CacheMode.ENABLED

    # Create cache context
    context["cache_context"] = CacheContext(url, config.cache_mode, False)
    context["start_time"] = time.perf_counter()

    return 1

# middlewares.py additions

async def browser_hub_middleware(context: Dict[str, Any]) -> int:
    """
    Initialize or connect to a Browser-Hub and add it to the pipeline context.

    This middleware handles browser hub initialization for all three scenarios:
    1. Default configuration when nothing is specified
    2. Custom configuration when browser_config is provided
    3. Connection to existing hub when browser_hub_connection is provided

    Args:
        context: The pipeline context dictionary

    Returns:
        int: 1 for success, 0 for failure
    """
    from crawl4ai.browser.browser_hub import BrowserHub

    try:
        # Get configuration from context
        browser_config = context.get("browser_config")
        browser_hub_id = context.get("browser_hub_id")
        browser_hub_connection = context.get("browser_hub_connection")
        logger = context.get("logger")

        # If we already have a browser hub in context, use it
        if context.get("browser_hub"):
            return 1

        # Get or create Browser-Hub
        browser_hub = await BrowserHub.get_browser_manager(
            config=browser_config,
            hub_id=browser_hub_id,
            connection_info=browser_hub_connection,
            logger=logger
        )

        # Add to context
        context["browser_hub"] = browser_hub
        return 1
    except Exception as e:
        context["error_message"] = f"Failed to initialize browser hub: {str(e)}"
        return 0


async def fetch_content_middleware(context: Dict[str, Any]) -> int:
    """
    Fetch content from the web using the browser hub.

    This middleware uses the browser hub to get pages for crawling,
    and properly releases them back to the pool when done.

    Args:
        context: The pipeline context dictionary

    Returns:
        int: 1 for success, 0 for failure
    """
    url = context.get("url")
    config = context.get("config")
    browser_hub = context.get("browser_hub")
    logger = context.get("logger")

    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1

    try:
        # Create crawler strategy without initializing its browser manager
        from crawl4ai.async_crawler_strategy import AsyncPlaywrightCrawlerStrategy

        crawler_strategy = AsyncPlaywrightCrawlerStrategy(
            browser_config=browser_hub.config if browser_hub else None,
            logger=logger
        )

        # Replace the browser manager with our shared instance
        crawler_strategy.browser_manager = browser_hub

        # Perform crawl without trying to initialize the browser
        # The crawler will use the provided browser_manager to get pages
        async_response = await crawler_strategy.crawl(url, config=config)

        # Store results in context
        context["html"] = async_response.html
        context["screenshot_data"] = async_response.screenshot
        context["pdf_data"] = async_response.pdf_data
        context["js_execution_result"] = async_response.js_execution_result
        context["async_response"] = async_response

        return 1
    except Exception as e:
        context["error_message"] = f"Error fetching content: {str(e)}"
        return 0


async def check_cache_middleware(context: Dict[str, Any]) -> int:
    """Check if there's a cached result and load it if available"""
    url = context.get("url")
    config = context.get("config")
    cache_context = context.get("cache_context")
    logger = context.get("logger")

    # Initialize variables
    context["cached_result"] = None
    context["html"] = None
    context["extracted_content"] = None
    context["screenshot_data"] = None
    context["pdf_data"] = None

    # Try to get cached result if appropriate
    if cache_context.should_read():
        cached_result = await async_db_manager.aget_cached_url(url)
        context["cached_result"] = cached_result

        if cached_result:
            html = sanitize_input_encode(cached_result.html)
            extracted_content = sanitize_input_encode(cached_result.extracted_content or "")
            extracted_content = None if not extracted_content or extracted_content == "[]" else extracted_content

            # If screenshot is requested but its not in cache, then set cache_result to None
            screenshot_data = cached_result.screenshot
            pdf_data = cached_result.pdf

            if config.screenshot and not screenshot_data:
                context["cached_result"] = None

            if config.pdf and not pdf_data:
                context["cached_result"] = None

            context["html"] = html
            context["extracted_content"] = extracted_content
            context["screenshot_data"] = screenshot_data
            context["pdf_data"] = pdf_data

            logger.url_status(
                url=cache_context.display_url,
                success=bool(html),
                timing=time.perf_counter() - context["start_time"],
                tag="FETCH",
            )

    return 1


async def configure_proxy_middleware(context: Dict[str, Any]) -> int:
    """Configure proxy if a proxy rotation strategy is available"""
    config = context.get("config")
    logger = context.get("logger")

    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1

    # Update proxy configuration from rotation strategy if available
    if config and config.proxy_rotation_strategy:
        next_proxy = await config.proxy_rotation_strategy.get_next_proxy()
        if next_proxy:
            logger.info(
                message="Switch proxy: {proxy}",
                tag="PROXY",
                params={"proxy": next_proxy.server},
            )
            config.proxy_config = next_proxy

    return 1


async def check_robots_txt_middleware(context: Dict[str, Any]) -> int:
    """Check if the URL is allowed by robots.txt if enabled"""
    url = context.get("url")
    config = context.get("config")
    browser_config = context.get("browser_config")
    robots_parser = context.get("robots_parser")

    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1

    # Check robots.txt if enabled
    if config and config.check_robots_txt:
        if not await robots_parser.can_fetch(url, browser_config.user_agent):
            context["crawl_result"] = CrawlResult(
                url=url,
                html="",
                success=False,
                status_code=403,
                error_message="Access denied by robots.txt",
                response_headers={"X-Robots-Status": "Blocked by robots.txt"}
            )
            return 0

    return 1


async def fetch_content_middleware_(context: Dict[str, Any]) -> int:
    """Fetch content from the web using the crawler strategy"""
    url = context.get("url")
    config = context.get("config")
    crawler_strategy = context.get("crawler_strategy")
    logger = context.get("logger")

    # Skip if using cached result
    if context.get("cached_result") and context.get("html"):
        return 1

    try:
        t1 = time.perf_counter()

        if config.user_agent:
            crawler_strategy.update_user_agent(config.user_agent)

        # Call CrawlerStrategy.crawl
        async_response = await crawler_strategy.crawl(url, config=config)

        html = sanitize_input_encode(async_response.html)
        screenshot_data = async_response.screenshot
        pdf_data = async_response.pdf_data
        js_execution_result = async_response.js_execution_result

        t2 = time.perf_counter()
        logger.url_status(
            url=context["cache_context"].display_url,
            success=bool(html),
            timing=t2 - t1,
            tag="FETCH",
        )

        context["html"] = html
        context["screenshot_data"] = screenshot_data
        context["pdf_data"] = pdf_data
        context["js_execution_result"] = js_execution_result
        context["async_response"] = async_response

        return 1
    except Exception as e:
        context["error_message"] = f"Error fetching content: {str(e)}"
        return 0


async def scrape_content_middleware(context: Dict[str, Any]) -> int:
    """Apply scraping strategy to extract content"""
    url = context.get("url")
    html = context.get("html")
    config = context.get("config")
    extracted_content = context.get("extracted_content")
    logger = context.get("logger")

    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1

    try:
        _url = url if not context.get("is_raw_html", False) else "Raw HTML"
        t1 = time.perf_counter()

        # Get scraping strategy and ensure it has a logger
        scraping_strategy = config.scraping_strategy
        if not scraping_strategy.logger:
            scraping_strategy.logger = logger

        # Process HTML content
        params = config.__dict__.copy()
        params.pop("url", None)
        # Add keys from kwargs to params that don't exist in params
        kwargs = context.get("kwargs", {})
        params.update({k: v for k, v in kwargs.items() if k not in params.keys()})

        # Scraping Strategy Execution
        result: ScrapingResult = scraping_strategy.scrap(url, html, **params)

        if result is None:
            raise ValueError(f"Process HTML, Failed to extract content from the website: {url}")

        # Extract results - handle both dict and ScrapingResult
        if isinstance(result, dict):
            cleaned_html = sanitize_input_encode(result.get("cleaned_html", ""))
            media = result.get("media", {})
            links = result.get("links", {})
            metadata = result.get("metadata", {})
        else:
            cleaned_html = sanitize_input_encode(result.cleaned_html)
            media = result.media.model_dump()
            links = result.links.model_dump()
            metadata = result.metadata

        context["cleaned_html"] = cleaned_html
        context["media"] = media
        context["links"] = links
        context["metadata"] = metadata

        # Log processing completion
        logger.info(
            message="{url:.50}... | Time: {timing}s",
            tag="SCRAPE",
            params={
                "url": _url,
                "timing": int((time.perf_counter() - t1) * 1000) / 1000,
            },
        )

        return 1
    except InvalidCSSSelectorError as e:
        context["error_message"] = str(e)
        return 0
    except Exception as e:
        context["error_message"] = f"Process HTML, Failed to extract content from the website: {url}, error: {str(e)}"
        return 0


async def generate_markdown_middleware(context: Dict[str, Any]) -> int:
    """Generate markdown from cleaned HTML"""
    url = context.get("url")
    cleaned_html = context.get("cleaned_html")
    config = context.get("config")

    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1

    # Generate Markdown
    markdown_generator = config.markdown_generator

    markdown_result: MarkdownGenerationResult = markdown_generator.generate_markdown(
        cleaned_html=cleaned_html,
        base_url=url,
    )

    context["markdown_result"] = markdown_result

    return 1


async def extract_structured_content_middleware(context: Dict[str, Any]) -> int:
    """Extract structured content using extraction strategy"""
    url = context.get("url")
    extracted_content = context.get("extracted_content")
    config = context.get("config")
    markdown_result = context.get("markdown_result")
    cleaned_html = context.get("cleaned_html")
    logger = context.get("logger")

    # Skip if already have a crawl result or extracted content
    if context.get("crawl_result") or bool(extracted_content):
        return 1

    from crawl4ai.chunking_strategy import IdentityChunking
    from crawl4ai.extraction_strategy import NoExtractionStrategy

    if config.extraction_strategy and not isinstance(config.extraction_strategy, NoExtractionStrategy):
        t1 = time.perf_counter()
        _url = url if not context.get("is_raw_html", False) else "Raw HTML"

        # Choose content based on input_format
        content_format = config.extraction_strategy.input_format
        if content_format == "fit_markdown" and not markdown_result.fit_markdown:
            logger.warning(
                message="Fit markdown requested but not available. Falling back to raw markdown.",
                tag="EXTRACT",
                params={"url": _url},
            )
            content_format = "markdown"

        content = {
            "markdown": markdown_result.raw_markdown,
            "html": context.get("html"),
            "cleaned_html": cleaned_html,
            "fit_markdown": markdown_result.fit_markdown,
        }.get(content_format, markdown_result.raw_markdown)

        # Use IdentityChunking for HTML input, otherwise use provided chunking strategy
        chunking = (
            IdentityChunking()
            if content_format in ["html", "cleaned_html"]
            else config.chunking_strategy
        )
        sections = chunking.chunk(content)
        extracted_content = config.extraction_strategy.run(url, sections)
        extracted_content = json.dumps(
            extracted_content, indent=4, default=str, ensure_ascii=False
        )

        context["extracted_content"] = extracted_content

        # Log extraction completion
        logger.info(
            message="Completed for {url:.50}... | Time: {timing}s",
            tag="EXTRACT",
            params={"url": _url, "timing": time.perf_counter() - t1},
        )

    return 1


async def format_html_middleware(context: Dict[str, Any]) -> int:
    """Format HTML if prettify is enabled"""
    config = context.get("config")
    cleaned_html = context.get("cleaned_html")

    # Skip if already have a crawl result
    if context.get("crawl_result"):
        return 1

    # Apply HTML formatting if requested
    if config.prettiify and cleaned_html:
        context["cleaned_html"] = fast_format_html(cleaned_html)

    return 1


async def write_cache_middleware(context: Dict[str, Any]) -> int:
    """Write result to cache if appropriate"""
    cache_context = context.get("cache_context")
    cached_result = context.get("cached_result")

    # Skip if already have a crawl result or not using cache
    if context.get("crawl_result") or not cache_context.should_write() or bool(cached_result):
        return 1

    # We'll create the CrawlResult in build_result_middleware and cache it there
    # to avoid creating it twice

    return 1


async def build_result_middleware(context: Dict[str, Any]) -> int:
    """Build the final CrawlResult object"""
    url = context.get("url")
    html = context.get("html", "")
    cache_context = context.get("cache_context")
    cached_result = context.get("cached_result")
    config = context.get("config")
    logger = context.get("logger")

    # If we already have a crawl result (from an earlier middleware like robots.txt check)
    if context.get("crawl_result"):
        result = context["crawl_result"]
        context["final_result"] = CrawlResultContainer(result)
        return 1

    # If we have a cached result
    if cached_result and html:
        logger.success(
            message="{url:.50}... | Status: {status} | Total: {timing}",
            tag="COMPLETE",
            params={
                "url": cache_context.display_url,
                "status": True,
                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
            },
            colors={"status": "green", "timing": "yellow"},
        )

        cached_result.success = bool(html)
        cached_result.session_id = getattr(config, "session_id", None)
        cached_result.redirected_url = cached_result.redirected_url or url
        context["final_result"] = CrawlResultContainer(cached_result)
        return 1

    # Build a new result
    try:
        # Get all necessary components from context
        cleaned_html = context.get("cleaned_html", "")
        markdown_result = context.get("markdown_result")
        media = context.get("media", {})
        links = context.get("links", {})
        metadata = context.get("metadata", {})
        screenshot_data = context.get("screenshot_data")
        pdf_data = context.get("pdf_data")
        extracted_content = context.get("extracted_content")
        async_response = context.get("async_response")

        # Create the CrawlResult
        crawl_result = CrawlResult(
            url=url,
            html=html,
            cleaned_html=cleaned_html,
            markdown=markdown_result,
            media=media,
            links=links,
            metadata=metadata,
            screenshot=screenshot_data,
            pdf=pdf_data,
            extracted_content=extracted_content,
            success=bool(html),
            error_message="",
        )

        # Add response details if available
        if async_response:
            crawl_result.status_code = async_response.status_code
            crawl_result.redirected_url = async_response.redirected_url or url
            crawl_result.response_headers = async_response.response_headers
            crawl_result.downloaded_files = async_response.downloaded_files
            crawl_result.js_execution_result = context.get("js_execution_result")
            crawl_result.ssl_certificate = async_response.ssl_certificate

        crawl_result.session_id = getattr(config, "session_id", None)

        # Log completion
        logger.success(
            message="{url:.50}... | Status: {status} | Total: {timing}",
            tag="COMPLETE",
            params={
                "url": cache_context.display_url,
                "status": crawl_result.success,
                "timing": f"{time.perf_counter() - context['start_time']:.2f}s",
            },
            colors={
                "status": "green" if crawl_result.success else "red",
                "timing": "yellow",
            },
        )

        # Update cache if appropriate
        if cache_context.should_write() and not bool(cached_result):
            await async_db_manager.acache_url(crawl_result)

        context["final_result"] = CrawlResultContainer(crawl_result)
        return 1
    except Exception as e:
        error_context = get_error_context(sys.exc_info())

        error_message = (
            f"Unexpected error in build_result at line {error_context['line_no']} "
            f"in {error_context['function']} ({error_context['filename']}):\n"
            f"Error: {str(e)}\n\n"
            f"Code context:\n{error_context['code_context']}"
        )

        logger.error_status(
            url=url,
            error=create_box_message(error_message, type="error"),
            tag="ERROR",
        )

        context["final_result"] = CrawlResultContainer(
            CrawlResult(
                url=url, html="", success=False, error_message=error_message
            )
        )
        return 1


async def handle_error_middleware(context: Dict[str, Any]) -> Dict[str, Any]:
    """Error handler middleware"""
    url = context.get("url", "")
    error_message = context.get("error_message", "Unknown error")
    logger = context.get("logger")

    # Log the error
    if logger:
        logger.error_status(
            url=url,
            error=create_box_message(error_message, type="error"),
            tag="ERROR",
        )

    # Create a failure result
    context["final_result"] = CrawlResultContainer(
        CrawlResult(
            url=url, html="", success=False, error_message=error_message
        )
    )

    return context


# Custom middlewares as requested

async def sentiment_analysis_middleware(context: Dict[str, Any]) -> int:
    """Analyze sentiment of generated markdown using TextBlob"""
    from textblob import TextBlob

    markdown_result = context.get("markdown_result")

    # Skip if no markdown or already failed
    if not markdown_result or not context.get("success", True):
        return 1

    try:
        # Get raw markdown text
        raw_markdown = markdown_result.raw_markdown

        # Analyze sentiment
        blob = TextBlob(raw_markdown)
        sentiment = blob.sentiment

        # Add sentiment to context
        context["sentiment_analysis"] = {
            "polarity": sentiment.polarity,  # -1.0 to 1.0 (negative to positive)
            "subjectivity": sentiment.subjectivity,  # 0.0 to 1.0 (objective to subjective)
            "classification": "positive" if sentiment.polarity > 0.1 else
                             "negative" if sentiment.polarity < -0.1 else "neutral"
        }

        return 1
    except Exception as e:
        # Don't fail the pipeline on sentiment analysis failure
        context["sentiment_analysis_error"] = str(e)
        return 1


async def log_timing_middleware(context: Dict[str, Any], name: str) -> int:
    """Log timing information for a specific point in the pipeline"""
    context[f"_timing_mark_{name}"] = time.perf_counter()

    # Calculate duration if we have a start time
    start_key = f"_timing_start_{name}"
    if start_key in context:
        duration = context[f"_timing_mark_{name}"] - context[start_key]
        context[f"_timing_duration_{name}"] = duration

        # Log the timing if we have a logger
        logger = context.get("logger")
        if logger:
            logger.info(
                message="{name} completed in {duration:.2f}s",
                tag="TIMING",
                params={"name": name, "duration": duration},
            )

    return 1


async def validate_url_middleware(context: Dict[str, Any], patterns: List[str]) -> int:
    """Validate URL against glob patterns"""
    import fnmatch
    url = context.get("url", "")

    # If no patterns provided, allow all
    if not patterns:
        return 1

    # Check if URL matches any of the allowed patterns
    for pattern in patterns:
        if fnmatch.fnmatch(url, pattern):
            return 1

    # If we get here, URL didn't match any patterns
    context["error_message"] = f"URL '{url}' does not match any allowed patterns"
    return 0


# Update the default middleware list function
def create_default_middleware_list():
    """Return the default list of middleware functions for the pipeline."""
    return [
        initialize_context_middleware,
        check_cache_middleware,
        browser_hub_middleware,  # Add browser hub middleware before fetch_content
        configure_proxy_middleware,
        check_robots_txt_middleware,
        fetch_content_middleware,
        scrape_content_middleware,
        generate_markdown_middleware,
        extract_structured_content_middleware,
        format_html_middleware,
        build_result_middleware
    ]