crawl4ai/deploy/lambda/lambda_function.py

import json
import asyncio
import os

# Ensure environment variables and directories are set
os.environ['CRAWL4_AI_BASE_DIRECTORY'] = '/tmp/.crawl4ai'
os.environ['HOME'] = '/tmp'

# Create directory if it doesn't exist
os.makedirs('/tmp/.crawl4ai', exist_ok=True)

from crawl4ai import (
    AsyncWebCrawler,
    BrowserConfig,
    CrawlerRunConfig,
    CacheMode
)


def handler(event, context):
    # Parse the incoming event (API Gateway request)
    try:
        body = json.loads(event.get('body', '{}'))

        url = body.get('url')
        if not url:
            return {
                'statusCode': 400,
                'body': json.dumps({'error': 'URL is required'})
            }

        # Get optional configurations or use defaults
        browser_config_dict = body.get('browser_config', {})
        crawler_config_dict = body.get('crawler_config', {})

        # Run the crawler
        result = asyncio.run(crawl(url, browser_config_dict, crawler_config_dict))

        # Return successful response
        return {
            'statusCode': 200,
            'headers': {
                'Content-Type': 'application/json'
            },
            'body': json.dumps(result)
        }

    except Exception as e:
        # Handle errors
        import traceback
        return {
            'statusCode': 500,
            'body': json.dumps({
                'error': str(e),
                'traceback': traceback.format_exc()
            })
        }

async def crawl(url, browser_config_dict, crawler_config_dict):
    """
    Run the crawler with the provided configurations, with Lambda-specific settings
    """
    # Start with user-provided config but override with Lambda-required settings
    base_browser_config = BrowserConfig.load(browser_config_dict) if browser_config_dict else BrowserConfig()

    # Apply Lambda-specific browser configurations
    browser_config = BrowserConfig(
        verbose=True,
        browser_type="chromium",
        headless=True,
        user_agent_mode="random",
        light_mode=True,
        use_managed_browser=False,
        extra_args=[
            "--headless=new",
            "--no-sandbox",
            "--disable-dev-shm-usage",
            "--disable-setuid-sandbox",
            "--remote-allow-origins=*",
            "--autoplay-policy=user-gesture-required",
            "--single-process",
        ],
        # # Carry over any other settings from user config that aren't overridden
        # **{k: v for k, v in base_browser_config.model_dump().items()
        #    if k not in ['verbose', 'browser_type', 'headless', 'user_agent_mode',
        #                'light_mode', 'use_managed_browser', 'extra_args']}
    )

    # Start with user-provided crawler config but ensure cache is bypassed
    base_crawler_config = CrawlerRunConfig.load(crawler_config_dict) if crawler_config_dict else CrawlerRunConfig()

    # Apply Lambda-specific crawler configurations
    crawler_config = CrawlerRunConfig(
        exclude_external_links=base_crawler_config.exclude_external_links,
        remove_overlay_elements=True,
        magic=True,
        cache_mode=CacheMode.BYPASS,
        # Carry over markdown generator and other settings
        markdown_generator=base_crawler_config.markdown_generator
    )

    # Perform the crawl with Lambda-optimized settings
    async with AsyncWebCrawler(config=browser_config) as crawler:
        result = await crawler.arun(url=url, config=crawler_config)

        # Return serializable results
        return result.model_dump()