Files
crawl4ai/deploy/modal/entry.py
2025-03-10 18:57:14 +08:00

128 lines
3.8 KiB
Python

import modal
from typing import Optional, Dict, Any
# Create a custom image with Crawl4ai and its dependencies
# "pip install crawl4ai",
image = modal.Image.debian_slim(python_version="3.10").pip_install(["fastapi[standard]"]).run_commands(
"apt-get update",
"apt-get install -y software-properties-common",
"apt-get install -y git",
"apt-add-repository non-free",
"apt-add-repository contrib",
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
"pip install -U fastapi[standard]",
"pip install -U pydantic",
"crawl4ai-setup", # This installs playwright and downloads chromium
# Print fastpi version
"python -m fastapi --version",
)
# Define the app
app = modal.App("crawl4ai", image=image)
# Define default configurations
DEFAULT_BROWSER_CONFIG = {
"headless": True,
"verbose": False,
}
DEFAULT_CRAWLER_CONFIG = {
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {
"threshold": 0.48,
"threshold_type": "fixed"
}
}
}
}
}
}
}
@app.function(timeout=300) # 5 minute timeout
async def crawl(
url: str,
browser_config: Optional[Dict[str, Any]] = None,
crawler_config: Optional[Dict[str, Any]] = None,
) -> Dict[str, Any]:
"""
Crawl a given URL using Crawl4ai.
Args:
url: The URL to crawl
browser_config: Optional browser configuration to override defaults
crawler_config: Optional crawler configuration to override defaults
Returns:
A dictionary containing the crawl results
"""
from crawl4ai import (
AsyncWebCrawler,
BrowserConfig,
CrawlerRunConfig,
CrawlResult
)
# Prepare browser config using the loader method
if browser_config is None:
browser_config = DEFAULT_BROWSER_CONFIG
browser_config_obj = BrowserConfig.load(browser_config)
# Prepare crawler config using the loader method
if crawler_config is None:
crawler_config = DEFAULT_CRAWLER_CONFIG
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
# Perform the crawl
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
# Return serializable results
try:
# Try newer Pydantic v2 method
return result.model_dump()
except AttributeError:
try:
# Try older Pydantic v1 method
return result.__dict__
except AttributeError:
# Fallback to returning the raw result
return result
@app.function()
@modal.web_endpoint(method="POST")
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
"""
Web endpoint that accepts POST requests with JSON data containing:
- url: The URL to crawl
- browser_config: Optional browser configuration
- crawler_config: Optional crawler configuration
Returns the crawl results.
"""
url = data.get("url")
if not url:
return {"error": "URL is required"}
browser_config = data.get("browser_config")
crawler_config = data.get("crawler_config")
return crawl.remote(url, browser_config, crawler_config)
@app.local_entrypoint()
def main(url: str = "https://www.modal.com"):
"""
Command line entrypoint for local testing.
"""
result = crawl.remote(url)
print(result)