128 lines
3.8 KiB
Python
128 lines
3.8 KiB
Python
import modal
|
|
from typing import Optional, Dict, Any
|
|
|
|
# Create a custom image with Crawl4ai and its dependencies
|
|
# "pip install crawl4ai",
|
|
image = modal.Image.debian_slim(python_version="3.10").pip_install(["fastapi[standard]"]).run_commands(
|
|
"apt-get update",
|
|
"apt-get install -y software-properties-common",
|
|
"apt-get install -y git",
|
|
"apt-add-repository non-free",
|
|
"apt-add-repository contrib",
|
|
"pip install -U git+https://github.com/unclecode/crawl4ai.git@next",
|
|
"pip install -U fastapi[standard]",
|
|
"pip install -U pydantic",
|
|
"crawl4ai-setup", # This installs playwright and downloads chromium
|
|
# Print fastpi version
|
|
"python -m fastapi --version",
|
|
)
|
|
|
|
# Define the app
|
|
app = modal.App("crawl4ai", image=image)
|
|
|
|
# Define default configurations
|
|
DEFAULT_BROWSER_CONFIG = {
|
|
"headless": True,
|
|
"verbose": False,
|
|
}
|
|
|
|
DEFAULT_CRAWLER_CONFIG = {
|
|
"crawler_config": {
|
|
"type": "CrawlerRunConfig",
|
|
"params": {
|
|
"markdown_generator": {
|
|
"type": "DefaultMarkdownGenerator",
|
|
"params": {
|
|
"content_filter": {
|
|
"type": "PruningContentFilter",
|
|
"params": {
|
|
"threshold": 0.48,
|
|
"threshold_type": "fixed"
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
@app.function(timeout=300) # 5 minute timeout
|
|
async def crawl(
|
|
url: str,
|
|
browser_config: Optional[Dict[str, Any]] = None,
|
|
crawler_config: Optional[Dict[str, Any]] = None,
|
|
) -> Dict[str, Any]:
|
|
"""
|
|
Crawl a given URL using Crawl4ai.
|
|
|
|
Args:
|
|
url: The URL to crawl
|
|
browser_config: Optional browser configuration to override defaults
|
|
crawler_config: Optional crawler configuration to override defaults
|
|
|
|
Returns:
|
|
A dictionary containing the crawl results
|
|
"""
|
|
from crawl4ai import (
|
|
AsyncWebCrawler,
|
|
BrowserConfig,
|
|
CrawlerRunConfig,
|
|
CrawlResult
|
|
)
|
|
|
|
|
|
# Prepare browser config using the loader method
|
|
if browser_config is None:
|
|
browser_config = DEFAULT_BROWSER_CONFIG
|
|
browser_config_obj = BrowserConfig.load(browser_config)
|
|
|
|
# Prepare crawler config using the loader method
|
|
if crawler_config is None:
|
|
crawler_config = DEFAULT_CRAWLER_CONFIG
|
|
crawler_config_obj = CrawlerRunConfig.load(crawler_config)
|
|
|
|
|
|
# Perform the crawl
|
|
async with AsyncWebCrawler(config=browser_config_obj) as crawler:
|
|
result: CrawlResult = await crawler.arun(url=url, config=crawler_config_obj)
|
|
|
|
# Return serializable results
|
|
try:
|
|
# Try newer Pydantic v2 method
|
|
return result.model_dump()
|
|
except AttributeError:
|
|
try:
|
|
# Try older Pydantic v1 method
|
|
return result.__dict__
|
|
except AttributeError:
|
|
# Fallback to returning the raw result
|
|
return result
|
|
|
|
@app.function()
|
|
@modal.web_endpoint(method="POST")
|
|
def crawl_endpoint(data: Dict[str, Any]) -> Dict[str, Any]:
|
|
"""
|
|
Web endpoint that accepts POST requests with JSON data containing:
|
|
- url: The URL to crawl
|
|
- browser_config: Optional browser configuration
|
|
- crawler_config: Optional crawler configuration
|
|
|
|
Returns the crawl results.
|
|
"""
|
|
url = data.get("url")
|
|
if not url:
|
|
return {"error": "URL is required"}
|
|
|
|
browser_config = data.get("browser_config")
|
|
crawler_config = data.get("crawler_config")
|
|
|
|
return crawl.remote(url, browser_config, crawler_config)
|
|
|
|
@app.local_entrypoint()
|
|
def main(url: str = "https://www.modal.com"):
|
|
"""
|
|
Command line entrypoint for local testing.
|
|
"""
|
|
result = crawl.remote(url)
|
|
print(result)
|