chore: remove old Dockerfile and server script

2025-02-22 13:45:04 +08:00
parent 367cd71db9
commit 46d2f12851
3 changed files with 0 additions and 662 deletions
--- a/136
+++ b/136
@@ -1,136 +0,0 @@
-# syntax=docker/dockerfile:1.4
-
-ARG TARGETPLATFORM
-ARG BUILDPLATFORM
-
-# Other build arguments
-ARG PYTHON_VERSION=3.10
-
-# Base stage with system dependencies
-FROM python:${PYTHON_VERSION}-slim as base
-
-# Declare ARG variables again within the build stage
-ARG INSTALL_TYPE=all
-ARG ENABLE_GPU=false
-
-# Platform-specific labels
-LABEL maintainer="unclecode"
-LABEL description="🔥🕷️ Crawl4AI: Open-source LLM Friendly Web Crawler & scraper"
-LABEL version="1.0"
-
-# Environment setup
-ENV PYTHONUNBUFFERED=1 \
-    PYTHONDONTWRITEBYTECODE=1 \
-    PIP_NO_CACHE_DIR=1 \
-    PIP_DISABLE_PIP_VERSION_CHECK=1 \
-    PIP_DEFAULT_TIMEOUT=100 \
-    DEBIAN_FRONTEND=noninteractive
-
-# Install system dependencies
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential \
-    curl \
-    wget \
-    gnupg \
-    git \
-    cmake \
-    pkg-config \
-    python3-dev \
-    libjpeg-dev \
-    libpng-dev \
-    && rm -rf /var/lib/apt/lists/*
-
-# Playwright system dependencies for Linux
-RUN apt-get update && apt-get install -y --no-install-recommends \
-    libglib2.0-0 \
-    libnss3 \
-    libnspr4 \
-    libatk1.0-0 \
-    libatk-bridge2.0-0 \
-    libcups2 \
-    libdrm2 \
-    libdbus-1-3 \
-    libxcb1 \
-    libxkbcommon0 \
-    libx11-6 \
-    libxcomposite1 \
-    libxdamage1 \
-    libxext6 \
-    libxfixes3 \
-    libxrandr2 \
-    libgbm1 \
-    libpango-1.0-0 \
-    libcairo2 \
-    libasound2 \
-    libatspi2.0-0 \
-    && rm -rf /var/lib/apt/lists/*
-
-# GPU support if enabled and architecture is supported
-RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETPLATFORM" = "linux/amd64" ] ; then \
-    apt-get update && apt-get install -y --no-install-recommends \
-    nvidia-cuda-toolkit \
-    && rm -rf /var/lib/apt/lists/* ; \
-else \
-    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
-fi
-
-# Create and set working directory
-WORKDIR /app
-
-# Copy the entire project
-COPY . .
-
-# Install base requirements
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Install required library for FastAPI
-RUN pip install fastapi uvicorn psutil
-
-# Install ML dependencies first for better layer caching
-RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install --no-cache-dir \
-            torch \
-            torchvision \
-            torchaudio \
-            scikit-learn \
-            nltk \
-            transformers \
-            tokenizers && \
-        python -m nltk.downloader punkt stopwords ; \
-    fi
-
-# Install the package
-RUN if [ "$INSTALL_TYPE" = "all" ] ; then \
-        pip install ".[all]" && \
-        python -m crawl4ai.model_loader ; \
-    elif [ "$INSTALL_TYPE" = "torch" ] ; then \
-        pip install ".[torch]" ; \
-    elif [ "$INSTALL_TYPE" = "transformer" ] ; then \
-        pip install ".[transformer]" && \
-        python -m crawl4ai.model_loader ; \
-    else \
-        pip install "." ; \
-    fi
-
-    # Install MkDocs and required plugins
-RUN pip install --no-cache-dir \
-    mkdocs \
-    mkdocs-material \
-    mkdocs-terminal \
-    pymdown-extensions
-
-# Build MkDocs documentation
-RUN mkdocs build
-
-# Install Playwright and browsers
-RUN if [ "$TARGETPLATFORM" = "linux/amd64" ]; then \
-    playwright install chromium; \
-    elif [ "$TARGETPLATFORM" = "linux/arm64" ]; then \
-    playwright install chromium; \
-    fi
-
-# Expose port
-EXPOSE 8000 11235 9222 8080
-
-# Start the FastAPI server
-CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "11235"]
--- a/main.py
+++ b/main.py
@@ -1,526 +0,0 @@
-import asyncio, os
-from fastapi import FastAPI, HTTPException
-from fastapi import FastAPI, HTTPException
-from fastapi.staticfiles import StaticFiles
-from fastapi.middleware.cors import CORSMiddleware
-from fastapi.templating import Jinja2Templates
-from fastapi.responses import RedirectResponse
-from fastapi.security import HTTPBearer, HTTPAuthorizationCredentials
-from fastapi import Depends, Security
-
-from pydantic import BaseModel, HttpUrl, Field
-from typing import Optional, List, Dict, Any, Union
-import psutil
-import time
-import uuid
-import math
-import logging
-from enum import Enum
-from dataclasses import dataclass
-from crawl4ai import AsyncWebCrawler, CrawlResult, CacheMode
-from crawl4ai.config import MIN_WORD_THRESHOLD
-from crawl4ai.extraction_strategy import (
-    LLMExtractionStrategy,
-    CosineStrategy,
-    JsonCssExtractionStrategy,
-)
-
-__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
-
-
-logger = logging.getLogger(__name__)
-logger.setLevel(logging.INFO)
-
-
-class TaskStatus(str, Enum):
-    PENDING = "pending"
-    PROCESSING = "processing"
-    COMPLETED = "completed"
-    FAILED = "failed"
-
-
-class CrawlerType(str, Enum):
-    BASIC = "basic"
-    LLM = "llm"
-    COSINE = "cosine"
-    JSON_CSS = "json_css"
-
-
-class ExtractionConfig(BaseModel):
-    type: CrawlerType
-    params: Dict[str, Any] = {}
-
-
-class ChunkingStrategy(BaseModel):
-    type: str
-    params: Dict[str, Any] = {}
-
-
-class ContentFilter(BaseModel):
-    type: str = "bm25"
-    params: Dict[str, Any] = {}
-
-
-class CrawlRequest(BaseModel):
-    urls: Union[HttpUrl, List[HttpUrl]]
-    word_count_threshold: int = MIN_WORD_THRESHOLD
-    extraction_config: Optional[ExtractionConfig] = None
-    chunking_strategy: Optional[ChunkingStrategy] = None
-    content_filter: Optional[ContentFilter] = None
-    js_code: Optional[List[str]] = None
-    wait_for: Optional[str] = None
-    css_selector: Optional[str] = None
-    screenshot: bool = False
-    magic: bool = False
-    extra: Optional[Dict[str, Any]] = {}
-    session_id: Optional[str] = None
-    cache_mode: Optional[CacheMode] = CacheMode.ENABLED
-    priority: int = Field(default=5, ge=1, le=10)
-    ttl: Optional[int] = 3600
-    crawler_params: Dict[str, Any] = {}
-
-
-@dataclass
-class TaskInfo:
-    id: str
-    status: TaskStatus
-    result: Optional[Union[CrawlResult, List[CrawlResult]]] = None
-    error: Optional[str] = None
-    created_at: float = time.time()
-    ttl: int = 3600
-
-
-class ResourceMonitor:
-    def __init__(self, max_concurrent_tasks: int = 10):
-        self.max_concurrent_tasks = max_concurrent_tasks
-        self.memory_threshold = 0.85
-        self.cpu_threshold = 0.90
-        self._last_check = 0
-        self._check_interval = 1  # seconds
-        self._last_available_slots = max_concurrent_tasks
-
-    async def get_available_slots(self) -> int:
-        current_time = time.time()
-        if current_time - self._last_check < self._check_interval:
-            return self._last_available_slots
-
-        mem_usage = psutil.virtual_memory().percent / 100
-        cpu_usage = psutil.cpu_percent() / 100
-
-        memory_factor = max(
-            0, (self.memory_threshold - mem_usage) / self.memory_threshold
-        )
-        cpu_factor = max(0, (self.cpu_threshold - cpu_usage) / self.cpu_threshold)
-
-        self._last_available_slots = math.floor(
-            self.max_concurrent_tasks * min(memory_factor, cpu_factor)
-        )
-        self._last_check = current_time
-
-        return self._last_available_slots
-
-
-class TaskManager:
-    def __init__(self, cleanup_interval: int = 300):
-        self.tasks: Dict[str, TaskInfo] = {}
-        self.high_priority = asyncio.PriorityQueue()
-        self.low_priority = asyncio.PriorityQueue()
-        self.cleanup_interval = cleanup_interval
-        self.cleanup_task = None
-
-    async def start(self):
-        self.cleanup_task = asyncio.create_task(self._cleanup_loop())
-
-    async def stop(self):
-        if self.cleanup_task:
-            self.cleanup_task.cancel()
-            try:
-                await self.cleanup_task
-            except asyncio.CancelledError:
-                pass
-
-    async def add_task(self, task_id: str, priority: int, ttl: int) -> None:
-        task_info = TaskInfo(id=task_id, status=TaskStatus.PENDING, ttl=ttl)
-        self.tasks[task_id] = task_info
-        queue = self.high_priority if priority > 5 else self.low_priority
-        await queue.put((-priority, task_id))  # Negative for proper priority ordering
-
-    async def get_next_task(self) -> Optional[str]:
-        try:
-            # Try high priority first
-            _, task_id = await asyncio.wait_for(self.high_priority.get(), timeout=0.1)
-            return task_id
-        except asyncio.TimeoutError:
-            try:
-                # Then try low priority
-                _, task_id = await asyncio.wait_for(
-                    self.low_priority.get(), timeout=0.1
-                )
-                return task_id
-            except asyncio.TimeoutError:
-                return None
-
-    def update_task(
-        self, task_id: str, status: TaskStatus, result: Any = None, error: str = None
-    ):
-        if task_id in self.tasks:
-            task_info = self.tasks[task_id]
-            task_info.status = status
-            task_info.result = result
-            task_info.error = error
-
-    def get_task(self, task_id: str) -> Optional[TaskInfo]:
-        return self.tasks.get(task_id)
-
-    async def _cleanup_loop(self):
-        while True:
-            try:
-                await asyncio.sleep(self.cleanup_interval)
-                current_time = time.time()
-                expired_tasks = [
-                    task_id
-                    for task_id, task in self.tasks.items()
-                    if current_time - task.created_at > task.ttl
-                    and task.status in [TaskStatus.COMPLETED, TaskStatus.FAILED]
-                ]
-                for task_id in expired_tasks:
-                    del self.tasks[task_id]
-            except Exception as e:
-                logger.error(f"Error in cleanup loop: {e}")
-
-
-class CrawlerPool:
-    def __init__(self, max_size: int = 10):
-        self.max_size = max_size
-        self.active_crawlers: Dict[AsyncWebCrawler, float] = {}
-        self._lock = asyncio.Lock()
-
-    async def acquire(self, **kwargs) -> AsyncWebCrawler:
-        async with self._lock:
-            # Clean up inactive crawlers
-            current_time = time.time()
-            inactive = [
-                crawler
-                for crawler, last_used in self.active_crawlers.items()
-                if current_time - last_used > 600  # 10 minutes timeout
-            ]
-            for crawler in inactive:
-                await crawler.__aexit__(None, None, None)
-                del self.active_crawlers[crawler]
-
-            # Create new crawler if needed
-            if len(self.active_crawlers) < self.max_size:
-                crawler = AsyncWebCrawler(**kwargs)
-                await crawler.__aenter__()
-                self.active_crawlers[crawler] = current_time
-                return crawler
-
-            # Reuse least recently used crawler
-            crawler = min(self.active_crawlers.items(), key=lambda x: x[1])[0]
-            self.active_crawlers[crawler] = current_time
-            return crawler
-
-    async def release(self, crawler: AsyncWebCrawler):
-        async with self._lock:
-            if crawler in self.active_crawlers:
-                self.active_crawlers[crawler] = time.time()
-
-    async def cleanup(self):
-        async with self._lock:
-            for crawler in list(self.active_crawlers.keys()):
-                await crawler.__aexit__(None, None, None)
-            self.active_crawlers.clear()
-
-
-class CrawlerService:
-    def __init__(self, max_concurrent_tasks: int = 10):
-        self.resource_monitor = ResourceMonitor(max_concurrent_tasks)
-        self.task_manager = TaskManager()
-        self.crawler_pool = CrawlerPool(max_concurrent_tasks)
-        self._processing_task = None
-
-    async def start(self):
-        await self.task_manager.start()
-        self._processing_task = asyncio.create_task(self._process_queue())
-
-    async def stop(self):
-        if self._processing_task:
-            self._processing_task.cancel()
-            try:
-                await self._processing_task
-            except asyncio.CancelledError:
-                pass
-        await self.task_manager.stop()
-        await self.crawler_pool.cleanup()
-
-    def _create_extraction_strategy(self, config: ExtractionConfig):
-        if not config:
-            return None
-
-        if config.type == CrawlerType.LLM:
-            return LLMExtractionStrategy(**config.params)
-        elif config.type == CrawlerType.COSINE:
-            return CosineStrategy(**config.params)
-        elif config.type == CrawlerType.JSON_CSS:
-            return JsonCssExtractionStrategy(**config.params)
-        return None
-
-    async def submit_task(self, request: CrawlRequest) -> str:
-        task_id = str(uuid.uuid4())
-        await self.task_manager.add_task(task_id, request.priority, request.ttl or 3600)
-
-        # Store request data with task
-        self.task_manager.tasks[task_id].request = request
-
-        return task_id
-
-    async def _process_queue(self):
-        while True:
-            try:
-                available_slots = await self.resource_monitor.get_available_slots()
-                if False and available_slots <= 0:
-                    await asyncio.sleep(1)
-                    continue
-
-                task_id = await self.task_manager.get_next_task()
-                if not task_id:
-                    await asyncio.sleep(1)
-                    continue
-
-                task_info = self.task_manager.get_task(task_id)
-                if not task_info:
-                    continue
-
-                request = task_info.request
-                self.task_manager.update_task(task_id, TaskStatus.PROCESSING)
-
-                try:
-                    crawler = await self.crawler_pool.acquire(**request.crawler_params)
-
-                    extraction_strategy = self._create_extraction_strategy(
-                        request.extraction_config
-                    )
-
-                    if isinstance(request.urls, list):
-                        results = await crawler.arun_many(
-                            urls=[str(url) for url in request.urls],
-                            word_count_threshold=MIN_WORD_THRESHOLD,
-                            extraction_strategy=extraction_strategy,
-                            js_code=request.js_code,
-                            wait_for=request.wait_for,
-                            css_selector=request.css_selector,
-                            screenshot=request.screenshot,
-                            magic=request.magic,
-                            session_id=request.session_id,
-                            cache_mode=request.cache_mode,
-                            **request.extra,
-                        )
-                    else:
-                        results = await crawler.arun(
-                            url=str(request.urls),
-                            extraction_strategy=extraction_strategy,
-                            js_code=request.js_code,
-                            wait_for=request.wait_for,
-                            css_selector=request.css_selector,
-                            screenshot=request.screenshot,
-                            magic=request.magic,
-                            session_id=request.session_id,
-                            cache_mode=request.cache_mode,
-                            **request.extra,
-                        )
-
-                    await self.crawler_pool.release(crawler)
-                    self.task_manager.update_task(
-                        task_id, TaskStatus.COMPLETED, results
-                    )
-
-                except Exception as e:
-                    logger.error(f"Error processing task {task_id}: {str(e)}")
-                    self.task_manager.update_task(
-                        task_id, TaskStatus.FAILED, error=str(e)
-                    )
-
-            except Exception as e:
-                logger.error(f"Error in queue processing: {str(e)}")
-                await asyncio.sleep(1)
-
-
-app = FastAPI(title="Crawl4AI API")
-
-# CORS configuration
-origins = ["*"]  # Allow all origins
-app.add_middleware(
-    CORSMiddleware,
-    allow_origins=origins,  # List of origins that are allowed to make requests
-    allow_credentials=True,
-    allow_methods=["*"],  # Allows all methods
-    allow_headers=["*"],  # Allows all headers
-)
-
-# API token security
-security = HTTPBearer()
-CRAWL4AI_API_TOKEN = os.getenv("CRAWL4AI_API_TOKEN")
-
-
-async def verify_token(credentials: HTTPAuthorizationCredentials = Security(security)):
-    if not CRAWL4AI_API_TOKEN:
-        return credentials  # No token verification if CRAWL4AI_API_TOKEN is not set
-    if credentials.credentials != CRAWL4AI_API_TOKEN:
-        raise HTTPException(status_code=401, detail="Invalid token")
-    return credentials
-
-
-def secure_endpoint():
-    """Returns security dependency only if CRAWL4AI_API_TOKEN is set"""
-    return Depends(verify_token) if CRAWL4AI_API_TOKEN else None
-
-
-# Check if site directory exists
-if os.path.exists(__location__ + "/site"):
-    # Mount the site directory as a static directory
-    app.mount("/mkdocs", StaticFiles(directory="site", html=True), name="mkdocs")
-
-site_templates = Jinja2Templates(directory=__location__ + "/site")
-
-crawler_service = CrawlerService()
-
-
-@app.on_event("startup")
-async def startup_event():
-    await crawler_service.start()
-
-
-@app.on_event("shutdown")
-async def shutdown_event():
-    await crawler_service.stop()
-
-
-@app.get("/")
-def read_root():
-    if os.path.exists(__location__ + "/site"):
-        return RedirectResponse(url="/mkdocs")
-    # Return a json response
-    return {"message": "Crawl4AI API service is running"}
-
-
-@app.post("/crawl", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
-async def crawl(request: CrawlRequest) -> Dict[str, str]:
-    task_id = await crawler_service.submit_task(request)
-    return {"task_id": task_id}
-
-
-@app.get(
-    "/task/{task_id}", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
-)
-async def get_task_status(task_id: str):
-    task_info = crawler_service.task_manager.get_task(task_id)
-    if not task_info:
-        raise HTTPException(status_code=404, detail="Task not found")
-
-    response = {
-        "status": task_info.status,
-        "created_at": task_info.created_at,
-    }
-
-    if task_info.status == TaskStatus.COMPLETED:
-        # Convert CrawlResult to dict for JSON response
-        if isinstance(task_info.result, list):
-            response["results"] = [result.dict() for result in task_info.result]
-        else:
-            response["result"] = task_info.result.dict()
-    elif task_info.status == TaskStatus.FAILED:
-        response["error"] = task_info.error
-
-    return response
-
-
-@app.post("/crawl_sync", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else [])
-async def crawl_sync(request: CrawlRequest) -> Dict[str, Any]:
-    task_id = await crawler_service.submit_task(request)
-
-    # Wait up to 60 seconds for task completion
-    for _ in range(60):
-        task_info = crawler_service.task_manager.get_task(task_id)
-        if not task_info:
-            raise HTTPException(status_code=404, detail="Task not found")
-
-        if task_info.status == TaskStatus.COMPLETED:
-            # Return same format as /task/{task_id} endpoint
-            if isinstance(task_info.result, list):
-                return {
-                    "status": task_info.status,
-                    "results": [result.dict() for result in task_info.result],
-                }
-            return {"status": task_info.status, "result": task_info.result.dict()}
-
-        if task_info.status == TaskStatus.FAILED:
-            raise HTTPException(status_code=500, detail=task_info.error)
-
-        await asyncio.sleep(1)
-
-    # If we get here, task didn't complete within timeout
-    raise HTTPException(status_code=408, detail="Task timed out")
-
-
-@app.post(
-    "/crawl_direct", dependencies=[secure_endpoint()] if CRAWL4AI_API_TOKEN else []
-)
-async def crawl_direct(request: CrawlRequest) -> Dict[str, Any]:
-    try:
-        crawler = await crawler_service.crawler_pool.acquire(**request.crawler_params)
-        extraction_strategy = crawler_service._create_extraction_strategy(
-            request.extraction_config
-        )
-
-        try:
-            if isinstance(request.urls, list):
-                results = await crawler.arun_many(
-                    urls=[str(url) for url in request.urls],
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"results": [result.dict() for result in results]}
-            else:
-                result = await crawler.arun(
-                    url=str(request.urls),
-                    extraction_strategy=extraction_strategy,
-                    js_code=request.js_code,
-                    wait_for=request.wait_for,
-                    css_selector=request.css_selector,
-                    screenshot=request.screenshot,
-                    magic=request.magic,
-                    cache_mode=request.cache_mode,
-                    session_id=request.session_id,
-                    **request.extra,
-                )
-                return {"result": result.dict()}
-        finally:
-            await crawler_service.crawler_pool.release(crawler)
-    except Exception as e:
-        logger.error(f"Error in direct crawl: {str(e)}")
-        raise HTTPException(status_code=500, detail=str(e))
-
-
-@app.get("/health")
-async def health_check():
-    available_slots = await crawler_service.resource_monitor.get_available_slots()
-    memory = psutil.virtual_memory()
-    return {
-        "status": "healthy",
-        "available_slots": available_slots,
-        "memory_usage": memory.percent,
-        "cpu_usage": psutil.cpu_percent(),
-    }
-
-
-if __name__ == "__main__":
-    import uvicorn
-
-    uvicorn.run(app, host="0.0.0.0", port=11235)
--- a/server.py
+++ b/server.py