Fixed capturing console messages for case the url is the local file. Update docker configuration (work in progress)

2025-04-10 23:22:38 +08:00
parent 66ac07b4f3
commit 108b2a8bfb
9 changed files with 898 additions and 664 deletions
--- a/25
+++ b/25
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libjpeg-dev \
    redis-server \
    supervisor \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*
 RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
    libcairo2 \
    libasound2 \
    libatspi2.0-0 \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*
 RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
    apt-get update && apt-get install -y --no-install-recommends \
    nvidia-cuda-toolkit \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/* ; \
 else \
    echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
    echo "🦾 Installing ARM-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libopenblas-dev \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 elif [ "$TARGETARCH" = "amd64" ]; then \
    echo "🖥️ Installing AMD64-specific optimizations"; \
    apt-get update && apt-get install -y --no-install-recommends \
    libomp-dev \
    && apt-get clean \ 
    && rm -rf /var/lib/apt/lists/*; \
 else \
    echo "Skipping platform-specific optimizations (unsupported platform)"; \
 fi
 # Create a non-root user and group
 RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
 # Create and set permissions for appuser home directory
 RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
 WORKDIR ${APP_HOME}
 RUN echo '#!/bin/bash\n\
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
 COPY . /tmp/project/
 # Copy supervisor config first (might need root later, but okay for now)
 COPY deploy/docker/supervisord.conf .
 COPY deploy/docker/requirements.txt .
@@ -139,8 +151,15 @@ RUN pip install --no-cache-dir --upgrade pip && \
 RUN playwright install --with-deps chromium
 # Copy application code
 COPY deploy/docker/* ${APP_HOME}/
 # Change ownership of the application directory to the non-root user
 RUN chown -R appuser:appuser ${APP_HOME}
 # give permissions to redis persistence dirs if used
 RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
 HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
    CMD bash -c '\
    MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
        exit 1; \
    fi && \
    redis-cli ping > /dev/null && \
-    curl -f http://localhost:8000/health || exit 1'
+    curl -f http://localhost:11235/health || exit 1'
 EXPOSE 6379
-CMD ["supervisord", "-c", "supervisord.conf"]
+# Switch to the non-root user before starting the application
 USER appuser
 CMD ["supervisord", "-c", "supervisord.conf"]
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
        user_agent = kwargs.get("user_agent", self.user_agent)
        # Use browser_manager to get a fresh page & context assigned to this session_id
-        page, context = await self.browser_manager.get_page(session_id, user_agent)
+        page, context = await self.browser_manager.get_page(CrawlerRunConfig(
            session_id=session_id,
            user_agent=user_agent,
            **kwargs,
        ))
        return session_id
    async def crawl(
@@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                html = f.read()
            if config.screenshot:
                screenshot_data = await self._generate_screenshot_from_html(html)
            if config.capture_console_messages:
                page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
                captured_console = await self._capture_console_messages(page, url)
            return AsyncCrawlResponse(
                html=html,
                response_headers=response_headers,
                status_code=status_code,
                screenshot=screenshot_data,
                get_delayed_content=None,
                console_messages=captured_console,
            )
        elif url.startswith("raw:") or url.startswith("raw://"):
@@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                        "url": request.url,
                        "method": request.method,
                        "resource_type": request.resource_type,
-                        "failure_text": request.failure.error_text if request.failure else "Unknown failure",
+                        "failure_text": str(request.failure) if request.failure else "Unknown failure",
                        "timestamp": time.time()
                    })
                 except Exception as e:
@@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
                )
            return None
    async def _capture_console_messages(
        self, page: Page, file_path: str
    ) -> List[Dict[str, Union[str, float]]]:
        """
        Captures console messages from the page.
        Args:
            page (Page): The Playwright page object
        Returns:
            List[Dict[str, Union[str, float]]]: A list of captured console messages
        """
        captured_console = []
        def handle_console_message(msg):
            try:
                message_type = msg.type
                message_text = msg.text
                entry = {
                    "type": message_type,
                    "text": message_text,
                    "timestamp": time.time(),
                }
                captured_console.append(entry)
            except Exception as e:
                if self.logger:
                    self.logger.warning(
                        f"Error capturing console message: {e}", tag="CAPTURE"
                    )
        page.on("console", handle_console_message)
        await page.goto(file_path)
        return captured_console
    async def take_screenshot(self, page, **kwargs) -> str:
        """
        Take a screenshot of the current page.
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -658,7 +658,7 @@ class BrowserManager:
                    "name": "cookiesEnabled",
                    "value": "true",
                    "url": crawlerRunConfig.url
-                    if crawlerRunConfig
+                    if crawlerRunConfig and crawlerRunConfig.url
                    else "https://crawl4ai.com/",
                }
            ]
--- a/deploy/docker/requirements.txt
+++ b/deploy/docker/requirements.txt
@@ -1,4 +1,3 @@
 crawl4ai
 fastapi
 uvicorn
 gunicorn>=23.0.0
--- a/deploy/docker/supervisord.conf
+++ b/deploy/docker/supervisord.conf
@@ -1,12 +1,28 @@
 [supervisord]
-nodaemon=true
+nodaemon=true                   ; Run supervisord in the foreground
 logfile=/dev/null               ; Log supervisord output to stdout/stderr
 logfile_maxbytes=0
 [program:redis]
-command=redis-server
+command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
 user=appuser                    ; Run redis as our non-root user
 autorestart=true
 priority=10
 stdout_logfile=/dev/stdout      ; Redirect redis stdout to container stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr      ; Redirect redis stderr to container stderr
 stderr_logfile_maxbytes=0
 [program:gunicorn]
-command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
+command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
 directory=/app                  ; Working directory for the app
 user=appuser                    ; Run gunicorn as our non-root user
 autorestart=true
 priority=20
 environment=PYTHONUNBUFFERED=1  ; Ensure Python output is sent straight to logs
 stdout_logfile=/dev/stdout      ; Redirect gunicorn stdout to container stdout
 stdout_logfile_maxbytes=0
 stderr_logfile=/dev/stderr      ; Redirect gunicorn stderr to container stderr
 stderr_logfile_maxbytes=0
 # Optional: Add filebeat or other logging agents here if needed
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -1,15 +1,31 @@
-# Base configuration (not a service, just a reusable config block)
+# docker-compose.yml
 # This file is in the root directory alongside Dockerfile
 # Base configuration anchor for reusability
 x-base-config: &base-config
  ports:
    # Map host port 11235 to container port 11235 (where Gunicorn will listen)
    - "11235:11235"
-    - "8000:8000"
+    # - "8080:8080" # Uncomment if needed
-    - "9222:9222"
+
-    - "8080:8080"
+  # Load API keys primarily from .llm.env file
  # Create .llm.env in the root directory from deploy/docker/.llm.env.example
  env_file:
    - .llm.env
  # Define environment variables, allowing overrides from host environment
  # Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
  environment:
    - CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
    - OPENAI_API_KEY=${OPENAI_API_KEY:-}
-    - CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
+    - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
    - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
    - GROQ_API_KEY=${GROQ_API_KEY:-}
    - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
    - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
    - GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
  volumes:
    # Mount /dev/shm for Chromium/Playwright performance
    - /dev/shm:/dev/shm
  deploy:
    resources:
@@ -19,47 +35,47 @@ x-base-config: &base-config
        memory: 1G
  restart: unless-stopped
  healthcheck:
    # IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
    test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
    interval: 30s
    timeout: 10s
    retries: 3
-    start_period: 40s
+    start_period: 40s # Give the server time to start
  # Run the container as the non-root user defined in the Dockerfile
  user: "appuser"
 services:
-  # Local build services for different platforms
+  # --- Local Build Services ---
-  crawl4ai-amd64:
+  crawl4ai-local-amd64:
    build:
-      context: .
+      context: . # Build context is the root directory
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
-        ENABLE_GPU: false
+        # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
-      platforms:
+    platform: linux/amd64
        - linux/amd64
    profiles: ["local-amd64"]
-    <<: *base-config  # extends yerine doğrudan yapılandırmayı dahil ettik
+    <<: *base-config # Inherit base configuration
-  crawl4ai-arm64:
+  crawl4ai-local-arm64:
    build:
-      context: .
+      context: . # Build context is the root directory
-      dockerfile: Dockerfile
+      dockerfile: Dockerfile # Dockerfile is in the root directory
      args:
-        PYTHON_VERSION: "3.10"
+        INSTALL_TYPE: ${INSTALL_TYPE:-default}
-        INSTALL_TYPE: ${INSTALL_TYPE:-basic}
+        ENABLE_GPU: ${ENABLE_GPU:-false}
-        ENABLE_GPU: false
+    platform: linux/arm64
      platforms:
        - linux/arm64
    profiles: ["local-arm64"]
    <<: *base-config
-  # Hub services for different platforms and versions
+  # --- Docker Hub Image Services ---
  crawl4ai-hub-amd64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-amd64
+    image: unclecode/crawl4ai:${VERSION:-latest}-amd64
    profiles: ["hub-amd64"]
    <<: *base-config
  crawl4ai-hub-arm64:
-    image: unclecode/crawl4ai:${VERSION:-basic}-arm64
+    image: unclecode/crawl4ai:${VERSION:-latest}-arm64
    profiles: ["hub-arm64"]
    <<: *base-config
--- a/docs/examples/network_console_capture_example.py
+++ b/docs/examples/network_console_capture_example.py
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
    async with AsyncWebCrawler() as crawler:
        config = CrawlerRunConfig(
            capture_network_requests=True,
-            wait_until="networkidle",
+            page_timeout=60 * 2 * 1000  # 120 seconds
            page_timeout=60000  # 60 seconds
        )
        result = await crawler.arun(
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
                            "url": url,
                            "duration_ms": duration
                        })
                    if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
                        # Convert to milliseconds
                        duration = (timing["responseStart"] - timing["requestStart"]) * 1000
                        resource_timings[resource_type].append({
                            "url": url,
                            "duration_ms": duration
                        })
                # Calculate statistics for each resource type
                print("\nPerformance by resource type:")
@@ -455,14 +461,14 @@ async def main():
    os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
    # Run basic examples
-    await demo_basic_network_capture()
+    # await demo_basic_network_capture()
    await demo_basic_console_capture()
-    await demo_combined_capture()
+    # await demo_combined_capture()
    # Run advanced examples
-    await analyze_spa_network_traffic()
+    # await analyze_spa_network_traffic()
-    await demo_security_analysis()
+    # await demo_security_analysis()
-    await demo_performance_analysis()
+    # await demo_performance_analysis()
    print("\n=== Examples Complete ===")
    print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
--- a/docs/md_v2/core/docker-deployment.md
+++ b/docs/md_v2/core/docker-deployment.md
--- a/docs/tutorials/coming_soon.md
+++ b/docs/tutorials/coming_soon.md