Fixed capturing console messages for case the url is the local file. Update docker configuration (work in progress)

This commit is contained in:
UncleCode
2025-04-10 23:22:38 +08:00
parent 66ac07b4f3
commit 108b2a8bfb
9 changed files with 898 additions and 664 deletions

View File

@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libjpeg-dev \ libjpeg-dev \
redis-server \ redis-server \
supervisor \ supervisor \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN apt-get update && apt-get install -y --no-install-recommends \ RUN apt-get update && apt-get install -y --no-install-recommends \
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
libcairo2 \ libcairo2 \
libasound2 \ libasound2 \
libatspi2.0-0 \ libatspi2.0-0 \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* && rm -rf /var/lib/apt/lists/*
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \ RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
nvidia-cuda-toolkit \ nvidia-cuda-toolkit \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/* ; \ && rm -rf /var/lib/apt/lists/* ; \
else \ else \
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \ echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
echo "🦾 Installing ARM-specific optimizations"; \ echo "🦾 Installing ARM-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
libopenblas-dev \ libopenblas-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \ && rm -rf /var/lib/apt/lists/*; \
elif [ "$TARGETARCH" = "amd64" ]; then \ elif [ "$TARGETARCH" = "amd64" ]; then \
echo "🖥️ Installing AMD64-specific optimizations"; \ echo "🖥️ Installing AMD64-specific optimizations"; \
apt-get update && apt-get install -y --no-install-recommends \ apt-get update && apt-get install -y --no-install-recommends \
libomp-dev \ libomp-dev \
&& apt-get clean \
&& rm -rf /var/lib/apt/lists/*; \ && rm -rf /var/lib/apt/lists/*; \
else \ else \
echo "Skipping platform-specific optimizations (unsupported platform)"; \ echo "Skipping platform-specific optimizations (unsupported platform)"; \
fi fi
# Create a non-root user and group
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
# Create and set permissions for appuser home directory
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
WORKDIR ${APP_HOME} WORKDIR ${APP_HOME}
RUN echo '#!/bin/bash\n\ RUN echo '#!/bin/bash\n\
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
COPY . /tmp/project/ COPY . /tmp/project/
# Copy supervisor config first (might need root later, but okay for now)
COPY deploy/docker/supervisord.conf . COPY deploy/docker/supervisord.conf .
COPY deploy/docker/requirements.txt . COPY deploy/docker/requirements.txt .
@@ -139,8 +151,15 @@ RUN pip install --no-cache-dir --upgrade pip && \
RUN playwright install --with-deps chromium RUN playwright install --with-deps chromium
# Copy application code
COPY deploy/docker/* ${APP_HOME}/ COPY deploy/docker/* ${APP_HOME}/
# Change ownership of the application directory to the non-root user
RUN chown -R appuser:appuser ${APP_HOME}
# give permissions to redis persistence dirs if used
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
CMD bash -c '\ CMD bash -c '\
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \ MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
@@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
exit 1; \ exit 1; \
fi && \ fi && \
redis-cli ping > /dev/null && \ redis-cli ping > /dev/null && \
curl -f http://localhost:8000/health || exit 1' curl -f http://localhost:11235/health || exit 1'
EXPOSE 6379 EXPOSE 6379
CMD ["supervisord", "-c", "supervisord.conf"] # Switch to the non-root user before starting the application
USER appuser
CMD ["supervisord", "-c", "supervisord.conf"]

View File

@@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
user_agent = kwargs.get("user_agent", self.user_agent) user_agent = kwargs.get("user_agent", self.user_agent)
# Use browser_manager to get a fresh page & context assigned to this session_id # Use browser_manager to get a fresh page & context assigned to this session_id
page, context = await self.browser_manager.get_page(session_id, user_agent) page, context = await self.browser_manager.get_page(CrawlerRunConfig(
session_id=session_id,
user_agent=user_agent,
**kwargs,
))
return session_id return session_id
async def crawl( async def crawl(
@@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
html = f.read() html = f.read()
if config.screenshot: if config.screenshot:
screenshot_data = await self._generate_screenshot_from_html(html) screenshot_data = await self._generate_screenshot_from_html(html)
if config.capture_console_messages:
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
captured_console = await self._capture_console_messages(page, url)
return AsyncCrawlResponse( return AsyncCrawlResponse(
html=html, html=html,
response_headers=response_headers, response_headers=response_headers,
status_code=status_code, status_code=status_code,
screenshot=screenshot_data, screenshot=screenshot_data,
get_delayed_content=None, get_delayed_content=None,
console_messages=captured_console,
) )
elif url.startswith("raw:") or url.startswith("raw://"): elif url.startswith("raw:") or url.startswith("raw://"):
@@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
"url": request.url, "url": request.url,
"method": request.method, "method": request.method,
"resource_type": request.resource_type, "resource_type": request.resource_type,
"failure_text": request.failure.error_text if request.failure else "Unknown failure", "failure_text": str(request.failure) if request.failure else "Unknown failure",
"timestamp": time.time() "timestamp": time.time()
}) })
except Exception as e: except Exception as e:
@@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
) )
return None return None
async def _capture_console_messages(
self, page: Page, file_path: str
) -> List[Dict[str, Union[str, float]]]:
"""
Captures console messages from the page.
Args:
page (Page): The Playwright page object
Returns:
List[Dict[str, Union[str, float]]]: A list of captured console messages
"""
captured_console = []
def handle_console_message(msg):
try:
message_type = msg.type
message_text = msg.text
entry = {
"type": message_type,
"text": message_text,
"timestamp": time.time(),
}
captured_console.append(entry)
except Exception as e:
if self.logger:
self.logger.warning(
f"Error capturing console message: {e}", tag="CAPTURE"
)
page.on("console", handle_console_message)
await page.goto(file_path)
return captured_console
async def take_screenshot(self, page, **kwargs) -> str: async def take_screenshot(self, page, **kwargs) -> str:
""" """
Take a screenshot of the current page. Take a screenshot of the current page.

View File

@@ -658,7 +658,7 @@ class BrowserManager:
"name": "cookiesEnabled", "name": "cookiesEnabled",
"value": "true", "value": "true",
"url": crawlerRunConfig.url "url": crawlerRunConfig.url
if crawlerRunConfig if crawlerRunConfig and crawlerRunConfig.url
else "https://crawl4ai.com/", else "https://crawl4ai.com/",
} }
] ]

View File

@@ -1,4 +1,3 @@
crawl4ai
fastapi fastapi
uvicorn uvicorn
gunicorn>=23.0.0 gunicorn>=23.0.0

View File

@@ -1,12 +1,28 @@
[supervisord] [supervisord]
nodaemon=true nodaemon=true ; Run supervisord in the foreground
logfile=/dev/null ; Log supervisord output to stdout/stderr
logfile_maxbytes=0
[program:redis] [program:redis]
command=redis-server command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
user=appuser ; Run redis as our non-root user
autorestart=true autorestart=true
priority=10 priority=10
stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
stderr_logfile_maxbytes=0
[program:gunicorn] [program:gunicorn]
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
directory=/app ; Working directory for the app
user=appuser ; Run gunicorn as our non-root user
autorestart=true autorestart=true
priority=20 priority=20
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
stdout_logfile_maxbytes=0
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
stderr_logfile_maxbytes=0
# Optional: Add filebeat or other logging agents here if needed

View File

@@ -1,15 +1,31 @@
# Base configuration (not a service, just a reusable config block) # docker-compose.yml
# This file is in the root directory alongside Dockerfile
# Base configuration anchor for reusability
x-base-config: &base-config x-base-config: &base-config
ports: ports:
# Map host port 11235 to container port 11235 (where Gunicorn will listen)
- "11235:11235" - "11235:11235"
- "8000:8000" # - "8080:8080" # Uncomment if needed
- "9222:9222"
- "8080:8080" # Load API keys primarily from .llm.env file
# Create .llm.env in the root directory from deploy/docker/.llm.env.example
env_file:
- .llm.env
# Define environment variables, allowing overrides from host environment
# Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
environment: environment:
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
- OPENAI_API_KEY=${OPENAI_API_KEY:-} - OPENAI_API_KEY=${OPENAI_API_KEY:-}
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-} - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GROQ_API_KEY=${GROQ_API_KEY:-}
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
volumes: volumes:
# Mount /dev/shm for Chromium/Playwright performance
- /dev/shm:/dev/shm - /dev/shm:/dev/shm
deploy: deploy:
resources: resources:
@@ -19,47 +35,47 @@ x-base-config: &base-config
memory: 1G memory: 1G
restart: unless-stopped restart: unless-stopped
healthcheck: healthcheck:
# IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
test: ["CMD", "curl", "-f", "http://localhost:11235/health"] test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
interval: 30s interval: 30s
timeout: 10s timeout: 10s
retries: 3 retries: 3
start_period: 40s start_period: 40s # Give the server time to start
# Run the container as the non-root user defined in the Dockerfile
user: "appuser"
services: services:
# Local build services for different platforms # --- Local Build Services ---
crawl4ai-amd64: crawl4ai-local-amd64:
build: build:
context: . context: . # Build context is the root directory
dockerfile: Dockerfile dockerfile: Dockerfile # Dockerfile is in the root directory
args: args:
PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-default}
INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: ${ENABLE_GPU:-false}
ENABLE_GPU: false # PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
platforms: platform: linux/amd64
- linux/amd64
profiles: ["local-amd64"] profiles: ["local-amd64"]
<<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik <<: *base-config # Inherit base configuration
crawl4ai-arm64: crawl4ai-local-arm64:
build: build:
context: . context: . # Build context is the root directory
dockerfile: Dockerfile dockerfile: Dockerfile # Dockerfile is in the root directory
args: args:
PYTHON_VERSION: "3.10" INSTALL_TYPE: ${INSTALL_TYPE:-default}
INSTALL_TYPE: ${INSTALL_TYPE:-basic} ENABLE_GPU: ${ENABLE_GPU:-false}
ENABLE_GPU: false platform: linux/arm64
platforms:
- linux/arm64
profiles: ["local-arm64"] profiles: ["local-arm64"]
<<: *base-config <<: *base-config
# Hub services for different platforms and versions # --- Docker Hub Image Services ---
crawl4ai-hub-amd64: crawl4ai-hub-amd64:
image: unclecode/crawl4ai:${VERSION:-basic}-amd64 image: unclecode/crawl4ai:${VERSION:-latest}-amd64
profiles: ["hub-amd64"] profiles: ["hub-amd64"]
<<: *base-config <<: *base-config
crawl4ai-hub-arm64: crawl4ai-hub-arm64:
image: unclecode/crawl4ai:${VERSION:-basic}-arm64 image: unclecode/crawl4ai:${VERSION:-latest}-arm64
profiles: ["hub-arm64"] profiles: ["hub-arm64"]
<<: *base-config <<: *base-config

View File

@@ -357,8 +357,7 @@ async def demo_performance_analysis():
async with AsyncWebCrawler() as crawler: async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig( config = CrawlerRunConfig(
capture_network_requests=True, capture_network_requests=True,
wait_until="networkidle", page_timeout=60 * 2 * 1000 # 120 seconds
page_timeout=60000 # 60 seconds
) )
result = await crawler.arun( result = await crawler.arun(
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
"url": url, "url": url,
"duration_ms": duration "duration_ms": duration
}) })
if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
# Convert to milliseconds
duration = (timing["responseStart"] - timing["requestStart"]) * 1000
resource_timings[resource_type].append({
"url": url,
"duration_ms": duration
})
# Calculate statistics for each resource type # Calculate statistics for each resource type
print("\nPerformance by resource type:") print("\nPerformance by resource type:")
@@ -455,14 +461,14 @@ async def main():
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True) os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
# Run basic examples # Run basic examples
await demo_basic_network_capture() # await demo_basic_network_capture()
await demo_basic_console_capture() await demo_basic_console_capture()
await demo_combined_capture() # await demo_combined_capture()
# Run advanced examples # Run advanced examples
await analyze_spa_network_traffic() # await analyze_spa_network_traffic()
await demo_security_analysis() # await demo_security_analysis()
await demo_performance_analysis() # await demo_performance_analysis()
print("\n=== Examples Complete ===") print("\n=== Examples Complete ===")
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}") print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")

File diff suppressed because it is too large Load Diff

View File