Fixed capturing console messages for case the url is the local file. Update docker configuration (work in progress)
This commit is contained in:
25
Dockerfile
25
Dockerfile
@@ -38,6 +38,7 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libjpeg-dev \
|
libjpeg-dev \
|
||||||
redis-server \
|
redis-server \
|
||||||
supervisor \
|
supervisor \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
@@ -62,11 +63,13 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libcairo2 \
|
libcairo2 \
|
||||||
libasound2 \
|
libasound2 \
|
||||||
libatspi2.0-0 \
|
libatspi2.0-0 \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
RUN if [ "$ENABLE_GPU" = "true" ] && [ "$TARGETARCH" = "amd64" ] ; then \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
nvidia-cuda-toolkit \
|
nvidia-cuda-toolkit \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/* ; \
|
&& rm -rf /var/lib/apt/lists/* ; \
|
||||||
else \
|
else \
|
||||||
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
echo "Skipping NVIDIA CUDA Toolkit installation (unsupported platform or GPU disabled)"; \
|
||||||
@@ -76,16 +79,24 @@ RUN if [ "$TARGETARCH" = "arm64" ]; then \
|
|||||||
echo "🦾 Installing ARM-specific optimizations"; \
|
echo "🦾 Installing ARM-specific optimizations"; \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libopenblas-dev \
|
libopenblas-dev \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*; \
|
&& rm -rf /var/lib/apt/lists/*; \
|
||||||
elif [ "$TARGETARCH" = "amd64" ]; then \
|
elif [ "$TARGETARCH" = "amd64" ]; then \
|
||||||
echo "🖥️ Installing AMD64-specific optimizations"; \
|
echo "🖥️ Installing AMD64-specific optimizations"; \
|
||||||
apt-get update && apt-get install -y --no-install-recommends \
|
apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libomp-dev \
|
libomp-dev \
|
||||||
|
&& apt-get clean \
|
||||||
&& rm -rf /var/lib/apt/lists/*; \
|
&& rm -rf /var/lib/apt/lists/*; \
|
||||||
else \
|
else \
|
||||||
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
echo "Skipping platform-specific optimizations (unsupported platform)"; \
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
# Create a non-root user and group
|
||||||
|
RUN groupadd -r appuser && useradd --no-log-init -r -g appuser appuser
|
||||||
|
|
||||||
|
# Create and set permissions for appuser home directory
|
||||||
|
RUN mkdir -p /home/appuser && chown -R appuser:appuser /home/appuser
|
||||||
|
|
||||||
WORKDIR ${APP_HOME}
|
WORKDIR ${APP_HOME}
|
||||||
|
|
||||||
RUN echo '#!/bin/bash\n\
|
RUN echo '#!/bin/bash\n\
|
||||||
@@ -103,6 +114,7 @@ fi' > /tmp/install.sh && chmod +x /tmp/install.sh
|
|||||||
|
|
||||||
COPY . /tmp/project/
|
COPY . /tmp/project/
|
||||||
|
|
||||||
|
# Copy supervisor config first (might need root later, but okay for now)
|
||||||
COPY deploy/docker/supervisord.conf .
|
COPY deploy/docker/supervisord.conf .
|
||||||
|
|
||||||
COPY deploy/docker/requirements.txt .
|
COPY deploy/docker/requirements.txt .
|
||||||
@@ -139,8 +151,15 @@ RUN pip install --no-cache-dir --upgrade pip && \
|
|||||||
|
|
||||||
RUN playwright install --with-deps chromium
|
RUN playwright install --with-deps chromium
|
||||||
|
|
||||||
|
# Copy application code
|
||||||
COPY deploy/docker/* ${APP_HOME}/
|
COPY deploy/docker/* ${APP_HOME}/
|
||||||
|
|
||||||
|
# Change ownership of the application directory to the non-root user
|
||||||
|
RUN chown -R appuser:appuser ${APP_HOME}
|
||||||
|
|
||||||
|
# give permissions to redis persistence dirs if used
|
||||||
|
RUN mkdir -p /var/lib/redis /var/log/redis && chown -R appuser:appuser /var/lib/redis /var/log/redis
|
||||||
|
|
||||||
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
||||||
CMD bash -c '\
|
CMD bash -c '\
|
||||||
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
MEM=$(free -m | awk "/^Mem:/{print \$2}"); \
|
||||||
@@ -149,8 +168,10 @@ HEALTHCHECK --interval=30s --timeout=10s --start-period=5s --retries=3 \
|
|||||||
exit 1; \
|
exit 1; \
|
||||||
fi && \
|
fi && \
|
||||||
redis-cli ping > /dev/null && \
|
redis-cli ping > /dev/null && \
|
||||||
curl -f http://localhost:8000/health || exit 1'
|
curl -f http://localhost:11235/health || exit 1'
|
||||||
|
|
||||||
EXPOSE 6379
|
EXPOSE 6379
|
||||||
CMD ["supervisord", "-c", "supervisord.conf"]
|
# Switch to the non-root user before starting the application
|
||||||
|
USER appuser
|
||||||
|
|
||||||
|
CMD ["supervisord", "-c", "supervisord.conf"]
|
||||||
@@ -409,7 +409,11 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
|
|
||||||
user_agent = kwargs.get("user_agent", self.user_agent)
|
user_agent = kwargs.get("user_agent", self.user_agent)
|
||||||
# Use browser_manager to get a fresh page & context assigned to this session_id
|
# Use browser_manager to get a fresh page & context assigned to this session_id
|
||||||
page, context = await self.browser_manager.get_page(session_id, user_agent)
|
page, context = await self.browser_manager.get_page(CrawlerRunConfig(
|
||||||
|
session_id=session_id,
|
||||||
|
user_agent=user_agent,
|
||||||
|
**kwargs,
|
||||||
|
))
|
||||||
return session_id
|
return session_id
|
||||||
|
|
||||||
async def crawl(
|
async def crawl(
|
||||||
@@ -447,12 +451,17 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
html = f.read()
|
html = f.read()
|
||||||
if config.screenshot:
|
if config.screenshot:
|
||||||
screenshot_data = await self._generate_screenshot_from_html(html)
|
screenshot_data = await self._generate_screenshot_from_html(html)
|
||||||
|
if config.capture_console_messages:
|
||||||
|
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||||
|
captured_console = await self._capture_console_messages(page, url)
|
||||||
|
|
||||||
return AsyncCrawlResponse(
|
return AsyncCrawlResponse(
|
||||||
html=html,
|
html=html,
|
||||||
response_headers=response_headers,
|
response_headers=response_headers,
|
||||||
status_code=status_code,
|
status_code=status_code,
|
||||||
screenshot=screenshot_data,
|
screenshot=screenshot_data,
|
||||||
get_delayed_content=None,
|
get_delayed_content=None,
|
||||||
|
console_messages=captured_console,
|
||||||
)
|
)
|
||||||
|
|
||||||
elif url.startswith("raw:") or url.startswith("raw://"):
|
elif url.startswith("raw:") or url.startswith("raw://"):
|
||||||
@@ -582,7 +591,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
"url": request.url,
|
"url": request.url,
|
||||||
"method": request.method,
|
"method": request.method,
|
||||||
"resource_type": request.resource_type,
|
"resource_type": request.resource_type,
|
||||||
"failure_text": request.failure.error_text if request.failure else "Unknown failure",
|
"failure_text": str(request.failure) if request.failure else "Unknown failure",
|
||||||
"timestamp": time.time()
|
"timestamp": time.time()
|
||||||
})
|
})
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
@@ -1274,6 +1283,42 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
)
|
)
|
||||||
return None
|
return None
|
||||||
|
|
||||||
|
async def _capture_console_messages(
|
||||||
|
self, page: Page, file_path: str
|
||||||
|
) -> List[Dict[str, Union[str, float]]]:
|
||||||
|
"""
|
||||||
|
Captures console messages from the page.
|
||||||
|
Args:
|
||||||
|
|
||||||
|
page (Page): The Playwright page object
|
||||||
|
Returns:
|
||||||
|
List[Dict[str, Union[str, float]]]: A list of captured console messages
|
||||||
|
"""
|
||||||
|
captured_console = []
|
||||||
|
|
||||||
|
def handle_console_message(msg):
|
||||||
|
try:
|
||||||
|
message_type = msg.type
|
||||||
|
message_text = msg.text
|
||||||
|
|
||||||
|
entry = {
|
||||||
|
"type": message_type,
|
||||||
|
"text": message_text,
|
||||||
|
"timestamp": time.time(),
|
||||||
|
}
|
||||||
|
captured_console.append(entry)
|
||||||
|
except Exception as e:
|
||||||
|
if self.logger:
|
||||||
|
self.logger.warning(
|
||||||
|
f"Error capturing console message: {e}", tag="CAPTURE"
|
||||||
|
)
|
||||||
|
|
||||||
|
page.on("console", handle_console_message)
|
||||||
|
|
||||||
|
await page.goto(file_path)
|
||||||
|
|
||||||
|
return captured_console
|
||||||
|
|
||||||
async def take_screenshot(self, page, **kwargs) -> str:
|
async def take_screenshot(self, page, **kwargs) -> str:
|
||||||
"""
|
"""
|
||||||
Take a screenshot of the current page.
|
Take a screenshot of the current page.
|
||||||
|
|||||||
@@ -658,7 +658,7 @@ class BrowserManager:
|
|||||||
"name": "cookiesEnabled",
|
"name": "cookiesEnabled",
|
||||||
"value": "true",
|
"value": "true",
|
||||||
"url": crawlerRunConfig.url
|
"url": crawlerRunConfig.url
|
||||||
if crawlerRunConfig
|
if crawlerRunConfig and crawlerRunConfig.url
|
||||||
else "https://crawl4ai.com/",
|
else "https://crawl4ai.com/",
|
||||||
}
|
}
|
||||||
]
|
]
|
||||||
|
|||||||
@@ -1,4 +1,3 @@
|
|||||||
crawl4ai
|
|
||||||
fastapi
|
fastapi
|
||||||
uvicorn
|
uvicorn
|
||||||
gunicorn>=23.0.0
|
gunicorn>=23.0.0
|
||||||
|
|||||||
@@ -1,12 +1,28 @@
|
|||||||
[supervisord]
|
[supervisord]
|
||||||
nodaemon=true
|
nodaemon=true ; Run supervisord in the foreground
|
||||||
|
logfile=/dev/null ; Log supervisord output to stdout/stderr
|
||||||
|
logfile_maxbytes=0
|
||||||
|
|
||||||
[program:redis]
|
[program:redis]
|
||||||
command=redis-server
|
command=/usr/bin/redis-server --loglevel notice ; Path to redis-server on Alpine
|
||||||
|
user=appuser ; Run redis as our non-root user
|
||||||
autorestart=true
|
autorestart=true
|
||||||
priority=10
|
priority=10
|
||||||
|
stdout_logfile=/dev/stdout ; Redirect redis stdout to container stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr ; Redirect redis stderr to container stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
[program:gunicorn]
|
[program:gunicorn]
|
||||||
command=gunicorn --bind 0.0.0.0:8000 --workers 4 --threads 2 --timeout 300 --graceful-timeout 60 --keep-alive 65 --log-level debug --worker-class uvicorn.workers.UvicornWorker --max-requests 1000 --max-requests-jitter 50 server:app
|
command=/usr/local/bin/gunicorn --bind 0.0.0.0:11235 --workers 2 --threads 2 --timeout 120 --graceful-timeout 30 --keep-alive 60 --log-level info --worker-class uvicorn.workers.UvicornWorker server:app
|
||||||
|
directory=/app ; Working directory for the app
|
||||||
|
user=appuser ; Run gunicorn as our non-root user
|
||||||
autorestart=true
|
autorestart=true
|
||||||
priority=20
|
priority=20
|
||||||
|
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
|
||||||
|
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
|
||||||
|
stdout_logfile_maxbytes=0
|
||||||
|
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
|
||||||
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
|
# Optional: Add filebeat or other logging agents here if needed
|
||||||
@@ -1,15 +1,31 @@
|
|||||||
# Base configuration (not a service, just a reusable config block)
|
# docker-compose.yml
|
||||||
|
# This file is in the root directory alongside Dockerfile
|
||||||
|
|
||||||
|
# Base configuration anchor for reusability
|
||||||
x-base-config: &base-config
|
x-base-config: &base-config
|
||||||
ports:
|
ports:
|
||||||
|
# Map host port 11235 to container port 11235 (where Gunicorn will listen)
|
||||||
- "11235:11235"
|
- "11235:11235"
|
||||||
- "8000:8000"
|
# - "8080:8080" # Uncomment if needed
|
||||||
- "9222:9222"
|
|
||||||
- "8080:8080"
|
# Load API keys primarily from .llm.env file
|
||||||
|
# Create .llm.env in the root directory from deploy/docker/.llm.env.example
|
||||||
|
env_file:
|
||||||
|
- .llm.env
|
||||||
|
|
||||||
|
# Define environment variables, allowing overrides from host environment
|
||||||
|
# Syntax ${VAR:-} uses host env var 'VAR' if set, otherwise uses value from .llm.env
|
||||||
environment:
|
environment:
|
||||||
- CRAWL4AI_API_TOKEN=${CRAWL4AI_API_TOKEN:-}
|
|
||||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||||
- CLAUDE_API_KEY=${CLAUDE_API_KEY:-}
|
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||||
|
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||||
|
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||||
|
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||||
|
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||||
|
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||||
|
|
||||||
volumes:
|
volumes:
|
||||||
|
# Mount /dev/shm for Chromium/Playwright performance
|
||||||
- /dev/shm:/dev/shm
|
- /dev/shm:/dev/shm
|
||||||
deploy:
|
deploy:
|
||||||
resources:
|
resources:
|
||||||
@@ -19,47 +35,47 @@ x-base-config: &base-config
|
|||||||
memory: 1G
|
memory: 1G
|
||||||
restart: unless-stopped
|
restart: unless-stopped
|
||||||
healthcheck:
|
healthcheck:
|
||||||
|
# IMPORTANT: Ensure Gunicorn binds to 11235 in supervisord.conf
|
||||||
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
test: ["CMD", "curl", "-f", "http://localhost:11235/health"]
|
||||||
interval: 30s
|
interval: 30s
|
||||||
timeout: 10s
|
timeout: 10s
|
||||||
retries: 3
|
retries: 3
|
||||||
start_period: 40s
|
start_period: 40s # Give the server time to start
|
||||||
|
# Run the container as the non-root user defined in the Dockerfile
|
||||||
|
user: "appuser"
|
||||||
|
|
||||||
services:
|
services:
|
||||||
# Local build services for different platforms
|
# --- Local Build Services ---
|
||||||
crawl4ai-amd64:
|
crawl4ai-local-amd64:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: . # Build context is the root directory
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||||
args:
|
args:
|
||||||
PYTHON_VERSION: "3.10"
|
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||||
ENABLE_GPU: false
|
# PYTHON_VERSION arg is omitted as it's fixed by 'FROM python:3.10-slim' in Dockerfile
|
||||||
platforms:
|
platform: linux/amd64
|
||||||
- linux/amd64
|
|
||||||
profiles: ["local-amd64"]
|
profiles: ["local-amd64"]
|
||||||
<<: *base-config # extends yerine doğrudan yapılandırmayı dahil ettik
|
<<: *base-config # Inherit base configuration
|
||||||
|
|
||||||
crawl4ai-arm64:
|
crawl4ai-local-arm64:
|
||||||
build:
|
build:
|
||||||
context: .
|
context: . # Build context is the root directory
|
||||||
dockerfile: Dockerfile
|
dockerfile: Dockerfile # Dockerfile is in the root directory
|
||||||
args:
|
args:
|
||||||
PYTHON_VERSION: "3.10"
|
INSTALL_TYPE: ${INSTALL_TYPE:-default}
|
||||||
INSTALL_TYPE: ${INSTALL_TYPE:-basic}
|
ENABLE_GPU: ${ENABLE_GPU:-false}
|
||||||
ENABLE_GPU: false
|
platform: linux/arm64
|
||||||
platforms:
|
|
||||||
- linux/arm64
|
|
||||||
profiles: ["local-arm64"]
|
profiles: ["local-arm64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
|
|
||||||
# Hub services for different platforms and versions
|
# --- Docker Hub Image Services ---
|
||||||
crawl4ai-hub-amd64:
|
crawl4ai-hub-amd64:
|
||||||
image: unclecode/crawl4ai:${VERSION:-basic}-amd64
|
image: unclecode/crawl4ai:${VERSION:-latest}-amd64
|
||||||
profiles: ["hub-amd64"]
|
profiles: ["hub-amd64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
|
|
||||||
crawl4ai-hub-arm64:
|
crawl4ai-hub-arm64:
|
||||||
image: unclecode/crawl4ai:${VERSION:-basic}-arm64
|
image: unclecode/crawl4ai:${VERSION:-latest}-arm64
|
||||||
profiles: ["hub-arm64"]
|
profiles: ["hub-arm64"]
|
||||||
<<: *base-config
|
<<: *base-config
|
||||||
@@ -357,8 +357,7 @@ async def demo_performance_analysis():
|
|||||||
async with AsyncWebCrawler() as crawler:
|
async with AsyncWebCrawler() as crawler:
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
capture_network_requests=True,
|
capture_network_requests=True,
|
||||||
wait_until="networkidle",
|
page_timeout=60 * 2 * 1000 # 120 seconds
|
||||||
page_timeout=60000 # 60 seconds
|
|
||||||
)
|
)
|
||||||
|
|
||||||
result = await crawler.arun(
|
result = await crawler.arun(
|
||||||
@@ -406,6 +405,13 @@ async def demo_performance_analysis():
|
|||||||
"url": url,
|
"url": url,
|
||||||
"duration_ms": duration
|
"duration_ms": duration
|
||||||
})
|
})
|
||||||
|
if isinstance(timing, dict) and "requestStart" in timing and "responseStart" in timing and "startTime" in timing:
|
||||||
|
# Convert to milliseconds
|
||||||
|
duration = (timing["responseStart"] - timing["requestStart"]) * 1000
|
||||||
|
resource_timings[resource_type].append({
|
||||||
|
"url": url,
|
||||||
|
"duration_ms": duration
|
||||||
|
})
|
||||||
|
|
||||||
# Calculate statistics for each resource type
|
# Calculate statistics for each resource type
|
||||||
print("\nPerformance by resource type:")
|
print("\nPerformance by resource type:")
|
||||||
@@ -455,14 +461,14 @@ async def main():
|
|||||||
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
|
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
|
||||||
|
|
||||||
# Run basic examples
|
# Run basic examples
|
||||||
await demo_basic_network_capture()
|
# await demo_basic_network_capture()
|
||||||
await demo_basic_console_capture()
|
await demo_basic_console_capture()
|
||||||
await demo_combined_capture()
|
# await demo_combined_capture()
|
||||||
|
|
||||||
# Run advanced examples
|
# Run advanced examples
|
||||||
await analyze_spa_network_traffic()
|
# await analyze_spa_network_traffic()
|
||||||
await demo_security_analysis()
|
# await demo_security_analysis()
|
||||||
await demo_performance_analysis()
|
# await demo_performance_analysis()
|
||||||
|
|
||||||
print("\n=== Examples Complete ===")
|
print("\n=== Examples Complete ===")
|
||||||
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
|
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
0
docs/tutorials/coming_soon.md
Normal file
0
docs/tutorials/coming_soon.md
Normal file
Reference in New Issue
Block a user