Compare commits
2 Commits
codex/add-
...
coderabbit
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
32fcacafa6 | ||
|
|
45f1652d98 |
10
Dockerfile
10
Dockerfile
@@ -43,17 +43,9 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
|
|||||||
libjpeg-dev \
|
libjpeg-dev \
|
||||||
redis-server \
|
redis-server \
|
||||||
supervisor \
|
supervisor \
|
||||||
xvfb \
|
&& apt-get clean \
|
||||||
x11vnc \
|
|
||||||
fluxbox \
|
|
||||||
websockify \
|
|
||||||
&& apt-get clean \
|
|
||||||
&& rm -rf /var/lib/apt/lists/*
|
&& rm -rf /var/lib/apt/lists/*
|
||||||
|
|
||||||
# Install noVNC for browser-based VNC access
|
|
||||||
RUN git clone --depth 1 https://github.com/novnc/noVNC /opt/novnc \
|
|
||||||
&& git clone --depth 1 https://github.com/novnc/websockify /opt/novnc/utils/websockify
|
|
||||||
|
|
||||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||||
libglib2.0-0 \
|
libglib2.0-0 \
|
||||||
libnss3 \
|
libnss3 \
|
||||||
|
|||||||
@@ -135,13 +135,20 @@ def merge_chunks(
|
|||||||
word_token_ratio: float = 1.0,
|
word_token_ratio: float = 1.0,
|
||||||
splitter: Callable = None
|
splitter: Callable = None
|
||||||
) -> List[str]:
|
) -> List[str]:
|
||||||
"""Merges documents into chunks of specified token size.
|
"""
|
||||||
|
Merges a sequence of documents into chunks based on a target token count, with optional overlap.
|
||||||
|
|
||||||
|
Each document is split into tokens using the provided splitter function (defaults to str.split). Tokens are distributed into chunks aiming for the specified target size, with optional overlapping tokens between consecutive chunks. Returns a list of non-empty merged chunks as strings.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
docs: Input documents
|
docs: Sequence of input document strings to be merged.
|
||||||
target_size: Desired token count per chunk
|
target_size: Target number of tokens per chunk.
|
||||||
overlap: Number of tokens to overlap between chunks
|
overlap: Number of tokens to overlap between consecutive chunks.
|
||||||
word_token_ratio: Multiplier for word->token conversion
|
word_token_ratio: Multiplier to estimate token count from word count.
|
||||||
|
splitter: Callable used to split each document into tokens.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
List of merged document chunks as strings, each not exceeding the target token size.
|
||||||
"""
|
"""
|
||||||
# Pre-tokenize all docs and store token counts
|
# Pre-tokenize all docs and store token counts
|
||||||
splitter = splitter or str.split
|
splitter = splitter or str.split
|
||||||
@@ -150,7 +157,7 @@ def merge_chunks(
|
|||||||
total_tokens = 0
|
total_tokens = 0
|
||||||
|
|
||||||
for doc in docs:
|
for doc in docs:
|
||||||
tokens = doc.split()
|
tokens = splitter(doc)
|
||||||
count = int(len(tokens) * word_token_ratio)
|
count = int(len(tokens) * word_token_ratio)
|
||||||
if count: # Skip empty docs
|
if count: # Skip empty docs
|
||||||
token_counts.append(count)
|
token_counts.append(count)
|
||||||
@@ -1109,6 +1116,23 @@ def get_content_of_website_optimized(
|
|||||||
css_selector: str = None,
|
css_selector: str = None,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
) -> Dict[str, Any]:
|
) -> Dict[str, Any]:
|
||||||
|
"""
|
||||||
|
Extracts and cleans content from website HTML, optimizing for useful media and contextual information.
|
||||||
|
|
||||||
|
Parses the provided HTML to extract internal and external links, filters and scores images for usefulness, gathers contextual descriptions for media, removes unwanted or low-value elements, and converts the cleaned HTML to Markdown. Also extracts metadata and returns all structured content in a dictionary.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
url: The URL of the website being processed.
|
||||||
|
html: The raw HTML content to extract from.
|
||||||
|
word_count_threshold: Minimum word count for elements to be retained.
|
||||||
|
css_selector: Optional CSS selector to restrict extraction to specific elements.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary containing Markdown content, cleaned HTML, extraction success status, media and link lists, and metadata.
|
||||||
|
|
||||||
|
Raises:
|
||||||
|
InvalidCSSSelectorError: If a provided CSS selector does not match any elements.
|
||||||
|
"""
|
||||||
if not html:
|
if not html:
|
||||||
return None
|
return None
|
||||||
|
|
||||||
@@ -1151,6 +1175,20 @@ def get_content_of_website_optimized(
|
|||||||
|
|
||||||
def process_image(img, url, index, total_images):
|
def process_image(img, url, index, total_images):
|
||||||
# Check if an image has valid display and inside undesired html elements
|
# Check if an image has valid display and inside undesired html elements
|
||||||
|
"""
|
||||||
|
Processes an HTML image element to determine its relevance and extract metadata.
|
||||||
|
|
||||||
|
Evaluates an image's visibility, context, and usefulness based on its attributes and parent elements. If the image passes validation and exceeds a usefulness score threshold, returns a dictionary with its source, alt text, contextual description, score, and type. Otherwise, returns None.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The BeautifulSoup image tag to process.
|
||||||
|
url: The base URL of the page containing the image.
|
||||||
|
index: The index of the image in the list of images on the page.
|
||||||
|
total_images: The total number of images on the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
A dictionary with image metadata if the image is considered useful, or None otherwise.
|
||||||
|
"""
|
||||||
def is_valid_image(img, parent, parent_classes):
|
def is_valid_image(img, parent, parent_classes):
|
||||||
style = img.get("style", "")
|
style = img.get("style", "")
|
||||||
src = img.get("src", "")
|
src = img.get("src", "")
|
||||||
@@ -1172,6 +1210,20 @@ def get_content_of_website_optimized(
|
|||||||
# Score an image for it's usefulness
|
# Score an image for it's usefulness
|
||||||
def score_image_for_usefulness(img, base_url, index, images_count):
|
def score_image_for_usefulness(img, base_url, index, images_count):
|
||||||
# Function to parse image height/width value and units
|
# Function to parse image height/width value and units
|
||||||
|
"""
|
||||||
|
Scores an HTML image element for usefulness based on size, format, attributes, and position.
|
||||||
|
|
||||||
|
The function evaluates an image's dimensions, file format, alt text, and its position among all images on the page to assign a usefulness score. Higher scores indicate images that are likely more relevant or informative for content extraction or summarization.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: The HTML image element to score.
|
||||||
|
base_url: The base URL used to resolve relative image sources.
|
||||||
|
index: The position of the image in the list of images on the page (zero-based).
|
||||||
|
images_count: The total number of images on the page.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
An integer usefulness score for the image.
|
||||||
|
"""
|
||||||
def parse_dimension(dimension):
|
def parse_dimension(dimension):
|
||||||
if dimension:
|
if dimension:
|
||||||
match = re.match(r"(\d+)(\D*)", dimension)
|
match = re.match(r"(\d+)(\D*)", dimension)
|
||||||
@@ -1186,6 +1238,16 @@ def get_content_of_website_optimized(
|
|||||||
# Fetch image file metadata to extract size and extension
|
# Fetch image file metadata to extract size and extension
|
||||||
def fetch_image_file_size(img, base_url):
|
def fetch_image_file_size(img, base_url):
|
||||||
# If src is relative path construct full URL, if not it may be CDN URL
|
# If src is relative path construct full URL, if not it may be CDN URL
|
||||||
|
"""
|
||||||
|
Fetches the file size of an image by sending a HEAD request to its URL.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
img: A BeautifulSoup tag representing the image element.
|
||||||
|
base_url: The base URL to resolve relative image sources.
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
The value of the "Content-Length" header as a string if available, otherwise None.
|
||||||
|
"""
|
||||||
img_url = urljoin(base_url, img.get("src"))
|
img_url = urljoin(base_url, img.get("src"))
|
||||||
try:
|
try:
|
||||||
response = requests.head(img_url)
|
response = requests.head(img_url)
|
||||||
@@ -1196,8 +1258,6 @@ def get_content_of_website_optimized(
|
|||||||
return None
|
return None
|
||||||
except InvalidSchema:
|
except InvalidSchema:
|
||||||
return None
|
return None
|
||||||
finally:
|
|
||||||
return
|
|
||||||
|
|
||||||
image_height = img.get("height")
|
image_height = img.get("height")
|
||||||
height_value, height_unit = parse_dimension(image_height)
|
height_value, height_unit = parse_dimension(image_height)
|
||||||
|
|||||||
@@ -17,7 +17,6 @@
|
|||||||
- [Screenshot Endpoint](#screenshot-endpoint)
|
- [Screenshot Endpoint](#screenshot-endpoint)
|
||||||
- [PDF Export Endpoint](#pdf-export-endpoint)
|
- [PDF Export Endpoint](#pdf-export-endpoint)
|
||||||
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
- [JavaScript Execution Endpoint](#javascript-execution-endpoint)
|
||||||
- [Browser VNC Endpoint](#browser-vnc-endpoint)
|
|
||||||
- [Library Context Endpoint](#library-context-endpoint)
|
- [Library Context Endpoint](#library-context-endpoint)
|
||||||
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
- [MCP (Model Context Protocol) Support](#mcp-model-context-protocol-support)
|
||||||
- [What is MCP?](#what-is-mcp)
|
- [What is MCP?](#what-is-mcp)
|
||||||
@@ -378,20 +377,6 @@ Executes JavaScript snippets on the specified URL and returns the full crawl res
|
|||||||
|
|
||||||
- `scripts`: List of JavaScript snippets to execute sequentially
|
- `scripts`: List of JavaScript snippets to execute sequentially
|
||||||
|
|
||||||
### Browser VNC Endpoint
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /vnc
|
|
||||||
```
|
|
||||||
|
|
||||||
Opens a browser-based VNC session for interacting with the container's desktop environment. Use `/vnc/url` to retrieve only the iframe URL.
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /vnc/url
|
|
||||||
```
|
|
||||||
|
|
||||||
Returns a JSON object containing the URL of the embedded noVNC client.
|
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Dockerfile Parameters
|
## Dockerfile Parameters
|
||||||
|
|||||||
@@ -33,11 +33,7 @@ from schemas import (
|
|||||||
)
|
)
|
||||||
|
|
||||||
from utils import (
|
from utils import (
|
||||||
FilterType,
|
FilterType, load_config, setup_logging, verify_email_domain
|
||||||
load_config,
|
|
||||||
setup_logging,
|
|
||||||
verify_email_domain,
|
|
||||||
get_base_url,
|
|
||||||
)
|
)
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
@@ -52,11 +48,7 @@ from fastapi import (
|
|||||||
)
|
)
|
||||||
from rank_bm25 import BM25Okapi
|
from rank_bm25 import BM25Okapi
|
||||||
from fastapi.responses import (
|
from fastapi.responses import (
|
||||||
StreamingResponse,
|
StreamingResponse, RedirectResponse, PlainTextResponse, JSONResponse
|
||||||
RedirectResponse,
|
|
||||||
PlainTextResponse,
|
|
||||||
JSONResponse,
|
|
||||||
HTMLResponse,
|
|
||||||
)
|
)
|
||||||
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
from fastapi.middleware.httpsredirect import HTTPSRedirectMiddleware
|
||||||
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
from fastapi.middleware.trustedhost import TrustedHostMiddleware
|
||||||
@@ -137,31 +129,11 @@ app.mount(
|
|||||||
name="play",
|
name="play",
|
||||||
)
|
)
|
||||||
|
|
||||||
# Serve noVNC static files if available
|
|
||||||
VNC_DIR = pathlib.Path("/opt/novnc")
|
|
||||||
if VNC_DIR.exists():
|
|
||||||
app.mount("/novnc", StaticFiles(directory=VNC_DIR, html=True), name="novnc")
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
return RedirectResponse("/playground")
|
return RedirectResponse("/playground")
|
||||||
|
|
||||||
|
|
||||||
@app.get("/vnc")
|
|
||||||
async def vnc_page(request: Request):
|
|
||||||
"""Return a simple page embedding the noVNC client."""
|
|
||||||
url = f"{get_base_url(request)}/novnc/vnc.html?autoconnect=true&resize=scale"
|
|
||||||
html = f"<iframe src='{url}' width='1024' height='768' style='border:none'></iframe>"
|
|
||||||
return HTMLResponse(f"<html><body>{html}</body></html>")
|
|
||||||
|
|
||||||
|
|
||||||
@app.get("/vnc/url")
|
|
||||||
async def vnc_url(request: Request):
|
|
||||||
"""Return the direct URL to the noVNC client."""
|
|
||||||
url = f"{get_base_url(request)}/novnc/vnc.html?autoconnect=true&resize=scale"
|
|
||||||
return {"url": url}
|
|
||||||
|
|
||||||
# ─────────────────── infra / middleware ─────────────────────
|
# ─────────────────── infra / middleware ─────────────────────
|
||||||
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
redis = aioredis.from_url(config["redis"].get("uri", "redis://localhost"))
|
||||||
|
|
||||||
|
|||||||
@@ -20,53 +20,9 @@ user=appuser ; Run gunicorn as our non-root user
|
|||||||
autorestart=true
|
autorestart=true
|
||||||
priority=20
|
priority=20
|
||||||
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
|
environment=PYTHONUNBUFFERED=1 ; Ensure Python output is sent straight to logs
|
||||||
environment=DISPLAY=:99
|
|
||||||
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
|
stdout_logfile=/dev/stdout ; Redirect gunicorn stdout to container stdout
|
||||||
stdout_logfile_maxbytes=0
|
stdout_logfile_maxbytes=0
|
||||||
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
|
stderr_logfile=/dev/stderr ; Redirect gunicorn stderr to container stderr
|
||||||
stderr_logfile_maxbytes=0
|
stderr_logfile_maxbytes=0
|
||||||
|
|
||||||
[program:xvfb]
|
|
||||||
command=/usr/bin/Xvfb :99 -screen 0 1280x720x24
|
|
||||||
user=appuser
|
|
||||||
autorestart=true
|
|
||||||
priority=5
|
|
||||||
stdout_logfile=/dev/stdout
|
|
||||||
stdout_logfile_maxbytes=0
|
|
||||||
stderr_logfile=/dev/stderr
|
|
||||||
stderr_logfile_maxbytes=0
|
|
||||||
|
|
||||||
[program:fluxbox]
|
|
||||||
command=/usr/bin/fluxbox
|
|
||||||
user=appuser
|
|
||||||
autorestart=true
|
|
||||||
priority=6
|
|
||||||
environment=DISPLAY=:99
|
|
||||||
stdout_logfile=/dev/stdout
|
|
||||||
stdout_logfile_maxbytes=0
|
|
||||||
stderr_logfile=/dev/stderr
|
|
||||||
stderr_logfile_maxbytes=0
|
|
||||||
|
|
||||||
[program:x11vnc]
|
|
||||||
command=/usr/bin/x11vnc -display :99 -nopw -forever -shared -rfbport 5900 -quiet
|
|
||||||
user=appuser
|
|
||||||
autorestart=true
|
|
||||||
priority=7
|
|
||||||
environment=DISPLAY=:99
|
|
||||||
stdout_logfile=/dev/stdout
|
|
||||||
stdout_logfile_maxbytes=0
|
|
||||||
stderr_logfile=/dev/stderr
|
|
||||||
stderr_logfile_maxbytes=0
|
|
||||||
|
|
||||||
[program:websockify]
|
|
||||||
command=/usr/bin/websockify 6080 localhost:5900 --web /opt/novnc
|
|
||||||
user=appuser
|
|
||||||
autorestart=true
|
|
||||||
priority=8
|
|
||||||
environment=DISPLAY=:99
|
|
||||||
stdout_logfile=/dev/stdout
|
|
||||||
stdout_logfile_maxbytes=0
|
|
||||||
stderr_logfile=/dev/stderr
|
|
||||||
stderr_logfile_maxbytes=0
|
|
||||||
|
|
||||||
# Optional: Add filebeat or other logging agents here if needed
|
# Optional: Add filebeat or other logging agents here if needed
|
||||||
Reference in New Issue
Block a user