Compare commits

..

1 Commits

Author SHA1 Message Date
ntohidi
ca100c6518 Release v0.7.6: The 0.7.6 Update
- Updated version to 0.7.6
- Added comprehensive demo and release notes
- Updated all documentation
- Update the veriosn in Dockerfile to 0.7.6
2025-10-22 13:46:54 +02:00
15 changed files with 707 additions and 1979 deletions

Submodule .yoyo/snapshot deleted from 5e783b71e7

View File

@@ -1,23 +1,22 @@
import asyncio import asyncio
import hashlib import time
from typing import List, Optional
import os import os
import shlex import sys
import shutil import shutil
import tempfile
import psutil
import signal import signal
import subprocess import subprocess
import sys import shlex
import tempfile
import time
import warnings
from typing import List, Optional
import psutil
from playwright.async_api import BrowserContext from playwright.async_api import BrowserContext
import hashlib
from .async_configs import BrowserConfig, CrawlerRunConfig
from .config import DOWNLOAD_PAGE_TIMEOUT
from .js_snippet import load_js_script from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .utils import get_chromium_path from .utils import get_chromium_path
import warnings
BROWSER_DISABLE_OPTIONS = [ BROWSER_DISABLE_OPTIONS = [
"--disable-background-networking", "--disable-background-networking",
@@ -66,7 +65,7 @@ class ManagedBrowser:
_cleanup(): Terminates the browser process and removes the temporary directory. _cleanup(): Terminates the browser process and removes the temporary directory.
create_profile(): Static method to create a user profile by launching a browser for user interaction. create_profile(): Static method to create a user profile by launching a browser for user interaction.
""" """
@staticmethod @staticmethod
def build_browser_flags(config: BrowserConfig) -> List[str]: def build_browser_flags(config: BrowserConfig) -> List[str]:
"""Common CLI flags for launching Chromium""" """Common CLI flags for launching Chromium"""
@@ -93,25 +92,21 @@ class ManagedBrowser:
if config.light_mode: if config.light_mode:
flags.extend(BROWSER_DISABLE_OPTIONS) flags.extend(BROWSER_DISABLE_OPTIONS)
if config.text_mode: if config.text_mode:
flags.extend( flags.extend([
[ "--blink-settings=imagesEnabled=false",
"--blink-settings=imagesEnabled=false", "--disable-remote-fonts",
"--disable-remote-fonts", "--disable-images",
"--disable-images", "--disable-javascript",
"--disable-javascript", "--disable-software-rasterizer",
"--disable-software-rasterizer", "--disable-dev-shm-usage",
"--disable-dev-shm-usage", ])
]
)
# proxy support # proxy support
if config.proxy: if config.proxy:
flags.append(f"--proxy-server={config.proxy}") flags.append(f"--proxy-server={config.proxy}")
elif config.proxy_config: elif config.proxy_config:
creds = "" creds = ""
if config.proxy_config.username and config.proxy_config.password: if config.proxy_config.username and config.proxy_config.password:
creds = ( creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
f"{config.proxy_config.username}:{config.proxy_config.password}@"
)
flags.append(f"--proxy-server={creds}{config.proxy_config.server}") flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
# dedupe # dedupe
return list(dict.fromkeys(flags)) return list(dict.fromkeys(flags))
@@ -132,7 +127,7 @@ class ManagedBrowser:
logger=None, logger=None,
host: str = "localhost", host: str = "localhost",
debugging_port: int = 9222, debugging_port: int = 9222,
cdp_url: Optional[str] = None, cdp_url: Optional[str] = None,
browser_config: Optional[BrowserConfig] = None, browser_config: Optional[BrowserConfig] = None,
): ):
""" """
@@ -168,7 +163,7 @@ class ManagedBrowser:
Starts the browser process or returns CDP endpoint URL. Starts the browser process or returns CDP endpoint URL.
If cdp_url is provided, returns it directly. If cdp_url is provided, returns it directly.
If user_data_dir is not provided for local browser, creates a temporary directory. If user_data_dir is not provided for local browser, creates a temporary directory.
Returns: Returns:
str: CDP endpoint URL str: CDP endpoint URL
""" """
@@ -184,9 +179,10 @@ class ManagedBrowser:
# Get browser path and args based on OS and browser type # Get browser path and args based on OS and browser type
# browser_path = self._get_browser_path() # browser_path = self._get_browser_path()
args = await self._get_browser_args() args = await self._get_browser_args()
if self.browser_config.extra_args: if self.browser_config.extra_args:
args.extend(self.browser_config.extra_args) args.extend(self.browser_config.extra_args)
# ── make sure no old Chromium instance is owning the same port/profile ── # ── make sure no old Chromium instance is owning the same port/profile ──
try: try:
@@ -204,9 +200,7 @@ class ManagedBrowser:
else: # macOS / Linux else: # macOS / Linux
# kill any process listening on the same debugging port # kill any process listening on the same debugging port
pids = ( pids = (
subprocess.check_output( subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
shlex.split(f"lsof -t -i:{self.debugging_port}")
)
.decode() .decode()
.strip() .strip()
.splitlines() .splitlines()
@@ -225,7 +219,8 @@ class ManagedBrowser:
os.remove(fp) os.remove(fp)
except Exception as _e: except Exception as _e:
# non-fatal — we'll try to start anyway, but log what happened # non-fatal — we'll try to start anyway, but log what happened
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER") self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
# Start browser process # Start browser process
try: try:
@@ -233,26 +228,26 @@ class ManagedBrowser:
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group # On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
if sys.platform == "win32": if sys.platform == "win32":
self.browser_process = subprocess.Popen( self.browser_process = subprocess.Popen(
args, args,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
creationflags=subprocess.DETACHED_PROCESS creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
| subprocess.CREATE_NEW_PROCESS_GROUP,
) )
else: else:
self.browser_process = subprocess.Popen( self.browser_process = subprocess.Popen(
args, args,
stdout=subprocess.PIPE, stdout=subprocess.PIPE,
stderr=subprocess.PIPE, stderr=subprocess.PIPE,
preexec_fn=os.setpgrp, # Start in a new process group preexec_fn=os.setpgrp # Start in a new process group
) )
# If verbose is True print args used to run the process # If verbose is True print args used to run the process
if self.logger and self.browser_config.verbose: if self.logger and self.browser_config.verbose:
self.logger.debug( self.logger.debug(
f"Starting browser with args: {' '.join(args)}", tag="BROWSER" f"Starting browser with args: {' '.join(args)}",
) tag="BROWSER"
)
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring # We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
await asyncio.sleep(0.5) # Give browser time to start await asyncio.sleep(0.5) # Give browser time to start
await self._initial_startup_check() await self._initial_startup_check()
@@ -269,7 +264,7 @@ class ManagedBrowser:
""" """
if not self.browser_process: if not self.browser_process:
return return
# Check that process started without immediate termination # Check that process started without immediate termination
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
if self.browser_process.poll() is not None: if self.browser_process.poll() is not None:
@@ -279,7 +274,7 @@ class ManagedBrowser:
stdout, stderr = self.browser_process.communicate(timeout=0.5) stdout, stderr = self.browser_process.communicate(timeout=0.5)
except subprocess.TimeoutExpired: except subprocess.TimeoutExpired:
pass pass
self.logger.error( self.logger.error(
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}", message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
tag="ERROR", tag="ERROR",
@@ -289,7 +284,7 @@ class ManagedBrowser:
"stderr": stderr.decode() if stderr else "", "stderr": stderr.decode() if stderr else "",
}, },
) )
async def _monitor_browser_process(self): async def _monitor_browser_process(self):
""" """
Monitor the browser process for unexpected termination. Monitor the browser process for unexpected termination.
@@ -412,14 +407,7 @@ class ManagedBrowser:
if sys.platform == "win32": if sys.platform == "win32":
# On Windows we might need taskkill for detached processes # On Windows we might need taskkill for detached processes
try: try:
subprocess.run( subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
[
"taskkill",
"/F",
"/PID",
str(self.browser_process.pid),
]
)
except Exception: except Exception:
self.browser_process.kill() self.browser_process.kill()
else: else:
@@ -429,7 +417,7 @@ class ManagedBrowser:
except Exception as e: except Exception as e:
self.logger.error( self.logger.error(
message="Error terminating browser: {error}", message="Error terminating browser: {error}",
tag="ERROR", tag="ERROR",
params={"error": str(e)}, params={"error": str(e)},
) )
@@ -442,77 +430,75 @@ class ManagedBrowser:
tag="ERROR", tag="ERROR",
params={"error": str(e)}, params={"error": str(e)},
) )
# These methods have been moved to BrowserProfiler class # These methods have been moved to BrowserProfiler class
@staticmethod @staticmethod
async def create_profile(browser_config=None, profile_name=None, logger=None): async def create_profile(browser_config=None, profile_name=None, logger=None):
""" """
This method has been moved to the BrowserProfiler class. This method has been moved to the BrowserProfiler class.
Creates a browser profile by launching a browser for interactive user setup Creates a browser profile by launching a browser for interactive user setup
and waits until the user closes it. The profile is stored in a directory that and waits until the user closes it. The profile is stored in a directory that
can be used later with BrowserConfig.user_data_dir. can be used later with BrowserConfig.user_data_dir.
Please use BrowserProfiler.create_profile() instead. Please use BrowserProfiler.create_profile() instead.
Example: Example:
```python ```python
from crawl4ai.browser_profiler import BrowserProfiler from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler() profiler = BrowserProfiler()
profile_path = await profiler.create_profile(profile_name="my-login-profile") profile_path = await profiler.create_profile(profile_name="my-login-profile")
``` ```
""" """
from .browser_profiler import BrowserProfiler from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it # Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler(logger=logger) profiler = BrowserProfiler(logger=logger)
return await profiler.create_profile( return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
profile_name=profile_name, browser_config=browser_config
)
@staticmethod @staticmethod
def list_profiles(): def list_profiles():
""" """
This method has been moved to the BrowserProfiler class. This method has been moved to the BrowserProfiler class.
Lists all available browser profiles in the Crawl4AI profiles directory. Lists all available browser profiles in the Crawl4AI profiles directory.
Please use BrowserProfiler.list_profiles() instead. Please use BrowserProfiler.list_profiles() instead.
Example: Example:
```python ```python
from crawl4ai.browser_profiler import BrowserProfiler from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler() profiler = BrowserProfiler()
profiles = profiler.list_profiles() profiles = profiler.list_profiles()
``` ```
""" """
from .browser_profiler import BrowserProfiler from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it # Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler() profiler = BrowserProfiler()
return profiler.list_profiles() return profiler.list_profiles()
@staticmethod @staticmethod
def delete_profile(profile_name_or_path): def delete_profile(profile_name_or_path):
""" """
This method has been moved to the BrowserProfiler class. This method has been moved to the BrowserProfiler class.
Delete a browser profile by name or path. Delete a browser profile by name or path.
Please use BrowserProfiler.delete_profile() instead. Please use BrowserProfiler.delete_profile() instead.
Example: Example:
```python ```python
from crawl4ai.browser_profiler import BrowserProfiler from crawl4ai.browser_profiler import BrowserProfiler
profiler = BrowserProfiler() profiler = BrowserProfiler()
success = profiler.delete_profile("my-profile") success = profiler.delete_profile("my-profile")
``` ```
""" """
from .browser_profiler import BrowserProfiler from .browser_profiler import BrowserProfiler
# Create a BrowserProfiler instance and delegate to it # Create a BrowserProfiler instance and delegate to it
profiler = BrowserProfiler() profiler = BrowserProfiler()
return profiler.delete_profile(profile_name_or_path) return profiler.delete_profile(profile_name_or_path)
@@ -565,10 +551,11 @@ async def clone_runtime_state(
"accuracy": crawlerRunConfig.geolocation.accuracy, "accuracy": crawlerRunConfig.geolocation.accuracy,
} }
) )
return dst return dst
class BrowserManager: class BrowserManager:
""" """
Manages the browser instance and context. Manages the browser instance and context.
@@ -585,7 +572,7 @@ class BrowserManager:
""" """
_playwright_instance = None _playwright_instance = None
@classmethod @classmethod
async def get_playwright(cls, use_undetected: bool = False): async def get_playwright(cls, use_undetected: bool = False):
if use_undetected: if use_undetected:
@@ -593,11 +580,9 @@ class BrowserManager:
else: else:
from playwright.async_api import async_playwright from playwright.async_api import async_playwright
cls._playwright_instance = await async_playwright().start() cls._playwright_instance = await async_playwright().start()
return cls._playwright_instance return cls._playwright_instance
def __init__( def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False):
self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False
):
""" """
Initialize the BrowserManager with a browser configuration. Initialize the BrowserManager with a browser configuration.
@@ -623,17 +608,16 @@ class BrowserManager:
# Keep track of contexts by a "config signature," so each unique config reuses a single context # Keep track of contexts by a "config signature," so each unique config reuses a single context
self.contexts_by_config = {} self.contexts_by_config = {}
self._contexts_lock = asyncio.Lock() self._contexts_lock = asyncio.Lock()
# Serialize context.new_page() across concurrent tasks to avoid races # Serialize context.new_page() across concurrent tasks to avoid races
# when using a shared persistent context (context.pages may be empty # when using a shared persistent context (context.pages may be empty
# for all racers). Prevents 'Target page/context closed' errors. # for all racers). Prevents 'Target page/context closed' errors.
self._page_lock = asyncio.Lock() self._page_lock = asyncio.Lock()
# Stealth adapter for stealth mode # Stealth adapter for stealth mode
self._stealth_adapter = None self._stealth_adapter = None
if self.config.enable_stealth and not self.use_undetected: if self.config.enable_stealth and not self.use_undetected:
from .browser_adapter import StealthAdapter from .browser_adapter import StealthAdapter
self._stealth_adapter = StealthAdapter() self._stealth_adapter = StealthAdapter()
# Initialize ManagedBrowser if needed # Initialize ManagedBrowser if needed
@@ -662,7 +646,7 @@ class BrowserManager:
""" """
if self.playwright is not None: if self.playwright is not None:
await self.close() await self.close()
if self.use_undetected: if self.use_undetected:
from patchright.async_api import async_playwright from patchright.async_api import async_playwright
else: else:
@@ -673,11 +657,7 @@ class BrowserManager:
if self.config.cdp_url or self.config.use_managed_browser: if self.config.cdp_url or self.config.use_managed_browser:
self.config.use_managed_browser = True self.config.use_managed_browser = True
cdp_url = ( cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
await self.managed_browser.start()
if not self.config.cdp_url
else self.config.cdp_url
)
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url) self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
contexts = self.browser.contexts contexts = self.browser.contexts
if contexts: if contexts:
@@ -698,6 +678,7 @@ class BrowserManager:
self.default_context = self.browser self.default_context = self.browser
def _build_browser_args(self) -> dict: def _build_browser_args(self) -> dict:
"""Build browser launch arguments from config.""" """Build browser launch arguments from config."""
args = [ args = [
@@ -743,7 +724,7 @@ class BrowserManager:
# Deduplicate args # Deduplicate args
args = list(dict.fromkeys(args)) args = list(dict.fromkeys(args))
browser_args = {"headless": self.config.headless, "args": args} browser_args = {"headless": self.config.headless, "args": args}
if self.config.chrome_channel: if self.config.chrome_channel:
@@ -820,9 +801,9 @@ class BrowserManager:
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT) context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
if self.config.downloads_path: if self.config.downloads_path:
context._impl_obj._options["accept_downloads"] = True context._impl_obj._options["accept_downloads"] = True
context._impl_obj._options["downloads_path"] = ( context._impl_obj._options[
self.config.downloads_path "downloads_path"
) ] = self.config.downloads_path
# Handle user agent and browser hints # Handle user agent and browser hints
if self.config.user_agent: if self.config.user_agent:
@@ -853,7 +834,7 @@ class BrowserManager:
or crawlerRunConfig.simulate_user or crawlerRunConfig.simulate_user
or crawlerRunConfig.magic or crawlerRunConfig.magic
): ):
await context.add_init_script(load_js_script("navigator_overrider")) await context.add_init_script(load_js_script("navigator_overrider"))
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None): async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
""" """
@@ -864,7 +845,7 @@ class BrowserManager:
Context: Browser context object with the specified configurations Context: Browser context object with the specified configurations
""" """
# Base settings # Base settings
user_agent = self.config.headers.get("User-Agent", self.config.user_agent) user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
viewport_settings = { viewport_settings = {
"width": self.config.viewport_width, "width": self.config.viewport_width,
"height": self.config.viewport_height, "height": self.config.viewport_height,
@@ -937,7 +918,7 @@ class BrowserManager:
"device_scale_factor": 1.0, "device_scale_factor": 1.0,
"java_script_enabled": self.config.java_script_enabled, "java_script_enabled": self.config.java_script_enabled,
} }
if crawlerRunConfig: if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context # Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config: if crawlerRunConfig.proxy_config:
@@ -945,12 +926,10 @@ class BrowserManager:
"server": crawlerRunConfig.proxy_config.server, "server": crawlerRunConfig.proxy_config.server,
} }
if crawlerRunConfig.proxy_config.username: if crawlerRunConfig.proxy_config.username:
proxy_settings.update( proxy_settings.update({
{ "username": crawlerRunConfig.proxy_config.username,
"username": crawlerRunConfig.proxy_config.username, "password": crawlerRunConfig.proxy_config.password,
"password": crawlerRunConfig.proxy_config.password, })
}
)
context_settings["proxy"] = proxy_settings context_settings["proxy"] = proxy_settings
if self.config.text_mode: if self.config.text_mode:
@@ -1008,12 +987,12 @@ class BrowserManager:
"cache_mode", "cache_mode",
"content_filter", "content_filter",
"semaphore_count", "semaphore_count",
"url", "url"
] ]
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context # Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
# and should cause a new context to be created if they change # and should cause a new context to be created if they change
for key in ephemeral_keys: for key in ephemeral_keys:
if key in config_dict: if key in config_dict:
del config_dict[key] del config_dict[key]
@@ -1034,7 +1013,7 @@ class BrowserManager:
self.logger.warning( self.logger.warning(
message="Failed to apply stealth to page: {error}", message="Failed to apply stealth to page: {error}",
tag="STEALTH", tag="STEALTH",
params={"error": str(e)}, params={"error": str(e)}
) )
async def get_page(self, crawlerRunConfig: CrawlerRunConfig): async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
@@ -1060,10 +1039,8 @@ class BrowserManager:
if self.config.use_managed_browser: if self.config.use_managed_browser:
if self.config.storage_state: if self.config.storage_state:
context = await self.create_browser_context(crawlerRunConfig) context = await self.create_browser_context(crawlerRunConfig)
ctx = self.default_context # default context, one window only ctx = self.default_context # default context, one window only
ctx = await clone_runtime_state( ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
context, ctx, crawlerRunConfig, self.config
)
# Avoid concurrent new_page on shared persistent context # Avoid concurrent new_page on shared persistent context
# See GH-1198: context.pages can be empty under races # See GH-1198: context.pages can be empty under races
async with self._page_lock: async with self._page_lock:
@@ -1075,21 +1052,14 @@ class BrowserManager:
page = next((p for p in pages if p.url == crawlerRunConfig.url), None) page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page: if not page:
if pages: if pages:
# FIX: Always create a new page for managed browsers to support concurrent crawling page = pages[0]
# Previously: page = pages[0]
async with self._page_lock:
page = await context.new_page()
await self._apply_stealth_to_page(page)
else: else:
# Double-check under lock to avoid TOCTOU and ensure only # Double-check under lock to avoid TOCTOU and ensure only
# one task calls new_page when pages=[] concurrently # one task calls new_page when pages=[] concurrently
async with self._page_lock: async with self._page_lock:
pages = context.pages pages = context.pages
if pages: if pages:
# FIX: Always create a new page for managed browsers to support concurrent crawling page = pages[0]
# Previously: page = pages[0]
page = await context.new_page()
await self._apply_stealth_to_page(page)
else: else:
page = await context.new_page() page = await context.new_page()
await self._apply_stealth_to_page(page) await self._apply_stealth_to_page(page)
@@ -1145,7 +1115,7 @@ class BrowserManager:
"""Close all browser resources and clean up.""" """Close all browser resources and clean up."""
if self.config.cdp_url: if self.config.cdp_url:
return return
if self.config.sleep_on_close: if self.config.sleep_on_close:
await asyncio.sleep(0.5) await asyncio.sleep(0.5)
@@ -1161,7 +1131,7 @@ class BrowserManager:
self.logger.error( self.logger.error(
message="Error closing context: {error}", message="Error closing context: {error}",
tag="ERROR", tag="ERROR",
params={"error": str(e)}, params={"error": str(e)}
) )
self.contexts_by_config.clear() self.contexts_by_config.clear()

View File

@@ -785,54 +785,6 @@ curl http://localhost:11235/crawl/job/crawl_xyz
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`. The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
#### LLM Extraction Jobs with Webhooks
The same webhook system works for LLM extraction jobs via `/llm/job`:
```bash
# Submit LLM extraction job with webhook
curl -X POST http://localhost:11235/llm/job \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com/article",
"q": "Extract the article title, author, and main points",
"provider": "openai/gpt-4o-mini",
"webhook_config": {
"webhook_url": "https://myapp.com/webhooks/llm-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}'
# Response: {"task_id": "llm_1234567890"}
```
**Your webhook receives:**
```json
{
"task_id": "llm_1234567890",
"task_type": "llm_extraction",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com/article"],
"data": {
"extracted_content": {
"title": "Understanding Web Scraping",
"author": "John Doe",
"main_points": ["Point 1", "Point 2", "Point 3"]
}
}
}
```
**Key Differences for LLM Jobs:**
- Task type is `"llm_extraction"` instead of `"crawl"`
- Extracted data is in `data.extracted_content`
- Single URL only (not an array)
- Supports schema-based extraction with `schema` parameter
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling. > 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
--- ---

File diff suppressed because it is too large Load Diff

View File

@@ -27,14 +27,6 @@
- [Hook Response Information](#hook-response-information) - [Hook Response Information](#hook-response-information)
- [Error Handling](#error-handling) - [Error Handling](#error-handling)
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python) - [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
- [Job Queue & Webhook API](#job-queue-webhook-api)
- [Why Use the Job Queue API?](#why-use-the-job-queue-api)
- [Available Endpoints](#available-endpoints)
- [Webhook Configuration](#webhook-configuration)
- [Usage Examples](#usage-examples)
- [Webhook Best Practices](#webhook-best-practices)
- [Use Cases](#use-cases)
- [Troubleshooting](#troubleshooting)
- [Dockerfile Parameters](#dockerfile-parameters) - [Dockerfile Parameters](#dockerfile-parameters)
- [Using the API](#using-the-api) - [Using the API](#using-the-api)
- [Playground Interface](#playground-interface) - [Playground Interface](#playground-interface)
@@ -1118,464 +1110,6 @@ if __name__ == "__main__":
--- ---
## Job Queue & Webhook API
The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish.
### Why Use the Job Queue API?
**Traditional Synchronous API (`/crawl`):**
- Client waits for entire crawl to complete
- Timeout issues with long-running crawls
- Resource blocking during execution
- Constant polling required for status updates
**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):**
- ✅ Submit job and continue immediately
- ✅ No timeout concerns for long operations
- ✅ Real-time webhook notifications on completion
- ✅ Better resource utilization
- ✅ Perfect for batch processing
- ✅ Ideal for microservice architectures
### Available Endpoints
#### 1. Crawl Job Endpoint
```
POST /crawl/job
```
Submit an asynchronous crawl job with optional webhook notification.
**Request Body:**
```json
{
"urls": ["https://example.com"],
"cache_mode": "bypass",
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"schema": {
"title": "h1",
"content": ".article-body"
}
},
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/crawl-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token",
"X-Custom-Header": "value"
}
}
}
```
**Response:**
```json
{
"task_id": "crawl_1698765432",
"message": "Crawl job submitted"
}
```
#### 2. LLM Extraction Job Endpoint
```
POST /llm/job
```
Submit an asynchronous LLM extraction job with optional webhook notification.
**Request Body:**
```json
{
"url": "https://example.com/article",
"q": "Extract the article title, author, publication date, and main points",
"provider": "openai/gpt-4o-mini",
"schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}",
"cache": false,
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/llm-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
```
**Response:**
```json
{
"task_id": "llm_1698765432",
"message": "LLM job submitted"
}
```
#### 3. Job Status Endpoint
```
GET /job/{task_id}
```
Check the status and retrieve results of a submitted job.
**Response (In Progress):**
```json
{
"task_id": "crawl_1698765432",
"status": "processing",
"message": "Job is being processed"
}
```
**Response (Completed):**
```json
{
"task_id": "crawl_1698765432",
"status": "completed",
"result": {
"markdown": "# Page Title\n\nContent...",
"extracted_content": {...},
"links": {...}
}
}
```
### Webhook Configuration
Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling.
#### Webhook Config Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications |
| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) |
| `webhook_headers` | object | No | Custom headers for authentication/identification |
#### Webhook Payload Format
**Success Notification (Crawl Job):**
```json
{
"task_id": "crawl_1698765432",
"task_type": "crawl",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com"],
"data": {
"markdown": "# Page content...",
"extracted_content": {...},
"links": {...}
}
}
```
**Success Notification (LLM Job):**
```json
{
"task_id": "llm_1698765432",
"task_type": "llm_extraction",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com/article"],
"data": {
"extracted_content": {
"title": "Understanding Web Scraping",
"author": "John Doe",
"date": "2025-10-22",
"points": ["Point 1", "Point 2"]
}
}
}
```
**Failure Notification:**
```json
{
"task_id": "crawl_1698765432",
"task_type": "crawl",
"status": "failed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com"],
"error": "Connection timeout after 30 seconds"
}
```
#### Webhook Delivery & Retry
- **Delivery Method:** HTTP POST to your `webhook_url`
- **Content-Type:** `application/json`
- **Retry Policy:** Exponential backoff with 5 attempts
- Attempt 1: Immediate
- Attempt 2: 1 second delay
- Attempt 3: 2 seconds delay
- Attempt 4: 4 seconds delay
- Attempt 5: 8 seconds delay
- **Success Status Codes:** 200-299
- **Custom Headers:** Your `webhook_headers` are included in every request
### Usage Examples
#### Example 1: Python with Webhook Handler (Flask)
```python
from flask import Flask, request, jsonify
import requests
app = Flask(__name__)
# Webhook handler
@app.route('/webhook/crawl-complete', methods=['POST'])
def handle_crawl_webhook():
payload = request.json
if payload['status'] == 'completed':
print(f"✅ Job {payload['task_id']} completed!")
print(f"Task type: {payload['task_type']}")
# Access the crawl results
if 'data' in payload:
markdown = payload['data'].get('markdown', '')
extracted = payload['data'].get('extracted_content', {})
print(f"Extracted {len(markdown)} characters")
print(f"Structured data: {extracted}")
else:
print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}")
return jsonify({"status": "received"}), 200
# Submit a crawl job with webhook
def submit_crawl_job():
response = requests.post(
"http://localhost:11235/crawl/job",
json={
"urls": ["https://example.com"],
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"schema": {
"name": "Example Schema",
"baseSelector": "body",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"}
]
}
},
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/crawl-complete",
"webhook_data_in_payload": True,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
)
task_id = response.json()['task_id']
print(f"Job submitted: {task_id}")
return task_id
if __name__ == '__main__':
app.run(port=5000)
```
#### Example 2: LLM Extraction with Webhooks
```python
import requests
def submit_llm_job_with_webhook():
response = requests.post(
"http://localhost:11235/llm/job",
json={
"url": "https://example.com/article",
"q": "Extract the article title, author, and main points",
"provider": "openai/gpt-4o-mini",
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/llm-complete",
"webhook_data_in_payload": True,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
)
task_id = response.json()['task_id']
print(f"LLM job submitted: {task_id}")
return task_id
# Webhook handler for LLM jobs
@app.route('/webhook/llm-complete', methods=['POST'])
def handle_llm_webhook():
payload = request.json
if payload['status'] == 'completed':
extracted = payload['data']['extracted_content']
print(f"✅ LLM extraction completed!")
print(f"Results: {extracted}")
else:
print(f"❌ LLM extraction failed: {payload.get('error')}")
return jsonify({"status": "received"}), 200
```
#### Example 3: Without Webhooks (Polling)
If you don't use webhooks, you can poll for results:
```python
import requests
import time
# Submit job
response = requests.post(
"http://localhost:11235/crawl/job",
json={"urls": ["https://example.com"]}
)
task_id = response.json()['task_id']
# Poll for results
while True:
result = requests.get(f"http://localhost:11235/job/{task_id}")
data = result.json()
if data['status'] == 'completed':
print("Job completed!")
print(data['result'])
break
elif data['status'] == 'failed':
print(f"Job failed: {data.get('error')}")
break
print("Still processing...")
time.sleep(2)
```
#### Example 4: Global Webhook Configuration
Set a default webhook URL in your `config.yml` to avoid repeating it in every request:
```yaml
# config.yml
api:
crawler:
# ... other settings ...
webhook:
default_url: "https://your-app.com/webhook/default"
default_headers:
X-Webhook-Secret: "your-secret-token"
```
Then submit jobs without webhook config:
```python
# Uses the global webhook configuration
response = requests.post(
"http://localhost:11235/crawl/job",
json={"urls": ["https://example.com"]}
)
```
### Webhook Best Practices
1. **Authentication:** Always use custom headers for webhook authentication
```json
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
```
2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications)
3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed
```python
@app.route('/webhook', methods=['POST'])
def webhook():
payload = request.json
# Queue for background processing
queue.enqueue(process_webhook, payload)
return jsonify({"status": "received"}), 200
```
4. **Error Handling:** Handle both success and failure notifications
```python
if payload['status'] == 'completed':
# Process success
elif payload['status'] == 'failed':
# Log error, retry, or alert
```
5. **Validation:** Verify webhook authenticity using custom headers
```python
secret = request.headers.get('X-Webhook-Secret')
if secret != os.environ['EXPECTED_SECRET']:
return jsonify({"error": "Unauthorized"}), 401
```
6. **Logging:** Log webhook deliveries for debugging
```python
logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}")
```
### Use Cases
**1. Batch Processing**
Submit hundreds of URLs and get notified as each completes:
```python
urls = ["https://site1.com", "https://site2.com", ...]
for url in urls:
submit_crawl_job(url, webhook_url="https://app.com/webhook")
```
**2. Microservice Integration**
Integrate with event-driven architectures:
```python
# Service A submits job
task_id = submit_crawl_job(url)
# Service B receives webhook and triggers next step
@app.route('/webhook')
def webhook():
process_result(request.json)
trigger_next_service()
return "OK", 200
```
**3. Long-Running Extractions**
Handle complex LLM extractions without timeouts:
```python
submit_llm_job(
url="https://long-article.com",
q="Comprehensive summary with key points and analysis",
webhook_url="https://app.com/webhook/llm"
)
```
### Troubleshooting
**Webhook not receiving notifications?**
- Check your webhook URL is publicly accessible
- Verify firewall/security group settings
- Use webhook testing tools like webhook.site for debugging
- Check server logs for delivery attempts
- Ensure your handler returns 200-299 status code
**Job stuck in processing?**
- Check Redis connection: `docker logs <container_name> | grep redis`
- Verify worker processes: `docker exec <container_name> ps aux | grep worker`
- Check server logs: `docker logs <container_name>`
**Need to cancel a job?**
Jobs are processed asynchronously. If you need to cancel:
- Delete the task from Redis (requires Redis CLI access)
- Or implement a cancellation endpoint in your webhook handler
---
## Dockerfile Parameters ## Dockerfile Parameters
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file. You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.

View File

@@ -529,19 +529,8 @@ class AdminDashboard {
</label> </label>
</div> </div>
<div class="form-group full-width"> <div class="form-group full-width">
<label>Long Description (Markdown - Overview tab)</label> <label>Integration Guide</label>
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea> <textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
</div>
<div class="form-group full-width">
<label>Integration Guide (Markdown - Integration tab)</label>
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
</div>
<div class="form-group full-width">
<label>Documentation (Markdown - Documentation tab)</label>
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
<small>Full documentation with API reference, examples, best practices, etc.</small>
</div> </div>
</div> </div>
`; `;
@@ -723,9 +712,7 @@ class AdminDashboard {
data.contact_email = document.getElementById('form-email').value; data.contact_email = document.getElementById('form-email').value;
data.featured = document.getElementById('form-featured').checked ? 1 : 0; data.featured = document.getElementById('form-featured').checked ? 1 : 0;
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0; data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
data.long_description = document.getElementById('form-long-description').value;
data.integration_guide = document.getElementById('form-integration').value; data.integration_guide = document.getElementById('form-integration').value;
data.documentation = document.getElementById('form-documentation').value;
} else if (type === 'articles') { } else if (type === 'articles') {
data.title = document.getElementById('form-title').value; data.title = document.getElementById('form-title').value;
data.slug = this.generateSlug(data.title); data.slug = this.generateSlug(data.title);

View File

@@ -278,12 +278,12 @@
} }
.tab-content { .tab-content {
display: none !important; display: none;
padding: 2rem; padding: 2rem;
} }
.tab-content.active { .tab-content.active {
display: block !important; display: block;
} }
/* Overview Layout */ /* Overview Layout */
@@ -510,31 +510,6 @@
line-height: 1.5; line-height: 1.5;
} }
/* Markdown rendered code blocks */
.integration-content pre,
.docs-content pre {
background: var(--bg-dark);
border: 1px solid var(--border-color);
margin: 1rem 0;
padding: 1rem;
padding-top: 2.5rem; /* Space for copy button */
overflow-x: auto;
position: relative;
max-height: none; /* Remove any height restrictions */
height: auto; /* Allow content to expand */
}
.integration-content pre code,
.docs-content pre code {
background: transparent;
padding: 0;
color: var(--text-secondary);
font-size: 0.875rem;
line-height: 1.5;
white-space: pre; /* Preserve whitespace and line breaks */
display: block;
}
/* Feature Grid */ /* Feature Grid */
.feature-grid { .feature-grid {
display: grid; display: grid;

View File

@@ -73,14 +73,27 @@
<div class="tabs"> <div class="tabs">
<button class="tab-btn active" data-tab="overview">Overview</button> <button class="tab-btn active" data-tab="overview">Overview</button>
<button class="tab-btn" data-tab="integration">Integration</button> <button class="tab-btn" data-tab="integration">Integration</button>
<!-- <button class="tab-btn" data-tab="docs">Documentation</button> <button class="tab-btn" data-tab="docs">Documentation</button>
<button class="tab-btn" data-tab="support">Support</button> --> <button class="tab-btn" data-tab="support">Support</button>
</div> </div>
<section id="overview-tab" class="tab-content active"> <section id="overview-tab" class="tab-content active">
<div class="overview-columns"> <div class="overview-columns">
<div class="overview-main"> <div class="overview-main">
<h2>Overview</h2>
<div id="app-overview">Overview content goes here.</div> <div id="app-overview">Overview content goes here.</div>
<h3>Key Features</h3>
<ul id="app-features" class="features-list">
<li>Feature 1</li>
<li>Feature 2</li>
<li>Feature 3</li>
</ul>
<h3>Use Cases</h3>
<div id="app-use-cases" class="use-cases">
<p>Describe how this app can help your workflow.</p>
</div>
</div> </div>
<aside class="sidebar"> <aside class="sidebar">
@@ -129,16 +142,37 @@
</section> </section>
<section id="integration-tab" class="tab-content"> <section id="integration-tab" class="tab-content">
<div class="integration-content" id="app-integration"> <div class="integration-content">
<h2>Integration Guide</h2>
<h3>Installation</h3>
<div class="code-block">
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
</div>
<h3>Basic Usage</h3>
<div class="code-block">
<pre><code id="usage-code"># Usage example will appear here</code></pre>
</div>
<h3>Complete Integration Example</h3>
<div class="code-block">
<button class="copy-btn" id="copy-integration">Copy</button>
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
</div>
</div> </div>
</section> </section>
<!-- <section id="docs-tab" class="tab-content"> <section id="docs-tab" class="tab-content">
<div class="docs-content" id="app-docs"> <div class="docs-content">
<h2>Documentation</h2>
<div id="app-docs" class="doc-sections">
<p>Documentation coming soon.</p>
</div>
</div> </div>
</section> --> </section>
<!-- <section id="support-tab" class="tab-content"> <section id="support-tab" class="tab-content">
<div class="docs-content"> <div class="docs-content">
<h2>Support</h2> <h2>Support</h2>
<div class="support-grid"> <div class="support-grid">
@@ -156,7 +190,7 @@
</div> </div>
</div> </div>
</div> </div>
</section> --> </section>
</div> </div>
</main> </main>

View File

@@ -112,7 +112,7 @@ class AppDetailPage {
} }
// Contact // Contact
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available'); document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
// Sidebar info // Sidebar info
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0); document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
@@ -123,134 +123,146 @@ class AppDetailPage {
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free'; document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com'; document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
// Render tab contents from database fields // Integration guide
this.renderTabContents(); this.renderIntegrationGuide();
} }
renderTabContents() { renderIntegrationGuide() {
// Overview tab - use long_description from database // Installation code
const overviewDiv = document.getElementById('app-overview'); const installCode = document.getElementById('install-code');
if (overviewDiv) { if (installCode) {
if (this.appData.long_description) { if (this.appData.type === 'Open Source' && this.appData.github_url) {
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description); installCode.textContent = `# Clone from GitHub
} else { git clone ${this.appData.github_url}
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
# Install dependencies
pip install -r requirements.txt`;
} else if (this.appData.name.toLowerCase().includes('api')) {
installCode.textContent = `# Install via pip
pip install ${this.appData.slug}
# Or install from source
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
} }
} }
// Integration tab - use integration_guide field from database // Usage code - customize based on category
const integrationDiv = document.getElementById('app-integration'); const usageCode = document.getElementById('usage-code');
if (integrationDiv) { if (usageCode) {
if (this.appData.integration_guide) { if (this.appData.category === 'Browser Automation') {
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide); usageCode.textContent = `from crawl4ai import AsyncWebCrawler
// Add copy buttons to all code blocks from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
this.addCopyButtonsToCodeBlocks(integrationDiv);
} else { async def main():
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>'; # Initialize ${this.appData.name}
automation = ${this.appData.name.replace(/\s+/g, '')}()
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
browser_config=automation.config,
wait_for="css:body"
)
print(result.markdown)`;
} else if (this.appData.category === 'Proxy Services') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
import ${this.appData.slug.replace(/-/g, '_')}
# Configure proxy
proxy_config = {
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
"username": "your_username",
"password": "your_password"
}
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
result = await crawler.arun(
url="https://example.com",
bypass_cache=True
)
print(result.status_code)`;
} else if (this.appData.category === 'LLM Integration') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Configure LLM extraction
strategy = LLMExtractionStrategy(
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
api_key="your-api-key",
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
instruction="Extract structured data"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
print(result.extracted_content)`;
} }
} }
// Documentation tab - use documentation field from database // Integration example
const docsDiv = document.getElementById('app-docs'); const integrationCode = document.getElementById('integration-code');
if (docsDiv) { if (integrationCode) {
if (this.appData.documentation) { integrationCode.textContent = this.appData.integration_guide ||
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation); `# Complete ${this.appData.name} Integration Example
// Add copy buttons to all code blocks
this.addCopyButtonsToCodeBlocks(docsDiv); from crawl4ai import AsyncWebCrawler
} else { from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
docsDiv.innerHTML = '<p>Documentation coming soon.</p>'; import json
}
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
"""
Complete example showing how to use ${this.appData.name}
with Crawl4AI for production web scraping
"""
# Define extraction schema
schema = {
"name": "ProductList",
"baseSelector": "div.product",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
]
}
# Initialize crawler with ${this.appData.name}
async with AsyncWebCrawler(
browser_type="chromium",
headless=True,
verbose=True
) as crawler:
# Crawl with extraction
result = await crawler.arun(
url="https://example.com/products",
extraction_strategy=JsonCssExtractionStrategy(schema),
cache_mode="bypass",
wait_for="css:.product",
screenshot=True
)
# Process results
if result.success:
products = json.loads(result.extracted_content)
print(f"Found {len(products)} products")
for product in products[:5]:
print(f"- {product['title']}: {product['price']}")
return products
# Run the crawler
if __name__ == "__main__":
import asyncio
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
} }
} }
addCopyButtonsToCodeBlocks(container) {
// Find all code blocks and add copy buttons
const codeBlocks = container.querySelectorAll('pre code');
codeBlocks.forEach(codeBlock => {
const pre = codeBlock.parentElement;
// Skip if already has a copy button
if (pre.querySelector('.copy-btn')) return;
// Create copy button
const copyBtn = document.createElement('button');
copyBtn.className = 'copy-btn';
copyBtn.textContent = 'Copy';
copyBtn.onclick = () => {
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
copyBtn.textContent = '✓ Copied!';
setTimeout(() => {
copyBtn.textContent = 'Copy';
}, 2000);
});
};
// Add button to pre element
pre.style.position = 'relative';
pre.insertBefore(copyBtn, codeBlock);
});
}
renderMarkdown(text) {
if (!text) return '';
// Store code blocks temporarily to protect them from processing
const codeBlocks = [];
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
return placeholder;
});
// Store inline code temporarily
const inlineCodes = [];
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
return placeholder;
});
// Now process the rest of the markdown
processed = processed
// Headers
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
// Bold
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
// Italic
.replace(/\*(.*?)\*/g, '<em>$1</em>')
// Links
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
// Line breaks
.replace(/\n\n/g, '</p><p>')
.replace(/\n/g, '<br>')
// Lists
.replace(/^\* (.*)$/gim, '<li>$1</li>')
.replace(/^- (.*)$/gim, '<li>$1</li>')
// Wrap in paragraphs
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
.replace(/(?<![>])$/gim, '</p>');
// Restore inline code
inlineCodes.forEach((code, i) => {
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
});
// Restore code blocks
codeBlocks.forEach((block, i) => {
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
});
return processed;
}
escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
formatNumber(num) { formatNumber(num) {
if (num >= 1000000) { if (num >= 1000000) {
return (num / 1000000).toFixed(1) + 'M'; return (num / 1000000).toFixed(1) + 'M';
@@ -263,27 +275,45 @@ class AppDetailPage {
setupEventListeners() { setupEventListeners() {
// Tab switching // Tab switching
const tabs = document.querySelectorAll('.tab-btn'); const tabs = document.querySelectorAll('.tab-btn');
tabs.forEach(tab => { tabs.forEach(tab => {
tab.addEventListener('click', () => { tab.addEventListener('click', () => {
// Update active tab button // Update active tab
tabs.forEach(t => t.classList.remove('active')); tabs.forEach(t => t.classList.remove('active'));
tab.classList.add('active'); tab.classList.add('active');
// Show corresponding content // Show corresponding content
const tabName = tab.dataset.tab; const tabName = tab.dataset.tab;
document.querySelectorAll('.tab-content').forEach(content => {
// Hide all tab contents
const allTabContents = document.querySelectorAll('.tab-content');
allTabContents.forEach(content => {
content.classList.remove('active'); content.classList.remove('active');
}); });
document.getElementById(`${tabName}-tab`).classList.add('active');
});
});
// Show the selected tab content // Copy integration code
const targetTab = document.getElementById(`${tabName}-tab`); document.getElementById('copy-integration').addEventListener('click', () => {
if (targetTab) { const code = document.getElementById('integration-code').textContent;
targetTab.classList.add('active'); navigator.clipboard.writeText(code).then(() => {
} const btn = document.getElementById('copy-integration');
const originalText = btn.innerHTML;
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = originalText;
}, 2000);
});
});
// Copy code buttons
document.querySelectorAll('.copy-btn').forEach(btn => {
btn.addEventListener('click', (e) => {
const codeBlock = e.target.closest('.code-block');
const code = codeBlock.querySelector('code').textContent;
navigator.clipboard.writeText(code).then(() => {
btn.textContent = 'Copied!';
setTimeout(() => {
btn.textContent = 'Copy';
}, 2000);
});
}); });
}); });
} }

View File

@@ -471,17 +471,13 @@ async def delete_sponsor(sponsor_id: int):
app.include_router(router) app.include_router(router)
# Version info
VERSION = "1.1.0"
BUILD_DATE = "2025-10-26"
@app.get("/") @app.get("/")
async def root(): async def root():
"""API info""" """API info"""
return { return {
"name": "Crawl4AI Marketplace API", "name": "Crawl4AI Marketplace API",
"version": VERSION, "version": "1.0.0",
"build_date": BUILD_DATE,
"endpoints": [ "endpoints": [
"/marketplace/api/apps", "/marketplace/api/apps",
"/marketplace/api/articles", "/marketplace/api/articles",

View File

@@ -31,7 +31,7 @@ dependencies = [
"rank-bm25~=0.2", "rank-bm25~=0.2",
"snowballstemmer~=2.2", "snowballstemmer~=2.2",
"pydantic>=2.10", "pydantic>=2.10",
"pyOpenSSL>=25.3.0", "pyOpenSSL>=24.3.0",
"psutil>=6.1.1", "psutil>=6.1.1",
"PyYAML>=6.0", "PyYAML>=6.0",
"nltk>=3.9.1", "nltk>=3.9.1",

View File

@@ -19,7 +19,7 @@ rank-bm25~=0.2
colorama~=0.4 colorama~=0.4
snowballstemmer~=2.2 snowballstemmer~=2.2
pydantic>=2.10 pydantic>=2.10
pyOpenSSL>=25.3.0 pyOpenSSL>=24.3.0
psutil>=6.1.1 psutil>=6.1.1
PyYAML>=6.0 PyYAML>=6.0
nltk>=3.9.1 nltk>=3.9.1

View File

@@ -1,283 +0,0 @@
"""
Compact test suite for CDP concurrency fix.
This file consolidates all tests related to the CDP concurrency fix for
AsyncWebCrawler.arun_many() with managed browsers.
The bug was that all concurrent tasks were fighting over one shared tab,
causing failures. This has been fixed by modifying the get_page() method
in browser_manager.py to always create new pages instead of reusing pages[0].
"""
import asyncio
import shutil
import sys
import tempfile
from pathlib import Path
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
from crawl4ai.async_configs import BrowserConfig
# =============================================================================
# TEST 1: Basic arun_many functionality
# =============================================================================
async def test_basic_arun_many():
"""Test that arun_many works correctly with basic configuration."""
print("=== TEST 1: Basic arun_many functionality ===")
# Configuration to bypass cache for testing
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
# Test URLs - using reliable test URLs
test_urls = [
"https://httpbin.org/html", # Simple HTML page
"https://httpbin.org/json", # Simple JSON response
]
async with AsyncWebCrawler() as crawler:
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
# This should work correctly
result = await crawler.arun_many(urls=test_urls, config=config)
# Simple verification - if we get here without exception, the basic functionality works
print(f"✓ arun_many completed successfully")
return True
# =============================================================================
# TEST 2: CDP Browser with Managed Configuration
# =============================================================================
async def test_arun_many_with_managed_cdp_browser():
"""Test that arun_many works correctly with managed CDP browsers."""
print("\n=== TEST 2: arun_many with managed CDP browser ===")
# Create a temporary user data directory for the CDP browser
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-cdp-test-")
try:
# Configure browser to use managed CDP mode
browser_config = BrowserConfig(
use_managed_browser=True,
browser_type="chromium",
headless=True,
user_data_dir=user_data_dir,
verbose=True,
)
# Configuration to bypass cache for testing
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
page_timeout=60000,
wait_until="domcontentloaded",
)
# Test URLs - using reliable test URLs
test_urls = [
"https://httpbin.org/html", # Simple HTML page
"https://httpbin.org/json", # Simple JSON response
]
# Create crawler with CDP browser configuration
async with AsyncWebCrawler(config=browser_config) as crawler:
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
# This should work correctly with our fix
result = await crawler.arun_many(urls=test_urls, config=crawler_config)
print(f"✓ arun_many completed successfully with managed CDP browser")
return True
except Exception as e:
print(f"❌ Test failed with error: {str(e)}")
raise
finally:
# Clean up temporary directory
try:
shutil.rmtree(user_data_dir, ignore_errors=True)
except:
pass
# =============================================================================
# TEST 3: Concurrency Verification
# =============================================================================
async def test_concurrent_crawling():
"""Test concurrent crawling to verify the fix works."""
print("\n=== TEST 3: Concurrent crawling verification ===")
# Configuration to bypass cache for testing
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
# Test URLs - using reliable test URLs
test_urls = [
"https://httpbin.org/html", # Simple HTML page
"https://httpbin.org/json", # Simple JSON response
"https://httpbin.org/uuid", # Simple UUID response
"https://example.com/", # Standard example page
]
async with AsyncWebCrawler() as crawler:
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
# This should work correctly with our fix
results = await crawler.arun_many(urls=test_urls, config=config)
# Simple verification - if we get here without exception, the fix works
print("✓ arun_many completed successfully with concurrent crawling")
return True
# =============================================================================
# TEST 4: Concurrency Fix Demonstration
# =============================================================================
async def test_concurrency_fix():
"""Demonstrate that the concurrency fix works."""
print("\n=== TEST 4: Concurrency fix demonstration ===")
# Configuration to bypass cache for testing
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
# Test URLs - using reliable test URLs
test_urls = [
"https://httpbin.org/html", # Simple HTML page
"https://httpbin.org/json", # Simple JSON response
"https://httpbin.org/uuid", # Simple UUID response
]
async with AsyncWebCrawler() as crawler:
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
# This should work correctly with our fix
results = await crawler.arun_many(urls=test_urls, config=config)
# Simple verification - if we get here without exception, the fix works
print("✓ arun_many completed successfully with concurrent crawling")
return True
# =============================================================================
# TEST 5: Before/After Behavior Comparison
# =============================================================================
async def test_before_after_behavior():
"""Test that demonstrates concurrent crawling works correctly after the fix."""
print("\n=== TEST 5: Before/After behavior test ===")
# Configuration to bypass cache for testing
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
# Test URLs - using reliable test URLs that would stress the concurrency system
test_urls = [
"https://httpbin.org/delay/1", # Delayed response to increase chance of contention
"https://httpbin.org/delay/2", # Delayed response to increase chance of contention
"https://httpbin.org/uuid", # Fast response
"https://httpbin.org/json", # Fast response
]
async with AsyncWebCrawler() as crawler:
print(
f"Testing concurrent crawling of {len(test_urls)} URLs (including delayed responses)..."
)
print(
"This test would have failed before the concurrency fix due to page contention."
)
# This should work correctly with our fix
results = await crawler.arun_many(urls=test_urls, config=config)
# Simple verification - if we get here without exception, the fix works
print("✓ arun_many completed successfully with concurrent crawling")
print("✓ No page contention issues detected")
return True
# =============================================================================
# TEST 6: Reference Pattern Test
# =============================================================================
async def test_reference_pattern():
"""Main test function following reference pattern."""
print("\n=== TEST 6: Reference pattern test ===")
# Configure crawler settings
crawler_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
page_timeout=60000,
wait_until="domcontentloaded",
)
# Define URLs to crawl
URLS = [
"https://httpbin.org/html",
"https://httpbin.org/json",
"https://httpbin.org/uuid",
]
# Crawl all URLs using arun_many
async with AsyncWebCrawler() as crawler:
print(f"Testing concurrent crawling of {len(URLS)} URLs...")
results = await crawler.arun_many(urls=URLS, config=crawler_cfg)
# Simple verification - if we get here without exception, the fix works
print("✓ arun_many completed successfully with concurrent crawling")
print("✅ Reference pattern test completed successfully!")
# =============================================================================
# MAIN EXECUTION
# =============================================================================
async def main():
"""Run all tests."""
print("Running compact CDP concurrency test suite...")
print("=" * 60)
tests = [
test_basic_arun_many,
test_arun_many_with_managed_cdp_browser,
test_concurrent_crawling,
test_concurrency_fix,
test_before_after_behavior,
test_reference_pattern,
]
passed = 0
failed = 0
for test_func in tests:
try:
await test_func()
passed += 1
except Exception as e:
print(f"❌ Test failed: {str(e)}")
failed += 1
print("\n" + "=" * 60)
print(f"Test Results: {passed} passed, {failed} failed")
if failed == 0:
print("🎉 All tests passed! The CDP concurrency fix is working correctly.")
return True
else:
print(f"{failed} test(s) failed!")
return False
if __name__ == "__main__":
success = asyncio.run(main())
sys.exit(0 if success else 1)

View File

@@ -1,168 +0,0 @@
"""
Lightweight test to verify pyOpenSSL security fix (Issue #1545).
This test verifies the security requirements are met:
1. pyOpenSSL >= 25.3.0 is installed
2. cryptography >= 45.0.7 is installed (above vulnerable range)
3. SSL/TLS functionality works correctly
This test can run without full crawl4ai dependencies installed.
"""
import sys
from packaging import version
def test_package_versions():
"""Test that package versions meet security requirements."""
print("=" * 70)
print("TEST: Package Version Security Requirements (Issue #1545)")
print("=" * 70)
all_passed = True
# Test pyOpenSSL version
try:
import OpenSSL
pyopenssl_version = OpenSSL.__version__
print(f"\n✓ pyOpenSSL is installed: {pyopenssl_version}")
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
print(f" ✓ PASS: pyOpenSSL {pyopenssl_version} >= 25.3.0 (required)")
else:
print(f" ✗ FAIL: pyOpenSSL {pyopenssl_version} < 25.3.0 (required)")
all_passed = False
except ImportError as e:
print(f"\n✗ FAIL: pyOpenSSL not installed - {e}")
all_passed = False
# Test cryptography version
try:
import cryptography
crypto_version = cryptography.__version__
print(f"\n✓ cryptography is installed: {crypto_version}")
# The vulnerable range is >=37.0.0 & <43.0.1
# We need >= 45.0.7 to be safe
if version.parse(crypto_version) >= version.parse("45.0.7"):
print(f" ✓ PASS: cryptography {crypto_version} >= 45.0.7 (secure)")
print(f" ✓ NOT in vulnerable range (37.0.0 to 43.0.0)")
elif version.parse(crypto_version) >= version.parse("37.0.0") and version.parse(crypto_version) < version.parse("43.0.1"):
print(f" ✗ FAIL: cryptography {crypto_version} is VULNERABLE")
print(f" ✗ Version is in vulnerable range (>=37.0.0 & <43.0.1)")
all_passed = False
else:
print(f" ⚠ WARNING: cryptography {crypto_version} < 45.0.7")
print(f" ⚠ May not meet security requirements")
except ImportError as e:
print(f"\n✗ FAIL: cryptography not installed - {e}")
all_passed = False
return all_passed
def test_ssl_basic_functionality():
"""Test that SSL/TLS basic functionality works."""
print("\n" + "=" * 70)
print("TEST: SSL/TLS Basic Functionality")
print("=" * 70)
try:
import OpenSSL.SSL
# Create a basic SSL context to verify functionality
context = OpenSSL.SSL.Context(OpenSSL.SSL.TLSv1_2_METHOD)
print("\n✓ SSL Context created successfully")
print(" ✓ PASS: SSL/TLS functionality is working")
return True
except Exception as e:
print(f"\n✗ FAIL: SSL functionality test failed - {e}")
return False
def test_pyopenssl_crypto_integration():
"""Test that pyOpenSSL and cryptography integration works."""
print("\n" + "=" * 70)
print("TEST: pyOpenSSL <-> cryptography Integration")
print("=" * 70)
try:
from OpenSSL import crypto
# Generate a simple key pair to test integration
key = crypto.PKey()
key.generate_key(crypto.TYPE_RSA, 2048)
print("\n✓ Generated RSA key pair successfully")
print(" ✓ PASS: pyOpenSSL and cryptography are properly integrated")
return True
except Exception as e:
print(f"\n✗ FAIL: Integration test failed - {e}")
import traceback
traceback.print_exc()
return False
def main():
"""Run all security tests."""
print("\n")
print("" + "=" * 68 + "")
print("║ pyOpenSSL Security Fix Verification - Issue #1545 ║")
print("" + "=" * 68 + "")
print("\nVerifying that the pyOpenSSL update resolves the security vulnerability")
print("in the cryptography package (CVE: versions >=37.0.0 & <43.0.1)\n")
results = []
# Test 1: Package versions
results.append(("Package Versions", test_package_versions()))
# Test 2: SSL functionality
results.append(("SSL Functionality", test_ssl_basic_functionality()))
# Test 3: Integration
results.append(("pyOpenSSL-crypto Integration", test_pyopenssl_crypto_integration()))
# Summary
print("\n" + "=" * 70)
print("TEST SUMMARY")
print("=" * 70)
all_passed = True
for test_name, passed in results:
status = "✓ PASS" if passed else "✗ FAIL"
print(f"{status}: {test_name}")
all_passed = all_passed and passed
print("=" * 70)
if all_passed:
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
print("✓ Security vulnerability is resolved")
print("✓ pyOpenSSL >= 25.3.0 is working correctly")
print("✓ cryptography >= 45.0.7 (not vulnerable)")
print("\nThe dependency update is safe to merge.\n")
return True
else:
print("\n✗✗✗ SOME TESTS FAILED ✗✗✗")
print("✗ Security requirements not met")
print("\nDo NOT merge until all tests pass.\n")
return False
if __name__ == "__main__":
try:
success = main()
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n\nTest interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n✗ Unexpected error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)

View File

@@ -1,184 +0,0 @@
"""
Test script to verify pyOpenSSL update doesn't break crawl4ai functionality.
This test verifies:
1. pyOpenSSL and cryptography versions are correct and secure
2. Basic crawling functionality still works
3. HTTPS/SSL connections work properly
4. Stealth mode integration works (uses playwright-stealth internally)
Issue: #1545 - Security vulnerability in cryptography package
Fix: Updated pyOpenSSL from >=24.3.0 to >=25.3.0
Expected: cryptography package should be >=45.0.7 (above vulnerable range)
"""
import asyncio
import sys
from packaging import version
def check_versions():
"""Verify pyOpenSSL and cryptography versions meet security requirements."""
print("=" * 60)
print("STEP 1: Checking Package Versions")
print("=" * 60)
try:
import OpenSSL
pyopenssl_version = OpenSSL.__version__
print(f"✓ pyOpenSSL version: {pyopenssl_version}")
# Check pyOpenSSL >= 25.3.0
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
print(f" ✓ Version check passed: {pyopenssl_version} >= 25.3.0")
else:
print(f" ✗ Version check FAILED: {pyopenssl_version} < 25.3.0")
return False
except ImportError as e:
print(f"✗ Failed to import pyOpenSSL: {e}")
return False
try:
import cryptography
crypto_version = cryptography.__version__
print(f"✓ cryptography version: {crypto_version}")
# Check cryptography >= 45.0.7 (above vulnerable range)
if version.parse(crypto_version) >= version.parse("45.0.7"):
print(f" ✓ Security check passed: {crypto_version} >= 45.0.7 (not vulnerable)")
else:
print(f" ✗ Security check FAILED: {crypto_version} < 45.0.7 (potentially vulnerable)")
return False
except ImportError as e:
print(f"✗ Failed to import cryptography: {e}")
return False
print("\n✓ All version checks passed!\n")
return True
async def test_basic_crawl():
"""Test basic crawling functionality with HTTPS site."""
print("=" * 60)
print("STEP 2: Testing Basic HTTPS Crawling")
print("=" * 60)
try:
from crawl4ai import AsyncWebCrawler
async with AsyncWebCrawler(verbose=True) as crawler:
# Test with a simple HTTPS site (requires SSL/TLS)
print("Crawling example.com (HTTPS)...")
result = await crawler.arun(
url="https://www.example.com",
bypass_cache=True
)
if result.success:
print(f"✓ Crawl successful!")
print(f" - Status code: {result.status_code}")
print(f" - Content length: {len(result.html)} bytes")
print(f" - SSL/TLS connection: ✓ Working")
return True
else:
print(f"✗ Crawl failed: {result.error_message}")
return False
except Exception as e:
print(f"✗ Test failed with error: {e}")
import traceback
traceback.print_exc()
return False
async def test_stealth_mode():
"""Test stealth mode functionality (depends on playwright-stealth)."""
print("\n" + "=" * 60)
print("STEP 3: Testing Stealth Mode Integration")
print("=" * 60)
try:
from crawl4ai import AsyncWebCrawler, BrowserConfig
# Create browser config with stealth mode
browser_config = BrowserConfig(
headless=True,
verbose=False
)
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
print("Crawling with stealth mode enabled...")
result = await crawler.arun(
url="https://www.example.com",
bypass_cache=True
)
if result.success:
print(f"✓ Stealth crawl successful!")
print(f" - Stealth mode: ✓ Working")
return True
else:
print(f"✗ Stealth crawl failed: {result.error_message}")
return False
except Exception as e:
print(f"✗ Stealth test failed with error: {e}")
import traceback
traceback.print_exc()
return False
async def main():
"""Run all tests."""
print("\n")
print("" + "=" * 58 + "")
print("║ pyOpenSSL Security Update Verification Test (Issue #1545) ║")
print("" + "=" * 58 + "")
print("\n")
# Step 1: Check versions
versions_ok = check_versions()
if not versions_ok:
print("\n✗ FAILED: Version requirements not met")
return False
# Step 2: Test basic crawling
crawl_ok = await test_basic_crawl()
if not crawl_ok:
print("\n✗ FAILED: Basic crawling test failed")
return False
# Step 3: Test stealth mode
stealth_ok = await test_stealth_mode()
if not stealth_ok:
print("\n✗ FAILED: Stealth mode test failed")
return False
# All tests passed
print("\n" + "=" * 60)
print("FINAL RESULT")
print("=" * 60)
print("✓ All tests passed successfully!")
print("✓ pyOpenSSL update is working correctly")
print("✓ No breaking changes detected")
print("✓ Security vulnerability resolved")
print("=" * 60)
print("\n")
return True
if __name__ == "__main__":
try:
success = asyncio.run(main())
sys.exit(0 if success else 1)
except KeyboardInterrupt:
print("\n\nTest interrupted by user")
sys.exit(1)
except Exception as e:
print(f"\n✗ Unexpected error: {e}")
import traceback
traceback.print_exc()
sys.exit(1)