Compare commits
1 Commits
fix/cdp
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ca100c6518 |
Submodule .yoyo/snapshot deleted from 5e783b71e7
@@ -1,23 +1,22 @@
|
|||||||
import asyncio
|
import asyncio
|
||||||
import hashlib
|
import time
|
||||||
|
from typing import List, Optional
|
||||||
import os
|
import os
|
||||||
import shlex
|
import sys
|
||||||
import shutil
|
import shutil
|
||||||
|
import tempfile
|
||||||
|
import psutil
|
||||||
import signal
|
import signal
|
||||||
import subprocess
|
import subprocess
|
||||||
import sys
|
import shlex
|
||||||
import tempfile
|
|
||||||
import time
|
|
||||||
import warnings
|
|
||||||
from typing import List, Optional
|
|
||||||
|
|
||||||
import psutil
|
|
||||||
from playwright.async_api import BrowserContext
|
from playwright.async_api import BrowserContext
|
||||||
|
import hashlib
|
||||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
|
||||||
from .config import DOWNLOAD_PAGE_TIMEOUT
|
|
||||||
from .js_snippet import load_js_script
|
from .js_snippet import load_js_script
|
||||||
|
from .config import DOWNLOAD_PAGE_TIMEOUT
|
||||||
|
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||||
from .utils import get_chromium_path
|
from .utils import get_chromium_path
|
||||||
|
import warnings
|
||||||
|
|
||||||
|
|
||||||
BROWSER_DISABLE_OPTIONS = [
|
BROWSER_DISABLE_OPTIONS = [
|
||||||
"--disable-background-networking",
|
"--disable-background-networking",
|
||||||
@@ -66,7 +65,7 @@ class ManagedBrowser:
|
|||||||
_cleanup(): Terminates the browser process and removes the temporary directory.
|
_cleanup(): Terminates the browser process and removes the temporary directory.
|
||||||
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
create_profile(): Static method to create a user profile by launching a browser for user interaction.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
def build_browser_flags(config: BrowserConfig) -> List[str]:
|
||||||
"""Common CLI flags for launching Chromium"""
|
"""Common CLI flags for launching Chromium"""
|
||||||
@@ -93,25 +92,21 @@ class ManagedBrowser:
|
|||||||
if config.light_mode:
|
if config.light_mode:
|
||||||
flags.extend(BROWSER_DISABLE_OPTIONS)
|
flags.extend(BROWSER_DISABLE_OPTIONS)
|
||||||
if config.text_mode:
|
if config.text_mode:
|
||||||
flags.extend(
|
flags.extend([
|
||||||
[
|
"--blink-settings=imagesEnabled=false",
|
||||||
"--blink-settings=imagesEnabled=false",
|
"--disable-remote-fonts",
|
||||||
"--disable-remote-fonts",
|
"--disable-images",
|
||||||
"--disable-images",
|
"--disable-javascript",
|
||||||
"--disable-javascript",
|
"--disable-software-rasterizer",
|
||||||
"--disable-software-rasterizer",
|
"--disable-dev-shm-usage",
|
||||||
"--disable-dev-shm-usage",
|
])
|
||||||
]
|
|
||||||
)
|
|
||||||
# proxy support
|
# proxy support
|
||||||
if config.proxy:
|
if config.proxy:
|
||||||
flags.append(f"--proxy-server={config.proxy}")
|
flags.append(f"--proxy-server={config.proxy}")
|
||||||
elif config.proxy_config:
|
elif config.proxy_config:
|
||||||
creds = ""
|
creds = ""
|
||||||
if config.proxy_config.username and config.proxy_config.password:
|
if config.proxy_config.username and config.proxy_config.password:
|
||||||
creds = (
|
creds = f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
||||||
f"{config.proxy_config.username}:{config.proxy_config.password}@"
|
|
||||||
)
|
|
||||||
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
flags.append(f"--proxy-server={creds}{config.proxy_config.server}")
|
||||||
# dedupe
|
# dedupe
|
||||||
return list(dict.fromkeys(flags))
|
return list(dict.fromkeys(flags))
|
||||||
@@ -132,7 +127,7 @@ class ManagedBrowser:
|
|||||||
logger=None,
|
logger=None,
|
||||||
host: str = "localhost",
|
host: str = "localhost",
|
||||||
debugging_port: int = 9222,
|
debugging_port: int = 9222,
|
||||||
cdp_url: Optional[str] = None,
|
cdp_url: Optional[str] = None,
|
||||||
browser_config: Optional[BrowserConfig] = None,
|
browser_config: Optional[BrowserConfig] = None,
|
||||||
):
|
):
|
||||||
"""
|
"""
|
||||||
@@ -168,7 +163,7 @@ class ManagedBrowser:
|
|||||||
Starts the browser process or returns CDP endpoint URL.
|
Starts the browser process or returns CDP endpoint URL.
|
||||||
If cdp_url is provided, returns it directly.
|
If cdp_url is provided, returns it directly.
|
||||||
If user_data_dir is not provided for local browser, creates a temporary directory.
|
If user_data_dir is not provided for local browser, creates a temporary directory.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
str: CDP endpoint URL
|
str: CDP endpoint URL
|
||||||
"""
|
"""
|
||||||
@@ -184,9 +179,10 @@ class ManagedBrowser:
|
|||||||
# Get browser path and args based on OS and browser type
|
# Get browser path and args based on OS and browser type
|
||||||
# browser_path = self._get_browser_path()
|
# browser_path = self._get_browser_path()
|
||||||
args = await self._get_browser_args()
|
args = await self._get_browser_args()
|
||||||
|
|
||||||
if self.browser_config.extra_args:
|
if self.browser_config.extra_args:
|
||||||
args.extend(self.browser_config.extra_args)
|
args.extend(self.browser_config.extra_args)
|
||||||
|
|
||||||
|
|
||||||
# ── make sure no old Chromium instance is owning the same port/profile ──
|
# ── make sure no old Chromium instance is owning the same port/profile ──
|
||||||
try:
|
try:
|
||||||
@@ -204,9 +200,7 @@ class ManagedBrowser:
|
|||||||
else: # macOS / Linux
|
else: # macOS / Linux
|
||||||
# kill any process listening on the same debugging port
|
# kill any process listening on the same debugging port
|
||||||
pids = (
|
pids = (
|
||||||
subprocess.check_output(
|
subprocess.check_output(shlex.split(f"lsof -t -i:{self.debugging_port}"))
|
||||||
shlex.split(f"lsof -t -i:{self.debugging_port}")
|
|
||||||
)
|
|
||||||
.decode()
|
.decode()
|
||||||
.strip()
|
.strip()
|
||||||
.splitlines()
|
.splitlines()
|
||||||
@@ -225,7 +219,8 @@ class ManagedBrowser:
|
|||||||
os.remove(fp)
|
os.remove(fp)
|
||||||
except Exception as _e:
|
except Exception as _e:
|
||||||
# non-fatal — we'll try to start anyway, but log what happened
|
# non-fatal — we'll try to start anyway, but log what happened
|
||||||
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
|
self.logger.warning(f"pre-launch cleanup failed: {_e}", tag="BROWSER")
|
||||||
|
|
||||||
|
|
||||||
# Start browser process
|
# Start browser process
|
||||||
try:
|
try:
|
||||||
@@ -233,26 +228,26 @@ class ManagedBrowser:
|
|||||||
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
||||||
if sys.platform == "win32":
|
if sys.platform == "win32":
|
||||||
self.browser_process = subprocess.Popen(
|
self.browser_process = subprocess.Popen(
|
||||||
args,
|
args,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
creationflags=subprocess.DETACHED_PROCESS
|
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
||||||
| subprocess.CREATE_NEW_PROCESS_GROUP,
|
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
self.browser_process = subprocess.Popen(
|
self.browser_process = subprocess.Popen(
|
||||||
args,
|
args,
|
||||||
stdout=subprocess.PIPE,
|
stdout=subprocess.PIPE,
|
||||||
stderr=subprocess.PIPE,
|
stderr=subprocess.PIPE,
|
||||||
preexec_fn=os.setpgrp, # Start in a new process group
|
preexec_fn=os.setpgrp # Start in a new process group
|
||||||
)
|
)
|
||||||
|
|
||||||
# If verbose is True print args used to run the process
|
# If verbose is True print args used to run the process
|
||||||
if self.logger and self.browser_config.verbose:
|
if self.logger and self.browser_config.verbose:
|
||||||
self.logger.debug(
|
self.logger.debug(
|
||||||
f"Starting browser with args: {' '.join(args)}", tag="BROWSER"
|
f"Starting browser with args: {' '.join(args)}",
|
||||||
)
|
tag="BROWSER"
|
||||||
|
)
|
||||||
|
|
||||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||||
await asyncio.sleep(0.5) # Give browser time to start
|
await asyncio.sleep(0.5) # Give browser time to start
|
||||||
await self._initial_startup_check()
|
await self._initial_startup_check()
|
||||||
@@ -269,7 +264,7 @@ class ManagedBrowser:
|
|||||||
"""
|
"""
|
||||||
if not self.browser_process:
|
if not self.browser_process:
|
||||||
return
|
return
|
||||||
|
|
||||||
# Check that process started without immediate termination
|
# Check that process started without immediate termination
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
if self.browser_process.poll() is not None:
|
if self.browser_process.poll() is not None:
|
||||||
@@ -279,7 +274,7 @@ class ManagedBrowser:
|
|||||||
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
||||||
except subprocess.TimeoutExpired:
|
except subprocess.TimeoutExpired:
|
||||||
pass
|
pass
|
||||||
|
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
@@ -289,7 +284,7 @@ class ManagedBrowser:
|
|||||||
"stderr": stderr.decode() if stderr else "",
|
"stderr": stderr.decode() if stderr else "",
|
||||||
},
|
},
|
||||||
)
|
)
|
||||||
|
|
||||||
async def _monitor_browser_process(self):
|
async def _monitor_browser_process(self):
|
||||||
"""
|
"""
|
||||||
Monitor the browser process for unexpected termination.
|
Monitor the browser process for unexpected termination.
|
||||||
@@ -412,14 +407,7 @@ class ManagedBrowser:
|
|||||||
if sys.platform == "win32":
|
if sys.platform == "win32":
|
||||||
# On Windows we might need taskkill for detached processes
|
# On Windows we might need taskkill for detached processes
|
||||||
try:
|
try:
|
||||||
subprocess.run(
|
subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
|
||||||
[
|
|
||||||
"taskkill",
|
|
||||||
"/F",
|
|
||||||
"/PID",
|
|
||||||
str(self.browser_process.pid),
|
|
||||||
]
|
|
||||||
)
|
|
||||||
except Exception:
|
except Exception:
|
||||||
self.browser_process.kill()
|
self.browser_process.kill()
|
||||||
else:
|
else:
|
||||||
@@ -429,7 +417,7 @@ class ManagedBrowser:
|
|||||||
except Exception as e:
|
except Exception as e:
|
||||||
self.logger.error(
|
self.logger.error(
|
||||||
message="Error terminating browser: {error}",
|
message="Error terminating browser: {error}",
|
||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
params={"error": str(e)},
|
params={"error": str(e)},
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -442,77 +430,75 @@ class ManagedBrowser:
|
|||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
params={"error": str(e)},
|
params={"error": str(e)},
|
||||||
)
|
)
|
||||||
|
|
||||||
# These methods have been moved to BrowserProfiler class
|
# These methods have been moved to BrowserProfiler class
|
||||||
@staticmethod
|
@staticmethod
|
||||||
async def create_profile(browser_config=None, profile_name=None, logger=None):
|
async def create_profile(browser_config=None, profile_name=None, logger=None):
|
||||||
"""
|
"""
|
||||||
This method has been moved to the BrowserProfiler class.
|
This method has been moved to the BrowserProfiler class.
|
||||||
|
|
||||||
Creates a browser profile by launching a browser for interactive user setup
|
Creates a browser profile by launching a browser for interactive user setup
|
||||||
and waits until the user closes it. The profile is stored in a directory that
|
and waits until the user closes it. The profile is stored in a directory that
|
||||||
can be used later with BrowserConfig.user_data_dir.
|
can be used later with BrowserConfig.user_data_dir.
|
||||||
|
|
||||||
Please use BrowserProfiler.create_profile() instead.
|
Please use BrowserProfiler.create_profile() instead.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
```python
|
```python
|
||||||
from crawl4ai.browser_profiler import BrowserProfiler
|
from crawl4ai.browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
profile_path = await profiler.create_profile(profile_name="my-login-profile")
|
profile_path = await profiler.create_profile(profile_name="my-login-profile")
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
from .browser_profiler import BrowserProfiler
|
from .browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
# Create a BrowserProfiler instance and delegate to it
|
# Create a BrowserProfiler instance and delegate to it
|
||||||
profiler = BrowserProfiler(logger=logger)
|
profiler = BrowserProfiler(logger=logger)
|
||||||
return await profiler.create_profile(
|
return await profiler.create_profile(profile_name=profile_name, browser_config=browser_config)
|
||||||
profile_name=profile_name, browser_config=browser_config
|
|
||||||
)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def list_profiles():
|
def list_profiles():
|
||||||
"""
|
"""
|
||||||
This method has been moved to the BrowserProfiler class.
|
This method has been moved to the BrowserProfiler class.
|
||||||
|
|
||||||
Lists all available browser profiles in the Crawl4AI profiles directory.
|
Lists all available browser profiles in the Crawl4AI profiles directory.
|
||||||
|
|
||||||
Please use BrowserProfiler.list_profiles() instead.
|
Please use BrowserProfiler.list_profiles() instead.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
```python
|
```python
|
||||||
from crawl4ai.browser_profiler import BrowserProfiler
|
from crawl4ai.browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
profiles = profiler.list_profiles()
|
profiles = profiler.list_profiles()
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
from .browser_profiler import BrowserProfiler
|
from .browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
# Create a BrowserProfiler instance and delegate to it
|
# Create a BrowserProfiler instance and delegate to it
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
return profiler.list_profiles()
|
return profiler.list_profiles()
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def delete_profile(profile_name_or_path):
|
def delete_profile(profile_name_or_path):
|
||||||
"""
|
"""
|
||||||
This method has been moved to the BrowserProfiler class.
|
This method has been moved to the BrowserProfiler class.
|
||||||
|
|
||||||
Delete a browser profile by name or path.
|
Delete a browser profile by name or path.
|
||||||
|
|
||||||
Please use BrowserProfiler.delete_profile() instead.
|
Please use BrowserProfiler.delete_profile() instead.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
```python
|
```python
|
||||||
from crawl4ai.browser_profiler import BrowserProfiler
|
from crawl4ai.browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
success = profiler.delete_profile("my-profile")
|
success = profiler.delete_profile("my-profile")
|
||||||
```
|
```
|
||||||
"""
|
"""
|
||||||
from .browser_profiler import BrowserProfiler
|
from .browser_profiler import BrowserProfiler
|
||||||
|
|
||||||
# Create a BrowserProfiler instance and delegate to it
|
# Create a BrowserProfiler instance and delegate to it
|
||||||
profiler = BrowserProfiler()
|
profiler = BrowserProfiler()
|
||||||
return profiler.delete_profile(profile_name_or_path)
|
return profiler.delete_profile(profile_name_or_path)
|
||||||
@@ -565,10 +551,11 @@ async def clone_runtime_state(
|
|||||||
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
"accuracy": crawlerRunConfig.geolocation.accuracy,
|
||||||
}
|
}
|
||||||
)
|
)
|
||||||
|
|
||||||
return dst
|
return dst
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
class BrowserManager:
|
class BrowserManager:
|
||||||
"""
|
"""
|
||||||
Manages the browser instance and context.
|
Manages the browser instance and context.
|
||||||
@@ -585,7 +572,7 @@ class BrowserManager:
|
|||||||
"""
|
"""
|
||||||
|
|
||||||
_playwright_instance = None
|
_playwright_instance = None
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
async def get_playwright(cls, use_undetected: bool = False):
|
async def get_playwright(cls, use_undetected: bool = False):
|
||||||
if use_undetected:
|
if use_undetected:
|
||||||
@@ -593,11 +580,9 @@ class BrowserManager:
|
|||||||
else:
|
else:
|
||||||
from playwright.async_api import async_playwright
|
from playwright.async_api import async_playwright
|
||||||
cls._playwright_instance = await async_playwright().start()
|
cls._playwright_instance = await async_playwright().start()
|
||||||
return cls._playwright_instance
|
return cls._playwright_instance
|
||||||
|
|
||||||
def __init__(
|
def __init__(self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False):
|
||||||
self, browser_config: BrowserConfig, logger=None, use_undetected: bool = False
|
|
||||||
):
|
|
||||||
"""
|
"""
|
||||||
Initialize the BrowserManager with a browser configuration.
|
Initialize the BrowserManager with a browser configuration.
|
||||||
|
|
||||||
@@ -623,17 +608,16 @@ class BrowserManager:
|
|||||||
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
# Keep track of contexts by a "config signature," so each unique config reuses a single context
|
||||||
self.contexts_by_config = {}
|
self.contexts_by_config = {}
|
||||||
self._contexts_lock = asyncio.Lock()
|
self._contexts_lock = asyncio.Lock()
|
||||||
|
|
||||||
# Serialize context.new_page() across concurrent tasks to avoid races
|
# Serialize context.new_page() across concurrent tasks to avoid races
|
||||||
# when using a shared persistent context (context.pages may be empty
|
# when using a shared persistent context (context.pages may be empty
|
||||||
# for all racers). Prevents 'Target page/context closed' errors.
|
# for all racers). Prevents 'Target page/context closed' errors.
|
||||||
self._page_lock = asyncio.Lock()
|
self._page_lock = asyncio.Lock()
|
||||||
|
|
||||||
# Stealth adapter for stealth mode
|
# Stealth adapter for stealth mode
|
||||||
self._stealth_adapter = None
|
self._stealth_adapter = None
|
||||||
if self.config.enable_stealth and not self.use_undetected:
|
if self.config.enable_stealth and not self.use_undetected:
|
||||||
from .browser_adapter import StealthAdapter
|
from .browser_adapter import StealthAdapter
|
||||||
|
|
||||||
self._stealth_adapter = StealthAdapter()
|
self._stealth_adapter = StealthAdapter()
|
||||||
|
|
||||||
# Initialize ManagedBrowser if needed
|
# Initialize ManagedBrowser if needed
|
||||||
@@ -662,7 +646,7 @@ class BrowserManager:
|
|||||||
"""
|
"""
|
||||||
if self.playwright is not None:
|
if self.playwright is not None:
|
||||||
await self.close()
|
await self.close()
|
||||||
|
|
||||||
if self.use_undetected:
|
if self.use_undetected:
|
||||||
from patchright.async_api import async_playwright
|
from patchright.async_api import async_playwright
|
||||||
else:
|
else:
|
||||||
@@ -673,11 +657,7 @@ class BrowserManager:
|
|||||||
|
|
||||||
if self.config.cdp_url or self.config.use_managed_browser:
|
if self.config.cdp_url or self.config.use_managed_browser:
|
||||||
self.config.use_managed_browser = True
|
self.config.use_managed_browser = True
|
||||||
cdp_url = (
|
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
||||||
await self.managed_browser.start()
|
|
||||||
if not self.config.cdp_url
|
|
||||||
else self.config.cdp_url
|
|
||||||
)
|
|
||||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||||
contexts = self.browser.contexts
|
contexts = self.browser.contexts
|
||||||
if contexts:
|
if contexts:
|
||||||
@@ -698,6 +678,7 @@ class BrowserManager:
|
|||||||
|
|
||||||
self.default_context = self.browser
|
self.default_context = self.browser
|
||||||
|
|
||||||
|
|
||||||
def _build_browser_args(self) -> dict:
|
def _build_browser_args(self) -> dict:
|
||||||
"""Build browser launch arguments from config."""
|
"""Build browser launch arguments from config."""
|
||||||
args = [
|
args = [
|
||||||
@@ -743,7 +724,7 @@ class BrowserManager:
|
|||||||
|
|
||||||
# Deduplicate args
|
# Deduplicate args
|
||||||
args = list(dict.fromkeys(args))
|
args = list(dict.fromkeys(args))
|
||||||
|
|
||||||
browser_args = {"headless": self.config.headless, "args": args}
|
browser_args = {"headless": self.config.headless, "args": args}
|
||||||
|
|
||||||
if self.config.chrome_channel:
|
if self.config.chrome_channel:
|
||||||
@@ -820,9 +801,9 @@ class BrowserManager:
|
|||||||
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
|
||||||
if self.config.downloads_path:
|
if self.config.downloads_path:
|
||||||
context._impl_obj._options["accept_downloads"] = True
|
context._impl_obj._options["accept_downloads"] = True
|
||||||
context._impl_obj._options["downloads_path"] = (
|
context._impl_obj._options[
|
||||||
self.config.downloads_path
|
"downloads_path"
|
||||||
)
|
] = self.config.downloads_path
|
||||||
|
|
||||||
# Handle user agent and browser hints
|
# Handle user agent and browser hints
|
||||||
if self.config.user_agent:
|
if self.config.user_agent:
|
||||||
@@ -853,7 +834,7 @@ class BrowserManager:
|
|||||||
or crawlerRunConfig.simulate_user
|
or crawlerRunConfig.simulate_user
|
||||||
or crawlerRunConfig.magic
|
or crawlerRunConfig.magic
|
||||||
):
|
):
|
||||||
await context.add_init_script(load_js_script("navigator_overrider"))
|
await context.add_init_script(load_js_script("navigator_overrider"))
|
||||||
|
|
||||||
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
|
||||||
"""
|
"""
|
||||||
@@ -864,7 +845,7 @@ class BrowserManager:
|
|||||||
Context: Browser context object with the specified configurations
|
Context: Browser context object with the specified configurations
|
||||||
"""
|
"""
|
||||||
# Base settings
|
# Base settings
|
||||||
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
|
||||||
viewport_settings = {
|
viewport_settings = {
|
||||||
"width": self.config.viewport_width,
|
"width": self.config.viewport_width,
|
||||||
"height": self.config.viewport_height,
|
"height": self.config.viewport_height,
|
||||||
@@ -937,7 +918,7 @@ class BrowserManager:
|
|||||||
"device_scale_factor": 1.0,
|
"device_scale_factor": 1.0,
|
||||||
"java_script_enabled": self.config.java_script_enabled,
|
"java_script_enabled": self.config.java_script_enabled,
|
||||||
}
|
}
|
||||||
|
|
||||||
if crawlerRunConfig:
|
if crawlerRunConfig:
|
||||||
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
|
||||||
if crawlerRunConfig.proxy_config:
|
if crawlerRunConfig.proxy_config:
|
||||||
@@ -945,12 +926,10 @@ class BrowserManager:
|
|||||||
"server": crawlerRunConfig.proxy_config.server,
|
"server": crawlerRunConfig.proxy_config.server,
|
||||||
}
|
}
|
||||||
if crawlerRunConfig.proxy_config.username:
|
if crawlerRunConfig.proxy_config.username:
|
||||||
proxy_settings.update(
|
proxy_settings.update({
|
||||||
{
|
"username": crawlerRunConfig.proxy_config.username,
|
||||||
"username": crawlerRunConfig.proxy_config.username,
|
"password": crawlerRunConfig.proxy_config.password,
|
||||||
"password": crawlerRunConfig.proxy_config.password,
|
})
|
||||||
}
|
|
||||||
)
|
|
||||||
context_settings["proxy"] = proxy_settings
|
context_settings["proxy"] = proxy_settings
|
||||||
|
|
||||||
if self.config.text_mode:
|
if self.config.text_mode:
|
||||||
@@ -1008,12 +987,12 @@ class BrowserManager:
|
|||||||
"cache_mode",
|
"cache_mode",
|
||||||
"content_filter",
|
"content_filter",
|
||||||
"semaphore_count",
|
"semaphore_count",
|
||||||
"url",
|
"url"
|
||||||
]
|
]
|
||||||
|
|
||||||
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
|
# Do NOT exclude locale, timezone_id, or geolocation as these DO affect browser context
|
||||||
# and should cause a new context to be created if they change
|
# and should cause a new context to be created if they change
|
||||||
|
|
||||||
for key in ephemeral_keys:
|
for key in ephemeral_keys:
|
||||||
if key in config_dict:
|
if key in config_dict:
|
||||||
del config_dict[key]
|
del config_dict[key]
|
||||||
@@ -1034,7 +1013,7 @@ class BrowserManager:
|
|||||||
self.logger.warning(
|
self.logger.warning(
|
||||||
message="Failed to apply stealth to page: {error}",
|
message="Failed to apply stealth to page: {error}",
|
||||||
tag="STEALTH",
|
tag="STEALTH",
|
||||||
params={"error": str(e)},
|
params={"error": str(e)}
|
||||||
)
|
)
|
||||||
|
|
||||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
|
||||||
@@ -1060,10 +1039,8 @@ class BrowserManager:
|
|||||||
if self.config.use_managed_browser:
|
if self.config.use_managed_browser:
|
||||||
if self.config.storage_state:
|
if self.config.storage_state:
|
||||||
context = await self.create_browser_context(crawlerRunConfig)
|
context = await self.create_browser_context(crawlerRunConfig)
|
||||||
ctx = self.default_context # default context, one window only
|
ctx = self.default_context # default context, one window only
|
||||||
ctx = await clone_runtime_state(
|
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||||
context, ctx, crawlerRunConfig, self.config
|
|
||||||
)
|
|
||||||
# Avoid concurrent new_page on shared persistent context
|
# Avoid concurrent new_page on shared persistent context
|
||||||
# See GH-1198: context.pages can be empty under races
|
# See GH-1198: context.pages can be empty under races
|
||||||
async with self._page_lock:
|
async with self._page_lock:
|
||||||
@@ -1075,21 +1052,14 @@ class BrowserManager:
|
|||||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||||
if not page:
|
if not page:
|
||||||
if pages:
|
if pages:
|
||||||
# FIX: Always create a new page for managed browsers to support concurrent crawling
|
page = pages[0]
|
||||||
# Previously: page = pages[0]
|
|
||||||
async with self._page_lock:
|
|
||||||
page = await context.new_page()
|
|
||||||
await self._apply_stealth_to_page(page)
|
|
||||||
else:
|
else:
|
||||||
# Double-check under lock to avoid TOCTOU and ensure only
|
# Double-check under lock to avoid TOCTOU and ensure only
|
||||||
# one task calls new_page when pages=[] concurrently
|
# one task calls new_page when pages=[] concurrently
|
||||||
async with self._page_lock:
|
async with self._page_lock:
|
||||||
pages = context.pages
|
pages = context.pages
|
||||||
if pages:
|
if pages:
|
||||||
# FIX: Always create a new page for managed browsers to support concurrent crawling
|
page = pages[0]
|
||||||
# Previously: page = pages[0]
|
|
||||||
page = await context.new_page()
|
|
||||||
await self._apply_stealth_to_page(page)
|
|
||||||
else:
|
else:
|
||||||
page = await context.new_page()
|
page = await context.new_page()
|
||||||
await self._apply_stealth_to_page(page)
|
await self._apply_stealth_to_page(page)
|
||||||
@@ -1145,7 +1115,7 @@ class BrowserManager:
|
|||||||
"""Close all browser resources and clean up."""
|
"""Close all browser resources and clean up."""
|
||||||
if self.config.cdp_url:
|
if self.config.cdp_url:
|
||||||
return
|
return
|
||||||
|
|
||||||
if self.config.sleep_on_close:
|
if self.config.sleep_on_close:
|
||||||
await asyncio.sleep(0.5)
|
await asyncio.sleep(0.5)
|
||||||
|
|
||||||
@@ -1161,7 +1131,7 @@ class BrowserManager:
|
|||||||
self.logger.error(
|
self.logger.error(
|
||||||
message="Error closing context: {error}",
|
message="Error closing context: {error}",
|
||||||
tag="ERROR",
|
tag="ERROR",
|
||||||
params={"error": str(e)},
|
params={"error": str(e)}
|
||||||
)
|
)
|
||||||
self.contexts_by_config.clear()
|
self.contexts_by_config.clear()
|
||||||
|
|
||||||
|
|||||||
@@ -785,54 +785,6 @@ curl http://localhost:11235/crawl/job/crawl_xyz
|
|||||||
|
|
||||||
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
||||||
|
|
||||||
#### LLM Extraction Jobs with Webhooks
|
|
||||||
|
|
||||||
The same webhook system works for LLM extraction jobs via `/llm/job`:
|
|
||||||
|
|
||||||
```bash
|
|
||||||
# Submit LLM extraction job with webhook
|
|
||||||
curl -X POST http://localhost:11235/llm/job \
|
|
||||||
-H "Content-Type: application/json" \
|
|
||||||
-d '{
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, and main points",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
|
||||||
"webhook_data_in_payload": true,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}'
|
|
||||||
|
|
||||||
# Response: {"task_id": "llm_1234567890"}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Your webhook receives:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1234567890",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"data": {
|
|
||||||
"extracted_content": {
|
|
||||||
"title": "Understanding Web Scraping",
|
|
||||||
"author": "John Doe",
|
|
||||||
"main_points": ["Point 1", "Point 2", "Point 3"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Key Differences for LLM Jobs:**
|
|
||||||
- Task type is `"llm_extraction"` instead of `"crawl"`
|
|
||||||
- Extracted data is in `data.extracted_content`
|
|
||||||
- Single URL only (not an array)
|
|
||||||
- Supports schema-based extraction with `schema` parameter
|
|
||||||
|
|
||||||
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
||||||
|
|
||||||
---
|
---
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
@@ -27,14 +27,6 @@
|
|||||||
- [Hook Response Information](#hook-response-information)
|
- [Hook Response Information](#hook-response-information)
|
||||||
- [Error Handling](#error-handling)
|
- [Error Handling](#error-handling)
|
||||||
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||||
- [Job Queue & Webhook API](#job-queue-webhook-api)
|
|
||||||
- [Why Use the Job Queue API?](#why-use-the-job-queue-api)
|
|
||||||
- [Available Endpoints](#available-endpoints)
|
|
||||||
- [Webhook Configuration](#webhook-configuration)
|
|
||||||
- [Usage Examples](#usage-examples)
|
|
||||||
- [Webhook Best Practices](#webhook-best-practices)
|
|
||||||
- [Use Cases](#use-cases)
|
|
||||||
- [Troubleshooting](#troubleshooting)
|
|
||||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||||
- [Using the API](#using-the-api)
|
- [Using the API](#using-the-api)
|
||||||
- [Playground Interface](#playground-interface)
|
- [Playground Interface](#playground-interface)
|
||||||
@@ -1118,464 +1110,6 @@ if __name__ == "__main__":
|
|||||||
|
|
||||||
---
|
---
|
||||||
|
|
||||||
## Job Queue & Webhook API
|
|
||||||
|
|
||||||
The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish.
|
|
||||||
|
|
||||||
### Why Use the Job Queue API?
|
|
||||||
|
|
||||||
**Traditional Synchronous API (`/crawl`):**
|
|
||||||
- Client waits for entire crawl to complete
|
|
||||||
- Timeout issues with long-running crawls
|
|
||||||
- Resource blocking during execution
|
|
||||||
- Constant polling required for status updates
|
|
||||||
|
|
||||||
**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):**
|
|
||||||
- ✅ Submit job and continue immediately
|
|
||||||
- ✅ No timeout concerns for long operations
|
|
||||||
- ✅ Real-time webhook notifications on completion
|
|
||||||
- ✅ Better resource utilization
|
|
||||||
- ✅ Perfect for batch processing
|
|
||||||
- ✅ Ideal for microservice architectures
|
|
||||||
|
|
||||||
### Available Endpoints
|
|
||||||
|
|
||||||
#### 1. Crawl Job Endpoint
|
|
||||||
|
|
||||||
```
|
|
||||||
POST /crawl/job
|
|
||||||
```
|
|
||||||
|
|
||||||
Submit an asynchronous crawl job with optional webhook notification.
|
|
||||||
|
|
||||||
**Request Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"cache_mode": "bypass",
|
|
||||||
"extraction_strategy": {
|
|
||||||
"type": "JsonCssExtractionStrategy",
|
|
||||||
"schema": {
|
|
||||||
"title": "h1",
|
|
||||||
"content": ".article-body"
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
|
||||||
"webhook_data_in_payload": true,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token",
|
|
||||||
"X-Custom-Header": "value"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_1698765432",
|
|
||||||
"message": "Crawl job submitted"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 2. LLM Extraction Job Endpoint
|
|
||||||
|
|
||||||
```
|
|
||||||
POST /llm/job
|
|
||||||
```
|
|
||||||
|
|
||||||
Submit an asynchronous LLM extraction job with optional webhook notification.
|
|
||||||
|
|
||||||
**Request Body:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, publication date, and main points",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}",
|
|
||||||
"cache": false,
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
|
||||||
"webhook_data_in_payload": true,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432",
|
|
||||||
"message": "LLM job submitted"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### 3. Job Status Endpoint
|
|
||||||
|
|
||||||
```
|
|
||||||
GET /job/{task_id}
|
|
||||||
```
|
|
||||||
|
|
||||||
Check the status and retrieve results of a submitted job.
|
|
||||||
|
|
||||||
**Response (In Progress):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_1698765432",
|
|
||||||
"status": "processing",
|
|
||||||
"message": "Job is being processed"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Response (Completed):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_1698765432",
|
|
||||||
"status": "completed",
|
|
||||||
"result": {
|
|
||||||
"markdown": "# Page Title\n\nContent...",
|
|
||||||
"extracted_content": {...},
|
|
||||||
"links": {...}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Configuration
|
|
||||||
|
|
||||||
Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling.
|
|
||||||
|
|
||||||
#### Webhook Config Parameters
|
|
||||||
|
|
||||||
| Parameter | Type | Required | Description |
|
|
||||||
|-----------|------|----------|-------------|
|
|
||||||
| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications |
|
|
||||||
| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) |
|
|
||||||
| `webhook_headers` | object | No | Custom headers for authentication/identification |
|
|
||||||
|
|
||||||
#### Webhook Payload Format
|
|
||||||
|
|
||||||
**Success Notification (Crawl Job):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_1698765432",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"data": {
|
|
||||||
"markdown": "# Page content...",
|
|
||||||
"extracted_content": {...},
|
|
||||||
"links": {...}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Success Notification (LLM Job):**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "llm_1698765432",
|
|
||||||
"task_type": "llm_extraction",
|
|
||||||
"status": "completed",
|
|
||||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com/article"],
|
|
||||||
"data": {
|
|
||||||
"extracted_content": {
|
|
||||||
"title": "Understanding Web Scraping",
|
|
||||||
"author": "John Doe",
|
|
||||||
"date": "2025-10-22",
|
|
||||||
"points": ["Point 1", "Point 2"]
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
**Failure Notification:**
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"task_id": "crawl_1698765432",
|
|
||||||
"task_type": "crawl",
|
|
||||||
"status": "failed",
|
|
||||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"error": "Connection timeout after 30 seconds"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Webhook Delivery & Retry
|
|
||||||
|
|
||||||
- **Delivery Method:** HTTP POST to your `webhook_url`
|
|
||||||
- **Content-Type:** `application/json`
|
|
||||||
- **Retry Policy:** Exponential backoff with 5 attempts
|
|
||||||
- Attempt 1: Immediate
|
|
||||||
- Attempt 2: 1 second delay
|
|
||||||
- Attempt 3: 2 seconds delay
|
|
||||||
- Attempt 4: 4 seconds delay
|
|
||||||
- Attempt 5: 8 seconds delay
|
|
||||||
- **Success Status Codes:** 200-299
|
|
||||||
- **Custom Headers:** Your `webhook_headers` are included in every request
|
|
||||||
|
|
||||||
### Usage Examples
|
|
||||||
|
|
||||||
#### Example 1: Python with Webhook Handler (Flask)
|
|
||||||
|
|
||||||
```python
|
|
||||||
from flask import Flask, request, jsonify
|
|
||||||
import requests
|
|
||||||
|
|
||||||
app = Flask(__name__)
|
|
||||||
|
|
||||||
# Webhook handler
|
|
||||||
@app.route('/webhook/crawl-complete', methods=['POST'])
|
|
||||||
def handle_crawl_webhook():
|
|
||||||
payload = request.json
|
|
||||||
|
|
||||||
if payload['status'] == 'completed':
|
|
||||||
print(f"✅ Job {payload['task_id']} completed!")
|
|
||||||
print(f"Task type: {payload['task_type']}")
|
|
||||||
|
|
||||||
# Access the crawl results
|
|
||||||
if 'data' in payload:
|
|
||||||
markdown = payload['data'].get('markdown', '')
|
|
||||||
extracted = payload['data'].get('extracted_content', {})
|
|
||||||
print(f"Extracted {len(markdown)} characters")
|
|
||||||
print(f"Structured data: {extracted}")
|
|
||||||
else:
|
|
||||||
print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}")
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
|
|
||||||
# Submit a crawl job with webhook
|
|
||||||
def submit_crawl_job():
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/crawl/job",
|
|
||||||
json={
|
|
||||||
"urls": ["https://example.com"],
|
|
||||||
"extraction_strategy": {
|
|
||||||
"type": "JsonCssExtractionStrategy",
|
|
||||||
"schema": {
|
|
||||||
"name": "Example Schema",
|
|
||||||
"baseSelector": "body",
|
|
||||||
"fields": [
|
|
||||||
{"name": "title", "selector": "h1", "type": "text"},
|
|
||||||
{"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
},
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"Job submitted: {task_id}")
|
|
||||||
return task_id
|
|
||||||
|
|
||||||
if __name__ == '__main__':
|
|
||||||
app.run(port=5000)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Example 2: LLM Extraction with Webhooks
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
|
|
||||||
def submit_llm_job_with_webhook():
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/llm/job",
|
|
||||||
json={
|
|
||||||
"url": "https://example.com/article",
|
|
||||||
"q": "Extract the article title, author, and main points",
|
|
||||||
"provider": "openai/gpt-4o-mini",
|
|
||||||
"webhook_config": {
|
|
||||||
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
|
||||||
"webhook_data_in_payload": True,
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
print(f"LLM job submitted: {task_id}")
|
|
||||||
return task_id
|
|
||||||
|
|
||||||
# Webhook handler for LLM jobs
|
|
||||||
@app.route('/webhook/llm-complete', methods=['POST'])
|
|
||||||
def handle_llm_webhook():
|
|
||||||
payload = request.json
|
|
||||||
|
|
||||||
if payload['status'] == 'completed':
|
|
||||||
extracted = payload['data']['extracted_content']
|
|
||||||
print(f"✅ LLM extraction completed!")
|
|
||||||
print(f"Results: {extracted}")
|
|
||||||
else:
|
|
||||||
print(f"❌ LLM extraction failed: {payload.get('error')}")
|
|
||||||
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Example 3: Without Webhooks (Polling)
|
|
||||||
|
|
||||||
If you don't use webhooks, you can poll for results:
|
|
||||||
|
|
||||||
```python
|
|
||||||
import requests
|
|
||||||
import time
|
|
||||||
|
|
||||||
# Submit job
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/crawl/job",
|
|
||||||
json={"urls": ["https://example.com"]}
|
|
||||||
)
|
|
||||||
task_id = response.json()['task_id']
|
|
||||||
|
|
||||||
# Poll for results
|
|
||||||
while True:
|
|
||||||
result = requests.get(f"http://localhost:11235/job/{task_id}")
|
|
||||||
data = result.json()
|
|
||||||
|
|
||||||
if data['status'] == 'completed':
|
|
||||||
print("Job completed!")
|
|
||||||
print(data['result'])
|
|
||||||
break
|
|
||||||
elif data['status'] == 'failed':
|
|
||||||
print(f"Job failed: {data.get('error')}")
|
|
||||||
break
|
|
||||||
|
|
||||||
print("Still processing...")
|
|
||||||
time.sleep(2)
|
|
||||||
```
|
|
||||||
|
|
||||||
#### Example 4: Global Webhook Configuration
|
|
||||||
|
|
||||||
Set a default webhook URL in your `config.yml` to avoid repeating it in every request:
|
|
||||||
|
|
||||||
```yaml
|
|
||||||
# config.yml
|
|
||||||
api:
|
|
||||||
crawler:
|
|
||||||
# ... other settings ...
|
|
||||||
webhook:
|
|
||||||
default_url: "https://your-app.com/webhook/default"
|
|
||||||
default_headers:
|
|
||||||
X-Webhook-Secret: "your-secret-token"
|
|
||||||
```
|
|
||||||
|
|
||||||
Then submit jobs without webhook config:
|
|
||||||
|
|
||||||
```python
|
|
||||||
# Uses the global webhook configuration
|
|
||||||
response = requests.post(
|
|
||||||
"http://localhost:11235/crawl/job",
|
|
||||||
json={"urls": ["https://example.com"]}
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Webhook Best Practices
|
|
||||||
|
|
||||||
1. **Authentication:** Always use custom headers for webhook authentication
|
|
||||||
```json
|
|
||||||
"webhook_headers": {
|
|
||||||
"X-Webhook-Secret": "your-secret-token"
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications)
|
|
||||||
|
|
||||||
3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed
|
|
||||||
```python
|
|
||||||
@app.route('/webhook', methods=['POST'])
|
|
||||||
def webhook():
|
|
||||||
payload = request.json
|
|
||||||
# Queue for background processing
|
|
||||||
queue.enqueue(process_webhook, payload)
|
|
||||||
return jsonify({"status": "received"}), 200
|
|
||||||
```
|
|
||||||
|
|
||||||
4. **Error Handling:** Handle both success and failure notifications
|
|
||||||
```python
|
|
||||||
if payload['status'] == 'completed':
|
|
||||||
# Process success
|
|
||||||
elif payload['status'] == 'failed':
|
|
||||||
# Log error, retry, or alert
|
|
||||||
```
|
|
||||||
|
|
||||||
5. **Validation:** Verify webhook authenticity using custom headers
|
|
||||||
```python
|
|
||||||
secret = request.headers.get('X-Webhook-Secret')
|
|
||||||
if secret != os.environ['EXPECTED_SECRET']:
|
|
||||||
return jsonify({"error": "Unauthorized"}), 401
|
|
||||||
```
|
|
||||||
|
|
||||||
6. **Logging:** Log webhook deliveries for debugging
|
|
||||||
```python
|
|
||||||
logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}")
|
|
||||||
```
|
|
||||||
|
|
||||||
### Use Cases
|
|
||||||
|
|
||||||
**1. Batch Processing**
|
|
||||||
Submit hundreds of URLs and get notified as each completes:
|
|
||||||
```python
|
|
||||||
urls = ["https://site1.com", "https://site2.com", ...]
|
|
||||||
for url in urls:
|
|
||||||
submit_crawl_job(url, webhook_url="https://app.com/webhook")
|
|
||||||
```
|
|
||||||
|
|
||||||
**2. Microservice Integration**
|
|
||||||
Integrate with event-driven architectures:
|
|
||||||
```python
|
|
||||||
# Service A submits job
|
|
||||||
task_id = submit_crawl_job(url)
|
|
||||||
|
|
||||||
# Service B receives webhook and triggers next step
|
|
||||||
@app.route('/webhook')
|
|
||||||
def webhook():
|
|
||||||
process_result(request.json)
|
|
||||||
trigger_next_service()
|
|
||||||
return "OK", 200
|
|
||||||
```
|
|
||||||
|
|
||||||
**3. Long-Running Extractions**
|
|
||||||
Handle complex LLM extractions without timeouts:
|
|
||||||
```python
|
|
||||||
submit_llm_job(
|
|
||||||
url="https://long-article.com",
|
|
||||||
q="Comprehensive summary with key points and analysis",
|
|
||||||
webhook_url="https://app.com/webhook/llm"
|
|
||||||
)
|
|
||||||
```
|
|
||||||
|
|
||||||
### Troubleshooting
|
|
||||||
|
|
||||||
**Webhook not receiving notifications?**
|
|
||||||
- Check your webhook URL is publicly accessible
|
|
||||||
- Verify firewall/security group settings
|
|
||||||
- Use webhook testing tools like webhook.site for debugging
|
|
||||||
- Check server logs for delivery attempts
|
|
||||||
- Ensure your handler returns 200-299 status code
|
|
||||||
|
|
||||||
**Job stuck in processing?**
|
|
||||||
- Check Redis connection: `docker logs <container_name> | grep redis`
|
|
||||||
- Verify worker processes: `docker exec <container_name> ps aux | grep worker`
|
|
||||||
- Check server logs: `docker logs <container_name>`
|
|
||||||
|
|
||||||
**Need to cancel a job?**
|
|
||||||
Jobs are processed asynchronously. If you need to cancel:
|
|
||||||
- Delete the task from Redis (requires Redis CLI access)
|
|
||||||
- Or implement a cancellation endpoint in your webhook handler
|
|
||||||
|
|
||||||
---
|
|
||||||
|
|
||||||
## Dockerfile Parameters
|
## Dockerfile Parameters
|
||||||
|
|
||||||
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
|
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
|
||||||
|
|||||||
@@ -529,19 +529,8 @@ class AdminDashboard {
|
|||||||
</label>
|
</label>
|
||||||
</div>
|
</div>
|
||||||
<div class="form-group full-width">
|
<div class="form-group full-width">
|
||||||
<label>Long Description (Markdown - Overview tab)</label>
|
<label>Integration Guide</label>
|
||||||
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
|
<textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
|
||||||
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
|
|
||||||
</div>
|
|
||||||
<div class="form-group full-width">
|
|
||||||
<label>Integration Guide (Markdown - Integration tab)</label>
|
|
||||||
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
|
|
||||||
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
|
|
||||||
</div>
|
|
||||||
<div class="form-group full-width">
|
|
||||||
<label>Documentation (Markdown - Documentation tab)</label>
|
|
||||||
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
|
|
||||||
<small>Full documentation with API reference, examples, best practices, etc.</small>
|
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
`;
|
`;
|
||||||
@@ -723,9 +712,7 @@ class AdminDashboard {
|
|||||||
data.contact_email = document.getElementById('form-email').value;
|
data.contact_email = document.getElementById('form-email').value;
|
||||||
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
||||||
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
||||||
data.long_description = document.getElementById('form-long-description').value;
|
|
||||||
data.integration_guide = document.getElementById('form-integration').value;
|
data.integration_guide = document.getElementById('form-integration').value;
|
||||||
data.documentation = document.getElementById('form-documentation').value;
|
|
||||||
} else if (type === 'articles') {
|
} else if (type === 'articles') {
|
||||||
data.title = document.getElementById('form-title').value;
|
data.title = document.getElementById('form-title').value;
|
||||||
data.slug = this.generateSlug(data.title);
|
data.slug = this.generateSlug(data.title);
|
||||||
|
|||||||
@@ -278,12 +278,12 @@
|
|||||||
}
|
}
|
||||||
|
|
||||||
.tab-content {
|
.tab-content {
|
||||||
display: none !important;
|
display: none;
|
||||||
padding: 2rem;
|
padding: 2rem;
|
||||||
}
|
}
|
||||||
|
|
||||||
.tab-content.active {
|
.tab-content.active {
|
||||||
display: block !important;
|
display: block;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Overview Layout */
|
/* Overview Layout */
|
||||||
@@ -510,31 +510,6 @@
|
|||||||
line-height: 1.5;
|
line-height: 1.5;
|
||||||
}
|
}
|
||||||
|
|
||||||
/* Markdown rendered code blocks */
|
|
||||||
.integration-content pre,
|
|
||||||
.docs-content pre {
|
|
||||||
background: var(--bg-dark);
|
|
||||||
border: 1px solid var(--border-color);
|
|
||||||
margin: 1rem 0;
|
|
||||||
padding: 1rem;
|
|
||||||
padding-top: 2.5rem; /* Space for copy button */
|
|
||||||
overflow-x: auto;
|
|
||||||
position: relative;
|
|
||||||
max-height: none; /* Remove any height restrictions */
|
|
||||||
height: auto; /* Allow content to expand */
|
|
||||||
}
|
|
||||||
|
|
||||||
.integration-content pre code,
|
|
||||||
.docs-content pre code {
|
|
||||||
background: transparent;
|
|
||||||
padding: 0;
|
|
||||||
color: var(--text-secondary);
|
|
||||||
font-size: 0.875rem;
|
|
||||||
line-height: 1.5;
|
|
||||||
white-space: pre; /* Preserve whitespace and line breaks */
|
|
||||||
display: block;
|
|
||||||
}
|
|
||||||
|
|
||||||
/* Feature Grid */
|
/* Feature Grid */
|
||||||
.feature-grid {
|
.feature-grid {
|
||||||
display: grid;
|
display: grid;
|
||||||
|
|||||||
@@ -73,14 +73,27 @@
|
|||||||
<div class="tabs">
|
<div class="tabs">
|
||||||
<button class="tab-btn active" data-tab="overview">Overview</button>
|
<button class="tab-btn active" data-tab="overview">Overview</button>
|
||||||
<button class="tab-btn" data-tab="integration">Integration</button>
|
<button class="tab-btn" data-tab="integration">Integration</button>
|
||||||
<!-- <button class="tab-btn" data-tab="docs">Documentation</button>
|
<button class="tab-btn" data-tab="docs">Documentation</button>
|
||||||
<button class="tab-btn" data-tab="support">Support</button> -->
|
<button class="tab-btn" data-tab="support">Support</button>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<section id="overview-tab" class="tab-content active">
|
<section id="overview-tab" class="tab-content active">
|
||||||
<div class="overview-columns">
|
<div class="overview-columns">
|
||||||
<div class="overview-main">
|
<div class="overview-main">
|
||||||
|
<h2>Overview</h2>
|
||||||
<div id="app-overview">Overview content goes here.</div>
|
<div id="app-overview">Overview content goes here.</div>
|
||||||
|
|
||||||
|
<h3>Key Features</h3>
|
||||||
|
<ul id="app-features" class="features-list">
|
||||||
|
<li>Feature 1</li>
|
||||||
|
<li>Feature 2</li>
|
||||||
|
<li>Feature 3</li>
|
||||||
|
</ul>
|
||||||
|
|
||||||
|
<h3>Use Cases</h3>
|
||||||
|
<div id="app-use-cases" class="use-cases">
|
||||||
|
<p>Describe how this app can help your workflow.</p>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
<aside class="sidebar">
|
<aside class="sidebar">
|
||||||
@@ -129,16 +142,37 @@
|
|||||||
</section>
|
</section>
|
||||||
|
|
||||||
<section id="integration-tab" class="tab-content">
|
<section id="integration-tab" class="tab-content">
|
||||||
<div class="integration-content" id="app-integration">
|
<div class="integration-content">
|
||||||
|
<h2>Integration Guide</h2>
|
||||||
|
|
||||||
|
<h3>Installation</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Basic Usage</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<pre><code id="usage-code"># Usage example will appear here</code></pre>
|
||||||
|
</div>
|
||||||
|
|
||||||
|
<h3>Complete Integration Example</h3>
|
||||||
|
<div class="code-block">
|
||||||
|
<button class="copy-btn" id="copy-integration">Copy</button>
|
||||||
|
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section>
|
</section>
|
||||||
|
|
||||||
<!-- <section id="docs-tab" class="tab-content">
|
<section id="docs-tab" class="tab-content">
|
||||||
<div class="docs-content" id="app-docs">
|
<div class="docs-content">
|
||||||
|
<h2>Documentation</h2>
|
||||||
|
<div id="app-docs" class="doc-sections">
|
||||||
|
<p>Documentation coming soon.</p>
|
||||||
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section> -->
|
</section>
|
||||||
|
|
||||||
<!-- <section id="support-tab" class="tab-content">
|
<section id="support-tab" class="tab-content">
|
||||||
<div class="docs-content">
|
<div class="docs-content">
|
||||||
<h2>Support</h2>
|
<h2>Support</h2>
|
||||||
<div class="support-grid">
|
<div class="support-grid">
|
||||||
@@ -156,7 +190,7 @@
|
|||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</div>
|
</div>
|
||||||
</section> -->
|
</section>
|
||||||
</div>
|
</div>
|
||||||
|
|
||||||
</main>
|
</main>
|
||||||
|
|||||||
@@ -112,7 +112,7 @@ class AppDetailPage {
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Contact
|
// Contact
|
||||||
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available');
|
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
|
||||||
|
|
||||||
// Sidebar info
|
// Sidebar info
|
||||||
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||||
@@ -123,134 +123,146 @@ class AppDetailPage {
|
|||||||
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
||||||
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
||||||
|
|
||||||
// Render tab contents from database fields
|
// Integration guide
|
||||||
this.renderTabContents();
|
this.renderIntegrationGuide();
|
||||||
}
|
}
|
||||||
|
|
||||||
renderTabContents() {
|
renderIntegrationGuide() {
|
||||||
// Overview tab - use long_description from database
|
// Installation code
|
||||||
const overviewDiv = document.getElementById('app-overview');
|
const installCode = document.getElementById('install-code');
|
||||||
if (overviewDiv) {
|
if (installCode) {
|
||||||
if (this.appData.long_description) {
|
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
||||||
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
|
installCode.textContent = `# Clone from GitHub
|
||||||
} else {
|
git clone ${this.appData.github_url}
|
||||||
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
|
|
||||||
|
# Install dependencies
|
||||||
|
pip install -r requirements.txt`;
|
||||||
|
} else if (this.appData.name.toLowerCase().includes('api')) {
|
||||||
|
installCode.textContent = `# Install via pip
|
||||||
|
pip install ${this.appData.slug}
|
||||||
|
|
||||||
|
# Or install from source
|
||||||
|
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Integration tab - use integration_guide field from database
|
// Usage code - customize based on category
|
||||||
const integrationDiv = document.getElementById('app-integration');
|
const usageCode = document.getElementById('usage-code');
|
||||||
if (integrationDiv) {
|
if (usageCode) {
|
||||||
if (this.appData.integration_guide) {
|
if (this.appData.category === 'Browser Automation') {
|
||||||
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
|
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||||
// Add copy buttons to all code blocks
|
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
||||||
this.addCopyButtonsToCodeBlocks(integrationDiv);
|
|
||||||
} else {
|
async def main():
|
||||||
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
|
# Initialize ${this.appData.name}
|
||||||
|
automation = ${this.appData.name.replace(/\s+/g, '')}()
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
browser_config=automation.config,
|
||||||
|
wait_for="css:body"
|
||||||
|
)
|
||||||
|
print(result.markdown)`;
|
||||||
|
} else if (this.appData.category === 'Proxy Services') {
|
||||||
|
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||||
|
import ${this.appData.slug.replace(/-/g, '_')}
|
||||||
|
|
||||||
|
# Configure proxy
|
||||||
|
proxy_config = {
|
||||||
|
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
|
||||||
|
"username": "your_username",
|
||||||
|
"password": "your_password"
|
||||||
|
}
|
||||||
|
|
||||||
|
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
bypass_cache=True
|
||||||
|
)
|
||||||
|
print(result.status_code)`;
|
||||||
|
} else if (this.appData.category === 'LLM Integration') {
|
||||||
|
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||||
|
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||||
|
|
||||||
|
# Configure LLM extraction
|
||||||
|
strategy = LLMExtractionStrategy(
|
||||||
|
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
|
||||||
|
api_key="your-api-key",
|
||||||
|
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
|
||||||
|
instruction="Extract structured data"
|
||||||
|
)
|
||||||
|
|
||||||
|
async with AsyncWebCrawler() as crawler:
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com",
|
||||||
|
extraction_strategy=strategy
|
||||||
|
)
|
||||||
|
print(result.extracted_content)`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Documentation tab - use documentation field from database
|
// Integration example
|
||||||
const docsDiv = document.getElementById('app-docs');
|
const integrationCode = document.getElementById('integration-code');
|
||||||
if (docsDiv) {
|
if (integrationCode) {
|
||||||
if (this.appData.documentation) {
|
integrationCode.textContent = this.appData.integration_guide ||
|
||||||
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
|
`# Complete ${this.appData.name} Integration Example
|
||||||
// Add copy buttons to all code blocks
|
|
||||||
this.addCopyButtonsToCodeBlocks(docsDiv);
|
from crawl4ai import AsyncWebCrawler
|
||||||
} else {
|
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||||
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
|
import json
|
||||||
}
|
|
||||||
|
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
|
||||||
|
"""
|
||||||
|
Complete example showing how to use ${this.appData.name}
|
||||||
|
with Crawl4AI for production web scraping
|
||||||
|
"""
|
||||||
|
|
||||||
|
# Define extraction schema
|
||||||
|
schema = {
|
||||||
|
"name": "ProductList",
|
||||||
|
"baseSelector": "div.product",
|
||||||
|
"fields": [
|
||||||
|
{"name": "title", "selector": "h2", "type": "text"},
|
||||||
|
{"name": "price", "selector": ".price", "type": "text"},
|
||||||
|
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
|
||||||
|
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
|
||||||
|
# Initialize crawler with ${this.appData.name}
|
||||||
|
async with AsyncWebCrawler(
|
||||||
|
browser_type="chromium",
|
||||||
|
headless=True,
|
||||||
|
verbose=True
|
||||||
|
) as crawler:
|
||||||
|
|
||||||
|
# Crawl with extraction
|
||||||
|
result = await crawler.arun(
|
||||||
|
url="https://example.com/products",
|
||||||
|
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||||
|
cache_mode="bypass",
|
||||||
|
wait_for="css:.product",
|
||||||
|
screenshot=True
|
||||||
|
)
|
||||||
|
|
||||||
|
# Process results
|
||||||
|
if result.success:
|
||||||
|
products = json.loads(result.extracted_content)
|
||||||
|
print(f"Found {len(products)} products")
|
||||||
|
|
||||||
|
for product in products[:5]:
|
||||||
|
print(f"- {product['title']}: {product['price']}")
|
||||||
|
|
||||||
|
return products
|
||||||
|
|
||||||
|
# Run the crawler
|
||||||
|
if __name__ == "__main__":
|
||||||
|
import asyncio
|
||||||
|
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
addCopyButtonsToCodeBlocks(container) {
|
|
||||||
// Find all code blocks and add copy buttons
|
|
||||||
const codeBlocks = container.querySelectorAll('pre code');
|
|
||||||
codeBlocks.forEach(codeBlock => {
|
|
||||||
const pre = codeBlock.parentElement;
|
|
||||||
|
|
||||||
// Skip if already has a copy button
|
|
||||||
if (pre.querySelector('.copy-btn')) return;
|
|
||||||
|
|
||||||
// Create copy button
|
|
||||||
const copyBtn = document.createElement('button');
|
|
||||||
copyBtn.className = 'copy-btn';
|
|
||||||
copyBtn.textContent = 'Copy';
|
|
||||||
copyBtn.onclick = () => {
|
|
||||||
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
|
|
||||||
copyBtn.textContent = '✓ Copied!';
|
|
||||||
setTimeout(() => {
|
|
||||||
copyBtn.textContent = 'Copy';
|
|
||||||
}, 2000);
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
// Add button to pre element
|
|
||||||
pre.style.position = 'relative';
|
|
||||||
pre.insertBefore(copyBtn, codeBlock);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
renderMarkdown(text) {
|
|
||||||
if (!text) return '';
|
|
||||||
|
|
||||||
// Store code blocks temporarily to protect them from processing
|
|
||||||
const codeBlocks = [];
|
|
||||||
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
|
|
||||||
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
|
|
||||||
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
|
|
||||||
return placeholder;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Store inline code temporarily
|
|
||||||
const inlineCodes = [];
|
|
||||||
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
|
|
||||||
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
|
|
||||||
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
|
|
||||||
return placeholder;
|
|
||||||
});
|
|
||||||
|
|
||||||
// Now process the rest of the markdown
|
|
||||||
processed = processed
|
|
||||||
// Headers
|
|
||||||
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
|
|
||||||
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
|
|
||||||
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
|
|
||||||
// Bold
|
|
||||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
|
||||||
// Italic
|
|
||||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
|
||||||
// Links
|
|
||||||
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
|
|
||||||
// Line breaks
|
|
||||||
.replace(/\n\n/g, '</p><p>')
|
|
||||||
.replace(/\n/g, '<br>')
|
|
||||||
// Lists
|
|
||||||
.replace(/^\* (.*)$/gim, '<li>$1</li>')
|
|
||||||
.replace(/^- (.*)$/gim, '<li>$1</li>')
|
|
||||||
// Wrap in paragraphs
|
|
||||||
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
|
|
||||||
.replace(/(?<![>])$/gim, '</p>');
|
|
||||||
|
|
||||||
// Restore inline code
|
|
||||||
inlineCodes.forEach((code, i) => {
|
|
||||||
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
|
|
||||||
});
|
|
||||||
|
|
||||||
// Restore code blocks
|
|
||||||
codeBlocks.forEach((block, i) => {
|
|
||||||
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
|
|
||||||
});
|
|
||||||
|
|
||||||
return processed;
|
|
||||||
}
|
|
||||||
|
|
||||||
escapeHtml(text) {
|
|
||||||
const div = document.createElement('div');
|
|
||||||
div.textContent = text;
|
|
||||||
return div.innerHTML;
|
|
||||||
}
|
|
||||||
|
|
||||||
formatNumber(num) {
|
formatNumber(num) {
|
||||||
if (num >= 1000000) {
|
if (num >= 1000000) {
|
||||||
return (num / 1000000).toFixed(1) + 'M';
|
return (num / 1000000).toFixed(1) + 'M';
|
||||||
@@ -263,27 +275,45 @@ class AppDetailPage {
|
|||||||
setupEventListeners() {
|
setupEventListeners() {
|
||||||
// Tab switching
|
// Tab switching
|
||||||
const tabs = document.querySelectorAll('.tab-btn');
|
const tabs = document.querySelectorAll('.tab-btn');
|
||||||
|
|
||||||
tabs.forEach(tab => {
|
tabs.forEach(tab => {
|
||||||
tab.addEventListener('click', () => {
|
tab.addEventListener('click', () => {
|
||||||
// Update active tab button
|
// Update active tab
|
||||||
tabs.forEach(t => t.classList.remove('active'));
|
tabs.forEach(t => t.classList.remove('active'));
|
||||||
tab.classList.add('active');
|
tab.classList.add('active');
|
||||||
|
|
||||||
// Show corresponding content
|
// Show corresponding content
|
||||||
const tabName = tab.dataset.tab;
|
const tabName = tab.dataset.tab;
|
||||||
|
document.querySelectorAll('.tab-content').forEach(content => {
|
||||||
// Hide all tab contents
|
|
||||||
const allTabContents = document.querySelectorAll('.tab-content');
|
|
||||||
allTabContents.forEach(content => {
|
|
||||||
content.classList.remove('active');
|
content.classList.remove('active');
|
||||||
});
|
});
|
||||||
|
document.getElementById(`${tabName}-tab`).classList.add('active');
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
// Show the selected tab content
|
// Copy integration code
|
||||||
const targetTab = document.getElementById(`${tabName}-tab`);
|
document.getElementById('copy-integration').addEventListener('click', () => {
|
||||||
if (targetTab) {
|
const code = document.getElementById('integration-code').textContent;
|
||||||
targetTab.classList.add('active');
|
navigator.clipboard.writeText(code).then(() => {
|
||||||
}
|
const btn = document.getElementById('copy-integration');
|
||||||
|
const originalText = btn.innerHTML;
|
||||||
|
btn.innerHTML = '<span>✓</span> Copied!';
|
||||||
|
setTimeout(() => {
|
||||||
|
btn.innerHTML = originalText;
|
||||||
|
}, 2000);
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Copy code buttons
|
||||||
|
document.querySelectorAll('.copy-btn').forEach(btn => {
|
||||||
|
btn.addEventListener('click', (e) => {
|
||||||
|
const codeBlock = e.target.closest('.code-block');
|
||||||
|
const code = codeBlock.querySelector('code').textContent;
|
||||||
|
navigator.clipboard.writeText(code).then(() => {
|
||||||
|
btn.textContent = 'Copied!';
|
||||||
|
setTimeout(() => {
|
||||||
|
btn.textContent = 'Copy';
|
||||||
|
}, 2000);
|
||||||
|
});
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -471,17 +471,13 @@ async def delete_sponsor(sponsor_id: int):
|
|||||||
|
|
||||||
app.include_router(router)
|
app.include_router(router)
|
||||||
|
|
||||||
# Version info
|
|
||||||
VERSION = "1.1.0"
|
|
||||||
BUILD_DATE = "2025-10-26"
|
|
||||||
|
|
||||||
@app.get("/")
|
@app.get("/")
|
||||||
async def root():
|
async def root():
|
||||||
"""API info"""
|
"""API info"""
|
||||||
return {
|
return {
|
||||||
"name": "Crawl4AI Marketplace API",
|
"name": "Crawl4AI Marketplace API",
|
||||||
"version": VERSION,
|
"version": "1.0.0",
|
||||||
"build_date": BUILD_DATE,
|
|
||||||
"endpoints": [
|
"endpoints": [
|
||||||
"/marketplace/api/apps",
|
"/marketplace/api/apps",
|
||||||
"/marketplace/api/articles",
|
"/marketplace/api/articles",
|
||||||
|
|||||||
@@ -31,7 +31,7 @@ dependencies = [
|
|||||||
"rank-bm25~=0.2",
|
"rank-bm25~=0.2",
|
||||||
"snowballstemmer~=2.2",
|
"snowballstemmer~=2.2",
|
||||||
"pydantic>=2.10",
|
"pydantic>=2.10",
|
||||||
"pyOpenSSL>=25.3.0",
|
"pyOpenSSL>=24.3.0",
|
||||||
"psutil>=6.1.1",
|
"psutil>=6.1.1",
|
||||||
"PyYAML>=6.0",
|
"PyYAML>=6.0",
|
||||||
"nltk>=3.9.1",
|
"nltk>=3.9.1",
|
||||||
|
|||||||
@@ -19,7 +19,7 @@ rank-bm25~=0.2
|
|||||||
colorama~=0.4
|
colorama~=0.4
|
||||||
snowballstemmer~=2.2
|
snowballstemmer~=2.2
|
||||||
pydantic>=2.10
|
pydantic>=2.10
|
||||||
pyOpenSSL>=25.3.0
|
pyOpenSSL>=24.3.0
|
||||||
psutil>=6.1.1
|
psutil>=6.1.1
|
||||||
PyYAML>=6.0
|
PyYAML>=6.0
|
||||||
nltk>=3.9.1
|
nltk>=3.9.1
|
||||||
|
|||||||
@@ -1,283 +0,0 @@
|
|||||||
"""
|
|
||||||
Compact test suite for CDP concurrency fix.
|
|
||||||
|
|
||||||
This file consolidates all tests related to the CDP concurrency fix for
|
|
||||||
AsyncWebCrawler.arun_many() with managed browsers.
|
|
||||||
|
|
||||||
The bug was that all concurrent tasks were fighting over one shared tab,
|
|
||||||
causing failures. This has been fixed by modifying the get_page() method
|
|
||||||
in browser_manager.py to always create new pages instead of reusing pages[0].
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import shutil
|
|
||||||
import sys
|
|
||||||
import tempfile
|
|
||||||
from pathlib import Path
|
|
||||||
|
|
||||||
# Add parent directory to path for imports
|
|
||||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
|
||||||
|
|
||||||
from crawl4ai import AsyncWebCrawler, CacheMode, CrawlerRunConfig
|
|
||||||
from crawl4ai.async_configs import BrowserConfig
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 1: Basic arun_many functionality
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_basic_arun_many():
|
|
||||||
"""Test that arun_many works correctly with basic configuration."""
|
|
||||||
print("=== TEST 1: Basic arun_many functionality ===")
|
|
||||||
|
|
||||||
# Configuration to bypass cache for testing
|
|
||||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
||||||
|
|
||||||
# Test URLs - using reliable test URLs
|
|
||||||
test_urls = [
|
|
||||||
"https://httpbin.org/html", # Simple HTML page
|
|
||||||
"https://httpbin.org/json", # Simple JSON response
|
|
||||||
]
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
|
|
||||||
|
|
||||||
# This should work correctly
|
|
||||||
result = await crawler.arun_many(urls=test_urls, config=config)
|
|
||||||
|
|
||||||
# Simple verification - if we get here without exception, the basic functionality works
|
|
||||||
print(f"✓ arun_many completed successfully")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 2: CDP Browser with Managed Configuration
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_arun_many_with_managed_cdp_browser():
|
|
||||||
"""Test that arun_many works correctly with managed CDP browsers."""
|
|
||||||
print("\n=== TEST 2: arun_many with managed CDP browser ===")
|
|
||||||
|
|
||||||
# Create a temporary user data directory for the CDP browser
|
|
||||||
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-cdp-test-")
|
|
||||||
|
|
||||||
try:
|
|
||||||
# Configure browser to use managed CDP mode
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
use_managed_browser=True,
|
|
||||||
browser_type="chromium",
|
|
||||||
headless=True,
|
|
||||||
user_data_dir=user_data_dir,
|
|
||||||
verbose=True,
|
|
||||||
)
|
|
||||||
|
|
||||||
# Configuration to bypass cache for testing
|
|
||||||
crawler_config = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
page_timeout=60000,
|
|
||||||
wait_until="domcontentloaded",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Test URLs - using reliable test URLs
|
|
||||||
test_urls = [
|
|
||||||
"https://httpbin.org/html", # Simple HTML page
|
|
||||||
"https://httpbin.org/json", # Simple JSON response
|
|
||||||
]
|
|
||||||
|
|
||||||
# Create crawler with CDP browser configuration
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
|
|
||||||
|
|
||||||
# This should work correctly with our fix
|
|
||||||
result = await crawler.arun_many(urls=test_urls, config=crawler_config)
|
|
||||||
|
|
||||||
print(f"✓ arun_many completed successfully with managed CDP browser")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Test failed with error: {str(e)}")
|
|
||||||
raise
|
|
||||||
finally:
|
|
||||||
# Clean up temporary directory
|
|
||||||
try:
|
|
||||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
|
||||||
except:
|
|
||||||
pass
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 3: Concurrency Verification
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_concurrent_crawling():
|
|
||||||
"""Test concurrent crawling to verify the fix works."""
|
|
||||||
print("\n=== TEST 3: Concurrent crawling verification ===")
|
|
||||||
|
|
||||||
# Configuration to bypass cache for testing
|
|
||||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
||||||
|
|
||||||
# Test URLs - using reliable test URLs
|
|
||||||
test_urls = [
|
|
||||||
"https://httpbin.org/html", # Simple HTML page
|
|
||||||
"https://httpbin.org/json", # Simple JSON response
|
|
||||||
"https://httpbin.org/uuid", # Simple UUID response
|
|
||||||
"https://example.com/", # Standard example page
|
|
||||||
]
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
|
|
||||||
|
|
||||||
# This should work correctly with our fix
|
|
||||||
results = await crawler.arun_many(urls=test_urls, config=config)
|
|
||||||
|
|
||||||
# Simple verification - if we get here without exception, the fix works
|
|
||||||
print("✓ arun_many completed successfully with concurrent crawling")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 4: Concurrency Fix Demonstration
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_concurrency_fix():
|
|
||||||
"""Demonstrate that the concurrency fix works."""
|
|
||||||
print("\n=== TEST 4: Concurrency fix demonstration ===")
|
|
||||||
|
|
||||||
# Configuration to bypass cache for testing
|
|
||||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
||||||
|
|
||||||
# Test URLs - using reliable test URLs
|
|
||||||
test_urls = [
|
|
||||||
"https://httpbin.org/html", # Simple HTML page
|
|
||||||
"https://httpbin.org/json", # Simple JSON response
|
|
||||||
"https://httpbin.org/uuid", # Simple UUID response
|
|
||||||
]
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
print(f"Testing concurrent crawling of {len(test_urls)} URLs...")
|
|
||||||
|
|
||||||
# This should work correctly with our fix
|
|
||||||
results = await crawler.arun_many(urls=test_urls, config=config)
|
|
||||||
|
|
||||||
# Simple verification - if we get here without exception, the fix works
|
|
||||||
print("✓ arun_many completed successfully with concurrent crawling")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 5: Before/After Behavior Comparison
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_before_after_behavior():
|
|
||||||
"""Test that demonstrates concurrent crawling works correctly after the fix."""
|
|
||||||
print("\n=== TEST 5: Before/After behavior test ===")
|
|
||||||
|
|
||||||
# Configuration to bypass cache for testing
|
|
||||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
|
||||||
|
|
||||||
# Test URLs - using reliable test URLs that would stress the concurrency system
|
|
||||||
test_urls = [
|
|
||||||
"https://httpbin.org/delay/1", # Delayed response to increase chance of contention
|
|
||||||
"https://httpbin.org/delay/2", # Delayed response to increase chance of contention
|
|
||||||
"https://httpbin.org/uuid", # Fast response
|
|
||||||
"https://httpbin.org/json", # Fast response
|
|
||||||
]
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
print(
|
|
||||||
f"Testing concurrent crawling of {len(test_urls)} URLs (including delayed responses)..."
|
|
||||||
)
|
|
||||||
print(
|
|
||||||
"This test would have failed before the concurrency fix due to page contention."
|
|
||||||
)
|
|
||||||
|
|
||||||
# This should work correctly with our fix
|
|
||||||
results = await crawler.arun_many(urls=test_urls, config=config)
|
|
||||||
|
|
||||||
# Simple verification - if we get here without exception, the fix works
|
|
||||||
print("✓ arun_many completed successfully with concurrent crawling")
|
|
||||||
print("✓ No page contention issues detected")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# TEST 6: Reference Pattern Test
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def test_reference_pattern():
|
|
||||||
"""Main test function following reference pattern."""
|
|
||||||
print("\n=== TEST 6: Reference pattern test ===")
|
|
||||||
|
|
||||||
# Configure crawler settings
|
|
||||||
crawler_cfg = CrawlerRunConfig(
|
|
||||||
cache_mode=CacheMode.BYPASS,
|
|
||||||
page_timeout=60000,
|
|
||||||
wait_until="domcontentloaded",
|
|
||||||
)
|
|
||||||
|
|
||||||
# Define URLs to crawl
|
|
||||||
URLS = [
|
|
||||||
"https://httpbin.org/html",
|
|
||||||
"https://httpbin.org/json",
|
|
||||||
"https://httpbin.org/uuid",
|
|
||||||
]
|
|
||||||
|
|
||||||
# Crawl all URLs using arun_many
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
print(f"Testing concurrent crawling of {len(URLS)} URLs...")
|
|
||||||
results = await crawler.arun_many(urls=URLS, config=crawler_cfg)
|
|
||||||
|
|
||||||
# Simple verification - if we get here without exception, the fix works
|
|
||||||
print("✓ arun_many completed successfully with concurrent crawling")
|
|
||||||
print("✅ Reference pattern test completed successfully!")
|
|
||||||
|
|
||||||
|
|
||||||
# =============================================================================
|
|
||||||
# MAIN EXECUTION
|
|
||||||
# =============================================================================
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all tests."""
|
|
||||||
print("Running compact CDP concurrency test suite...")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
tests = [
|
|
||||||
test_basic_arun_many,
|
|
||||||
test_arun_many_with_managed_cdp_browser,
|
|
||||||
test_concurrent_crawling,
|
|
||||||
test_concurrency_fix,
|
|
||||||
test_before_after_behavior,
|
|
||||||
test_reference_pattern,
|
|
||||||
]
|
|
||||||
|
|
||||||
passed = 0
|
|
||||||
failed = 0
|
|
||||||
|
|
||||||
for test_func in tests:
|
|
||||||
try:
|
|
||||||
await test_func()
|
|
||||||
passed += 1
|
|
||||||
except Exception as e:
|
|
||||||
print(f"❌ Test failed: {str(e)}")
|
|
||||||
failed += 1
|
|
||||||
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print(f"Test Results: {passed} passed, {failed} failed")
|
|
||||||
|
|
||||||
if failed == 0:
|
|
||||||
print("🎉 All tests passed! The CDP concurrency fix is working correctly.")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"❌ {failed} test(s) failed!")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
success = asyncio.run(main())
|
|
||||||
sys.exit(0 if success else 1)
|
|
||||||
@@ -1,168 +0,0 @@
|
|||||||
"""
|
|
||||||
Lightweight test to verify pyOpenSSL security fix (Issue #1545).
|
|
||||||
|
|
||||||
This test verifies the security requirements are met:
|
|
||||||
1. pyOpenSSL >= 25.3.0 is installed
|
|
||||||
2. cryptography >= 45.0.7 is installed (above vulnerable range)
|
|
||||||
3. SSL/TLS functionality works correctly
|
|
||||||
|
|
||||||
This test can run without full crawl4ai dependencies installed.
|
|
||||||
"""
|
|
||||||
|
|
||||||
import sys
|
|
||||||
from packaging import version
|
|
||||||
|
|
||||||
|
|
||||||
def test_package_versions():
|
|
||||||
"""Test that package versions meet security requirements."""
|
|
||||||
print("=" * 70)
|
|
||||||
print("TEST: Package Version Security Requirements (Issue #1545)")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
all_passed = True
|
|
||||||
|
|
||||||
# Test pyOpenSSL version
|
|
||||||
try:
|
|
||||||
import OpenSSL
|
|
||||||
pyopenssl_version = OpenSSL.__version__
|
|
||||||
print(f"\n✓ pyOpenSSL is installed: {pyopenssl_version}")
|
|
||||||
|
|
||||||
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
|
|
||||||
print(f" ✓ PASS: pyOpenSSL {pyopenssl_version} >= 25.3.0 (required)")
|
|
||||||
else:
|
|
||||||
print(f" ✗ FAIL: pyOpenSSL {pyopenssl_version} < 25.3.0 (required)")
|
|
||||||
all_passed = False
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"\n✗ FAIL: pyOpenSSL not installed - {e}")
|
|
||||||
all_passed = False
|
|
||||||
|
|
||||||
# Test cryptography version
|
|
||||||
try:
|
|
||||||
import cryptography
|
|
||||||
crypto_version = cryptography.__version__
|
|
||||||
print(f"\n✓ cryptography is installed: {crypto_version}")
|
|
||||||
|
|
||||||
# The vulnerable range is >=37.0.0 & <43.0.1
|
|
||||||
# We need >= 45.0.7 to be safe
|
|
||||||
if version.parse(crypto_version) >= version.parse("45.0.7"):
|
|
||||||
print(f" ✓ PASS: cryptography {crypto_version} >= 45.0.7 (secure)")
|
|
||||||
print(f" ✓ NOT in vulnerable range (37.0.0 to 43.0.0)")
|
|
||||||
elif version.parse(crypto_version) >= version.parse("37.0.0") and version.parse(crypto_version) < version.parse("43.0.1"):
|
|
||||||
print(f" ✗ FAIL: cryptography {crypto_version} is VULNERABLE")
|
|
||||||
print(f" ✗ Version is in vulnerable range (>=37.0.0 & <43.0.1)")
|
|
||||||
all_passed = False
|
|
||||||
else:
|
|
||||||
print(f" ⚠ WARNING: cryptography {crypto_version} < 45.0.7")
|
|
||||||
print(f" ⚠ May not meet security requirements")
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"\n✗ FAIL: cryptography not installed - {e}")
|
|
||||||
all_passed = False
|
|
||||||
|
|
||||||
return all_passed
|
|
||||||
|
|
||||||
|
|
||||||
def test_ssl_basic_functionality():
|
|
||||||
"""Test that SSL/TLS basic functionality works."""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("TEST: SSL/TLS Basic Functionality")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import OpenSSL.SSL
|
|
||||||
|
|
||||||
# Create a basic SSL context to verify functionality
|
|
||||||
context = OpenSSL.SSL.Context(OpenSSL.SSL.TLSv1_2_METHOD)
|
|
||||||
print("\n✓ SSL Context created successfully")
|
|
||||||
print(" ✓ PASS: SSL/TLS functionality is working")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n✗ FAIL: SSL functionality test failed - {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def test_pyopenssl_crypto_integration():
|
|
||||||
"""Test that pyOpenSSL and cryptography integration works."""
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("TEST: pyOpenSSL <-> cryptography Integration")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from OpenSSL import crypto
|
|
||||||
|
|
||||||
# Generate a simple key pair to test integration
|
|
||||||
key = crypto.PKey()
|
|
||||||
key.generate_key(crypto.TYPE_RSA, 2048)
|
|
||||||
|
|
||||||
print("\n✓ Generated RSA key pair successfully")
|
|
||||||
print(" ✓ PASS: pyOpenSSL and cryptography are properly integrated")
|
|
||||||
return True
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n✗ FAIL: Integration test failed - {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
def main():
|
|
||||||
"""Run all security tests."""
|
|
||||||
print("\n")
|
|
||||||
print("╔" + "=" * 68 + "╗")
|
|
||||||
print("║ pyOpenSSL Security Fix Verification - Issue #1545 ║")
|
|
||||||
print("╚" + "=" * 68 + "╝")
|
|
||||||
print("\nVerifying that the pyOpenSSL update resolves the security vulnerability")
|
|
||||||
print("in the cryptography package (CVE: versions >=37.0.0 & <43.0.1)\n")
|
|
||||||
|
|
||||||
results = []
|
|
||||||
|
|
||||||
# Test 1: Package versions
|
|
||||||
results.append(("Package Versions", test_package_versions()))
|
|
||||||
|
|
||||||
# Test 2: SSL functionality
|
|
||||||
results.append(("SSL Functionality", test_ssl_basic_functionality()))
|
|
||||||
|
|
||||||
# Test 3: Integration
|
|
||||||
results.append(("pyOpenSSL-crypto Integration", test_pyopenssl_crypto_integration()))
|
|
||||||
|
|
||||||
# Summary
|
|
||||||
print("\n" + "=" * 70)
|
|
||||||
print("TEST SUMMARY")
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
all_passed = True
|
|
||||||
for test_name, passed in results:
|
|
||||||
status = "✓ PASS" if passed else "✗ FAIL"
|
|
||||||
print(f"{status}: {test_name}")
|
|
||||||
all_passed = all_passed and passed
|
|
||||||
|
|
||||||
print("=" * 70)
|
|
||||||
|
|
||||||
if all_passed:
|
|
||||||
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
|
|
||||||
print("✓ Security vulnerability is resolved")
|
|
||||||
print("✓ pyOpenSSL >= 25.3.0 is working correctly")
|
|
||||||
print("✓ cryptography >= 45.0.7 (not vulnerable)")
|
|
||||||
print("\nThe dependency update is safe to merge.\n")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print("\n✗✗✗ SOME TESTS FAILED ✗✗✗")
|
|
||||||
print("✗ Security requirements not met")
|
|
||||||
print("\nDo NOT merge until all tests pass.\n")
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
success = main()
|
|
||||||
sys.exit(0 if success else 1)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n\nTest interrupted by user")
|
|
||||||
sys.exit(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n✗ Unexpected error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
sys.exit(1)
|
|
||||||
@@ -1,184 +0,0 @@
|
|||||||
"""
|
|
||||||
Test script to verify pyOpenSSL update doesn't break crawl4ai functionality.
|
|
||||||
|
|
||||||
This test verifies:
|
|
||||||
1. pyOpenSSL and cryptography versions are correct and secure
|
|
||||||
2. Basic crawling functionality still works
|
|
||||||
3. HTTPS/SSL connections work properly
|
|
||||||
4. Stealth mode integration works (uses playwright-stealth internally)
|
|
||||||
|
|
||||||
Issue: #1545 - Security vulnerability in cryptography package
|
|
||||||
Fix: Updated pyOpenSSL from >=24.3.0 to >=25.3.0
|
|
||||||
Expected: cryptography package should be >=45.0.7 (above vulnerable range)
|
|
||||||
"""
|
|
||||||
|
|
||||||
import asyncio
|
|
||||||
import sys
|
|
||||||
from packaging import version
|
|
||||||
|
|
||||||
|
|
||||||
def check_versions():
|
|
||||||
"""Verify pyOpenSSL and cryptography versions meet security requirements."""
|
|
||||||
print("=" * 60)
|
|
||||||
print("STEP 1: Checking Package Versions")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
import OpenSSL
|
|
||||||
pyopenssl_version = OpenSSL.__version__
|
|
||||||
print(f"✓ pyOpenSSL version: {pyopenssl_version}")
|
|
||||||
|
|
||||||
# Check pyOpenSSL >= 25.3.0
|
|
||||||
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
|
|
||||||
print(f" ✓ Version check passed: {pyopenssl_version} >= 25.3.0")
|
|
||||||
else:
|
|
||||||
print(f" ✗ Version check FAILED: {pyopenssl_version} < 25.3.0")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"✗ Failed to import pyOpenSSL: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
try:
|
|
||||||
import cryptography
|
|
||||||
crypto_version = cryptography.__version__
|
|
||||||
print(f"✓ cryptography version: {crypto_version}")
|
|
||||||
|
|
||||||
# Check cryptography >= 45.0.7 (above vulnerable range)
|
|
||||||
if version.parse(crypto_version) >= version.parse("45.0.7"):
|
|
||||||
print(f" ✓ Security check passed: {crypto_version} >= 45.0.7 (not vulnerable)")
|
|
||||||
else:
|
|
||||||
print(f" ✗ Security check FAILED: {crypto_version} < 45.0.7 (potentially vulnerable)")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except ImportError as e:
|
|
||||||
print(f"✗ Failed to import cryptography: {e}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
print("\n✓ All version checks passed!\n")
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
async def test_basic_crawl():
|
|
||||||
"""Test basic crawling functionality with HTTPS site."""
|
|
||||||
print("=" * 60)
|
|
||||||
print("STEP 2: Testing Basic HTTPS Crawling")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from crawl4ai import AsyncWebCrawler
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
|
||||||
# Test with a simple HTTPS site (requires SSL/TLS)
|
|
||||||
print("Crawling example.com (HTTPS)...")
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://www.example.com",
|
|
||||||
bypass_cache=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
print(f"✓ Crawl successful!")
|
|
||||||
print(f" - Status code: {result.status_code}")
|
|
||||||
print(f" - Content length: {len(result.html)} bytes")
|
|
||||||
print(f" - SSL/TLS connection: ✓ Working")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"✗ Crawl failed: {result.error_message}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Test failed with error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def test_stealth_mode():
|
|
||||||
"""Test stealth mode functionality (depends on playwright-stealth)."""
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("STEP 3: Testing Stealth Mode Integration")
|
|
||||||
print("=" * 60)
|
|
||||||
|
|
||||||
try:
|
|
||||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
|
||||||
|
|
||||||
# Create browser config with stealth mode
|
|
||||||
browser_config = BrowserConfig(
|
|
||||||
headless=True,
|
|
||||||
verbose=False
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
|
|
||||||
print("Crawling with stealth mode enabled...")
|
|
||||||
result = await crawler.arun(
|
|
||||||
url="https://www.example.com",
|
|
||||||
bypass_cache=True
|
|
||||||
)
|
|
||||||
|
|
||||||
if result.success:
|
|
||||||
print(f"✓ Stealth crawl successful!")
|
|
||||||
print(f" - Stealth mode: ✓ Working")
|
|
||||||
return True
|
|
||||||
else:
|
|
||||||
print(f"✗ Stealth crawl failed: {result.error_message}")
|
|
||||||
return False
|
|
||||||
|
|
||||||
except Exception as e:
|
|
||||||
print(f"✗ Stealth test failed with error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
return False
|
|
||||||
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
"""Run all tests."""
|
|
||||||
print("\n")
|
|
||||||
print("╔" + "=" * 58 + "╗")
|
|
||||||
print("║ pyOpenSSL Security Update Verification Test (Issue #1545) ║")
|
|
||||||
print("╚" + "=" * 58 + "╝")
|
|
||||||
print("\n")
|
|
||||||
|
|
||||||
# Step 1: Check versions
|
|
||||||
versions_ok = check_versions()
|
|
||||||
if not versions_ok:
|
|
||||||
print("\n✗ FAILED: Version requirements not met")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Step 2: Test basic crawling
|
|
||||||
crawl_ok = await test_basic_crawl()
|
|
||||||
if not crawl_ok:
|
|
||||||
print("\n✗ FAILED: Basic crawling test failed")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# Step 3: Test stealth mode
|
|
||||||
stealth_ok = await test_stealth_mode()
|
|
||||||
if not stealth_ok:
|
|
||||||
print("\n✗ FAILED: Stealth mode test failed")
|
|
||||||
return False
|
|
||||||
|
|
||||||
# All tests passed
|
|
||||||
print("\n" + "=" * 60)
|
|
||||||
print("FINAL RESULT")
|
|
||||||
print("=" * 60)
|
|
||||||
print("✓ All tests passed successfully!")
|
|
||||||
print("✓ pyOpenSSL update is working correctly")
|
|
||||||
print("✓ No breaking changes detected")
|
|
||||||
print("✓ Security vulnerability resolved")
|
|
||||||
print("=" * 60)
|
|
||||||
print("\n")
|
|
||||||
|
|
||||||
return True
|
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
try:
|
|
||||||
success = asyncio.run(main())
|
|
||||||
sys.exit(0 if success else 1)
|
|
||||||
except KeyboardInterrupt:
|
|
||||||
print("\n\nTest interrupted by user")
|
|
||||||
sys.exit(1)
|
|
||||||
except Exception as e:
|
|
||||||
print(f"\n✗ Unexpected error: {e}")
|
|
||||||
import traceback
|
|
||||||
traceback.print_exc()
|
|
||||||
sys.exit(1)
|
|
||||||
Reference in New Issue
Block a user