Compare commits
1 Commits
bugfix/aru
...
release/v0
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
ca100c6518 |
@@ -1047,28 +1047,14 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
raise e
|
||||
|
||||
finally:
|
||||
# Clean up page after crawl completes
|
||||
# For managed CDP browsers, close pages that are not part of a session to prevent memory leaks
|
||||
# If no session_id is given we should close the page
|
||||
all_contexts = page.context.browser.contexts
|
||||
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||
|
||||
should_close_page = False
|
||||
|
||||
total_pages = sum(len(context.pages) for context in all_contexts)
|
||||
if config.session_id:
|
||||
# Session pages are kept alive for reuse
|
||||
pass
|
||||
elif self.browser_config.use_managed_browser:
|
||||
# For managed browsers (CDP), close non-session pages to prevent tab accumulation
|
||||
# This is especially important for arun_many() with multiple concurrent crawls
|
||||
should_close_page = True
|
||||
elif total_pages <= 1 and self.browser_config.headless:
|
||||
# Keep the last page in headless mode to avoid closing the browser
|
||||
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
|
||||
pass
|
||||
else:
|
||||
# For non-managed browsers, close the page
|
||||
should_close_page = True
|
||||
|
||||
if should_close_page:
|
||||
# Detach listeners before closing to prevent potential errors during close
|
||||
if config.capture_network_requests:
|
||||
page.remove_listener("request", handle_request_capture)
|
||||
@@ -1397,10 +1383,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
try:
|
||||
await self.adapter.evaluate(page,
|
||||
f"""
|
||||
(async () => {{
|
||||
(() => {{
|
||||
try {{
|
||||
const removeOverlays = {remove_overlays_js};
|
||||
await removeOverlays();
|
||||
{remove_overlays_js}
|
||||
return {{ success: true }};
|
||||
}} catch (error) {{
|
||||
return {{
|
||||
|
||||
@@ -1035,20 +1035,34 @@ class BrowserManager:
|
||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||
return page, context
|
||||
|
||||
# If using a managed browser, reuse the default context and create new pages
|
||||
# If using a managed browser, just grab the shared default_context
|
||||
if self.config.use_managed_browser:
|
||||
context = self.default_context
|
||||
if self.config.storage_state:
|
||||
# Clone runtime state from storage to the shared context
|
||||
ctx = self.default_context
|
||||
context = await self.create_browser_context(crawlerRunConfig)
|
||||
ctx = self.default_context # default context, one window only
|
||||
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
|
||||
|
||||
# Always create a new page for concurrent safety
|
||||
# The page-level isolation prevents race conditions while sharing the same context
|
||||
async with self._page_lock:
|
||||
page = await context.new_page()
|
||||
|
||||
await self._apply_stealth_to_page(page)
|
||||
# Avoid concurrent new_page on shared persistent context
|
||||
# See GH-1198: context.pages can be empty under races
|
||||
async with self._page_lock:
|
||||
page = await ctx.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
context = self.default_context
|
||||
pages = context.pages
|
||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||
if not page:
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
# Double-check under lock to avoid TOCTOU and ensure only
|
||||
# one task calls new_page when pages=[] concurrently
|
||||
async with self._page_lock:
|
||||
pages = context.pages
|
||||
if pages:
|
||||
page = pages[0]
|
||||
else:
|
||||
page = await context.new_page()
|
||||
await self._apply_stealth_to_page(page)
|
||||
else:
|
||||
# Otherwise, check if we have an existing context for this config
|
||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||
|
||||
@@ -785,54 +785,6 @@ curl http://localhost:11235/crawl/job/crawl_xyz
|
||||
|
||||
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
|
||||
|
||||
#### LLM Extraction Jobs with Webhooks
|
||||
|
||||
The same webhook system works for LLM extraction jobs via `/llm/job`:
|
||||
|
||||
```bash
|
||||
# Submit LLM extraction job with webhook
|
||||
curl -X POST http://localhost:11235/llm/job \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and main points",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://myapp.com/webhooks/llm-complete",
|
||||
"webhook_data_in_payload": true,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}'
|
||||
|
||||
# Response: {"task_id": "llm_1234567890"}
|
||||
```
|
||||
|
||||
**Your webhook receives:**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1234567890",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"main_points": ["Point 1", "Point 2", "Point 3"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Key Differences for LLM Jobs:**
|
||||
- Task type is `"llm_extraction"` instead of `"crawl"`
|
||||
- Extracted data is in `data.extracted_content`
|
||||
- Single URL only (not an array)
|
||||
- Supports schema-based extraction with `schema` parameter
|
||||
|
||||
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
|
||||
|
||||
---
|
||||
|
||||
@@ -6,16 +6,15 @@ x-base-config: &base-config
|
||||
- "11235:11235" # Gunicorn port
|
||||
env_file:
|
||||
- .llm.env # API keys (create from .llm.env.example)
|
||||
# Uncomment to set default environment variables (will overwrite .llm.env)
|
||||
# environment:
|
||||
# - OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
# - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||
# - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
# - GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||
# - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
# - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
# - GEMINI_API_KEY=${GEMINI_API_KEY:-}
|
||||
# - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
|
||||
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
|
||||
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
|
||||
- GROQ_API_KEY=${GROQ_API_KEY:-}
|
||||
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
|
||||
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
|
||||
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
|
||||
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
|
||||
volumes:
|
||||
- /dev/shm:/dev/shm # Chromium performance
|
||||
deploy:
|
||||
|
||||
@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
2. **Install Dependencies**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install flask
|
||||
```
|
||||
|
||||
3. **Launch the Server**
|
||||
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
4. **Open in Browser**
|
||||
```
|
||||
http://localhost:8000
|
||||
http://localhost:8080
|
||||
```
|
||||
|
||||
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
|
||||
@@ -325,7 +325,7 @@ Powers the recording functionality:
|
||||
### Configuration
|
||||
```python
|
||||
# server.py configuration
|
||||
PORT = 8000
|
||||
PORT = 8080
|
||||
DEBUG = True
|
||||
THREADED = True
|
||||
```
|
||||
@@ -343,9 +343,9 @@ THREADED = True
|
||||
**Port Already in Use**
|
||||
```bash
|
||||
# Kill existing process
|
||||
lsof -ti:8000 | xargs kill -9
|
||||
lsof -ti:8080 | xargs kill -9
|
||||
# Or use different port
|
||||
python server.py --port 8001
|
||||
python server.py --port 8081
|
||||
```
|
||||
|
||||
**Blockly Not Loading**
|
||||
|
||||
@@ -216,7 +216,7 @@ def get_examples():
|
||||
'name': 'Handle Cookie Banner',
|
||||
'description': 'Accept cookies and close newsletter popup',
|
||||
'script': '''# Handle cookie banner and newsletter
|
||||
GO http://127.0.0.1:8000/playground/
|
||||
GO http://127.0.0.1:8080/playground/
|
||||
WAIT `body` 2
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -1,594 +0,0 @@
|
||||
# CDP Browser Crawling
|
||||
|
||||
> **New in v0.7.6**: Efficient concurrent crawling with managed CDP (Chrome DevTools Protocol) browsers. Connect to a running browser instance and perform multiple crawls without spawning new windows.
|
||||
|
||||
## 1. Overview
|
||||
|
||||
When working with CDP browsers, you can connect to an existing browser instance instead of launching a new one for each crawl. This is particularly useful for:
|
||||
|
||||
- **Development**: Keep your browser open with DevTools for debugging
|
||||
- **Persistent Sessions**: Maintain authentication across multiple crawls
|
||||
- **Resource Efficiency**: Reuse a single browser instance for multiple operations
|
||||
- **Concurrent Crawling**: Run multiple crawls simultaneously with proper isolation
|
||||
|
||||
**Key Benefits:**
|
||||
|
||||
- ✅ Single browser window with multiple tabs (no window clutter)
|
||||
- ✅ Shared state (cookies, localStorage) across crawls
|
||||
- ✅ Concurrent safety with automatic page isolation
|
||||
- ✅ Automatic cleanup to prevent memory leaks
|
||||
- ✅ Works seamlessly with `arun_many()` for parallel crawling
|
||||
|
||||
---
|
||||
|
||||
## 2. Quick Start
|
||||
|
||||
### 2.1 Starting a CDP Browser
|
||||
|
||||
Use the Crawl4AI CLI to start a managed CDP browser:
|
||||
|
||||
```bash
|
||||
# Start CDP browser on default port (9222)
|
||||
crwl cdp
|
||||
|
||||
# Start on custom port
|
||||
crwl cdp -d 9223
|
||||
|
||||
# Start in headless mode
|
||||
crwl cdp --headless
|
||||
```
|
||||
|
||||
The browser will stay running until you press 'q' or close the terminal.
|
||||
|
||||
### 2.2 Basic CDP Connection
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
# Configure CDP connection
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Crawl a single URL
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=CrawlerRunConfig()
|
||||
)
|
||||
print(f"Success: {result.success}")
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 3. Concurrent Crawling with arun_many()
|
||||
|
||||
The real power of CDP crawling shines with `arun_many()`. The browser manager automatically handles:
|
||||
|
||||
- **Page Isolation**: Each crawl gets its own tab
|
||||
- **Context Sharing**: All tabs share cookies and localStorage
|
||||
- **Concurrent Safety**: Proper locking prevents race conditions
|
||||
- **Auto Cleanup**: Tabs are closed after crawling (except sessions)
|
||||
|
||||
### 3.1 Basic Concurrent Crawling
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def crawl_multiple_urls():
|
||||
# URLs to crawl
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/html",
|
||||
"https://www.python.org",
|
||||
]
|
||||
|
||||
# Configure CDP browser
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=False
|
||||
)
|
||||
|
||||
# Configure crawler (bypass cache for fresh data)
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Crawl all URLs concurrently
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_cfg
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
print(f"\nURL: {result.url}")
|
||||
if result.success:
|
||||
print(f"✓ Success | Content length: {len(result.markdown)}")
|
||||
else:
|
||||
print(f"✗ Failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(crawl_multiple_urls())
|
||||
```
|
||||
|
||||
### 3.2 With Session Management
|
||||
|
||||
Use sessions to maintain authentication and state across individual crawls:
|
||||
|
||||
```python
|
||||
async def crawl_with_sessions():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# First crawl: Login page
|
||||
login_result = await crawler.arun(
|
||||
url="https://example.com/login",
|
||||
config=CrawlerRunConfig(
|
||||
session_id="my-session", # Session persists
|
||||
js_code="document.querySelector('#login').click();"
|
||||
)
|
||||
)
|
||||
|
||||
# Second crawl: Reuse authenticated session
|
||||
dashboard_result = await crawler.arun(
|
||||
url="https://example.com/dashboard",
|
||||
config=CrawlerRunConfig(
|
||||
session_id="my-session" # Same session, cookies preserved
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 4. How It Works
|
||||
|
||||
### 4.1 Browser Context Reuse
|
||||
|
||||
When using CDP browsers, Crawl4AI:
|
||||
|
||||
1. **Connects** to the existing browser via CDP URL
|
||||
2. **Reuses** the default browser context (single window)
|
||||
3. **Creates** new pages (tabs) for each crawl
|
||||
4. **Locks** page creation to prevent concurrent races
|
||||
5. **Cleans up** pages after crawling (unless it's a session)
|
||||
|
||||
```python
|
||||
# Internal behavior (simplified)
|
||||
if self.config.use_managed_browser:
|
||||
context = self.default_context # Shared context
|
||||
|
||||
# Thread-safe page creation
|
||||
async with self._page_lock:
|
||||
page = await context.new_page() # New tab per crawl
|
||||
|
||||
# After crawl completes
|
||||
if not config.session_id:
|
||||
await page.close() # Auto cleanup
|
||||
```
|
||||
|
||||
### 4.2 Page Lifecycle
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
A[Start Crawl] --> B{Has session_id?}
|
||||
B -->|Yes| C[Reuse existing page]
|
||||
B -->|No| D[Create new page/tab]
|
||||
D --> E[Navigate & Extract]
|
||||
C --> E
|
||||
E --> F{Is session?}
|
||||
F -->|Yes| G[Keep page open]
|
||||
F -->|No| H[Close page]
|
||||
H --> I[End]
|
||||
G --> I
|
||||
```
|
||||
|
||||
### 4.3 State Sharing
|
||||
|
||||
All pages in the same context share:
|
||||
|
||||
- 🍪 **Cookies**: Authentication tokens, preferences
|
||||
- 💾 **localStorage**: Client-side data storage
|
||||
- 🔐 **sessionStorage**: Per-tab session data
|
||||
- 🌐 **Network cache**: Shared HTTP cache
|
||||
|
||||
This makes it perfect for crawling authenticated sites or maintaining state across multiple pages.
|
||||
|
||||
---
|
||||
|
||||
## 5. Configuration Options
|
||||
|
||||
### 5.1 BrowserConfig for CDP
|
||||
|
||||
```python
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium", # Must be "chromium" for CDP
|
||||
cdp_url="http://localhost:9222", # CDP endpoint URL
|
||||
verbose=True, # Log browser operations
|
||||
|
||||
# Optional: Override headers for all requests
|
||||
headers={
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
},
|
||||
|
||||
# Optional: Set user agent
|
||||
user_agent="Mozilla/5.0 ...",
|
||||
|
||||
# Optional: Enable stealth mode (requires dedicated browser)
|
||||
# enable_stealth=False, # Not compatible with CDP
|
||||
)
|
||||
```
|
||||
|
||||
### 5.2 CrawlerRunConfig Options
|
||||
|
||||
```python
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
# Session management
|
||||
session_id="my-session", # Persist page across calls
|
||||
|
||||
# Caching
|
||||
cache_mode=CacheMode.BYPASS, # Fresh data every time
|
||||
|
||||
# Browser location (affects timezone, locale)
|
||||
locale="en-US",
|
||||
timezone_id="America/New_York",
|
||||
geolocation={
|
||||
"latitude": 40.7128,
|
||||
"longitude": -74.0060
|
||||
},
|
||||
|
||||
# Proxy (per-crawl override)
|
||||
proxy_config={
|
||||
"server": "http://proxy.example.com:8080",
|
||||
"username": "user",
|
||||
"password": "pass"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 6. Advanced Patterns
|
||||
|
||||
### 6.1 Streaming Results
|
||||
|
||||
Process URLs as they complete instead of waiting for all:
|
||||
|
||||
```python
|
||||
async def stream_crawl_results():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
urls = ["https://example.com" for _ in range(100)]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# Stream results as they complete
|
||||
async for result in crawler.arun_many(
|
||||
urls=urls,
|
||||
config=CrawlerRunConfig(stream=True)
|
||||
):
|
||||
if result.success:
|
||||
print(f"✓ {result.url}: {len(result.markdown)} chars")
|
||||
# Process immediately instead of waiting for all
|
||||
await save_to_database(result)
|
||||
```
|
||||
|
||||
### 6.2 Custom Concurrency Control
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
|
||||
# Limit concurrent crawls to 3
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
semaphore_count=3, # Max 3 concurrent requests
|
||||
mean_delay=0.5, # Average 0.5s delay between requests
|
||||
max_range=1.0, # +/- 1s random delay
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls, config=crawler_cfg)
|
||||
```
|
||||
|
||||
### 6.3 Multi-Config Crawling
|
||||
|
||||
Different configurations for different URL groups:
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
|
||||
# Fast crawl for static pages
|
||||
fast_config = CrawlerRunConfig(
|
||||
wait_until="domcontentloaded",
|
||||
page_timeout=30000
|
||||
)
|
||||
|
||||
# Slow crawl for dynamic pages
|
||||
slow_config = CrawlerRunConfig(
|
||||
wait_until="networkidle",
|
||||
page_timeout=60000,
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);"
|
||||
)
|
||||
|
||||
configs = [fast_config, slow_config, fast_config]
|
||||
urls = ["https://static.com", "https://dynamic.com", "https://static2.com"]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls, configs=configs)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 7. Best Practices
|
||||
|
||||
### 7.1 Resource Management
|
||||
|
||||
✅ **DO:**
|
||||
```python
|
||||
# Use context manager for automatic cleanup
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls)
|
||||
# Browser connection closed automatically
|
||||
```
|
||||
|
||||
❌ **DON'T:**
|
||||
```python
|
||||
# Manual management risks resource leaks
|
||||
crawler = AsyncWebCrawler(config=browser_cfg)
|
||||
await crawler.start()
|
||||
results = await crawler.arun_many(urls)
|
||||
# Forgot to call crawler.close()!
|
||||
```
|
||||
|
||||
### 7.2 Session Management
|
||||
|
||||
✅ **DO:**
|
||||
```python
|
||||
# Use sessions for related crawls
|
||||
config = CrawlerRunConfig(session_id="user-flow")
|
||||
await crawler.arun(login_url, config=config)
|
||||
await crawler.arun(dashboard_url, config=config)
|
||||
await crawler.kill_session("user-flow") # Clean up when done
|
||||
```
|
||||
|
||||
❌ **DON'T:**
|
||||
```python
|
||||
# Creating new session IDs unnecessarily
|
||||
for i in range(100):
|
||||
config = CrawlerRunConfig(session_id=f"session-{i}")
|
||||
await crawler.arun(url, config=config)
|
||||
# 100 unclosed sessions accumulate!
|
||||
```
|
||||
|
||||
### 7.3 Error Handling
|
||||
|
||||
```python
|
||||
async def robust_crawl(urls):
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls)
|
||||
|
||||
# Separate successes and failures
|
||||
successes = [r for r in results if r.success]
|
||||
failures = [r for r in results if not r.success]
|
||||
|
||||
print(f"✓ {len(successes)} succeeded")
|
||||
print(f"✗ {len(failures)} failed")
|
||||
|
||||
# Retry failures with different config
|
||||
if failures:
|
||||
retry_urls = [r.url for r in failures]
|
||||
retry_config = CrawlerRunConfig(
|
||||
page_timeout=120000, # Longer timeout
|
||||
wait_until="networkidle"
|
||||
)
|
||||
retry_results = await crawler.arun_many(
|
||||
retry_urls,
|
||||
config=retry_config
|
||||
)
|
||||
|
||||
return successes + retry_results
|
||||
|
||||
except Exception as e:
|
||||
print(f"Fatal error: {e}")
|
||||
return []
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 8. Troubleshooting
|
||||
|
||||
### 8.1 Connection Issues
|
||||
|
||||
**Problem**: `Cannot connect to CDP browser`
|
||||
|
||||
```python
|
||||
# Check CDP browser is running
|
||||
$ lsof -i :9222
|
||||
# Should show: Chromium PID USER FD TYPE ...
|
||||
|
||||
# Or start it if not running
|
||||
$ crwl cdp
|
||||
```
|
||||
|
||||
**Problem**: `ERR_ABORTED` errors in concurrent crawls
|
||||
|
||||
✅ **Fixed in v0.7.6**: This issue has been resolved. Pages are now properly isolated with locking.
|
||||
|
||||
### 8.2 Performance Issues
|
||||
|
||||
**Problem**: Too many open tabs
|
||||
|
||||
```python
|
||||
# Ensure you're not using session_id for everything
|
||||
config = CrawlerRunConfig() # No session_id
|
||||
await crawler.arun_many(urls, config=config)
|
||||
# Pages auto-close after crawling
|
||||
```
|
||||
|
||||
**Problem**: Memory leaks
|
||||
|
||||
```python
|
||||
# Always use context manager
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
# Crawling code here
|
||||
pass
|
||||
# Automatic cleanup on exit
|
||||
```
|
||||
|
||||
### 8.3 State Issues
|
||||
|
||||
**Problem**: Cookies not persisting
|
||||
|
||||
```python
|
||||
# Use the same context (automatic with CDP)
|
||||
browser_cfg = BrowserConfig(cdp_url="http://localhost:9222")
|
||||
# All crawls share cookies automatically
|
||||
```
|
||||
|
||||
**Problem**: Need isolated state
|
||||
|
||||
```python
|
||||
# Use different CDP endpoints or non-CDP browsers
|
||||
browser_cfg_1 = BrowserConfig(cdp_url="http://localhost:9222")
|
||||
browser_cfg_2 = BrowserConfig(cdp_url="http://localhost:9223")
|
||||
# Completely isolated browsers
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 9. Comparison: CDP vs Regular Browsers
|
||||
|
||||
| Feature | CDP Browser | Regular Browser |
|
||||
|---------|-------------|-----------------|
|
||||
| **Window Management** | ✅ Single window, multiple tabs | ❌ New window per context |
|
||||
| **Startup Time** | ✅ Instant (already running) | ⏱️ ~2-3s per launch |
|
||||
| **State Sharing** | ✅ Shared cookies/localStorage | ⚠️ Isolated by default |
|
||||
| **Concurrent Safety** | ✅ Automatic locking | ✅ Separate processes |
|
||||
| **Memory Usage** | ✅ Lower (shared browser) | ⚠️ Higher (multiple processes) |
|
||||
| **Session Persistence** | ✅ Native support | ✅ Via session_id |
|
||||
| **Stealth Mode** | ❌ Not compatible | ✅ Full support |
|
||||
| **Best For** | Development, authenticated crawls | Production, isolated crawls |
|
||||
|
||||
---
|
||||
|
||||
## 10. Real-World Examples
|
||||
|
||||
### 10.1 E-commerce Product Scraping
|
||||
|
||||
```python
|
||||
async def scrape_products():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
# Get product URLs from category page
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
category_result = await crawler.arun(
|
||||
url="https://shop.example.com/category",
|
||||
config=CrawlerRunConfig(
|
||||
css_selector=".product-link"
|
||||
)
|
||||
)
|
||||
|
||||
# Extract product URLs
|
||||
product_urls = extract_urls(category_result.links)
|
||||
|
||||
# Crawl all products concurrently
|
||||
product_results = await crawler.arun_many(
|
||||
urls=product_urls,
|
||||
config=CrawlerRunConfig(
|
||||
css_selector=".product-details",
|
||||
semaphore_count=5 # Polite crawling
|
||||
)
|
||||
)
|
||||
|
||||
return [extract_product_data(r) for r in product_results]
|
||||
```
|
||||
|
||||
### 10.2 News Article Monitoring
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from datetime import datetime
|
||||
|
||||
async def monitor_news_sites():
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
cdp_url="http://localhost:9222"
|
||||
)
|
||||
|
||||
news_sites = [
|
||||
"https://news.site1.com",
|
||||
"https://news.site2.com",
|
||||
"https://news.site3.com"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
while True:
|
||||
print(f"\n[{datetime.now()}] Checking for updates...")
|
||||
|
||||
results = await crawler.arun_many(
|
||||
urls=news_sites,
|
||||
config=CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, # Always fresh
|
||||
css_selector=".article-headline"
|
||||
)
|
||||
)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
headlines = extract_headlines(result)
|
||||
for headline in headlines:
|
||||
if is_new(headline):
|
||||
notify_user(headline)
|
||||
|
||||
# Check every 5 minutes
|
||||
await asyncio.sleep(300)
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## 11. Summary
|
||||
|
||||
CDP browser crawling offers:
|
||||
|
||||
- 🚀 **Performance**: Faster startup, lower resource usage
|
||||
- 🔄 **State Management**: Shared cookies and authentication
|
||||
- 🎯 **Concurrent Safety**: Automatic page isolation and cleanup
|
||||
- 💻 **Developer Friendly**: Visual debugging with DevTools
|
||||
|
||||
**When to use CDP:**
|
||||
- Development and debugging
|
||||
- Authenticated crawling (login required)
|
||||
- Sequential crawls needing state
|
||||
- Resource-constrained environments
|
||||
|
||||
**When to use regular browsers:**
|
||||
- Production deployments
|
||||
- Maximum isolation required
|
||||
- Stealth mode needed
|
||||
- Distributed/cloud crawling
|
||||
|
||||
For most use cases, **CDP browsers provide the best balance** of performance, convenience, and safety.
|
||||
@@ -82,42 +82,6 @@ If you installed Crawl4AI (which installs Playwright under the hood), you alread
|
||||
|
||||
---
|
||||
|
||||
### Creating a Profile Using the Crawl4AI CLI (Easiest)
|
||||
|
||||
If you prefer a guided, interactive setup, use the built-in CLI to create and manage persistent browser profiles.
|
||||
|
||||
1.⠀Launch the profile manager:
|
||||
```bash
|
||||
crwl profiles
|
||||
```
|
||||
|
||||
2.⠀Choose "Create new profile" and enter a profile name. A Chromium window opens so you can log in to sites and configure settings. When finished, return to the terminal and press `q` to save the profile.
|
||||
|
||||
3.⠀Profiles are saved under `~/.crawl4ai/profiles/<profile_name>` (for example: `/home/<you>/.crawl4ai/profiles/test_profile_1`) along with a `storage_state.json` for cookies and session data.
|
||||
|
||||
4.⠀Optionally, choose "List profiles" in the CLI to view available profiles and their paths.
|
||||
|
||||
5.⠀Use the saved path with `BrowserConfig.user_data_dir`:
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
profile_path = "/home/<you>/.crawl4ai/profiles/test_profile_1"
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir=profile_path,
|
||||
browser_type="chromium",
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com/private")
|
||||
```
|
||||
|
||||
The CLI also supports listing and deleting profiles, and even testing a crawl directly from the menu.
|
||||
|
||||
---
|
||||
|
||||
## 3. Using Managed Browsers in Crawl4AI
|
||||
|
||||
Once you have a data directory with your session data, pass it to **`BrowserConfig`**:
|
||||
|
||||
@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
2. **Install Dependencies**
|
||||
```bash
|
||||
pip install -r requirements.txt
|
||||
pip install flask
|
||||
```
|
||||
|
||||
3. **Launch the Server**
|
||||
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
|
||||
|
||||
4. **Open in Browser**
|
||||
```
|
||||
http://localhost:8000
|
||||
http://localhost:8080
|
||||
```
|
||||
|
||||
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
|
||||
@@ -325,7 +325,7 @@ Powers the recording functionality:
|
||||
### Configuration
|
||||
```python
|
||||
# server.py configuration
|
||||
PORT = 8000
|
||||
PORT = 8080
|
||||
DEBUG = True
|
||||
THREADED = True
|
||||
```
|
||||
@@ -343,9 +343,9 @@ THREADED = True
|
||||
**Port Already in Use**
|
||||
```bash
|
||||
# Kill existing process
|
||||
lsof -ti:8000 | xargs kill -9
|
||||
lsof -ti:8080 | xargs kill -9
|
||||
# Or use different port
|
||||
python server.py --port 8001
|
||||
python server.py --port 8081
|
||||
```
|
||||
|
||||
**Blockly Not Loading**
|
||||
|
||||
@@ -216,7 +216,7 @@ def get_examples():
|
||||
'name': 'Handle Cookie Banner',
|
||||
'description': 'Accept cookies and close newsletter popup',
|
||||
'script': '''# Handle cookie banner and newsletter
|
||||
GO http://127.0.0.1:8000/playground/
|
||||
GO http://127.0.0.1:8080/playground/
|
||||
WAIT `body` 2
|
||||
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
|
||||
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
|
||||
@@ -283,7 +283,7 @@ WAIT `.success-message` 5'''
|
||||
return jsonify(examples)
|
||||
|
||||
if __name__ == '__main__':
|
||||
port = int(os.environ.get('PORT', 8000))
|
||||
port = int(os.environ.get('PORT', 8080))
|
||||
print(f"""
|
||||
╔══════════════════════════════════════════════════════════╗
|
||||
║ C4A-Script Interactive Tutorial Server ║
|
||||
|
||||
@@ -69,12 +69,12 @@ The tutorial includes a Flask-based web interface with:
|
||||
cd docs/examples/c4a_script/tutorial/
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt
|
||||
pip install flask
|
||||
|
||||
# Launch the tutorial server
|
||||
python server.py
|
||||
python app.py
|
||||
|
||||
# Open http://localhost:8000 in your browser
|
||||
# Open http://localhost:5000 in your browser
|
||||
```
|
||||
|
||||
## Core Concepts
|
||||
@@ -111,8 +111,8 @@ CLICK `.submit-btn`
|
||||
# By attribute
|
||||
CLICK `button[type="submit"]`
|
||||
|
||||
# By accessible attributes
|
||||
CLICK `button[aria-label="Search"][title="Search"]`
|
||||
# By text content
|
||||
CLICK `button:contains("Sign In")`
|
||||
|
||||
# Complex selectors
|
||||
CLICK `.form-container input[name="email"]`
|
||||
|
||||
@@ -27,14 +27,6 @@
|
||||
- [Hook Response Information](#hook-response-information)
|
||||
- [Error Handling](#error-handling)
|
||||
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
|
||||
- [Job Queue & Webhook API](#job-queue-webhook-api)
|
||||
- [Why Use the Job Queue API?](#why-use-the-job-queue-api)
|
||||
- [Available Endpoints](#available-endpoints)
|
||||
- [Webhook Configuration](#webhook-configuration)
|
||||
- [Usage Examples](#usage-examples)
|
||||
- [Webhook Best Practices](#webhook-best-practices)
|
||||
- [Use Cases](#use-cases)
|
||||
- [Troubleshooting](#troubleshooting)
|
||||
- [Dockerfile Parameters](#dockerfile-parameters)
|
||||
- [Using the API](#using-the-api)
|
||||
- [Playground Interface](#playground-interface)
|
||||
@@ -1118,464 +1110,6 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
## Job Queue & Webhook API
|
||||
|
||||
The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish.
|
||||
|
||||
### Why Use the Job Queue API?
|
||||
|
||||
**Traditional Synchronous API (`/crawl`):**
|
||||
- Client waits for entire crawl to complete
|
||||
- Timeout issues with long-running crawls
|
||||
- Resource blocking during execution
|
||||
- Constant polling required for status updates
|
||||
|
||||
**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):**
|
||||
- ✅ Submit job and continue immediately
|
||||
- ✅ No timeout concerns for long operations
|
||||
- ✅ Real-time webhook notifications on completion
|
||||
- ✅ Better resource utilization
|
||||
- ✅ Perfect for batch processing
|
||||
- ✅ Ideal for microservice architectures
|
||||
|
||||
### Available Endpoints
|
||||
|
||||
#### 1. Crawl Job Endpoint
|
||||
|
||||
```
|
||||
POST /crawl/job
|
||||
```
|
||||
|
||||
Submit an asynchronous crawl job with optional webhook notification.
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"cache_mode": "bypass",
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"schema": {
|
||||
"title": "h1",
|
||||
"content": ".article-body"
|
||||
}
|
||||
},
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
||||
"webhook_data_in_payload": true,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token",
|
||||
"X-Custom-Header": "value"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_1698765432",
|
||||
"message": "Crawl job submitted"
|
||||
}
|
||||
```
|
||||
|
||||
#### 2. LLM Extraction Job Endpoint
|
||||
|
||||
```
|
||||
POST /llm/job
|
||||
```
|
||||
|
||||
Submit an asynchronous LLM extraction job with optional webhook notification.
|
||||
|
||||
**Request Body:**
|
||||
```json
|
||||
{
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, publication date, and main points",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}",
|
||||
"cache": false,
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
||||
"webhook_data_in_payload": true,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Response:**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432",
|
||||
"message": "LLM job submitted"
|
||||
}
|
||||
```
|
||||
|
||||
#### 3. Job Status Endpoint
|
||||
|
||||
```
|
||||
GET /job/{task_id}
|
||||
```
|
||||
|
||||
Check the status and retrieve results of a submitted job.
|
||||
|
||||
**Response (In Progress):**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_1698765432",
|
||||
"status": "processing",
|
||||
"message": "Job is being processed"
|
||||
}
|
||||
```
|
||||
|
||||
**Response (Completed):**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_1698765432",
|
||||
"status": "completed",
|
||||
"result": {
|
||||
"markdown": "# Page Title\n\nContent...",
|
||||
"extracted_content": {...},
|
||||
"links": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Webhook Configuration
|
||||
|
||||
Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling.
|
||||
|
||||
#### Webhook Config Parameters
|
||||
|
||||
| Parameter | Type | Required | Description |
|
||||
|-----------|------|----------|-------------|
|
||||
| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications |
|
||||
| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) |
|
||||
| `webhook_headers` | object | No | Custom headers for authentication/identification |
|
||||
|
||||
#### Webhook Payload Format
|
||||
|
||||
**Success Notification (Crawl Job):**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_1698765432",
|
||||
"task_type": "crawl",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"data": {
|
||||
"markdown": "# Page content...",
|
||||
"extracted_content": {...},
|
||||
"links": {...}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Success Notification (LLM Job):**
|
||||
```json
|
||||
{
|
||||
"task_id": "llm_1698765432",
|
||||
"task_type": "llm_extraction",
|
||||
"status": "completed",
|
||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||
"urls": ["https://example.com/article"],
|
||||
"data": {
|
||||
"extracted_content": {
|
||||
"title": "Understanding Web Scraping",
|
||||
"author": "John Doe",
|
||||
"date": "2025-10-22",
|
||||
"points": ["Point 1", "Point 2"]
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
**Failure Notification:**
|
||||
```json
|
||||
{
|
||||
"task_id": "crawl_1698765432",
|
||||
"task_type": "crawl",
|
||||
"status": "failed",
|
||||
"timestamp": "2025-10-22T12:30:00.000000+00:00",
|
||||
"urls": ["https://example.com"],
|
||||
"error": "Connection timeout after 30 seconds"
|
||||
}
|
||||
```
|
||||
|
||||
#### Webhook Delivery & Retry
|
||||
|
||||
- **Delivery Method:** HTTP POST to your `webhook_url`
|
||||
- **Content-Type:** `application/json`
|
||||
- **Retry Policy:** Exponential backoff with 5 attempts
|
||||
- Attempt 1: Immediate
|
||||
- Attempt 2: 1 second delay
|
||||
- Attempt 3: 2 seconds delay
|
||||
- Attempt 4: 4 seconds delay
|
||||
- Attempt 5: 8 seconds delay
|
||||
- **Success Status Codes:** 200-299
|
||||
- **Custom Headers:** Your `webhook_headers` are included in every request
|
||||
|
||||
### Usage Examples
|
||||
|
||||
#### Example 1: Python with Webhook Handler (Flask)
|
||||
|
||||
```python
|
||||
from flask import Flask, request, jsonify
|
||||
import requests
|
||||
|
||||
app = Flask(__name__)
|
||||
|
||||
# Webhook handler
|
||||
@app.route('/webhook/crawl-complete', methods=['POST'])
|
||||
def handle_crawl_webhook():
|
||||
payload = request.json
|
||||
|
||||
if payload['status'] == 'completed':
|
||||
print(f"✅ Job {payload['task_id']} completed!")
|
||||
print(f"Task type: {payload['task_type']}")
|
||||
|
||||
# Access the crawl results
|
||||
if 'data' in payload:
|
||||
markdown = payload['data'].get('markdown', '')
|
||||
extracted = payload['data'].get('extracted_content', {})
|
||||
print(f"Extracted {len(markdown)} characters")
|
||||
print(f"Structured data: {extracted}")
|
||||
else:
|
||||
print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}")
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
|
||||
# Submit a crawl job with webhook
|
||||
def submit_crawl_job():
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl/job",
|
||||
json={
|
||||
"urls": ["https://example.com"],
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"schema": {
|
||||
"name": "Example Schema",
|
||||
"baseSelector": "body",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1", "type": "text"},
|
||||
{"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"}
|
||||
]
|
||||
}
|
||||
},
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://your-app.com/webhook/crawl-complete",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
task_id = response.json()['task_id']
|
||||
print(f"Job submitted: {task_id}")
|
||||
return task_id
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(port=5000)
|
||||
```
|
||||
|
||||
#### Example 2: LLM Extraction with Webhooks
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
def submit_llm_job_with_webhook():
|
||||
response = requests.post(
|
||||
"http://localhost:11235/llm/job",
|
||||
json={
|
||||
"url": "https://example.com/article",
|
||||
"q": "Extract the article title, author, and main points",
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"webhook_config": {
|
||||
"webhook_url": "https://your-app.com/webhook/llm-complete",
|
||||
"webhook_data_in_payload": True,
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
}
|
||||
}
|
||||
)
|
||||
|
||||
task_id = response.json()['task_id']
|
||||
print(f"LLM job submitted: {task_id}")
|
||||
return task_id
|
||||
|
||||
# Webhook handler for LLM jobs
|
||||
@app.route('/webhook/llm-complete', methods=['POST'])
|
||||
def handle_llm_webhook():
|
||||
payload = request.json
|
||||
|
||||
if payload['status'] == 'completed':
|
||||
extracted = payload['data']['extracted_content']
|
||||
print(f"✅ LLM extraction completed!")
|
||||
print(f"Results: {extracted}")
|
||||
else:
|
||||
print(f"❌ LLM extraction failed: {payload.get('error')}")
|
||||
|
||||
return jsonify({"status": "received"}), 200
|
||||
```
|
||||
|
||||
#### Example 3: Without Webhooks (Polling)
|
||||
|
||||
If you don't use webhooks, you can poll for results:
|
||||
|
||||
```python
|
||||
import requests
|
||||
import time
|
||||
|
||||
# Submit job
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl/job",
|
||||
json={"urls": ["https://example.com"]}
|
||||
)
|
||||
task_id = response.json()['task_id']
|
||||
|
||||
# Poll for results
|
||||
while True:
|
||||
result = requests.get(f"http://localhost:11235/job/{task_id}")
|
||||
data = result.json()
|
||||
|
||||
if data['status'] == 'completed':
|
||||
print("Job completed!")
|
||||
print(data['result'])
|
||||
break
|
||||
elif data['status'] == 'failed':
|
||||
print(f"Job failed: {data.get('error')}")
|
||||
break
|
||||
|
||||
print("Still processing...")
|
||||
time.sleep(2)
|
||||
```
|
||||
|
||||
#### Example 4: Global Webhook Configuration
|
||||
|
||||
Set a default webhook URL in your `config.yml` to avoid repeating it in every request:
|
||||
|
||||
```yaml
|
||||
# config.yml
|
||||
api:
|
||||
crawler:
|
||||
# ... other settings ...
|
||||
webhook:
|
||||
default_url: "https://your-app.com/webhook/default"
|
||||
default_headers:
|
||||
X-Webhook-Secret: "your-secret-token"
|
||||
```
|
||||
|
||||
Then submit jobs without webhook config:
|
||||
|
||||
```python
|
||||
# Uses the global webhook configuration
|
||||
response = requests.post(
|
||||
"http://localhost:11235/crawl/job",
|
||||
json={"urls": ["https://example.com"]}
|
||||
)
|
||||
```
|
||||
|
||||
### Webhook Best Practices
|
||||
|
||||
1. **Authentication:** Always use custom headers for webhook authentication
|
||||
```json
|
||||
"webhook_headers": {
|
||||
"X-Webhook-Secret": "your-secret-token"
|
||||
}
|
||||
```
|
||||
|
||||
2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications)
|
||||
|
||||
3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed
|
||||
```python
|
||||
@app.route('/webhook', methods=['POST'])
|
||||
def webhook():
|
||||
payload = request.json
|
||||
# Queue for background processing
|
||||
queue.enqueue(process_webhook, payload)
|
||||
return jsonify({"status": "received"}), 200
|
||||
```
|
||||
|
||||
4. **Error Handling:** Handle both success and failure notifications
|
||||
```python
|
||||
if payload['status'] == 'completed':
|
||||
# Process success
|
||||
elif payload['status'] == 'failed':
|
||||
# Log error, retry, or alert
|
||||
```
|
||||
|
||||
5. **Validation:** Verify webhook authenticity using custom headers
|
||||
```python
|
||||
secret = request.headers.get('X-Webhook-Secret')
|
||||
if secret != os.environ['EXPECTED_SECRET']:
|
||||
return jsonify({"error": "Unauthorized"}), 401
|
||||
```
|
||||
|
||||
6. **Logging:** Log webhook deliveries for debugging
|
||||
```python
|
||||
logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}")
|
||||
```
|
||||
|
||||
### Use Cases
|
||||
|
||||
**1. Batch Processing**
|
||||
Submit hundreds of URLs and get notified as each completes:
|
||||
```python
|
||||
urls = ["https://site1.com", "https://site2.com", ...]
|
||||
for url in urls:
|
||||
submit_crawl_job(url, webhook_url="https://app.com/webhook")
|
||||
```
|
||||
|
||||
**2. Microservice Integration**
|
||||
Integrate with event-driven architectures:
|
||||
```python
|
||||
# Service A submits job
|
||||
task_id = submit_crawl_job(url)
|
||||
|
||||
# Service B receives webhook and triggers next step
|
||||
@app.route('/webhook')
|
||||
def webhook():
|
||||
process_result(request.json)
|
||||
trigger_next_service()
|
||||
return "OK", 200
|
||||
```
|
||||
|
||||
**3. Long-Running Extractions**
|
||||
Handle complex LLM extractions without timeouts:
|
||||
```python
|
||||
submit_llm_job(
|
||||
url="https://long-article.com",
|
||||
q="Comprehensive summary with key points and analysis",
|
||||
webhook_url="https://app.com/webhook/llm"
|
||||
)
|
||||
```
|
||||
|
||||
### Troubleshooting
|
||||
|
||||
**Webhook not receiving notifications?**
|
||||
- Check your webhook URL is publicly accessible
|
||||
- Verify firewall/security group settings
|
||||
- Use webhook testing tools like webhook.site for debugging
|
||||
- Check server logs for delivery attempts
|
||||
- Ensure your handler returns 200-299 status code
|
||||
|
||||
**Job stuck in processing?**
|
||||
- Check Redis connection: `docker logs <container_name> | grep redis`
|
||||
- Verify worker processes: `docker exec <container_name> ps aux | grep worker`
|
||||
- Check server logs: `docker logs <container_name>`
|
||||
|
||||
**Need to cancel a job?**
|
||||
Jobs are processed asynchronously. If you need to cancel:
|
||||
- Delete the task from Redis (requires Redis CLI access)
|
||||
- Or implement a cancellation endpoint in your webhook handler
|
||||
|
||||
---
|
||||
|
||||
## Dockerfile Parameters
|
||||
|
||||
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.
|
||||
|
||||
@@ -57,7 +57,7 @@
|
||||
|
||||
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
|
||||
|
||||
> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth!
|
||||
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
|
||||
|
||||
## 🆕 AI Assistant Skill Now Available!
|
||||
|
||||
|
||||
@@ -529,19 +529,8 @@ class AdminDashboard {
|
||||
</label>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Long Description (Markdown - Overview tab)</label>
|
||||
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
|
||||
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Integration Guide (Markdown - Integration tab)</label>
|
||||
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
|
||||
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
|
||||
</div>
|
||||
<div class="form-group full-width">
|
||||
<label>Documentation (Markdown - Documentation tab)</label>
|
||||
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
|
||||
<small>Full documentation with API reference, examples, best practices, etc.</small>
|
||||
<label>Integration Guide</label>
|
||||
<textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
|
||||
</div>
|
||||
</div>
|
||||
`;
|
||||
@@ -723,9 +712,7 @@ class AdminDashboard {
|
||||
data.contact_email = document.getElementById('form-email').value;
|
||||
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
|
||||
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
|
||||
data.long_description = document.getElementById('form-long-description').value;
|
||||
data.integration_guide = document.getElementById('form-integration').value;
|
||||
data.documentation = document.getElementById('form-documentation').value;
|
||||
} else if (type === 'articles') {
|
||||
data.title = document.getElementById('form-title').value;
|
||||
data.slug = this.generateSlug(data.title);
|
||||
|
||||
@@ -278,12 +278,12 @@
|
||||
}
|
||||
|
||||
.tab-content {
|
||||
display: none !important;
|
||||
display: none;
|
||||
padding: 2rem;
|
||||
}
|
||||
|
||||
.tab-content.active {
|
||||
display: block !important;
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Overview Layout */
|
||||
@@ -510,31 +510,6 @@
|
||||
line-height: 1.5;
|
||||
}
|
||||
|
||||
/* Markdown rendered code blocks */
|
||||
.integration-content pre,
|
||||
.docs-content pre {
|
||||
background: var(--bg-dark);
|
||||
border: 1px solid var(--border-color);
|
||||
margin: 1rem 0;
|
||||
padding: 1rem;
|
||||
padding-top: 2.5rem; /* Space for copy button */
|
||||
overflow-x: auto;
|
||||
position: relative;
|
||||
max-height: none; /* Remove any height restrictions */
|
||||
height: auto; /* Allow content to expand */
|
||||
}
|
||||
|
||||
.integration-content pre code,
|
||||
.docs-content pre code {
|
||||
background: transparent;
|
||||
padding: 0;
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
line-height: 1.5;
|
||||
white-space: pre; /* Preserve whitespace and line breaks */
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Feature Grid */
|
||||
.feature-grid {
|
||||
display: grid;
|
||||
|
||||
@@ -73,14 +73,27 @@
|
||||
<div class="tabs">
|
||||
<button class="tab-btn active" data-tab="overview">Overview</button>
|
||||
<button class="tab-btn" data-tab="integration">Integration</button>
|
||||
<!-- <button class="tab-btn" data-tab="docs">Documentation</button>
|
||||
<button class="tab-btn" data-tab="support">Support</button> -->
|
||||
<button class="tab-btn" data-tab="docs">Documentation</button>
|
||||
<button class="tab-btn" data-tab="support">Support</button>
|
||||
</div>
|
||||
|
||||
<section id="overview-tab" class="tab-content active">
|
||||
<div class="overview-columns">
|
||||
<div class="overview-main">
|
||||
<h2>Overview</h2>
|
||||
<div id="app-overview">Overview content goes here.</div>
|
||||
|
||||
<h3>Key Features</h3>
|
||||
<ul id="app-features" class="features-list">
|
||||
<li>Feature 1</li>
|
||||
<li>Feature 2</li>
|
||||
<li>Feature 3</li>
|
||||
</ul>
|
||||
|
||||
<h3>Use Cases</h3>
|
||||
<div id="app-use-cases" class="use-cases">
|
||||
<p>Describe how this app can help your workflow.</p>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<aside class="sidebar">
|
||||
@@ -129,16 +142,37 @@
|
||||
</section>
|
||||
|
||||
<section id="integration-tab" class="tab-content">
|
||||
<div class="integration-content" id="app-integration">
|
||||
<div class="integration-content">
|
||||
<h2>Integration Guide</h2>
|
||||
|
||||
<h3>Installation</h3>
|
||||
<div class="code-block">
|
||||
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
|
||||
</div>
|
||||
|
||||
<h3>Basic Usage</h3>
|
||||
<div class="code-block">
|
||||
<pre><code id="usage-code"># Usage example will appear here</code></pre>
|
||||
</div>
|
||||
|
||||
<h3>Complete Integration Example</h3>
|
||||
<div class="code-block">
|
||||
<button class="copy-btn" id="copy-integration">Copy</button>
|
||||
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- <section id="docs-tab" class="tab-content">
|
||||
<div class="docs-content" id="app-docs">
|
||||
<section id="docs-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Documentation</h2>
|
||||
<div id="app-docs" class="doc-sections">
|
||||
<p>Documentation coming soon.</p>
|
||||
</div>
|
||||
</div>
|
||||
</section> -->
|
||||
</section>
|
||||
|
||||
<!-- <section id="support-tab" class="tab-content">
|
||||
<section id="support-tab" class="tab-content">
|
||||
<div class="docs-content">
|
||||
<h2>Support</h2>
|
||||
<div class="support-grid">
|
||||
@@ -156,7 +190,7 @@
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section> -->
|
||||
</section>
|
||||
</div>
|
||||
|
||||
</main>
|
||||
|
||||
@@ -112,7 +112,7 @@ class AppDetailPage {
|
||||
}
|
||||
|
||||
// Contact
|
||||
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available');
|
||||
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
|
||||
|
||||
// Sidebar info
|
||||
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
|
||||
@@ -123,134 +123,146 @@ class AppDetailPage {
|
||||
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
|
||||
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
|
||||
|
||||
// Render tab contents from database fields
|
||||
this.renderTabContents();
|
||||
// Integration guide
|
||||
this.renderIntegrationGuide();
|
||||
}
|
||||
|
||||
renderTabContents() {
|
||||
// Overview tab - use long_description from database
|
||||
const overviewDiv = document.getElementById('app-overview');
|
||||
if (overviewDiv) {
|
||||
if (this.appData.long_description) {
|
||||
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
|
||||
} else {
|
||||
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
|
||||
renderIntegrationGuide() {
|
||||
// Installation code
|
||||
const installCode = document.getElementById('install-code');
|
||||
if (installCode) {
|
||||
if (this.appData.type === 'Open Source' && this.appData.github_url) {
|
||||
installCode.textContent = `# Clone from GitHub
|
||||
git clone ${this.appData.github_url}
|
||||
|
||||
# Install dependencies
|
||||
pip install -r requirements.txt`;
|
||||
} else if (this.appData.name.toLowerCase().includes('api')) {
|
||||
installCode.textContent = `# Install via pip
|
||||
pip install ${this.appData.slug}
|
||||
|
||||
# Or install from source
|
||||
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
|
||||
}
|
||||
}
|
||||
|
||||
// Integration tab - use integration_guide field from database
|
||||
const integrationDiv = document.getElementById('app-integration');
|
||||
if (integrationDiv) {
|
||||
if (this.appData.integration_guide) {
|
||||
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
|
||||
// Add copy buttons to all code blocks
|
||||
this.addCopyButtonsToCodeBlocks(integrationDiv);
|
||||
} else {
|
||||
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
|
||||
// Usage code - customize based on category
|
||||
const usageCode = document.getElementById('usage-code');
|
||||
if (usageCode) {
|
||||
if (this.appData.category === 'Browser Automation') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
|
||||
|
||||
async def main():
|
||||
# Initialize ${this.appData.name}
|
||||
automation = ${this.appData.name.replace(/\s+/g, '')}()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
browser_config=automation.config,
|
||||
wait_for="css:body"
|
||||
)
|
||||
print(result.markdown)`;
|
||||
} else if (this.appData.category === 'Proxy Services') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
import ${this.appData.slug.replace(/-/g, '_')}
|
||||
|
||||
# Configure proxy
|
||||
proxy_config = {
|
||||
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
|
||||
"username": "your_username",
|
||||
"password": "your_password"
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
bypass_cache=True
|
||||
)
|
||||
print(result.status_code)`;
|
||||
} else if (this.appData.category === 'LLM Integration') {
|
||||
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
# Configure LLM extraction
|
||||
strategy = LLMExtractionStrategy(
|
||||
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
|
||||
api_key="your-api-key",
|
||||
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
|
||||
instruction="Extract structured data"
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
extraction_strategy=strategy
|
||||
)
|
||||
print(result.extracted_content)`;
|
||||
}
|
||||
}
|
||||
|
||||
// Documentation tab - use documentation field from database
|
||||
const docsDiv = document.getElementById('app-docs');
|
||||
if (docsDiv) {
|
||||
if (this.appData.documentation) {
|
||||
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
|
||||
// Add copy buttons to all code blocks
|
||||
this.addCopyButtonsToCodeBlocks(docsDiv);
|
||||
} else {
|
||||
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
|
||||
}
|
||||
// Integration example
|
||||
const integrationCode = document.getElementById('integration-code');
|
||||
if (integrationCode) {
|
||||
integrationCode.textContent = this.appData.integration_guide ||
|
||||
`# Complete ${this.appData.name} Integration Example
|
||||
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
import json
|
||||
|
||||
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
|
||||
"""
|
||||
Complete example showing how to use ${this.appData.name}
|
||||
with Crawl4AI for production web scraping
|
||||
"""
|
||||
|
||||
# Define extraction schema
|
||||
schema = {
|
||||
"name": "ProductList",
|
||||
"baseSelector": "div.product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
|
||||
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
|
||||
# Initialize crawler with ${this.appData.name}
|
||||
async with AsyncWebCrawler(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
verbose=True
|
||||
) as crawler:
|
||||
|
||||
# Crawl with extraction
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
cache_mode="bypass",
|
||||
wait_for="css:.product",
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
# Process results
|
||||
if result.success:
|
||||
products = json.loads(result.extracted_content)
|
||||
print(f"Found {len(products)} products")
|
||||
|
||||
for product in products[:5]:
|
||||
print(f"- {product['title']}: {product['price']}")
|
||||
|
||||
return products
|
||||
|
||||
# Run the crawler
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
|
||||
}
|
||||
}
|
||||
|
||||
addCopyButtonsToCodeBlocks(container) {
|
||||
// Find all code blocks and add copy buttons
|
||||
const codeBlocks = container.querySelectorAll('pre code');
|
||||
codeBlocks.forEach(codeBlock => {
|
||||
const pre = codeBlock.parentElement;
|
||||
|
||||
// Skip if already has a copy button
|
||||
if (pre.querySelector('.copy-btn')) return;
|
||||
|
||||
// Create copy button
|
||||
const copyBtn = document.createElement('button');
|
||||
copyBtn.className = 'copy-btn';
|
||||
copyBtn.textContent = 'Copy';
|
||||
copyBtn.onclick = () => {
|
||||
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
|
||||
copyBtn.textContent = '✓ Copied!';
|
||||
setTimeout(() => {
|
||||
copyBtn.textContent = 'Copy';
|
||||
}, 2000);
|
||||
});
|
||||
};
|
||||
|
||||
// Add button to pre element
|
||||
pre.style.position = 'relative';
|
||||
pre.insertBefore(copyBtn, codeBlock);
|
||||
});
|
||||
}
|
||||
|
||||
renderMarkdown(text) {
|
||||
if (!text) return '';
|
||||
|
||||
// Store code blocks temporarily to protect them from processing
|
||||
const codeBlocks = [];
|
||||
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
|
||||
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
|
||||
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// Store inline code temporarily
|
||||
const inlineCodes = [];
|
||||
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
|
||||
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
|
||||
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
|
||||
return placeholder;
|
||||
});
|
||||
|
||||
// Now process the rest of the markdown
|
||||
processed = processed
|
||||
// Headers
|
||||
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
|
||||
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
|
||||
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
|
||||
// Bold
|
||||
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
|
||||
// Italic
|
||||
.replace(/\*(.*?)\*/g, '<em>$1</em>')
|
||||
// Links
|
||||
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
|
||||
// Line breaks
|
||||
.replace(/\n\n/g, '</p><p>')
|
||||
.replace(/\n/g, '<br>')
|
||||
// Lists
|
||||
.replace(/^\* (.*)$/gim, '<li>$1</li>')
|
||||
.replace(/^- (.*)$/gim, '<li>$1</li>')
|
||||
// Wrap in paragraphs
|
||||
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
|
||||
.replace(/(?<![>])$/gim, '</p>');
|
||||
|
||||
// Restore inline code
|
||||
inlineCodes.forEach((code, i) => {
|
||||
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
|
||||
});
|
||||
|
||||
// Restore code blocks
|
||||
codeBlocks.forEach((block, i) => {
|
||||
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
|
||||
});
|
||||
|
||||
return processed;
|
||||
}
|
||||
|
||||
escapeHtml(text) {
|
||||
const div = document.createElement('div');
|
||||
div.textContent = text;
|
||||
return div.innerHTML;
|
||||
}
|
||||
|
||||
formatNumber(num) {
|
||||
if (num >= 1000000) {
|
||||
return (num / 1000000).toFixed(1) + 'M';
|
||||
@@ -263,27 +275,45 @@ class AppDetailPage {
|
||||
setupEventListeners() {
|
||||
// Tab switching
|
||||
const tabs = document.querySelectorAll('.tab-btn');
|
||||
|
||||
tabs.forEach(tab => {
|
||||
tab.addEventListener('click', () => {
|
||||
// Update active tab button
|
||||
// Update active tab
|
||||
tabs.forEach(t => t.classList.remove('active'));
|
||||
tab.classList.add('active');
|
||||
|
||||
// Show corresponding content
|
||||
const tabName = tab.dataset.tab;
|
||||
|
||||
// Hide all tab contents
|
||||
const allTabContents = document.querySelectorAll('.tab-content');
|
||||
allTabContents.forEach(content => {
|
||||
document.querySelectorAll('.tab-content').forEach(content => {
|
||||
content.classList.remove('active');
|
||||
});
|
||||
document.getElementById(`${tabName}-tab`).classList.add('active');
|
||||
});
|
||||
});
|
||||
|
||||
// Show the selected tab content
|
||||
const targetTab = document.getElementById(`${tabName}-tab`);
|
||||
if (targetTab) {
|
||||
targetTab.classList.add('active');
|
||||
}
|
||||
// Copy integration code
|
||||
document.getElementById('copy-integration').addEventListener('click', () => {
|
||||
const code = document.getElementById('integration-code').textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
const btn = document.getElementById('copy-integration');
|
||||
const originalText = btn.innerHTML;
|
||||
btn.innerHTML = '<span>✓</span> Copied!';
|
||||
setTimeout(() => {
|
||||
btn.innerHTML = originalText;
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
|
||||
// Copy code buttons
|
||||
document.querySelectorAll('.copy-btn').forEach(btn => {
|
||||
btn.addEventListener('click', (e) => {
|
||||
const codeBlock = e.target.closest('.code-block');
|
||||
const code = codeBlock.querySelector('code').textContent;
|
||||
navigator.clipboard.writeText(code).then(() => {
|
||||
btn.textContent = 'Copied!';
|
||||
setTimeout(() => {
|
||||
btn.textContent = 'Copy';
|
||||
}, 2000);
|
||||
});
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
@@ -471,17 +471,13 @@ async def delete_sponsor(sponsor_id: int):
|
||||
|
||||
app.include_router(router)
|
||||
|
||||
# Version info
|
||||
VERSION = "1.1.0"
|
||||
BUILD_DATE = "2025-10-26"
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""API info"""
|
||||
return {
|
||||
"name": "Crawl4AI Marketplace API",
|
||||
"version": VERSION,
|
||||
"build_date": BUILD_DATE,
|
||||
"version": "1.0.0",
|
||||
"endpoints": [
|
||||
"/marketplace/api/apps",
|
||||
"/marketplace/api/articles",
|
||||
|
||||
@@ -31,7 +31,7 @@ dependencies = [
|
||||
"rank-bm25~=0.2",
|
||||
"snowballstemmer~=2.2",
|
||||
"pydantic>=2.10",
|
||||
"pyOpenSSL>=25.3.0",
|
||||
"pyOpenSSL>=24.3.0",
|
||||
"psutil>=6.1.1",
|
||||
"PyYAML>=6.0",
|
||||
"nltk>=3.9.1",
|
||||
|
||||
@@ -19,7 +19,7 @@ rank-bm25~=0.2
|
||||
colorama~=0.4
|
||||
snowballstemmer~=2.2
|
||||
pydantic>=2.10
|
||||
pyOpenSSL>=25.3.0
|
||||
pyOpenSSL>=24.3.0
|
||||
psutil>=6.1.1
|
||||
PyYAML>=6.0
|
||||
nltk>=3.9.1
|
||||
|
||||
@@ -364,19 +364,5 @@ async def test_network_error_handling():
|
||||
async with AsyncPlaywrightCrawlerStrategy() as strategy:
|
||||
await strategy.crawl("https://invalid.example.com", config)
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_remove_overlay_elements(crawler_strategy):
|
||||
config = CrawlerRunConfig(
|
||||
remove_overlay_elements=True,
|
||||
delay_before_return_html=5,
|
||||
)
|
||||
|
||||
response = await crawler_strategy.crawl(
|
||||
"https://www2.hm.com/en_us/index.html",
|
||||
config
|
||||
)
|
||||
assert response.status_code == 200
|
||||
assert "Accept all cookies" not in response.html
|
||||
|
||||
if __name__ == "__main__":
|
||||
pytest.main([__file__, "-v"])
|
||||
@@ -1,63 +0,0 @@
|
||||
"""
|
||||
Test for arun_many with managed CDP browser to ensure each crawl gets its own tab.
|
||||
"""
|
||||
import pytest
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_many_with_cdp():
|
||||
"""Test arun_many opens a new tab for each url with managed CDP browser."""
|
||||
# NOTE: Requires a running CDP browser at localhost:9222
|
||||
# Can be started with: crwl cdp -d 9222
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="cdp",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=False,
|
||||
)
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/html",
|
||||
"https://www.python.org",
|
||||
]
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = await crawler.arun_many(urls=urls, config=crawler_cfg)
|
||||
# All results should be successful and distinct
|
||||
assert len(results) == 3
|
||||
for result in results:
|
||||
assert result.success, f"Crawl failed: {result.url} - {result.error_message}"
|
||||
assert result.markdown is not None
|
||||
|
||||
|
||||
@pytest.mark.asyncio
|
||||
async def test_arun_many_with_cdp_sequential():
|
||||
"""Test arun_many sequentially to isolate issues."""
|
||||
browser_cfg = BrowserConfig(
|
||||
browser_type="cdp",
|
||||
cdp_url="http://localhost:9222",
|
||||
verbose=True,
|
||||
)
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/html",
|
||||
"https://www.python.org",
|
||||
]
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
results = []
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=crawler_cfg)
|
||||
results.append(result)
|
||||
assert result.success, f"Crawl failed: {result.url} - {result.error_message}"
|
||||
assert result.markdown is not None
|
||||
assert len(results) == 3
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_arun_many_with_cdp())
|
||||
@@ -1,168 +0,0 @@
|
||||
"""
|
||||
Lightweight test to verify pyOpenSSL security fix (Issue #1545).
|
||||
|
||||
This test verifies the security requirements are met:
|
||||
1. pyOpenSSL >= 25.3.0 is installed
|
||||
2. cryptography >= 45.0.7 is installed (above vulnerable range)
|
||||
3. SSL/TLS functionality works correctly
|
||||
|
||||
This test can run without full crawl4ai dependencies installed.
|
||||
"""
|
||||
|
||||
import sys
|
||||
from packaging import version
|
||||
|
||||
|
||||
def test_package_versions():
|
||||
"""Test that package versions meet security requirements."""
|
||||
print("=" * 70)
|
||||
print("TEST: Package Version Security Requirements (Issue #1545)")
|
||||
print("=" * 70)
|
||||
|
||||
all_passed = True
|
||||
|
||||
# Test pyOpenSSL version
|
||||
try:
|
||||
import OpenSSL
|
||||
pyopenssl_version = OpenSSL.__version__
|
||||
print(f"\n✓ pyOpenSSL is installed: {pyopenssl_version}")
|
||||
|
||||
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
|
||||
print(f" ✓ PASS: pyOpenSSL {pyopenssl_version} >= 25.3.0 (required)")
|
||||
else:
|
||||
print(f" ✗ FAIL: pyOpenSSL {pyopenssl_version} < 25.3.0 (required)")
|
||||
all_passed = False
|
||||
|
||||
except ImportError as e:
|
||||
print(f"\n✗ FAIL: pyOpenSSL not installed - {e}")
|
||||
all_passed = False
|
||||
|
||||
# Test cryptography version
|
||||
try:
|
||||
import cryptography
|
||||
crypto_version = cryptography.__version__
|
||||
print(f"\n✓ cryptography is installed: {crypto_version}")
|
||||
|
||||
# The vulnerable range is >=37.0.0 & <43.0.1
|
||||
# We need >= 45.0.7 to be safe
|
||||
if version.parse(crypto_version) >= version.parse("45.0.7"):
|
||||
print(f" ✓ PASS: cryptography {crypto_version} >= 45.0.7 (secure)")
|
||||
print(f" ✓ NOT in vulnerable range (37.0.0 to 43.0.0)")
|
||||
elif version.parse(crypto_version) >= version.parse("37.0.0") and version.parse(crypto_version) < version.parse("43.0.1"):
|
||||
print(f" ✗ FAIL: cryptography {crypto_version} is VULNERABLE")
|
||||
print(f" ✗ Version is in vulnerable range (>=37.0.0 & <43.0.1)")
|
||||
all_passed = False
|
||||
else:
|
||||
print(f" ⚠ WARNING: cryptography {crypto_version} < 45.0.7")
|
||||
print(f" ⚠ May not meet security requirements")
|
||||
|
||||
except ImportError as e:
|
||||
print(f"\n✗ FAIL: cryptography not installed - {e}")
|
||||
all_passed = False
|
||||
|
||||
return all_passed
|
||||
|
||||
|
||||
def test_ssl_basic_functionality():
|
||||
"""Test that SSL/TLS basic functionality works."""
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST: SSL/TLS Basic Functionality")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
import OpenSSL.SSL
|
||||
|
||||
# Create a basic SSL context to verify functionality
|
||||
context = OpenSSL.SSL.Context(OpenSSL.SSL.TLSv1_2_METHOD)
|
||||
print("\n✓ SSL Context created successfully")
|
||||
print(" ✓ PASS: SSL/TLS functionality is working")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ FAIL: SSL functionality test failed - {e}")
|
||||
return False
|
||||
|
||||
|
||||
def test_pyopenssl_crypto_integration():
|
||||
"""Test that pyOpenSSL and cryptography integration works."""
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST: pyOpenSSL <-> cryptography Integration")
|
||||
print("=" * 70)
|
||||
|
||||
try:
|
||||
from OpenSSL import crypto
|
||||
|
||||
# Generate a simple key pair to test integration
|
||||
key = crypto.PKey()
|
||||
key.generate_key(crypto.TYPE_RSA, 2048)
|
||||
|
||||
print("\n✓ Generated RSA key pair successfully")
|
||||
print(" ✓ PASS: pyOpenSSL and cryptography are properly integrated")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n✗ FAIL: Integration test failed - {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
def main():
|
||||
"""Run all security tests."""
|
||||
print("\n")
|
||||
print("╔" + "=" * 68 + "╗")
|
||||
print("║ pyOpenSSL Security Fix Verification - Issue #1545 ║")
|
||||
print("╚" + "=" * 68 + "╝")
|
||||
print("\nVerifying that the pyOpenSSL update resolves the security vulnerability")
|
||||
print("in the cryptography package (CVE: versions >=37.0.0 & <43.0.1)\n")
|
||||
|
||||
results = []
|
||||
|
||||
# Test 1: Package versions
|
||||
results.append(("Package Versions", test_package_versions()))
|
||||
|
||||
# Test 2: SSL functionality
|
||||
results.append(("SSL Functionality", test_ssl_basic_functionality()))
|
||||
|
||||
# Test 3: Integration
|
||||
results.append(("pyOpenSSL-crypto Integration", test_pyopenssl_crypto_integration()))
|
||||
|
||||
# Summary
|
||||
print("\n" + "=" * 70)
|
||||
print("TEST SUMMARY")
|
||||
print("=" * 70)
|
||||
|
||||
all_passed = True
|
||||
for test_name, passed in results:
|
||||
status = "✓ PASS" if passed else "✗ FAIL"
|
||||
print(f"{status}: {test_name}")
|
||||
all_passed = all_passed and passed
|
||||
|
||||
print("=" * 70)
|
||||
|
||||
if all_passed:
|
||||
print("\n✓✓✓ ALL TESTS PASSED ✓✓✓")
|
||||
print("✓ Security vulnerability is resolved")
|
||||
print("✓ pyOpenSSL >= 25.3.0 is working correctly")
|
||||
print("✓ cryptography >= 45.0.7 (not vulnerable)")
|
||||
print("\nThe dependency update is safe to merge.\n")
|
||||
return True
|
||||
else:
|
||||
print("\n✗✗✗ SOME TESTS FAILED ✗✗✗")
|
||||
print("✗ Security requirements not met")
|
||||
print("\nDo NOT merge until all tests pass.\n")
|
||||
return False
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
success = main()
|
||||
sys.exit(0 if success else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTest interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n✗ Unexpected error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
@@ -1,184 +0,0 @@
|
||||
"""
|
||||
Test script to verify pyOpenSSL update doesn't break crawl4ai functionality.
|
||||
|
||||
This test verifies:
|
||||
1. pyOpenSSL and cryptography versions are correct and secure
|
||||
2. Basic crawling functionality still works
|
||||
3. HTTPS/SSL connections work properly
|
||||
4. Stealth mode integration works (uses playwright-stealth internally)
|
||||
|
||||
Issue: #1545 - Security vulnerability in cryptography package
|
||||
Fix: Updated pyOpenSSL from >=24.3.0 to >=25.3.0
|
||||
Expected: cryptography package should be >=45.0.7 (above vulnerable range)
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import sys
|
||||
from packaging import version
|
||||
|
||||
|
||||
def check_versions():
|
||||
"""Verify pyOpenSSL and cryptography versions meet security requirements."""
|
||||
print("=" * 60)
|
||||
print("STEP 1: Checking Package Versions")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
import OpenSSL
|
||||
pyopenssl_version = OpenSSL.__version__
|
||||
print(f"✓ pyOpenSSL version: {pyopenssl_version}")
|
||||
|
||||
# Check pyOpenSSL >= 25.3.0
|
||||
if version.parse(pyopenssl_version) >= version.parse("25.3.0"):
|
||||
print(f" ✓ Version check passed: {pyopenssl_version} >= 25.3.0")
|
||||
else:
|
||||
print(f" ✗ Version check FAILED: {pyopenssl_version} < 25.3.0")
|
||||
return False
|
||||
|
||||
except ImportError as e:
|
||||
print(f"✗ Failed to import pyOpenSSL: {e}")
|
||||
return False
|
||||
|
||||
try:
|
||||
import cryptography
|
||||
crypto_version = cryptography.__version__
|
||||
print(f"✓ cryptography version: {crypto_version}")
|
||||
|
||||
# Check cryptography >= 45.0.7 (above vulnerable range)
|
||||
if version.parse(crypto_version) >= version.parse("45.0.7"):
|
||||
print(f" ✓ Security check passed: {crypto_version} >= 45.0.7 (not vulnerable)")
|
||||
else:
|
||||
print(f" ✗ Security check FAILED: {crypto_version} < 45.0.7 (potentially vulnerable)")
|
||||
return False
|
||||
|
||||
except ImportError as e:
|
||||
print(f"✗ Failed to import cryptography: {e}")
|
||||
return False
|
||||
|
||||
print("\n✓ All version checks passed!\n")
|
||||
return True
|
||||
|
||||
|
||||
async def test_basic_crawl():
|
||||
"""Test basic crawling functionality with HTTPS site."""
|
||||
print("=" * 60)
|
||||
print("STEP 2: Testing Basic HTTPS Crawling")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Test with a simple HTTPS site (requires SSL/TLS)
|
||||
print("Crawling example.com (HTTPS)...")
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
bypass_cache=True
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Crawl successful!")
|
||||
print(f" - Status code: {result.status_code}")
|
||||
print(f" - Content length: {len(result.html)} bytes")
|
||||
print(f" - SSL/TLS connection: ✓ Working")
|
||||
return True
|
||||
else:
|
||||
print(f"✗ Crawl failed: {result.error_message}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Test failed with error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
async def test_stealth_mode():
|
||||
"""Test stealth mode functionality (depends on playwright-stealth)."""
|
||||
print("\n" + "=" * 60)
|
||||
print("STEP 3: Testing Stealth Mode Integration")
|
||||
print("=" * 60)
|
||||
|
||||
try:
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
# Create browser config with stealth mode
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=False
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config, verbose=True) as crawler:
|
||||
print("Crawling with stealth mode enabled...")
|
||||
result = await crawler.arun(
|
||||
url="https://www.example.com",
|
||||
bypass_cache=True
|
||||
)
|
||||
|
||||
if result.success:
|
||||
print(f"✓ Stealth crawl successful!")
|
||||
print(f" - Stealth mode: ✓ Working")
|
||||
return True
|
||||
else:
|
||||
print(f"✗ Stealth crawl failed: {result.error_message}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ Stealth test failed with error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return False
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests."""
|
||||
print("\n")
|
||||
print("╔" + "=" * 58 + "╗")
|
||||
print("║ pyOpenSSL Security Update Verification Test (Issue #1545) ║")
|
||||
print("╚" + "=" * 58 + "╝")
|
||||
print("\n")
|
||||
|
||||
# Step 1: Check versions
|
||||
versions_ok = check_versions()
|
||||
if not versions_ok:
|
||||
print("\n✗ FAILED: Version requirements not met")
|
||||
return False
|
||||
|
||||
# Step 2: Test basic crawling
|
||||
crawl_ok = await test_basic_crawl()
|
||||
if not crawl_ok:
|
||||
print("\n✗ FAILED: Basic crawling test failed")
|
||||
return False
|
||||
|
||||
# Step 3: Test stealth mode
|
||||
stealth_ok = await test_stealth_mode()
|
||||
if not stealth_ok:
|
||||
print("\n✗ FAILED: Stealth mode test failed")
|
||||
return False
|
||||
|
||||
# All tests passed
|
||||
print("\n" + "=" * 60)
|
||||
print("FINAL RESULT")
|
||||
print("=" * 60)
|
||||
print("✓ All tests passed successfully!")
|
||||
print("✓ pyOpenSSL update is working correctly")
|
||||
print("✓ No breaking changes detected")
|
||||
print("✓ Security vulnerability resolved")
|
||||
print("=" * 60)
|
||||
print("\n")
|
||||
|
||||
return True
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
try:
|
||||
success = asyncio.run(main())
|
||||
sys.exit(0 if success else 1)
|
||||
except KeyboardInterrupt:
|
||||
print("\n\nTest interrupted by user")
|
||||
sys.exit(1)
|
||||
except Exception as e:
|
||||
print(f"\n✗ Unexpected error: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
sys.exit(1)
|
||||
Reference in New Issue
Block a user