Compare commits

..

13 Commits

Author SHA1 Message Date
AHMET YILMAZ
c003cb6e4f fix #1563 (cdp): resolve page leaks and race conditions in concurrent crawling
Fix memory leaks and race conditions when using arun_many() with managed CDP browsers. Each crawl now gets proper page isolation with automatic cleanup while maintaining shared browser context.

Key fixes:
- Close non-session pages after crawling to prevent tab accumulation
- Add thread-safe page creation with locks to avoid concurrent access
- Improve page lifecycle management for managed vs non-managed browsers
- Keep session pages alive for authentication persistence
- Prevent TOCTOU (time-of-check-time-of-use) race conditions

This ensures stable parallel crawling without memory growth or browser instability.
2025-11-07 15:42:37 +08:00
Nasrin
2c918155aa Merge pull request #1529 from unclecode/fix/remove_overlay_elements
Fix remove_overlay_elements functionality by calling injected JS function.
2025-11-06 00:10:32 +08:00
Nasrin
854694ef33 Merge pull request #1537 from unclecode/fix/docker-compose-llm-env
fix(docker): Remove environment variable overrides in docker-compose.yml
2025-11-06 00:07:51 +08:00
Nasrin
6534ece026 Merge pull request #1532 from unclecode/fix/update-documentation
Standardize C4A-Script tutorial, add CLI identity-based crawling, and add sponsorship CTA
2025-11-05 23:37:05 +08:00
Nasrin
89e28d4eee Merge pull request #1558 from unclecode/claude/fix-update-pyopenssl-security-011CUPexU25DkNvoxfu5ZrnB
Claude/fix update pyopenssl security 011 cu pex u25 dk nvoxfu5 zrn b
2025-10-28 17:09:11 +08:00
ntohidi
c0f1865287 feat(api): update marketplace version and build date in root endpoint response 2025-10-26 11:35:39 +01:00
ntohidi
46ef1116c4 fix(app-detail): enhance tab functionality, hide documentation and support tabs in marketplace 2025-10-26 11:21:29 +01:00
Nasrin
4df83893ac Merge pull request #1560 from unclecode/fix/marketplace
Fix/marketplace
2025-10-23 22:17:06 +08:00
ntohidi
13e116610d fix(marketplace): improve app detail page content rendering and UX
Fixed multiple issues with app detail page content display and formatting
2025-10-23 16:12:30 +02:00
ntohidi
97c92c4f62 fix(marketplace): replace hardcoded app detail content with database-driven fields.
The app detail page was displaying hardcoded/templated content instead of
using actual data from the database. This prevented admins from controlling
the content shown in Overview, Integration, and Documentation tabs.
2025-10-21 15:39:04 +02:00
Soham Kukreti
46e1a67f61 fix(docker): Remove environment variable overrides in docker-compose.yml (#1411)
The docker-compose.yml had an `environment:` section with variable
substitutions (${VAR:-}) that was overriding values from .llm.env with
empty strings.

- Commented out the `environment:` section to prevent overwrites
- Added clear warning comment explaining the override behavior
- .llm.env values now load directly into container without interference
2025-10-06 14:41:22 +05:30
Soham Kukreti
7dfe528d43 fix(docs): standardize C4A-Script tutorial, add CLI identity-based crawling, and add sponsorship CTA
- Switch installs to pip install -r requirements.txt (tutorial and app docs)
- Update local run steps to python server.py and http://localhost:8000
- Set default PORT to 8000; update port-in-use commands and alt port 8001
- Replace unsupported :contains() example with accessible attribute selector
- Update example URLs in tutorial servers to 127.0.0.1:8000
- Add “Identity-based crawling” section with crwl profiles CLI workflow and code usage
- Replace legacy-docs note with sponsorship message in docs/md_v2/index.md
- Minor copy and consistency fixes across pages
2025-10-03 22:00:46 +05:30
Soham Kukreti
2dc6588573 fix: remove_overlay_elements functionality by calling injected JS function. ref: #1396
- Fix critical bug where overlay removal JS function was injected but never called
  - Change remove_overlay_elements() to properly execute the injected async function
  - Wrap JS execution in async to handle the async overlay removal logic
  - Add test_remove_overlay_elements() test case to verify functionality works
  - Ensure overlay elements (cookie banners, popups, modals) are actually removed

  The remove_overlay_elements feature now works as intended:
  - Before: Function definition injected but never executed (silent failure)
  - After: Function injected and called, successfully removing overlay elements
2025-09-29 20:40:08 +05:30
18 changed files with 945 additions and 258 deletions

View File

@@ -1047,14 +1047,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
raise e
finally:
# If no session_id is given we should close the page
# Clean up page after crawl completes
# For managed CDP browsers, close pages that are not part of a session to prevent memory leaks
all_contexts = page.context.browser.contexts
total_pages = sum(len(context.pages) for context in all_contexts)
should_close_page = False
if config.session_id:
# Session pages are kept alive for reuse
pass
elif total_pages <= 1 and (self.browser_config.use_managed_browser or self.browser_config.headless):
elif self.browser_config.use_managed_browser:
# For managed browsers (CDP), close non-session pages to prevent tab accumulation
# This is especially important for arun_many() with multiple concurrent crawls
should_close_page = True
elif total_pages <= 1 and self.browser_config.headless:
# Keep the last page in headless mode to avoid closing the browser
pass
else:
# For non-managed browsers, close the page
should_close_page = True
if should_close_page:
# Detach listeners before closing to prevent potential errors during close
if config.capture_network_requests:
page.remove_listener("request", handle_request_capture)
@@ -1383,9 +1397,10 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
try:
await self.adapter.evaluate(page,
f"""
(() => {{
(async () => {{
try {{
{remove_overlays_js}
const removeOverlays = {remove_overlays_js};
await removeOverlays();
return {{ success: true }};
}} catch (error) {{
return {{

View File

@@ -1035,33 +1035,19 @@ class BrowserManager:
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
return page, context
# If using a managed browser, just grab the shared default_context
# If using a managed browser, reuse the default context and create new pages
if self.config.use_managed_browser:
if self.config.storage_state:
context = await self.create_browser_context(crawlerRunConfig)
ctx = self.default_context # default context, one window only
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
# Avoid concurrent new_page on shared persistent context
# See GH-1198: context.pages can be empty under races
async with self._page_lock:
page = await ctx.new_page()
await self._apply_stealth_to_page(page)
else:
context = self.default_context
pages = context.pages
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
if not page:
if pages:
page = pages[0]
else:
# Double-check under lock to avoid TOCTOU and ensure only
# one task calls new_page when pages=[] concurrently
if self.config.storage_state:
# Clone runtime state from storage to the shared context
ctx = self.default_context
ctx = await clone_runtime_state(context, ctx, crawlerRunConfig, self.config)
# Always create a new page for concurrent safety
# The page-level isolation prevents race conditions while sharing the same context
async with self._page_lock:
pages = context.pages
if pages:
page = pages[0]
else:
page = await context.new_page()
await self._apply_stealth_to_page(page)
else:
# Otherwise, check if we have an existing context for this config

View File

@@ -6,15 +6,16 @@ x-base-config: &base-config
- "11235:11235" # Gunicorn port
env_file:
- .llm.env # API keys (create from .llm.env.example)
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY:-}
- DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
- ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
- GROQ_API_KEY=${GROQ_API_KEY:-}
- TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
- MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
- GEMINI_API_TOKEN=${GEMINI_API_TOKEN:-}
- LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
# Uncomment to set default environment variables (will overwrite .llm.env)
# environment:
# - OPENAI_API_KEY=${OPENAI_API_KEY:-}
# - DEEPSEEK_API_KEY=${DEEPSEEK_API_KEY:-}
# - ANTHROPIC_API_KEY=${ANTHROPIC_API_KEY:-}
# - GROQ_API_KEY=${GROQ_API_KEY:-}
# - TOGETHER_API_KEY=${TOGETHER_API_KEY:-}
# - MISTRAL_API_KEY=${MISTRAL_API_KEY:-}
# - GEMINI_API_KEY=${GEMINI_API_KEY:-}
# - LLM_PROVIDER=${LLM_PROVIDER:-} # Optional: Override default provider (e.g., "anthropic/claude-3-opus")
volumes:
- /dev/shm:/dev/shm # Chromium performance
deploy:

View File

@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
2. **Install Dependencies**
```bash
pip install flask
pip install -r requirements.txt
```
3. **Launch the Server**
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
4. **Open in Browser**
```
http://localhost:8080
http://localhost:8000
```
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
@@ -325,7 +325,7 @@ Powers the recording functionality:
### Configuration
```python
# server.py configuration
PORT = 8080
PORT = 8000
DEBUG = True
THREADED = True
```
@@ -343,9 +343,9 @@ THREADED = True
**Port Already in Use**
```bash
# Kill existing process
lsof -ti:8080 | xargs kill -9
lsof -ti:8000 | xargs kill -9
# Or use different port
python server.py --port 8081
python server.py --port 8001
```
**Blockly Not Loading**

View File

@@ -216,7 +216,7 @@ def get_examples():
'name': 'Handle Cookie Banner',
'description': 'Accept cookies and close newsletter popup',
'script': '''# Handle cookie banner and newsletter
GO http://127.0.0.1:8080/playground/
GO http://127.0.0.1:8000/playground/
WAIT `body` 2
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''

View File

@@ -0,0 +1,594 @@
# CDP Browser Crawling
> **New in v0.7.6**: Efficient concurrent crawling with managed CDP (Chrome DevTools Protocol) browsers. Connect to a running browser instance and perform multiple crawls without spawning new windows.
## 1. Overview
When working with CDP browsers, you can connect to an existing browser instance instead of launching a new one for each crawl. This is particularly useful for:
- **Development**: Keep your browser open with DevTools for debugging
- **Persistent Sessions**: Maintain authentication across multiple crawls
- **Resource Efficiency**: Reuse a single browser instance for multiple operations
- **Concurrent Crawling**: Run multiple crawls simultaneously with proper isolation
**Key Benefits:**
- ✅ Single browser window with multiple tabs (no window clutter)
- ✅ Shared state (cookies, localStorage) across crawls
- ✅ Concurrent safety with automatic page isolation
- ✅ Automatic cleanup to prevent memory leaks
- ✅ Works seamlessly with `arun_many()` for parallel crawling
---
## 2. Quick Start
### 2.1 Starting a CDP Browser
Use the Crawl4AI CLI to start a managed CDP browser:
```bash
# Start CDP browser on default port (9222)
crwl cdp
# Start on custom port
crwl cdp -d 9223
# Start in headless mode
crwl cdp --headless
```
The browser will stay running until you press 'q' or close the terminal.
### 2.2 Basic CDP Connection
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def main():
# Configure CDP connection
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222",
verbose=True
)
# Crawl a single URL
async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(
url="https://example.com",
config=CrawlerRunConfig()
)
print(f"Success: {result.success}")
print(f"Content length: {len(result.markdown)}")
if __name__ == "__main__":
asyncio.run(main())
```
---
## 3. Concurrent Crawling with arun_many()
The real power of CDP crawling shines with `arun_many()`. The browser manager automatically handles:
- **Page Isolation**: Each crawl gets its own tab
- **Context Sharing**: All tabs share cookies and localStorage
- **Concurrent Safety**: Proper locking prevents race conditions
- **Auto Cleanup**: Tabs are closed after crawling (except sessions)
### 3.1 Basic Concurrent Crawling
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def crawl_multiple_urls():
# URLs to crawl
urls = [
"https://example.com",
"https://httpbin.org/html",
"https://www.python.org",
]
# Configure CDP browser
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222",
verbose=False
)
# Configure crawler (bypass cache for fresh data)
crawler_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS
)
# Crawl all URLs concurrently
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(
urls=urls,
config=crawler_cfg
)
# Process results
for result in results:
print(f"\nURL: {result.url}")
if result.success:
print(f"✓ Success | Content length: {len(result.markdown)}")
else:
print(f"✗ Failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(crawl_multiple_urls())
```
### 3.2 With Session Management
Use sessions to maintain authentication and state across individual crawls:
```python
async def crawl_with_sessions():
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222"
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# First crawl: Login page
login_result = await crawler.arun(
url="https://example.com/login",
config=CrawlerRunConfig(
session_id="my-session", # Session persists
js_code="document.querySelector('#login').click();"
)
)
# Second crawl: Reuse authenticated session
dashboard_result = await crawler.arun(
url="https://example.com/dashboard",
config=CrawlerRunConfig(
session_id="my-session" # Same session, cookies preserved
)
)
```
---
## 4. How It Works
### 4.1 Browser Context Reuse
When using CDP browsers, Crawl4AI:
1. **Connects** to the existing browser via CDP URL
2. **Reuses** the default browser context (single window)
3. **Creates** new pages (tabs) for each crawl
4. **Locks** page creation to prevent concurrent races
5. **Cleans up** pages after crawling (unless it's a session)
```python
# Internal behavior (simplified)
if self.config.use_managed_browser:
context = self.default_context # Shared context
# Thread-safe page creation
async with self._page_lock:
page = await context.new_page() # New tab per crawl
# After crawl completes
if not config.session_id:
await page.close() # Auto cleanup
```
### 4.2 Page Lifecycle
```mermaid
graph TD
A[Start Crawl] --> B{Has session_id?}
B -->|Yes| C[Reuse existing page]
B -->|No| D[Create new page/tab]
D --> E[Navigate & Extract]
C --> E
E --> F{Is session?}
F -->|Yes| G[Keep page open]
F -->|No| H[Close page]
H --> I[End]
G --> I
```
### 4.3 State Sharing
All pages in the same context share:
- 🍪 **Cookies**: Authentication tokens, preferences
- 💾 **localStorage**: Client-side data storage
- 🔐 **sessionStorage**: Per-tab session data
- 🌐 **Network cache**: Shared HTTP cache
This makes it perfect for crawling authenticated sites or maintaining state across multiple pages.
---
## 5. Configuration Options
### 5.1 BrowserConfig for CDP
```python
browser_cfg = BrowserConfig(
browser_type="chromium", # Must be "chromium" for CDP
cdp_url="http://localhost:9222", # CDP endpoint URL
verbose=True, # Log browser operations
# Optional: Override headers for all requests
headers={
"Accept-Language": "en-US,en;q=0.9",
},
# Optional: Set user agent
user_agent="Mozilla/5.0 ...",
# Optional: Enable stealth mode (requires dedicated browser)
# enable_stealth=False, # Not compatible with CDP
)
```
### 5.2 CrawlerRunConfig Options
```python
crawler_cfg = CrawlerRunConfig(
# Session management
session_id="my-session", # Persist page across calls
# Caching
cache_mode=CacheMode.BYPASS, # Fresh data every time
# Browser location (affects timezone, locale)
locale="en-US",
timezone_id="America/New_York",
geolocation={
"latitude": 40.7128,
"longitude": -74.0060
},
# Proxy (per-crawl override)
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "user",
"password": "pass"
}
)
```
---
## 6. Advanced Patterns
### 6.1 Streaming Results
Process URLs as they complete instead of waiting for all:
```python
async def stream_crawl_results():
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222"
)
urls = ["https://example.com" for _ in range(100)]
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# Stream results as they complete
async for result in crawler.arun_many(
urls=urls,
config=CrawlerRunConfig(stream=True)
):
if result.success:
print(f"{result.url}: {len(result.markdown)} chars")
# Process immediately instead of waiting for all
await save_to_database(result)
```
### 6.2 Custom Concurrency Control
```python
from crawl4ai import CrawlerRunConfig
# Limit concurrent crawls to 3
crawler_cfg = CrawlerRunConfig(
semaphore_count=3, # Max 3 concurrent requests
mean_delay=0.5, # Average 0.5s delay between requests
max_range=1.0, # +/- 1s random delay
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls, config=crawler_cfg)
```
### 6.3 Multi-Config Crawling
Different configurations for different URL groups:
```python
from crawl4ai import CrawlerRunConfig
# Fast crawl for static pages
fast_config = CrawlerRunConfig(
wait_until="domcontentloaded",
page_timeout=30000
)
# Slow crawl for dynamic pages
slow_config = CrawlerRunConfig(
wait_until="networkidle",
page_timeout=60000,
js_code="window.scrollTo(0, document.body.scrollHeight);"
)
configs = [fast_config, slow_config, fast_config]
urls = ["https://static.com", "https://dynamic.com", "https://static2.com"]
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls, configs=configs)
```
---
## 7. Best Practices
### 7.1 Resource Management
**DO:**
```python
# Use context manager for automatic cleanup
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls)
# Browser connection closed automatically
```
**DON'T:**
```python
# Manual management risks resource leaks
crawler = AsyncWebCrawler(config=browser_cfg)
await crawler.start()
results = await crawler.arun_many(urls)
# Forgot to call crawler.close()!
```
### 7.2 Session Management
**DO:**
```python
# Use sessions for related crawls
config = CrawlerRunConfig(session_id="user-flow")
await crawler.arun(login_url, config=config)
await crawler.arun(dashboard_url, config=config)
await crawler.kill_session("user-flow") # Clean up when done
```
**DON'T:**
```python
# Creating new session IDs unnecessarily
for i in range(100):
config = CrawlerRunConfig(session_id=f"session-{i}")
await crawler.arun(url, config=config)
# 100 unclosed sessions accumulate!
```
### 7.3 Error Handling
```python
async def robust_crawl(urls):
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222"
)
try:
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls)
# Separate successes and failures
successes = [r for r in results if r.success]
failures = [r for r in results if not r.success]
print(f"{len(successes)} succeeded")
print(f"{len(failures)} failed")
# Retry failures with different config
if failures:
retry_urls = [r.url for r in failures]
retry_config = CrawlerRunConfig(
page_timeout=120000, # Longer timeout
wait_until="networkidle"
)
retry_results = await crawler.arun_many(
retry_urls,
config=retry_config
)
return successes + retry_results
except Exception as e:
print(f"Fatal error: {e}")
return []
```
---
## 8. Troubleshooting
### 8.1 Connection Issues
**Problem**: `Cannot connect to CDP browser`
```python
# Check CDP browser is running
$ lsof -i :9222
# Should show: Chromium PID USER FD TYPE ...
# Or start it if not running
$ crwl cdp
```
**Problem**: `ERR_ABORTED` errors in concurrent crawls
**Fixed in v0.7.6**: This issue has been resolved. Pages are now properly isolated with locking.
### 8.2 Performance Issues
**Problem**: Too many open tabs
```python
# Ensure you're not using session_id for everything
config = CrawlerRunConfig() # No session_id
await crawler.arun_many(urls, config=config)
# Pages auto-close after crawling
```
**Problem**: Memory leaks
```python
# Always use context manager
async with AsyncWebCrawler(config=browser_cfg) as crawler:
# Crawling code here
pass
# Automatic cleanup on exit
```
### 8.3 State Issues
**Problem**: Cookies not persisting
```python
# Use the same context (automatic with CDP)
browser_cfg = BrowserConfig(cdp_url="http://localhost:9222")
# All crawls share cookies automatically
```
**Problem**: Need isolated state
```python
# Use different CDP endpoints or non-CDP browsers
browser_cfg_1 = BrowserConfig(cdp_url="http://localhost:9222")
browser_cfg_2 = BrowserConfig(cdp_url="http://localhost:9223")
# Completely isolated browsers
```
---
## 9. Comparison: CDP vs Regular Browsers
| Feature | CDP Browser | Regular Browser |
|---------|-------------|-----------------|
| **Window Management** | ✅ Single window, multiple tabs | ❌ New window per context |
| **Startup Time** | ✅ Instant (already running) | ⏱️ ~2-3s per launch |
| **State Sharing** | ✅ Shared cookies/localStorage | ⚠️ Isolated by default |
| **Concurrent Safety** | ✅ Automatic locking | ✅ Separate processes |
| **Memory Usage** | ✅ Lower (shared browser) | ⚠️ Higher (multiple processes) |
| **Session Persistence** | ✅ Native support | ✅ Via session_id |
| **Stealth Mode** | ❌ Not compatible | ✅ Full support |
| **Best For** | Development, authenticated crawls | Production, isolated crawls |
---
## 10. Real-World Examples
### 10.1 E-commerce Product Scraping
```python
async def scrape_products():
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222"
)
# Get product URLs from category page
async with AsyncWebCrawler(config=browser_cfg) as crawler:
category_result = await crawler.arun(
url="https://shop.example.com/category",
config=CrawlerRunConfig(
css_selector=".product-link"
)
)
# Extract product URLs
product_urls = extract_urls(category_result.links)
# Crawl all products concurrently
product_results = await crawler.arun_many(
urls=product_urls,
config=CrawlerRunConfig(
css_selector=".product-details",
semaphore_count=5 # Polite crawling
)
)
return [extract_product_data(r) for r in product_results]
```
### 10.2 News Article Monitoring
```python
import asyncio
from datetime import datetime
async def monitor_news_sites():
browser_cfg = BrowserConfig(
browser_type="chromium",
cdp_url="http://localhost:9222"
)
news_sites = [
"https://news.site1.com",
"https://news.site2.com",
"https://news.site3.com"
]
async with AsyncWebCrawler(config=browser_cfg) as crawler:
while True:
print(f"\n[{datetime.now()}] Checking for updates...")
results = await crawler.arun_many(
urls=news_sites,
config=CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, # Always fresh
css_selector=".article-headline"
)
)
for result in results:
if result.success:
headlines = extract_headlines(result)
for headline in headlines:
if is_new(headline):
notify_user(headline)
# Check every 5 minutes
await asyncio.sleep(300)
```
---
## 11. Summary
CDP browser crawling offers:
- 🚀 **Performance**: Faster startup, lower resource usage
- 🔄 **State Management**: Shared cookies and authentication
- 🎯 **Concurrent Safety**: Automatic page isolation and cleanup
- 💻 **Developer Friendly**: Visual debugging with DevTools
**When to use CDP:**
- Development and debugging
- Authenticated crawling (login required)
- Sequential crawls needing state
- Resource-constrained environments
**When to use regular browsers:**
- Production deployments
- Maximum isolation required
- Stealth mode needed
- Distributed/cloud crawling
For most use cases, **CDP browsers provide the best balance** of performance, convenience, and safety.

View File

@@ -82,6 +82,42 @@ If you installed Crawl4AI (which installs Playwright under the hood), you alread
---
### Creating a Profile Using the Crawl4AI CLI (Easiest)
If you prefer a guided, interactive setup, use the built-in CLI to create and manage persistent browser profiles.
1.Launch the profile manager:
```bash
crwl profiles
```
2.Choose "Create new profile" and enter a profile name. A Chromium window opens so you can log in to sites and configure settings. When finished, return to the terminal and press `q` to save the profile.
3.Profiles are saved under `~/.crawl4ai/profiles/<profile_name>` (for example: `/home/<you>/.crawl4ai/profiles/test_profile_1`) along with a `storage_state.json` for cookies and session data.
4.Optionally, choose "List profiles" in the CLI to view available profiles and their paths.
5.Use the saved path with `BrowserConfig.user_data_dir`:
```python
from crawl4ai import AsyncWebCrawler, BrowserConfig
profile_path = "/home/<you>/.crawl4ai/profiles/test_profile_1"
browser_config = BrowserConfig(
headless=True,
use_managed_browser=True,
user_data_dir=profile_path,
browser_type="chromium",
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com/private")
```
The CLI also supports listing and deleting profiles, and even testing a crawl directly from the menu.
---
## 3. Using Managed Browsers in Crawl4AI
Once you have a data directory with your session data, pass it to **`BrowserConfig`**:

View File

@@ -18,7 +18,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
2. **Install Dependencies**
```bash
pip install flask
pip install -r requirements.txt
```
3. **Launch the Server**
@@ -28,7 +28,7 @@ A comprehensive web-based tutorial for learning and experimenting with C4A-Scrip
4. **Open in Browser**
```
http://localhost:8080
http://localhost:8000
```
**🌐 Try Online**: [Live Demo](https://docs.crawl4ai.com/c4a-script/demo)
@@ -325,7 +325,7 @@ Powers the recording functionality:
### Configuration
```python
# server.py configuration
PORT = 8080
PORT = 8000
DEBUG = True
THREADED = True
```
@@ -343,9 +343,9 @@ THREADED = True
**Port Already in Use**
```bash
# Kill existing process
lsof -ti:8080 | xargs kill -9
lsof -ti:8000 | xargs kill -9
# Or use different port
python server.py --port 8081
python server.py --port 8001
```
**Blockly Not Loading**

View File

@@ -216,7 +216,7 @@ def get_examples():
'name': 'Handle Cookie Banner',
'description': 'Accept cookies and close newsletter popup',
'script': '''# Handle cookie banner and newsletter
GO http://127.0.0.1:8080/playground/
GO http://127.0.0.1:8000/playground/
WAIT `body` 2
IF (EXISTS `.cookie-banner`) THEN CLICK `.accept`
IF (EXISTS `.newsletter-popup`) THEN CLICK `.close`'''
@@ -283,7 +283,7 @@ WAIT `.success-message` 5'''
return jsonify(examples)
if __name__ == '__main__':
port = int(os.environ.get('PORT', 8080))
port = int(os.environ.get('PORT', 8000))
print(f"""
╔══════════════════════════════════════════════════════════╗
║ C4A-Script Interactive Tutorial Server ║

View File

@@ -69,12 +69,12 @@ The tutorial includes a Flask-based web interface with:
cd docs/examples/c4a_script/tutorial/
# Install dependencies
pip install flask
pip install -r requirements.txt
# Launch the tutorial server
python app.py
python server.py
# Open http://localhost:5000 in your browser
# Open http://localhost:8000 in your browser
```
## Core Concepts
@@ -111,8 +111,8 @@ CLICK `.submit-btn`
# By attribute
CLICK `button[type="submit"]`
# By text content
CLICK `button:contains("Sign In")`
# By accessible attributes
CLICK `button[aria-label="Search"][title="Search"]`
# Complex selectors
CLICK `.form-container input[name="email"]`

View File

@@ -57,7 +57,7 @@
Crawl4AI is the #1 trending GitHub repository, actively maintained by a vibrant community. It delivers blazing-fast, AI-ready web crawling tailored for large language models, AI agents, and data pipelines. Fully open source, flexible, and built for real-time performance, **Crawl4AI** empowers developers with unmatched speed, precision, and deployment ease.
> **Note**: If you're looking for the old documentation, you can access it [here](https://old.docs.crawl4ai.com).
> Enjoy using Crawl4AI? Consider **[becoming a sponsor](https://github.com/sponsors/unclecode)** to support ongoing development and community growth!
## 🆕 AI Assistant Skill Now Available!

View File

@@ -529,8 +529,19 @@ class AdminDashboard {
</label>
</div>
<div class="form-group full-width">
<label>Integration Guide</label>
<textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
<label>Long Description (Markdown - Overview tab)</label>
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
</div>
<div class="form-group full-width">
<label>Integration Guide (Markdown - Integration tab)</label>
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
</div>
<div class="form-group full-width">
<label>Documentation (Markdown - Documentation tab)</label>
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
<small>Full documentation with API reference, examples, best practices, etc.</small>
</div>
</div>
`;
@@ -712,7 +723,9 @@ class AdminDashboard {
data.contact_email = document.getElementById('form-email').value;
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
data.long_description = document.getElementById('form-long-description').value;
data.integration_guide = document.getElementById('form-integration').value;
data.documentation = document.getElementById('form-documentation').value;
} else if (type === 'articles') {
data.title = document.getElementById('form-title').value;
data.slug = this.generateSlug(data.title);

View File

@@ -278,12 +278,12 @@
}
.tab-content {
display: none;
display: none !important;
padding: 2rem;
}
.tab-content.active {
display: block;
display: block !important;
}
/* Overview Layout */
@@ -510,6 +510,31 @@
line-height: 1.5;
}
/* Markdown rendered code blocks */
.integration-content pre,
.docs-content pre {
background: var(--bg-dark);
border: 1px solid var(--border-color);
margin: 1rem 0;
padding: 1rem;
padding-top: 2.5rem; /* Space for copy button */
overflow-x: auto;
position: relative;
max-height: none; /* Remove any height restrictions */
height: auto; /* Allow content to expand */
}
.integration-content pre code,
.docs-content pre code {
background: transparent;
padding: 0;
color: var(--text-secondary);
font-size: 0.875rem;
line-height: 1.5;
white-space: pre; /* Preserve whitespace and line breaks */
display: block;
}
/* Feature Grid */
.feature-grid {
display: grid;

View File

@@ -73,27 +73,14 @@
<div class="tabs">
<button class="tab-btn active" data-tab="overview">Overview</button>
<button class="tab-btn" data-tab="integration">Integration</button>
<button class="tab-btn" data-tab="docs">Documentation</button>
<button class="tab-btn" data-tab="support">Support</button>
<!-- <button class="tab-btn" data-tab="docs">Documentation</button>
<button class="tab-btn" data-tab="support">Support</button> -->
</div>
<section id="overview-tab" class="tab-content active">
<div class="overview-columns">
<div class="overview-main">
<h2>Overview</h2>
<div id="app-overview">Overview content goes here.</div>
<h3>Key Features</h3>
<ul id="app-features" class="features-list">
<li>Feature 1</li>
<li>Feature 2</li>
<li>Feature 3</li>
</ul>
<h3>Use Cases</h3>
<div id="app-use-cases" class="use-cases">
<p>Describe how this app can help your workflow.</p>
</div>
</div>
<aside class="sidebar">
@@ -142,37 +129,16 @@
</section>
<section id="integration-tab" class="tab-content">
<div class="integration-content">
<h2>Integration Guide</h2>
<h3>Installation</h3>
<div class="code-block">
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
</div>
<h3>Basic Usage</h3>
<div class="code-block">
<pre><code id="usage-code"># Usage example will appear here</code></pre>
</div>
<h3>Complete Integration Example</h3>
<div class="code-block">
<button class="copy-btn" id="copy-integration">Copy</button>
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
</div>
<div class="integration-content" id="app-integration">
</div>
</section>
<section id="docs-tab" class="tab-content">
<div class="docs-content">
<h2>Documentation</h2>
<div id="app-docs" class="doc-sections">
<p>Documentation coming soon.</p>
<!-- <section id="docs-tab" class="tab-content">
<div class="docs-content" id="app-docs">
</div>
</div>
</section>
</section> -->
<section id="support-tab" class="tab-content">
<!-- <section id="support-tab" class="tab-content">
<div class="docs-content">
<h2>Support</h2>
<div class="support-grid">
@@ -190,7 +156,7 @@
</div>
</div>
</div>
</section>
</section> -->
</div>
</main>

View File

@@ -112,7 +112,7 @@ class AppDetailPage {
}
// Contact
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available');
// Sidebar info
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
@@ -123,144 +123,132 @@ class AppDetailPage {
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
// Integration guide
this.renderIntegrationGuide();
// Render tab contents from database fields
this.renderTabContents();
}
renderIntegrationGuide() {
// Installation code
const installCode = document.getElementById('install-code');
if (installCode) {
if (this.appData.type === 'Open Source' && this.appData.github_url) {
installCode.textContent = `# Clone from GitHub
git clone ${this.appData.github_url}
# Install dependencies
pip install -r requirements.txt`;
} else if (this.appData.name.toLowerCase().includes('api')) {
installCode.textContent = `# Install via pip
pip install ${this.appData.slug}
# Or install from source
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
renderTabContents() {
// Overview tab - use long_description from database
const overviewDiv = document.getElementById('app-overview');
if (overviewDiv) {
if (this.appData.long_description) {
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
} else {
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
}
}
// Usage code - customize based on category
const usageCode = document.getElementById('usage-code');
if (usageCode) {
if (this.appData.category === 'Browser Automation') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
async def main():
# Initialize ${this.appData.name}
automation = ${this.appData.name.replace(/\s+/g, '')}()
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
browser_config=automation.config,
wait_for="css:body"
)
print(result.markdown)`;
} else if (this.appData.category === 'Proxy Services') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
import ${this.appData.slug.replace(/-/g, '_')}
# Configure proxy
proxy_config = {
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
"username": "your_username",
"password": "your_password"
}
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
result = await crawler.arun(
url="https://example.com",
bypass_cache=True
)
print(result.status_code)`;
} else if (this.appData.category === 'LLM Integration') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Configure LLM extraction
strategy = LLMExtractionStrategy(
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
api_key="your-api-key",
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
instruction="Extract structured data"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
print(result.extracted_content)`;
// Integration tab - use integration_guide field from database
const integrationDiv = document.getElementById('app-integration');
if (integrationDiv) {
if (this.appData.integration_guide) {
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
// Add copy buttons to all code blocks
this.addCopyButtonsToCodeBlocks(integrationDiv);
} else {
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
}
}
// Integration example
const integrationCode = document.getElementById('integration-code');
if (integrationCode) {
integrationCode.textContent = this.appData.integration_guide ||
`# Complete ${this.appData.name} Integration Example
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
"""
Complete example showing how to use ${this.appData.name}
with Crawl4AI for production web scraping
"""
# Define extraction schema
schema = {
"name": "ProductList",
"baseSelector": "div.product",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
]
// Documentation tab - use documentation field from database
const docsDiv = document.getElementById('app-docs');
if (docsDiv) {
if (this.appData.documentation) {
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
// Add copy buttons to all code blocks
this.addCopyButtonsToCodeBlocks(docsDiv);
} else {
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
}
}
}
# Initialize crawler with ${this.appData.name}
async with AsyncWebCrawler(
browser_type="chromium",
headless=True,
verbose=True
) as crawler:
addCopyButtonsToCodeBlocks(container) {
// Find all code blocks and add copy buttons
const codeBlocks = container.querySelectorAll('pre code');
codeBlocks.forEach(codeBlock => {
const pre = codeBlock.parentElement;
# Crawl with extraction
result = await crawler.arun(
url="https://example.com/products",
extraction_strategy=JsonCssExtractionStrategy(schema),
cache_mode="bypass",
wait_for="css:.product",
screenshot=True
)
// Skip if already has a copy button
if (pre.querySelector('.copy-btn')) return;
# Process results
if result.success:
products = json.loads(result.extracted_content)
print(f"Found {len(products)} products")
// Create copy button
const copyBtn = document.createElement('button');
copyBtn.className = 'copy-btn';
copyBtn.textContent = 'Copy';
copyBtn.onclick = () => {
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
copyBtn.textContent = '✓ Copied!';
setTimeout(() => {
copyBtn.textContent = 'Copy';
}, 2000);
});
};
for product in products[:5]:
print(f"- {product['title']}: {product['price']}")
return products
# Run the crawler
if __name__ == "__main__":
import asyncio
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
// Add button to pre element
pre.style.position = 'relative';
pre.insertBefore(copyBtn, codeBlock);
});
}
renderMarkdown(text) {
if (!text) return '';
// Store code blocks temporarily to protect them from processing
const codeBlocks = [];
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
return placeholder;
});
// Store inline code temporarily
const inlineCodes = [];
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
return placeholder;
});
// Now process the rest of the markdown
processed = processed
// Headers
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
// Bold
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
// Italic
.replace(/\*(.*?)\*/g, '<em>$1</em>')
// Links
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
// Line breaks
.replace(/\n\n/g, '</p><p>')
.replace(/\n/g, '<br>')
// Lists
.replace(/^\* (.*)$/gim, '<li>$1</li>')
.replace(/^- (.*)$/gim, '<li>$1</li>')
// Wrap in paragraphs
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
.replace(/(?<![>])$/gim, '</p>');
// Restore inline code
inlineCodes.forEach((code, i) => {
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
});
// Restore code blocks
codeBlocks.forEach((block, i) => {
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
});
return processed;
}
escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
formatNumber(num) {
@@ -275,45 +263,27 @@ if __name__ == "__main__":
setupEventListeners() {
// Tab switching
const tabs = document.querySelectorAll('.tab-btn');
tabs.forEach(tab => {
tab.addEventListener('click', () => {
// Update active tab
// Update active tab button
tabs.forEach(t => t.classList.remove('active'));
tab.classList.add('active');
// Show corresponding content
const tabName = tab.dataset.tab;
document.querySelectorAll('.tab-content').forEach(content => {
// Hide all tab contents
const allTabContents = document.querySelectorAll('.tab-content');
allTabContents.forEach(content => {
content.classList.remove('active');
});
document.getElementById(`${tabName}-tab`).classList.add('active');
});
});
// Copy integration code
document.getElementById('copy-integration').addEventListener('click', () => {
const code = document.getElementById('integration-code').textContent;
navigator.clipboard.writeText(code).then(() => {
const btn = document.getElementById('copy-integration');
const originalText = btn.innerHTML;
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = originalText;
}, 2000);
});
});
// Copy code buttons
document.querySelectorAll('.copy-btn').forEach(btn => {
btn.addEventListener('click', (e) => {
const codeBlock = e.target.closest('.code-block');
const code = codeBlock.querySelector('code').textContent;
navigator.clipboard.writeText(code).then(() => {
btn.textContent = 'Copied!';
setTimeout(() => {
btn.textContent = 'Copy';
}, 2000);
});
// Show the selected tab content
const targetTab = document.getElementById(`${tabName}-tab`);
if (targetTab) {
targetTab.classList.add('active');
}
});
});
}

View File

@@ -471,13 +471,17 @@ async def delete_sponsor(sponsor_id: int):
app.include_router(router)
# Version info
VERSION = "1.1.0"
BUILD_DATE = "2025-10-26"
@app.get("/")
async def root():
"""API info"""
return {
"name": "Crawl4AI Marketplace API",
"version": "1.0.0",
"version": VERSION,
"build_date": BUILD_DATE,
"endpoints": [
"/marketplace/api/apps",
"/marketplace/api/articles",

View File

@@ -364,5 +364,19 @@ async def test_network_error_handling():
async with AsyncPlaywrightCrawlerStrategy() as strategy:
await strategy.crawl("https://invalid.example.com", config)
@pytest.mark.asyncio
async def test_remove_overlay_elements(crawler_strategy):
config = CrawlerRunConfig(
remove_overlay_elements=True,
delay_before_return_html=5,
)
response = await crawler_strategy.crawl(
"https://www2.hm.com/en_us/index.html",
config
)
assert response.status_code == 200
assert "Accept all cookies" not in response.html
if __name__ == "__main__":
pytest.main([__file__, "-v"])

View File

@@ -0,0 +1,63 @@
"""
Test for arun_many with managed CDP browser to ensure each crawl gets its own tab.
"""
import pytest
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
@pytest.mark.asyncio
async def test_arun_many_with_cdp():
"""Test arun_many opens a new tab for each url with managed CDP browser."""
# NOTE: Requires a running CDP browser at localhost:9222
# Can be started with: crwl cdp -d 9222
browser_cfg = BrowserConfig(
browser_type="cdp",
cdp_url="http://localhost:9222",
verbose=False,
)
urls = [
"https://example.com",
"https://httpbin.org/html",
"https://www.python.org",
]
crawler_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = await crawler.arun_many(urls=urls, config=crawler_cfg)
# All results should be successful and distinct
assert len(results) == 3
for result in results:
assert result.success, f"Crawl failed: {result.url} - {result.error_message}"
assert result.markdown is not None
@pytest.mark.asyncio
async def test_arun_many_with_cdp_sequential():
"""Test arun_many sequentially to isolate issues."""
browser_cfg = BrowserConfig(
browser_type="cdp",
cdp_url="http://localhost:9222",
verbose=True,
)
urls = [
"https://example.com",
"https://httpbin.org/html",
"https://www.python.org",
]
crawler_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
results = []
for url in urls:
result = await crawler.arun(url=url, config=crawler_cfg)
results.append(result)
assert result.success, f"Crawl failed: {result.url} - {result.error_message}"
assert result.markdown is not None
assert len(results) == 3
if __name__ == "__main__":
asyncio.run(test_arun_many_with_cdp())