Compare commits

..

1 Commits

Author SHA1 Message Date
ntohidi
ca100c6518 Release v0.7.6: The 0.7.6 Update
- Updated version to 0.7.6
- Added comprehensive demo and release notes
- Updated all documentation
- Update the veriosn in Dockerfile to 0.7.6
2025-10-22 13:46:54 +02:00
9 changed files with 622 additions and 1240 deletions

View File

@@ -785,54 +785,6 @@ curl http://localhost:11235/crawl/job/crawl_xyz
The response includes `status` field: `"processing"`, `"completed"`, or `"failed"`.
#### LLM Extraction Jobs with Webhooks
The same webhook system works for LLM extraction jobs via `/llm/job`:
```bash
# Submit LLM extraction job with webhook
curl -X POST http://localhost:11235/llm/job \
-H "Content-Type: application/json" \
-d '{
"url": "https://example.com/article",
"q": "Extract the article title, author, and main points",
"provider": "openai/gpt-4o-mini",
"webhook_config": {
"webhook_url": "https://myapp.com/webhooks/llm-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}'
# Response: {"task_id": "llm_1234567890"}
```
**Your webhook receives:**
```json
{
"task_id": "llm_1234567890",
"task_type": "llm_extraction",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com/article"],
"data": {
"extracted_content": {
"title": "Understanding Web Scraping",
"author": "John Doe",
"main_points": ["Point 1", "Point 2", "Point 3"]
}
}
}
```
**Key Differences for LLM Jobs:**
- Task type is `"llm_extraction"` instead of `"crawl"`
- Extracted data is in `data.extracted_content`
- Single URL only (not an array)
- Supports schema-based extraction with `schema` parameter
> 💡 **Pro tip**: See [WEBHOOK_EXAMPLES.md](./WEBHOOK_EXAMPLES.md) for detailed examples including TypeScript client code, Flask webhook handlers, and failure handling.
---

View File

@@ -2,8 +2,8 @@
import asyncio, json, hashlib, time, psutil
from contextlib import suppress
from typing import Dict
from crawl4ai import AsyncWebCrawler, BrowserConfig, BrowserAdapter
from typing import Dict ,Optional
from crawl4ai import AsyncWebCrawler, BrowserConfig
from typing import Dict
from utils import load_config
CONFIG = load_config()
@@ -15,22 +15,11 @@ LOCK = asyncio.Lock()
MEM_LIMIT = CONFIG.get("crawler", {}).get("memory_threshold_percent", 95.0) # % RAM refuse new browsers above this
IDLE_TTL = CONFIG.get("crawler", {}).get("pool", {}).get("idle_ttl_sec", 1800) # close if unused for 30min
def _sig(cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None) -> str:
try:
config_payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",", ":"))
except (TypeError, ValueError):
# Fallback to string representation if JSON serialization fails
config_payload = str(cfg.to_dict())
adapter_name = adapter.__class__.__name__ if adapter else "PlaywrightAdapter"
payload = f"{config_payload}:{adapter_name}"
def _sig(cfg: BrowserConfig) -> str:
payload = json.dumps(cfg.to_dict(), sort_keys=True, separators=(",",":"))
return hashlib.sha1(payload.encode()).hexdigest()
async def get_crawler(
cfg: BrowserConfig, adapter: Optional[BrowserAdapter] = None
) -> AsyncWebCrawler:
sig = None
async def get_crawler(cfg: BrowserConfig) -> AsyncWebCrawler:
try:
sig = _sig(cfg)
async with LOCK:
@@ -48,13 +37,12 @@ async def get_crawler(
except Exception as e:
raise RuntimeError(f"Failed to start browser: {e}")
finally:
if sig:
if sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
if sig in POOL:
LAST_USED[sig] = time.time()
else:
# If we failed to start the browser, we should remove it from the pool
POOL.pop(sig, None)
LAST_USED.pop(sig, None)
# If we failed to start the browser, we should remove it from the pool
async def close_all():
async with LOCK:

File diff suppressed because it is too large Load Diff

View File

@@ -27,14 +27,6 @@
- [Hook Response Information](#hook-response-information)
- [Error Handling](#error-handling)
- [Hooks Utility: Function-Based Approach (Python)](#hooks-utility-function-based-approach-python)
- [Job Queue & Webhook API](#job-queue-webhook-api)
- [Why Use the Job Queue API?](#why-use-the-job-queue-api)
- [Available Endpoints](#available-endpoints)
- [Webhook Configuration](#webhook-configuration)
- [Usage Examples](#usage-examples)
- [Webhook Best Practices](#webhook-best-practices)
- [Use Cases](#use-cases)
- [Troubleshooting](#troubleshooting)
- [Dockerfile Parameters](#dockerfile-parameters)
- [Using the API](#using-the-api)
- [Playground Interface](#playground-interface)
@@ -1118,464 +1110,6 @@ if __name__ == "__main__":
---
## Job Queue & Webhook API
The Docker deployment includes a powerful asynchronous job queue system with webhook support for both crawling and LLM extraction tasks. Instead of waiting for long-running operations to complete, submit jobs and receive real-time notifications via webhooks when they finish.
### Why Use the Job Queue API?
**Traditional Synchronous API (`/crawl`):**
- Client waits for entire crawl to complete
- Timeout issues with long-running crawls
- Resource blocking during execution
- Constant polling required for status updates
**Asynchronous Job Queue API (`/crawl/job`, `/llm/job`):**
- ✅ Submit job and continue immediately
- ✅ No timeout concerns for long operations
- ✅ Real-time webhook notifications on completion
- ✅ Better resource utilization
- ✅ Perfect for batch processing
- ✅ Ideal for microservice architectures
### Available Endpoints
#### 1. Crawl Job Endpoint
```
POST /crawl/job
```
Submit an asynchronous crawl job with optional webhook notification.
**Request Body:**
```json
{
"urls": ["https://example.com"],
"cache_mode": "bypass",
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"schema": {
"title": "h1",
"content": ".article-body"
}
},
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/crawl-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token",
"X-Custom-Header": "value"
}
}
}
```
**Response:**
```json
{
"task_id": "crawl_1698765432",
"message": "Crawl job submitted"
}
```
#### 2. LLM Extraction Job Endpoint
```
POST /llm/job
```
Submit an asynchronous LLM extraction job with optional webhook notification.
**Request Body:**
```json
{
"url": "https://example.com/article",
"q": "Extract the article title, author, publication date, and main points",
"provider": "openai/gpt-4o-mini",
"schema": "{\"title\": \"string\", \"author\": \"string\", \"date\": \"string\", \"points\": [\"string\"]}",
"cache": false,
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/llm-complete",
"webhook_data_in_payload": true,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
```
**Response:**
```json
{
"task_id": "llm_1698765432",
"message": "LLM job submitted"
}
```
#### 3. Job Status Endpoint
```
GET /job/{task_id}
```
Check the status and retrieve results of a submitted job.
**Response (In Progress):**
```json
{
"task_id": "crawl_1698765432",
"status": "processing",
"message": "Job is being processed"
}
```
**Response (Completed):**
```json
{
"task_id": "crawl_1698765432",
"status": "completed",
"result": {
"markdown": "# Page Title\n\nContent...",
"extracted_content": {...},
"links": {...}
}
}
```
### Webhook Configuration
Webhooks provide real-time notifications when your jobs complete, eliminating the need for constant polling.
#### Webhook Config Parameters
| Parameter | Type | Required | Description |
|-----------|------|----------|-------------|
| `webhook_url` | string | Yes | Your HTTP(S) endpoint to receive notifications |
| `webhook_data_in_payload` | boolean | No | Include full result data in webhook payload (default: false) |
| `webhook_headers` | object | No | Custom headers for authentication/identification |
#### Webhook Payload Format
**Success Notification (Crawl Job):**
```json
{
"task_id": "crawl_1698765432",
"task_type": "crawl",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com"],
"data": {
"markdown": "# Page content...",
"extracted_content": {...},
"links": {...}
}
}
```
**Success Notification (LLM Job):**
```json
{
"task_id": "llm_1698765432",
"task_type": "llm_extraction",
"status": "completed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com/article"],
"data": {
"extracted_content": {
"title": "Understanding Web Scraping",
"author": "John Doe",
"date": "2025-10-22",
"points": ["Point 1", "Point 2"]
}
}
}
```
**Failure Notification:**
```json
{
"task_id": "crawl_1698765432",
"task_type": "crawl",
"status": "failed",
"timestamp": "2025-10-22T12:30:00.000000+00:00",
"urls": ["https://example.com"],
"error": "Connection timeout after 30 seconds"
}
```
#### Webhook Delivery & Retry
- **Delivery Method:** HTTP POST to your `webhook_url`
- **Content-Type:** `application/json`
- **Retry Policy:** Exponential backoff with 5 attempts
- Attempt 1: Immediate
- Attempt 2: 1 second delay
- Attempt 3: 2 seconds delay
- Attempt 4: 4 seconds delay
- Attempt 5: 8 seconds delay
- **Success Status Codes:** 200-299
- **Custom Headers:** Your `webhook_headers` are included in every request
### Usage Examples
#### Example 1: Python with Webhook Handler (Flask)
```python
from flask import Flask, request, jsonify
import requests
app = Flask(__name__)
# Webhook handler
@app.route('/webhook/crawl-complete', methods=['POST'])
def handle_crawl_webhook():
payload = request.json
if payload['status'] == 'completed':
print(f"✅ Job {payload['task_id']} completed!")
print(f"Task type: {payload['task_type']}")
# Access the crawl results
if 'data' in payload:
markdown = payload['data'].get('markdown', '')
extracted = payload['data'].get('extracted_content', {})
print(f"Extracted {len(markdown)} characters")
print(f"Structured data: {extracted}")
else:
print(f"❌ Job {payload['task_id']} failed: {payload.get('error')}")
return jsonify({"status": "received"}), 200
# Submit a crawl job with webhook
def submit_crawl_job():
response = requests.post(
"http://localhost:11235/crawl/job",
json={
"urls": ["https://example.com"],
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"schema": {
"name": "Example Schema",
"baseSelector": "body",
"fields": [
{"name": "title", "selector": "h1", "type": "text"},
{"name": "description", "selector": "meta[name='description']", "type": "attribute", "attribute": "content"}
]
}
},
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/crawl-complete",
"webhook_data_in_payload": True,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
)
task_id = response.json()['task_id']
print(f"Job submitted: {task_id}")
return task_id
if __name__ == '__main__':
app.run(port=5000)
```
#### Example 2: LLM Extraction with Webhooks
```python
import requests
def submit_llm_job_with_webhook():
response = requests.post(
"http://localhost:11235/llm/job",
json={
"url": "https://example.com/article",
"q": "Extract the article title, author, and main points",
"provider": "openai/gpt-4o-mini",
"webhook_config": {
"webhook_url": "https://your-app.com/webhook/llm-complete",
"webhook_data_in_payload": True,
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
}
}
)
task_id = response.json()['task_id']
print(f"LLM job submitted: {task_id}")
return task_id
# Webhook handler for LLM jobs
@app.route('/webhook/llm-complete', methods=['POST'])
def handle_llm_webhook():
payload = request.json
if payload['status'] == 'completed':
extracted = payload['data']['extracted_content']
print(f"✅ LLM extraction completed!")
print(f"Results: {extracted}")
else:
print(f"❌ LLM extraction failed: {payload.get('error')}")
return jsonify({"status": "received"}), 200
```
#### Example 3: Without Webhooks (Polling)
If you don't use webhooks, you can poll for results:
```python
import requests
import time
# Submit job
response = requests.post(
"http://localhost:11235/crawl/job",
json={"urls": ["https://example.com"]}
)
task_id = response.json()['task_id']
# Poll for results
while True:
result = requests.get(f"http://localhost:11235/job/{task_id}")
data = result.json()
if data['status'] == 'completed':
print("Job completed!")
print(data['result'])
break
elif data['status'] == 'failed':
print(f"Job failed: {data.get('error')}")
break
print("Still processing...")
time.sleep(2)
```
#### Example 4: Global Webhook Configuration
Set a default webhook URL in your `config.yml` to avoid repeating it in every request:
```yaml
# config.yml
api:
crawler:
# ... other settings ...
webhook:
default_url: "https://your-app.com/webhook/default"
default_headers:
X-Webhook-Secret: "your-secret-token"
```
Then submit jobs without webhook config:
```python
# Uses the global webhook configuration
response = requests.post(
"http://localhost:11235/crawl/job",
json={"urls": ["https://example.com"]}
)
```
### Webhook Best Practices
1. **Authentication:** Always use custom headers for webhook authentication
```json
"webhook_headers": {
"X-Webhook-Secret": "your-secret-token"
}
```
2. **Idempotency:** Design your webhook handler to be idempotent (safe to receive duplicate notifications)
3. **Fast Response:** Return HTTP 200 quickly; process data asynchronously if needed
```python
@app.route('/webhook', methods=['POST'])
def webhook():
payload = request.json
# Queue for background processing
queue.enqueue(process_webhook, payload)
return jsonify({"status": "received"}), 200
```
4. **Error Handling:** Handle both success and failure notifications
```python
if payload['status'] == 'completed':
# Process success
elif payload['status'] == 'failed':
# Log error, retry, or alert
```
5. **Validation:** Verify webhook authenticity using custom headers
```python
secret = request.headers.get('X-Webhook-Secret')
if secret != os.environ['EXPECTED_SECRET']:
return jsonify({"error": "Unauthorized"}), 401
```
6. **Logging:** Log webhook deliveries for debugging
```python
logger.info(f"Webhook received: {payload['task_id']} - {payload['status']}")
```
### Use Cases
**1. Batch Processing**
Submit hundreds of URLs and get notified as each completes:
```python
urls = ["https://site1.com", "https://site2.com", ...]
for url in urls:
submit_crawl_job(url, webhook_url="https://app.com/webhook")
```
**2. Microservice Integration**
Integrate with event-driven architectures:
```python
# Service A submits job
task_id = submit_crawl_job(url)
# Service B receives webhook and triggers next step
@app.route('/webhook')
def webhook():
process_result(request.json)
trigger_next_service()
return "OK", 200
```
**3. Long-Running Extractions**
Handle complex LLM extractions without timeouts:
```python
submit_llm_job(
url="https://long-article.com",
q="Comprehensive summary with key points and analysis",
webhook_url="https://app.com/webhook/llm"
)
```
### Troubleshooting
**Webhook not receiving notifications?**
- Check your webhook URL is publicly accessible
- Verify firewall/security group settings
- Use webhook testing tools like webhook.site for debugging
- Check server logs for delivery attempts
- Ensure your handler returns 200-299 status code
**Job stuck in processing?**
- Check Redis connection: `docker logs <container_name> | grep redis`
- Verify worker processes: `docker exec <container_name> ps aux | grep worker`
- Check server logs: `docker logs <container_name>`
**Need to cancel a job?**
Jobs are processed asynchronously. If you need to cancel:
- Delete the task from Redis (requires Redis CLI access)
- Or implement a cancellation endpoint in your webhook handler
---
## Dockerfile Parameters
You can customize the image build process using build arguments (`--build-arg`). These are typically used via `docker buildx build` or within the `docker-compose.yml` file.

View File

@@ -529,19 +529,8 @@ class AdminDashboard {
</label>
</div>
<div class="form-group full-width">
<label>Long Description (Markdown - Overview tab)</label>
<textarea id="form-long-description" rows="10" placeholder="Enter detailed description with markdown formatting...">${app?.long_description || ''}</textarea>
<small>Markdown support: **bold**, *italic*, [links](url), # headers, code blocks, lists</small>
</div>
<div class="form-group full-width">
<label>Integration Guide (Markdown - Integration tab)</label>
<textarea id="form-integration" rows="20" placeholder="Enter integration guide with installation, examples, and code snippets using markdown...">${app?.integration_guide || ''}</textarea>
<small>Single markdown field with installation, examples, and complete guide. Code blocks get auto copy buttons.</small>
</div>
<div class="form-group full-width">
<label>Documentation (Markdown - Documentation tab)</label>
<textarea id="form-documentation" rows="20" placeholder="Enter documentation with API reference, examples, and best practices using markdown...">${app?.documentation || ''}</textarea>
<small>Full documentation with API reference, examples, best practices, etc.</small>
<label>Integration Guide</label>
<textarea id="form-integration" rows="10">${app?.integration_guide || ''}</textarea>
</div>
</div>
`;
@@ -723,9 +712,7 @@ class AdminDashboard {
data.contact_email = document.getElementById('form-email').value;
data.featured = document.getElementById('form-featured').checked ? 1 : 0;
data.sponsored = document.getElementById('form-sponsored').checked ? 1 : 0;
data.long_description = document.getElementById('form-long-description').value;
data.integration_guide = document.getElementById('form-integration').value;
data.documentation = document.getElementById('form-documentation').value;
} else if (type === 'articles') {
data.title = document.getElementById('form-title').value;
data.slug = this.generateSlug(data.title);

View File

@@ -278,12 +278,12 @@
}
.tab-content {
display: none !important;
display: none;
padding: 2rem;
}
.tab-content.active {
display: block !important;
display: block;
}
/* Overview Layout */
@@ -510,31 +510,6 @@
line-height: 1.5;
}
/* Markdown rendered code blocks */
.integration-content pre,
.docs-content pre {
background: var(--bg-dark);
border: 1px solid var(--border-color);
margin: 1rem 0;
padding: 1rem;
padding-top: 2.5rem; /* Space for copy button */
overflow-x: auto;
position: relative;
max-height: none; /* Remove any height restrictions */
height: auto; /* Allow content to expand */
}
.integration-content pre code,
.docs-content pre code {
background: transparent;
padding: 0;
color: var(--text-secondary);
font-size: 0.875rem;
line-height: 1.5;
white-space: pre; /* Preserve whitespace and line breaks */
display: block;
}
/* Feature Grid */
.feature-grid {
display: grid;

View File

@@ -73,14 +73,27 @@
<div class="tabs">
<button class="tab-btn active" data-tab="overview">Overview</button>
<button class="tab-btn" data-tab="integration">Integration</button>
<!-- <button class="tab-btn" data-tab="docs">Documentation</button>
<button class="tab-btn" data-tab="support">Support</button> -->
<button class="tab-btn" data-tab="docs">Documentation</button>
<button class="tab-btn" data-tab="support">Support</button>
</div>
<section id="overview-tab" class="tab-content active">
<div class="overview-columns">
<div class="overview-main">
<h2>Overview</h2>
<div id="app-overview">Overview content goes here.</div>
<h3>Key Features</h3>
<ul id="app-features" class="features-list">
<li>Feature 1</li>
<li>Feature 2</li>
<li>Feature 3</li>
</ul>
<h3>Use Cases</h3>
<div id="app-use-cases" class="use-cases">
<p>Describe how this app can help your workflow.</p>
</div>
</div>
<aside class="sidebar">
@@ -129,16 +142,37 @@
</section>
<section id="integration-tab" class="tab-content">
<div class="integration-content" id="app-integration">
<div class="integration-content">
<h2>Integration Guide</h2>
<h3>Installation</h3>
<div class="code-block">
<pre><code id="install-code"># Installation instructions will appear here</code></pre>
</div>
<h3>Basic Usage</h3>
<div class="code-block">
<pre><code id="usage-code"># Usage example will appear here</code></pre>
</div>
<h3>Complete Integration Example</h3>
<div class="code-block">
<button class="copy-btn" id="copy-integration">Copy</button>
<pre><code id="integration-code"># Complete integration guide will appear here</code></pre>
</div>
</div>
</section>
<!-- <section id="docs-tab" class="tab-content">
<div class="docs-content" id="app-docs">
<section id="docs-tab" class="tab-content">
<div class="docs-content">
<h2>Documentation</h2>
<div id="app-docs" class="doc-sections">
<p>Documentation coming soon.</p>
</div>
</div>
</section> -->
</section>
<!-- <section id="support-tab" class="tab-content">
<section id="support-tab" class="tab-content">
<div class="docs-content">
<h2>Support</h2>
<div class="support-grid">
@@ -156,7 +190,7 @@
</div>
</div>
</div>
</section> -->
</section>
</div>
</main>

View File

@@ -112,7 +112,7 @@ class AppDetailPage {
}
// Contact
document.getElementById('app-contact') && (document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available');
document.getElementById('app-contact').textContent = this.appData.contact_email || 'Not available';
// Sidebar info
document.getElementById('sidebar-downloads').textContent = this.formatNumber(this.appData.downloads || 0);
@@ -123,134 +123,146 @@ class AppDetailPage {
document.getElementById('sidebar-pricing').textContent = this.appData.pricing || 'Free';
document.getElementById('sidebar-contact').textContent = this.appData.contact_email || 'contact@example.com';
// Render tab contents from database fields
this.renderTabContents();
// Integration guide
this.renderIntegrationGuide();
}
renderTabContents() {
// Overview tab - use long_description from database
const overviewDiv = document.getElementById('app-overview');
if (overviewDiv) {
if (this.appData.long_description) {
overviewDiv.innerHTML = this.renderMarkdown(this.appData.long_description);
} else {
overviewDiv.innerHTML = `<p>${this.appData.description || 'No overview available.'}</p>`;
renderIntegrationGuide() {
// Installation code
const installCode = document.getElementById('install-code');
if (installCode) {
if (this.appData.type === 'Open Source' && this.appData.github_url) {
installCode.textContent = `# Clone from GitHub
git clone ${this.appData.github_url}
# Install dependencies
pip install -r requirements.txt`;
} else if (this.appData.name.toLowerCase().includes('api')) {
installCode.textContent = `# Install via pip
pip install ${this.appData.slug}
# Or install from source
pip install git+${this.appData.github_url || 'https://github.com/example/repo'}`;
}
}
// Integration tab - use integration_guide field from database
const integrationDiv = document.getElementById('app-integration');
if (integrationDiv) {
if (this.appData.integration_guide) {
integrationDiv.innerHTML = this.renderMarkdown(this.appData.integration_guide);
// Add copy buttons to all code blocks
this.addCopyButtonsToCodeBlocks(integrationDiv);
} else {
integrationDiv.innerHTML = '<p>Integration guide not yet available. Please check the official website for details.</p>';
// Usage code - customize based on category
const usageCode = document.getElementById('usage-code');
if (usageCode) {
if (this.appData.category === 'Browser Automation') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
from ${this.appData.slug.replace(/-/g, '_')} import ${this.appData.name.replace(/\s+/g, '')}
async def main():
# Initialize ${this.appData.name}
automation = ${this.appData.name.replace(/\s+/g, '')}()
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
browser_config=automation.config,
wait_for="css:body"
)
print(result.markdown)`;
} else if (this.appData.category === 'Proxy Services') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
import ${this.appData.slug.replace(/-/g, '_')}
# Configure proxy
proxy_config = {
"server": "${this.appData.website_url || 'https://proxy.example.com'}",
"username": "your_username",
"password": "your_password"
}
async with AsyncWebCrawler(proxy=proxy_config) as crawler:
result = await crawler.arun(
url="https://example.com",
bypass_cache=True
)
print(result.status_code)`;
} else if (this.appData.category === 'LLM Integration') {
usageCode.textContent = `from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Configure LLM extraction
strategy = LLMExtractionStrategy(
provider="${this.appData.name.toLowerCase().includes('gpt') ? 'openai' : 'anthropic'}",
api_key="your-api-key",
model="${this.appData.name.toLowerCase().includes('gpt') ? 'gpt-4' : 'claude-3'}",
instruction="Extract structured data"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
extraction_strategy=strategy
)
print(result.extracted_content)`;
}
}
// Documentation tab - use documentation field from database
const docsDiv = document.getElementById('app-docs');
if (docsDiv) {
if (this.appData.documentation) {
docsDiv.innerHTML = this.renderMarkdown(this.appData.documentation);
// Add copy buttons to all code blocks
this.addCopyButtonsToCodeBlocks(docsDiv);
} else {
docsDiv.innerHTML = '<p>Documentation coming soon.</p>';
}
// Integration example
const integrationCode = document.getElementById('integration-code');
if (integrationCode) {
integrationCode.textContent = this.appData.integration_guide ||
`# Complete ${this.appData.name} Integration Example
from crawl4ai import AsyncWebCrawler
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
import json
async def crawl_with_${this.appData.slug.replace(/-/g, '_')}():
"""
Complete example showing how to use ${this.appData.name}
with Crawl4AI for production web scraping
"""
# Define extraction schema
schema = {
"name": "ProductList",
"baseSelector": "div.product",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "image", "selector": "img", "type": "attribute", "attribute": "src"},
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
]
}
# Initialize crawler with ${this.appData.name}
async with AsyncWebCrawler(
browser_type="chromium",
headless=True,
verbose=True
) as crawler:
# Crawl with extraction
result = await crawler.arun(
url="https://example.com/products",
extraction_strategy=JsonCssExtractionStrategy(schema),
cache_mode="bypass",
wait_for="css:.product",
screenshot=True
)
# Process results
if result.success:
products = json.loads(result.extracted_content)
print(f"Found {len(products)} products")
for product in products[:5]:
print(f"- {product['title']}: {product['price']}")
return products
# Run the crawler
if __name__ == "__main__":
import asyncio
asyncio.run(crawl_with_${this.appData.slug.replace(/-/g, '_')}())`;
}
}
addCopyButtonsToCodeBlocks(container) {
// Find all code blocks and add copy buttons
const codeBlocks = container.querySelectorAll('pre code');
codeBlocks.forEach(codeBlock => {
const pre = codeBlock.parentElement;
// Skip if already has a copy button
if (pre.querySelector('.copy-btn')) return;
// Create copy button
const copyBtn = document.createElement('button');
copyBtn.className = 'copy-btn';
copyBtn.textContent = 'Copy';
copyBtn.onclick = () => {
navigator.clipboard.writeText(codeBlock.textContent).then(() => {
copyBtn.textContent = '✓ Copied!';
setTimeout(() => {
copyBtn.textContent = 'Copy';
}, 2000);
});
};
// Add button to pre element
pre.style.position = 'relative';
pre.insertBefore(copyBtn, codeBlock);
});
}
renderMarkdown(text) {
if (!text) return '';
// Store code blocks temporarily to protect them from processing
const codeBlocks = [];
let processed = text.replace(/```(\w+)?\n([\s\S]*?)```/g, (match, lang, code) => {
const placeholder = `___CODE_BLOCK_${codeBlocks.length}___`;
codeBlocks.push(`<pre><code class="language-${lang || ''}">${this.escapeHtml(code)}</code></pre>`);
return placeholder;
});
// Store inline code temporarily
const inlineCodes = [];
processed = processed.replace(/`([^`]+)`/g, (match, code) => {
const placeholder = `___INLINE_CODE_${inlineCodes.length}___`;
inlineCodes.push(`<code>${this.escapeHtml(code)}</code>`);
return placeholder;
});
// Now process the rest of the markdown
processed = processed
// Headers
.replace(/^### (.*$)/gim, '<h3>$1</h3>')
.replace(/^## (.*$)/gim, '<h2>$1</h2>')
.replace(/^# (.*$)/gim, '<h1>$1</h1>')
// Bold
.replace(/\*\*(.*?)\*\*/g, '<strong>$1</strong>')
// Italic
.replace(/\*(.*?)\*/g, '<em>$1</em>')
// Links
.replace(/\[([^\]]+)\]\(([^)]+)\)/g, '<a href="$2" target="_blank">$1</a>')
// Line breaks
.replace(/\n\n/g, '</p><p>')
.replace(/\n/g, '<br>')
// Lists
.replace(/^\* (.*)$/gim, '<li>$1</li>')
.replace(/^- (.*)$/gim, '<li>$1</li>')
// Wrap in paragraphs
.replace(/^(?!<[h|p|pre|ul|ol|li])/gim, '<p>')
.replace(/(?<![>])$/gim, '</p>');
// Restore inline code
inlineCodes.forEach((code, i) => {
processed = processed.replace(`___INLINE_CODE_${i}___`, code);
});
// Restore code blocks
codeBlocks.forEach((block, i) => {
processed = processed.replace(`___CODE_BLOCK_${i}___`, block);
});
return processed;
}
escapeHtml(text) {
const div = document.createElement('div');
div.textContent = text;
return div.innerHTML;
}
formatNumber(num) {
if (num >= 1000000) {
return (num / 1000000).toFixed(1) + 'M';
@@ -263,27 +275,45 @@ class AppDetailPage {
setupEventListeners() {
// Tab switching
const tabs = document.querySelectorAll('.tab-btn');
tabs.forEach(tab => {
tab.addEventListener('click', () => {
// Update active tab button
// Update active tab
tabs.forEach(t => t.classList.remove('active'));
tab.classList.add('active');
// Show corresponding content
const tabName = tab.dataset.tab;
// Hide all tab contents
const allTabContents = document.querySelectorAll('.tab-content');
allTabContents.forEach(content => {
document.querySelectorAll('.tab-content').forEach(content => {
content.classList.remove('active');
});
document.getElementById(`${tabName}-tab`).classList.add('active');
});
});
// Show the selected tab content
const targetTab = document.getElementById(`${tabName}-tab`);
if (targetTab) {
targetTab.classList.add('active');
}
// Copy integration code
document.getElementById('copy-integration').addEventListener('click', () => {
const code = document.getElementById('integration-code').textContent;
navigator.clipboard.writeText(code).then(() => {
const btn = document.getElementById('copy-integration');
const originalText = btn.innerHTML;
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = originalText;
}, 2000);
});
});
// Copy code buttons
document.querySelectorAll('.copy-btn').forEach(btn => {
btn.addEventListener('click', (e) => {
const codeBlock = e.target.closest('.code-block');
const code = codeBlock.querySelector('code').textContent;
navigator.clipboard.writeText(code).then(() => {
btn.textContent = 'Copied!';
setTimeout(() => {
btn.textContent = 'Copy';
}, 2000);
});
});
});
}

View File

@@ -471,17 +471,13 @@ async def delete_sponsor(sponsor_id: int):
app.include_router(router)
# Version info
VERSION = "1.1.0"
BUILD_DATE = "2025-10-26"
@app.get("/")
async def root():
"""API info"""
return {
"name": "Crawl4AI Marketplace API",
"version": VERSION,
"build_date": BUILD_DATE,
"version": "1.0.0",
"endpoints": [
"/marketplace/api/apps",
"/marketplace/api/articles",