diff --git a/.claude/settings.local.json b/.claude/settings.local.json
index 616bc6dd..52e79a4f 100644
--- a/.claude/settings.local.json
+++ b/.claude/settings.local.json
@@ -20,7 +20,8 @@
"Bash(docker logs:*)",
"Bash(curl:*)",
"Bash(docker compose:*)",
- "Bash(./test-final-integration.sh:*)"
+ "Bash(./test-final-integration.sh:*)",
+ "Bash(mv:*)"
]
},
"enableAllProjectMcpServers": false
diff --git a/docs/md_v2/apps/crawl4ai-assistant/README.md b/docs/md_v2/apps/crawl4ai-assistant/README.md
index 9d3841f1..9d6f4a60 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/README.md
+++ b/docs/md_v2/apps/crawl4ai-assistant/README.md
@@ -1,14 +1,15 @@
# Crawl4AI Chrome Extension
-Visual schema and script builder for Crawl4AI - Build extraction schemas by clicking on webpage elements!
+Visual extraction tools for Crawl4AI - Click to extract data and content from any webpage!
## ๐ Features
-- **Visual Schema Builder**: Click on elements to build extraction schemas
+- **Click2Crawl**: Click on elements to build data extraction schemas instantly
+- **Markdown Extraction**: Select elements and export as clean markdown
+- **Script Builder (Alpha)**: Record browser actions to create automation scripts
- **Smart Element Selection**: Container and field selection with visual feedback
-- **Code Generation**: Generates complete Python code with LLM integration
+- **Code Generation**: Generates complete Python code for Crawl4AI
- **Beautiful Dark UI**: Consistent with Crawl4AI's design language
-- **One-Click Download**: Get your generated code instantly
## ๐ฆ Installation
@@ -33,11 +34,11 @@ If you want proper icons:
## ๐ฏ How to Use
-### Building a Schema
+### Using Click2Crawl
1. **Navigate to any website** you want to extract data from
2. **Click the Crawl4AI extension icon** in your toolbar
-3. **Click "Schema Builder"** to start the capture mode
+3. **Click "Click2Crawl"** to start the capture mode
4. **Select a container element**:
- Hover over elements (they'll highlight in blue)
- Click on a repeating container (e.g., product card, article block)
@@ -45,9 +46,9 @@ If you want proper icons:
- Elements will now highlight in green
- Click on each piece of data you want to extract
- Name each field (e.g., "title", "price", "description")
-6. **Generate the code**:
- - Click "Generate Code" in the extension popup
- - A Python file will automatically download
+6. **Test and Export**:
+ - Click "Test Schema" to see extracted data instantly
+ - Export as Python code, JSON schema, or markdown
### Running the Generated Code
diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/schemaBuilder.js b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
similarity index 91%
rename from docs/md_v2/apps/crawl4ai-assistant/content/schemaBuilder.js
rename to docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
index 7bbb7a80..0c3c37b9 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/content/schemaBuilder.js
+++ b/docs/md_v2/apps/crawl4ai-assistant/content/click2crawl.js
@@ -1,15 +1,16 @@
-// Enhanced SchemaBuilder class for Crawl4AI Chrome Extension
+// Click2Crawl class for Crawl4AI Chrome Extension
+// Click elements to build extraction schemas
// Singleton instance to prevent multiple toolbars
-let schemaBuilderInstance = null;
+let click2CrawlInstance = null;
-class SchemaBuilder {
+class Click2Crawl {
constructor() {
// Prevent multiple instances
- if (schemaBuilderInstance) {
- schemaBuilderInstance.stop();
+ if (click2CrawlInstance) {
+ click2CrawlInstance.stop();
}
- schemaBuilderInstance = this;
+ click2CrawlInstance = this;
this.container = null;
this.fields = [];
@@ -57,9 +58,15 @@ class SchemaBuilder {
this.inspectingFields = false;
this.parentLevels = 1;
+ // Clean up markdown preview modal
+ if (this.markdownPreviewModal) {
+ this.markdownPreviewModal.destroy();
+ this.markdownPreviewModal = null;
+ }
+
// Clear singleton reference
- if (schemaBuilderInstance === this) {
- schemaBuilderInstance = null;
+ if (click2CrawlInstance === this) {
+ click2CrawlInstance = null;
}
}
@@ -97,8 +104,8 @@ class SchemaBuilder {
-
-
๐ง Schema Builder
+
Click2Crawl
+
@@ -151,6 +158,9 @@ class SchemaBuilder {
+
@@ -202,6 +212,7 @@ class SchemaBuilder {
addClickHandler('c4ai-test', () => this.testSchema());
addClickHandler('c4ai-export-schema', () => this.exportSchema());
addClickHandler('c4ai-export-data', () => this.exportData());
+ addClickHandler('c4ai-export-markdown', () => this.exportMarkdown());
addClickHandler('c4ai-deploy-cloud', () => this.deployToCloud());
addClickHandler('c4ai-close', () => this.stop());
@@ -273,10 +284,15 @@ class SchemaBuilder {
handleClick(e) {
const element = e.target;
- // Check if clicking on our UI elements
+ // Check if clicking on our UI elements (including markdown preview modal)
if (this.isOurElement(element)) {
return; // Let toolbar clicks work normally
}
+
+ // Additional check for markdown preview modal classes
+ if (element.closest('.c4ai-c2c-preview') || element.closest('.c4ai-preview-options')) {
+ return; // Don't interfere with markdown preview modal
+ }
// Use current element
const targetElement = this.currentElement || element;
@@ -303,7 +319,9 @@ class SchemaBuilder {
isOurElement(element) {
return window.C4AI_Utils.isOurElement(element) ||
- (this.selectedBox && element === this.selectedBox);
+ (this.selectedBox && element === this.selectedBox) ||
+ (this.markdownPreviewModal && this.markdownPreviewModal.modal &&
+ (element === this.markdownPreviewModal.modal || this.markdownPreviewModal.modal.contains(element)));
}
showSelectedBox(element) {
@@ -499,6 +517,9 @@ class SchemaBuilder {
}
showFieldDialog(element) {
+ // Remove any existing field dialogs first
+ document.querySelectorAll('.c4ai-field-dialog').forEach(d => d.remove());
+
const dialog = document.createElement('div');
dialog.className = 'c4ai-field-dialog';
@@ -922,6 +943,7 @@ class SchemaBuilder {
document.getElementById('c4ai-test').disabled = false;
document.getElementById('c4ai-export-schema').disabled = false;
document.getElementById('c4ai-export-data').disabled = false;
+ document.getElementById('c4ai-export-markdown').disabled = false;
document.getElementById('c4ai-deploy-cloud').disabled = false;
} else {
schemaSection.style.display = 'none';
@@ -976,6 +998,9 @@ class SchemaBuilder {
const field = this.fields[index];
if (!field) return;
+ // Remove any existing field dialogs first
+ document.querySelectorAll('.c4ai-field-dialog').forEach(d => d.remove());
+
// Re-show the field dialog with existing values
const dialog = document.createElement('div');
dialog.className = 'c4ai-field-dialog';
@@ -1476,6 +1501,137 @@ class SchemaBuilder {
await this.testSchema();
}
+ async exportMarkdown() {
+ // Initialize markdown converter if not already done
+ if (!this.markdownConverter) {
+ this.markdownConverter = new MarkdownConverter();
+ }
+ if (!this.contentAnalyzer) {
+ this.contentAnalyzer = new ContentAnalyzer();
+ }
+
+ // Initialize markdown preview modal if not already done
+ if (!this.markdownPreviewModal) {
+ this.markdownPreviewModal = new MarkdownPreviewModal();
+ }
+
+ // Get all matching containers
+ const containers = document.querySelectorAll(this.container.selector);
+ if (containers.length === 0) {
+ this.showNotification('No matching containers found', 'error');
+ return;
+ }
+
+ // Show modal with callback to generate markdown
+ this.markdownPreviewModal.show(async (options) => {
+ return await this.generateMarkdownFromSchema(options);
+ });
+ }
+
+
+
+
+
+ async generateMarkdownFromSchema(options) {
+ // Get all matching containers
+ const containers = document.querySelectorAll(this.container.selector);
+ const markdownParts = [];
+
+ for (let i = 0; i < containers.length; i++) {
+ const container = containers[i];
+
+ // Add XPath header if enabled
+ if (options.includeXPath) {
+ const xpath = this.getXPath(container);
+ markdownParts.push(`### Container ${i + 1} - XPath: \`${xpath}\`\n`);
+ }
+
+ // Extract data based on schema fields
+ const extractedData = {};
+ this.fields.forEach(field => {
+ try {
+ const element = container.querySelector(field.selector);
+ if (element) {
+ if (field.type === 'text') {
+ extractedData[field.name] = element.textContent.trim();
+ } else if (field.type === 'attribute' && field.attribute) {
+ extractedData[field.name] = element.getAttribute(field.attribute);
+ }
+ }
+ } catch (e) {
+ // Skip invalid selectors
+ }
+ });
+
+ // Convert container to markdown based on options
+ const analysis = await this.contentAnalyzer.analyze([container]);
+ const containerMarkdown = await this.markdownConverter.convert([container], {
+ ...options,
+ analysis,
+ extractedData // Pass extracted data for context
+ });
+
+ // Trim the markdown before adding
+ const trimmedMarkdown = containerMarkdown.trim();
+ markdownParts.push(trimmedMarkdown);
+
+ // Add separator if enabled and not last element
+ if (options.addSeparators && i < containers.length - 1) {
+ markdownParts.push('\n---\n');
+ }
+ }
+
+ return markdownParts.join('\n');
+ }
+
+ getXPath(element) {
+ if (element.id) {
+ return `//*[@id="${element.id}"]`;
+ }
+
+ const parts = [];
+ let current = element;
+
+ while (current && current.nodeType === Node.ELEMENT_NODE) {
+ let index = 0;
+ let sibling = current.previousSibling;
+
+ while (sibling) {
+ if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName === current.nodeName) {
+ index++;
+ }
+ sibling = sibling.previousSibling;
+ }
+
+ const tagName = current.nodeName.toLowerCase();
+ const part = index > 0 ? `${tagName}[${index + 1}]` : tagName;
+ parts.unshift(part);
+
+ current = current.parentNode;
+ }
+
+ return '/' + parts.join('/');
+ }
+
+
+
+ showNotification(message, type = 'success') {
+ const notification = document.createElement('div');
+ notification.className = `c4ai-notification c4ai-notification-${type}`;
+ notification.textContent = message;
+
+ document.body.appendChild(notification);
+
+ // Animate in
+ setTimeout(() => notification.classList.add('show'), 10);
+
+ // Remove after 3 seconds
+ setTimeout(() => {
+ notification.classList.remove('show');
+ setTimeout(() => notification.remove(), 300);
+ }, 3000);
+ }
+
deployToCloud() {
// Create cloud deployment modal
const modal = document.createElement('div');
@@ -1808,5 +1964,5 @@ if __name__ == "__main__":
// Export for use in content script
if (typeof window !== 'undefined') {
- window.SchemaBuilder = SchemaBuilder;
+ window.Click2Crawl = Click2Crawl;
}
\ No newline at end of file
diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/content.js b/docs/md_v2/apps/crawl4ai-assistant/content/content.js
index f20efe3b..a4f63a99 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/content/content.js
+++ b/docs/md_v2/apps/crawl4ai-assistant/content/content.js
@@ -1,5 +1,5 @@
// Main content script for Crawl4AI Assistant
-// Coordinates between SchemaBuilder and ScriptBuilder
+// Coordinates between Click2Crawl, ScriptBuilder, and MarkdownExtraction
let activeBuilder = null;
@@ -13,8 +13,8 @@ chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
}
if (request.mode === 'schema') {
- console.log('Starting Schema Builder');
- activeBuilder = new SchemaBuilder();
+ console.log('Starting Click2Crawl');
+ activeBuilder = new Click2Crawl();
activeBuilder.start();
} else if (request.mode === 'script') {
console.log('Starting Script Builder');
@@ -34,8 +34,8 @@ chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
activeBuilder.deactivate?.();
activeBuilder = null;
}
- console.log('Starting Schema Builder');
- activeBuilder = new SchemaBuilder();
+ console.log('Starting Click2Crawl');
+ activeBuilder = new Click2Crawl();
activeBuilder.start();
sendResponse({ success: true });
} else if (request.action === 'startScriptCapture') {
@@ -52,8 +52,8 @@ chrome.runtime.onMessage.addListener((request, sender, sendResponse) => {
activeBuilder.deactivate?.();
activeBuilder = null;
}
- console.log('Starting Click2Crawl');
- activeBuilder = new Click2CrawlBuilder();
+ console.log('Starting Markdown Extraction');
+ activeBuilder = new MarkdownExtraction();
sendResponse({ success: true });
} else if (request.action === 'generateCode') {
if (activeBuilder && activeBuilder.generateCode) {
diff --git a/docs/md_v2/apps/crawl4ai-assistant/content/click2CrawlBuilder.js b/docs/md_v2/apps/crawl4ai-assistant/content/markdownExtraction.js
similarity index 90%
rename from docs/md_v2/apps/crawl4ai-assistant/content/click2CrawlBuilder.js
rename to docs/md_v2/apps/crawl4ai-assistant/content/markdownExtraction.js
index 4fdcc40e..6e4cf393 100644
--- a/docs/md_v2/apps/crawl4ai-assistant/content/click2CrawlBuilder.js
+++ b/docs/md_v2/apps/crawl4ai-assistant/content/markdownExtraction.js
@@ -1,26 +1,14 @@
-class Click2CrawlBuilder {
+class MarkdownExtraction {
constructor() {
this.selectedElements = new Set();
this.highlightBoxes = new Map();
this.selectionMode = false;
this.toolbar = null;
- this.previewPanel = null;
+ this.markdownPreviewModal = null;
this.selectionCounter = 0;
this.markdownConverter = null;
this.contentAnalyzer = null;
- // Configuration options
- this.options = {
- includeImages: true,
- preserveTables: true,
- keepCodeFormatting: true,
- simplifyLayout: false,
- preserveLinks: true,
- addSeparators: true,
- includeXPath: false,
- textOnly: false
- };
-
this.init();
}
@@ -44,7 +32,7 @@ class Click2CrawlBuilder {
- Click2Crawl
+ Markdown Extraction
Transform any website into structured data with just a few clicks! The Crawl4AI Assistant Chrome Extension provides three powerful tools for web scraping and data extraction.
- ๐ NEW: Schema Builder now extracts data INSTANTLY without any LLM! Test your schema and see JSON results immediately in the browser!
+ ๐ NEW: Click2Crawl extracts data INSTANTLY without any LLM! Test your schema and see JSON results immediately in the browser!
๐ฏ
-
Schema Builder
-
Extract data instantly without LLMs - see results in real-time!
+
Click2Crawl
+
Visual data extraction - click elements to build schemas instantly!
๐ด
@@ -77,8 +77,8 @@
๐
-
Click2Crawl (New!)
-
Select multiple elements to extract clean markdown "as you see"
+
Markdown Extraction (New!)
+
Convert any webpage content to clean markdown with Visual Text Mode
-
-
๐
+
+
๐ฏ
-
Schema Builder
+
Click2Crawl
Visual data extraction
Available
@@ -154,11 +154,11 @@
Alpha
-
+
๐
-
Click2Crawl
-
Markdown extraction
+
Markdown Extraction
+
Content to markdown
New!
@@ -166,11 +166,11 @@
-
-
+
+
-
๐ Schema Builder
- No LLM needed - Extract data instantly!
+
๐ฏ Click2Crawl
+ Click elements to build extraction schemas - No LLM needed!
@@ -199,8 +199,8 @@
3
-
Test & Extract Data NOW!
-
๐ Click "Test Schema" to extract ALL matching data instantly - no coding required!
+
Test & Extract Data Instantly!
+
๐ Click "Test Schema" to see extracted JSON immediately - no LLM or coding required!
โก See extracted JSON immediately
@@ -210,11 +210,12 @@
๐ Zero LLM dependency
-
๐ Instant data extraction
-
๐ฏ Smart selector generation
-
๐ Ready-to-run Python code
-
โจ Preview matching elements
-
๐ฅ Download JSON results
+
๐ Instant JSON extraction
+
๐ฏ Visual element selection
+
๐ Export Python code
+
โจ Live preview
+
๐ฅ Download results
+
๐ Export to markdown
@@ -268,11 +269,11 @@
-
-
+
+
-
๐ Click2Crawl
- Select multiple elements to extract clean markdown
+
๐ Markdown Extraction
+ Convert webpage content to clean markdown "as you see"
#!/usr/bin/env python3"""
๐ NO LLM NEEDED! Direct extraction with CSS selectors
-Generated by Crawl4AI Chrome Extension
+Generated by Crawl4AI Chrome Extension - Click2Crawl
"""import asyncio
@@ -353,7 +354,7 @@ Generated by Crawl4AI Chrome Extension
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
-# The EXACT schema from your visual clicks - no guessing!
+# The EXACT schema from Click2Crawl - no guessing!
EXTRACTION_SCHEMA = {
"name": "Product Catalog",
"baseSelector": "div.product-card", # The container you selected
@@ -515,7 +516,7 @@ asyncio.run(automate_shopping())
-
+
@@ -692,20 +693,20 @@ Today, finding a 24-hour restaurant in Manhattan requires genuine effort. The pa
Direct
-
Get CrawlResult Without Code
+
Direct Data Download
-
Skip the code generation entirely! Get extracted data directly in the extension as a CrawlResult object, ready to download as JSON.
+
Skip the code generation entirely! Download extracted data directly from Click2Crawl as JSON or CSV files.
- ๐ One-click extraction โข No Python needed โข Export to JSON/CSV
+ ๐ One-click download โข No Python needed โข Multiple export formats
AI
-
Smart Schema Suggestions
+
Smart Field Detection
-
AI-powered field detection that automatically suggests the most likely data fields on any page, making schema building even faster.
+
AI-powered field detection for Click2Crawl that automatically suggests the most likely data fields on any page.