Files
crawl4ai/docs/md_v2/apps/crawl4ai-assistant/content/schemaBuilder_v1.js
UncleCode 0ac12da9f3 feat: Major Chrome Extension overhaul with Click2Crawl, instant Schema extraction, and modular architecture
 New Features:
- Click2Crawl: Visual element selection with markdown conversion
  - Ctrl/Cmd+Click to select multiple elements
  - Visual text mode for WYSIWYG extraction
  - Real-time markdown preview with syntax highlighting
  - Export to .md file or clipboard

- Schema Builder Enhancement: Instant data extraction without LLMs
  - Test schemas directly in browser
  - See JSON results immediately
  - Export data or Python code
  - Cloud deployment ready (coming soon)

- Modular Architecture:
  - Separated into schemaBuilder.js, scriptBuilder.js, click2CrawlBuilder.js
  - Added contentAnalyzer.js and markdownConverter.js modules
  - Shared utilities and CSS reset system
  - Integrated marked.js for markdown rendering

🎨 UI/UX Improvements:
- Added edgy cloud announcement banner with seamless shimmer animation
- Direct, technical copy: "You don't need Puppeteer. You need Crawl4AI Cloud."
- Enhanced feature cards with emojis
- Fixed CSS conflicts with targeted reset approach
- Improved badge hover effects (red on hover)
- Added wrap toggle for code preview

📚 Documentation Updates:
- Split extraction diagrams into LLM and no-LLM versions
- Updated llms-full.txt with latest content
- Added versioned LLM context (v0.1.1)

🔧 Technical Enhancements:
- Refactored 3464 lines of monolithic content.js into modules
- Added proper event handling and cleanup
- Improved z-index management
- Better scroll position tracking for badges
- Enhanced error handling throughout

This release transforms the Chrome Extension from a simple tool into a powerful
visual data extraction suite, making web scraping accessible to everyone.
2025-06-09 23:18:27 +08:00

608 lines
19 KiB
JavaScript

// SchemaBuilder class for Crawl4AI Chrome Extension
class SchemaBuilder {
constructor() {
this.mode = null;
this.container = null;
this.fields = [];
this.overlay = null;
this.toolbar = null;
this.highlightBox = null;
this.selectedElements = new Set();
this.isPaused = false;
this.codeModal = null;
this.handleMouseMove = this.handleMouseMove.bind(this);
this.handleClick = this.handleClick.bind(this);
this.handleKeyPress = this.handleKeyPress.bind(this);
}
start() {
this.mode = 'container';
this.createOverlay();
this.createToolbar();
this.attachEventListeners();
this.updateToolbar();
}
stop() {
this.detachEventListeners();
this.overlay?.remove();
this.toolbar?.remove();
this.highlightBox?.remove();
this.removeAllHighlights();
this.mode = null;
this.container = null;
this.fields = [];
this.selectedElements.clear();
}
createOverlay() {
// Create highlight box
this.highlightBox = document.createElement('div');
this.highlightBox.className = 'c4ai-highlight-box';
document.body.appendChild(this.highlightBox);
}
createToolbar() {
this.toolbar = document.createElement('div');
this.toolbar.className = 'c4ai-toolbar';
this.toolbar.innerHTML = `
<div class="c4ai-toolbar-titlebar">
<div class="c4ai-titlebar-dots">
<button class="c4ai-dot c4ai-dot-close" id="c4ai-close"></button>
<button class="c4ai-dot c4ai-dot-minimize"></button>
<button class="c4ai-dot c4ai-dot-maximize"></button>
</div>
<img src="${chrome.runtime.getURL('icons/icon-16.png')}" class="c4ai-titlebar-icon" alt="Crawl4AI">
<div class="c4ai-titlebar-title">Crawl4AI Schema Builder</div>
</div>
<div class="c4ai-toolbar-content">
<div class="c4ai-toolbar-status">
<div class="c4ai-status-item">
<span class="c4ai-status-label">Mode:</span>
<span class="c4ai-status-value" id="c4ai-mode">Select Container</span>
</div>
<div class="c4ai-status-item">
<span class="c4ai-status-label">Container:</span>
<span class="c4ai-status-value" id="c4ai-container">Not selected</span>
</div>
</div>
<div class="c4ai-fields-list" id="c4ai-fields-list" style="display: none;">
<div class="c4ai-fields-header">Selected Fields:</div>
<ul class="c4ai-fields-items" id="c4ai-fields-items"></ul>
</div>
<div class="c4ai-toolbar-hint" id="c4ai-hint">
Click on a container element (e.g., product card, article, etc.)
</div>
<div class="c4ai-toolbar-actions">
<button id="c4ai-pause" class="c4ai-action-btn c4ai-pause-btn">
<span class="c4ai-pause-icon">⏸</span> Pause
</button>
<button id="c4ai-generate" class="c4ai-action-btn c4ai-generate-btn">
<span class="c4ai-generate-icon">⚡</span> Generate Code
</button>
</div>
</div>
`;
document.body.appendChild(this.toolbar);
// Add event listeners for toolbar buttons
document.getElementById('c4ai-pause').addEventListener('click', () => this.togglePause());
document.getElementById('c4ai-generate').addEventListener('click', () => this.stopAndGenerate());
document.getElementById('c4ai-close').addEventListener('click', () => this.stop());
// Make toolbar draggable
window.C4AI_Utils.makeDraggable(this.toolbar);
}
attachEventListeners() {
document.addEventListener('mousemove', this.handleMouseMove, true);
document.addEventListener('click', this.handleClick, true);
document.addEventListener('keydown', this.handleKeyPress, true);
}
detachEventListeners() {
document.removeEventListener('mousemove', this.handleMouseMove, true);
document.removeEventListener('click', this.handleClick, true);
document.removeEventListener('keydown', this.handleKeyPress, true);
}
handleMouseMove(e) {
if (this.isPaused) return;
const element = document.elementFromPoint(e.clientX, e.clientY);
if (element && !this.isOurElement(element)) {
this.highlightElement(element);
}
}
handleClick(e) {
if (this.isPaused) return;
const element = e.target;
if (this.isOurElement(element)) {
return;
}
e.preventDefault();
e.stopPropagation();
if (this.mode === 'container') {
this.selectContainer(element);
} else if (this.mode === 'field') {
this.selectField(element);
}
}
handleKeyPress(e) {
if (e.key === 'Escape') {
this.stop();
}
}
isOurElement(element) {
return window.C4AI_Utils.isOurElement(element);
}
togglePause() {
this.isPaused = !this.isPaused;
const pauseBtn = document.getElementById('c4ai-pause');
if (this.isPaused) {
pauseBtn.innerHTML = '<span class="c4ai-play-icon">▶</span> Resume';
pauseBtn.classList.add('c4ai-paused');
this.highlightBox.style.display = 'none';
} else {
pauseBtn.innerHTML = '<span class="c4ai-pause-icon">⏸</span> Pause';
pauseBtn.classList.remove('c4ai-paused');
}
}
stopAndGenerate() {
if (!this.container || this.fields.length === 0) {
alert('Please select a container and at least one field before generating code.');
return;
}
const code = this.generateCode();
this.showCodeModal(code);
}
highlightElement(element) {
const rect = element.getBoundingClientRect();
this.highlightBox.style.cssText = `
left: ${rect.left + window.scrollX}px;
top: ${rect.top + window.scrollY}px;
width: ${rect.width}px;
height: ${rect.height}px;
display: block;
`;
if (this.mode === 'container') {
this.highlightBox.className = 'c4ai-highlight-box c4ai-container-mode';
} else {
this.highlightBox.className = 'c4ai-highlight-box c4ai-field-mode';
}
}
selectContainer(element) {
// Remove previous container highlight
if (this.container) {
this.container.element.classList.remove('c4ai-selected-container');
}
this.container = {
element: element,
html: element.outerHTML,
selector: this.generateSelector(element),
tagName: element.tagName.toLowerCase()
};
element.classList.add('c4ai-selected-container');
this.mode = 'field';
this.updateToolbar();
this.updateStats();
}
selectField(element) {
// Don't select the container itself
if (element === this.container.element) {
return;
}
// Check if already selected - if so, deselect it
if (this.selectedElements.has(element)) {
this.deselectField(element);
return;
}
// Must be inside the container
if (!this.container.element.contains(element)) {
return;
}
this.showFieldDialog(element);
}
deselectField(element) {
// Remove from fields array
this.fields = this.fields.filter(f => f.element !== element);
// Remove from selected elements set
this.selectedElements.delete(element);
// Remove visual selection
element.classList.remove('c4ai-selected-field');
// Update UI
this.updateToolbar();
this.updateStats();
}
showFieldDialog(element) {
const dialog = document.createElement('div');
dialog.className = 'c4ai-field-dialog';
const rect = element.getBoundingClientRect();
dialog.style.cssText = `
left: ${rect.left + window.scrollX}px;
top: ${rect.bottom + window.scrollY + 10}px;
`;
dialog.innerHTML = `
<div class="c4ai-field-dialog-content">
<h4>Name this field:</h4>
<input type="text" id="c4ai-field-name" placeholder="e.g., title, price, description" autofocus>
<div class="c4ai-field-preview">
<strong>Content:</strong> ${element.textContent.trim().substring(0, 50)}...
</div>
<div class="c4ai-field-actions">
<button id="c4ai-field-save">Save</button>
<button id="c4ai-field-cancel">Cancel</button>
</div>
</div>
`;
document.body.appendChild(dialog);
const input = dialog.querySelector('#c4ai-field-name');
const saveBtn = dialog.querySelector('#c4ai-field-save');
const cancelBtn = dialog.querySelector('#c4ai-field-cancel');
const save = () => {
const fieldName = input.value.trim();
if (fieldName) {
this.fields.push({
name: fieldName,
value: element.textContent.trim(),
element: element,
selector: this.generateSelector(element, this.container.element)
});
element.classList.add('c4ai-selected-field');
this.selectedElements.add(element);
this.updateToolbar();
this.updateStats();
}
dialog.remove();
};
const cancel = () => {
dialog.remove();
};
saveBtn.addEventListener('click', save);
cancelBtn.addEventListener('click', cancel);
input.addEventListener('keypress', (e) => {
if (e.key === 'Enter') save();
if (e.key === 'Escape') cancel();
});
input.focus();
}
generateSelector(element, context = document) {
// Try to generate a robust selector
if (element.id) {
return `#${CSS.escape(element.id)}`;
}
// Check for data attributes (most stable)
const dataAttrs = ['data-testid', 'data-id', 'data-test', 'data-cy'];
for (const attr of dataAttrs) {
const value = element.getAttribute(attr);
if (value) {
return `[${attr}="${value}"]`;
}
}
// Check for aria-label
if (element.getAttribute('aria-label')) {
return `[aria-label="${element.getAttribute('aria-label')}"]`;
}
// Try semantic HTML elements with text
const tagName = element.tagName.toLowerCase();
if (['button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
const text = element.textContent.trim();
if (text && text.length < 50) {
// Use tag name with partial text match
return `${tagName}`;
}
}
// Check for simple, non-utility classes
const classes = Array.from(element.classList)
.filter(c => !c.startsWith('c4ai-')) // Exclude our classes
.filter(c => !c.includes('[') && !c.includes('(') && !c.includes(':')) // Exclude utility classes
.filter(c => c.length < 30); // Exclude very long classes
if (classes.length > 0 && classes.length <= 3) {
const selector = classes.map(c => `.${CSS.escape(c)}`).join('');
try {
if (context.querySelectorAll(selector).length === 1) {
return selector;
}
} catch (e) {
// Invalid selector, continue
}
}
// Use nth-child with simple parent tag
const parent = element.parentElement;
if (parent && parent !== context) {
const siblings = Array.from(parent.children);
const index = siblings.indexOf(element) + 1;
// Just use parent tag name to avoid recursion
const parentTag = parent.tagName.toLowerCase();
return `${parentTag} > ${tagName}:nth-child(${index})`;
}
// Final fallback
return tagName;
}
updateToolbar() {
document.getElementById('c4ai-mode').textContent =
this.mode === 'container' ? 'Select Container' : 'Select Fields';
document.getElementById('c4ai-container').textContent =
this.container ? `${this.container.tagName}` : 'Not selected';
// Update fields list
const fieldsList = document.getElementById('c4ai-fields-list');
const fieldsItems = document.getElementById('c4ai-fields-items');
if (this.fields.length > 0) {
fieldsList.style.display = 'block';
fieldsItems.innerHTML = this.fields.map(field => `
<li class="c4ai-field-item">
<span class="c4ai-field-name">${field.name}</span>
<span class="c4ai-field-value">${field.value.substring(0, 30)}${field.value.length > 30 ? '...' : ''}</span>
</li>
`).join('');
} else {
fieldsList.style.display = 'none';
}
const hint = document.getElementById('c4ai-hint');
if (this.mode === 'container') {
hint.textContent = 'Click on a container element (e.g., product card, article, etc.)';
} else if (this.fields.length === 0) {
hint.textContent = 'Click on fields inside the container to extract (title, price, etc.)';
} else {
hint.innerHTML = `Continue selecting fields or click <strong>Stop & Generate</strong> to finish.`;
}
}
updateStats() {
chrome.runtime.sendMessage({
action: 'updateStats',
stats: {
container: !!this.container,
fields: this.fields.length
}
});
}
removeAllHighlights() {
document.querySelectorAll('.c4ai-selected-container').forEach(el => {
el.classList.remove('c4ai-selected-container');
});
document.querySelectorAll('.c4ai-selected-field').forEach(el => {
el.classList.remove('c4ai-selected-field');
});
}
generateCode() {
const fieldDescriptions = this.fields.map(f =>
`- ${f.name} (example: "${f.value.substring(0, 50)}...")`
).join('\n');
return `#!/usr/bin/env python3
"""
Generated by Crawl4AI Chrome Extension
URL: ${window.location.href}
Generated: ${new Date().toISOString()}
"""
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# HTML snippet of the selected container element
HTML_SNIPPET = """
${this.container.html}
"""
# Extraction query based on your field selections
EXTRACTION_QUERY = """
Create a JSON CSS extraction schema to extract the following fields:
${fieldDescriptions}
The schema should handle multiple ${this.container.tagName} elements on the page.
Each item should be extracted as a separate object in the results array.
"""
async def generate_schema():
"""Generate extraction schema using LLM"""
print("🔧 Generating extraction schema...")
try:
# Generate the schema using Crawl4AI's built-in LLM integration
schema = JsonCssExtractionStrategy.generate_schema(
html=HTML_SNIPPET,
query=EXTRACTION_QUERY,
)
# Save the schema for reuse
schema_path = Path('generated_schema.json')
with open(schema_path, 'w') as f:
json.dump(schema, f, indent=2)
print("✅ Schema generated successfully!")
print(f"📄 Schema saved to: {schema_path}")
print("\\nGenerated schema:")
print(json.dumps(schema, indent=2))
return schema
except Exception as e:
print(f"❌ Error generating schema: {e}")
return None
async def test_extraction(url: str = "${window.location.href}"):
"""Test the generated schema on the actual webpage"""
print("\\n🧪 Testing extraction on live webpage...")
# Load the generated schema
try:
with open('generated_schema.json', 'r') as f:
schema = json.load(f)
except FileNotFoundError:
print("❌ Schema file not found. Run generate_schema() first.")
return
# Configure browser
browser_config = BrowserConfig(
headless=True,
verbose=False
)
# Configure extraction
crawler_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
print(f"\\n✅ Successfully extracted {len(data)} items!")
# Save results
with open('extracted_data.json', 'w') as f:
json.dump(data, f, indent=2)
# Show sample results
print("\\n📊 Sample results (first 2 items):")
for i, item in enumerate(data[:2], 1):
print(f"\\nItem {i}:")
for key, value in item.items():
print(f" {key}: {value}")
else:
print("❌ Extraction failed:", result.error_message)
if __name__ == "__main__":
# Step 1: Generate the schema from HTML snippet
asyncio.run(generate_schema())
# Step 2: Test extraction on the live webpage
# Uncomment the line below to test extraction:
# asyncio.run(test_extraction())
print("\\n🎯 Next steps:")
print("1. Review the generated schema in 'generated_schema.json'")
print("2. Uncomment the test_extraction() line to test on the live site")
print("3. Use the schema in your Crawl4AI projects!")
`;
return code;
}
showCodeModal(code) {
// Create modal
this.codeModal = document.createElement('div');
this.codeModal.className = 'c4ai-code-modal';
this.codeModal.innerHTML = `
<div class="c4ai-code-modal-content">
<div class="c4ai-code-modal-header">
<h2>Generated Python Code</h2>
<button class="c4ai-close-modal" id="c4ai-close-modal">✕</button>
</div>
<div class="c4ai-code-modal-body">
<pre class="c4ai-code-block"><code class="language-python">${window.C4AI_Utils.escapeHtml(code)}</code></pre>
</div>
<div class="c4ai-code-modal-footer">
<button class="c4ai-action-btn c4ai-cloud-btn" id="c4ai-run-cloud" disabled>
<span>☁️</span> Run on C4AI Cloud (Coming Soon)
</button>
<button class="c4ai-action-btn c4ai-download-btn" id="c4ai-download-code">
<span>⬇</span> Download Code
</button>
<button class="c4ai-action-btn c4ai-copy-btn" id="c4ai-copy-code">
<span>📋</span> Copy to Clipboard
</button>
</div>
</div>
`;
document.body.appendChild(this.codeModal);
// Add event listeners
document.getElementById('c4ai-close-modal').addEventListener('click', () => {
this.codeModal.remove();
this.codeModal = null;
// Don't stop the capture session
});
document.getElementById('c4ai-download-code').addEventListener('click', () => {
chrome.runtime.sendMessage({
action: 'downloadCode',
code: code,
filename: `crawl4ai_schema_${Date.now()}.py`
}, (response) => {
if (response && response.success) {
const btn = document.getElementById('c4ai-download-code');
const originalHTML = btn.innerHTML;
btn.innerHTML = '<span>✓</span> Downloaded!';
setTimeout(() => {
btn.innerHTML = originalHTML;
}, 2000);
} else {
console.error('Download failed:', response?.error);
alert('Download failed. Please check your browser settings.');
}
});
});
document.getElementById('c4ai-copy-code').addEventListener('click', () => {
navigator.clipboard.writeText(code).then(() => {
const btn = document.getElementById('c4ai-copy-code');
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = '<span>📋</span> Copy to Clipboard';
}, 2000);
});
});
// Apply syntax highlighting
window.C4AI_Utils.applySyntaxHighlighting(this.codeModal.querySelector('.language-python'));
}
}