Files
crawl4ai/docs/md_v2/apps/crawl4ai-assistant/content/schemaBuilder.js
UncleCode 0ac12da9f3 feat: Major Chrome Extension overhaul with Click2Crawl, instant Schema extraction, and modular architecture
 New Features:
- Click2Crawl: Visual element selection with markdown conversion
  - Ctrl/Cmd+Click to select multiple elements
  - Visual text mode for WYSIWYG extraction
  - Real-time markdown preview with syntax highlighting
  - Export to .md file or clipboard

- Schema Builder Enhancement: Instant data extraction without LLMs
  - Test schemas directly in browser
  - See JSON results immediately
  - Export data or Python code
  - Cloud deployment ready (coming soon)

- Modular Architecture:
  - Separated into schemaBuilder.js, scriptBuilder.js, click2CrawlBuilder.js
  - Added contentAnalyzer.js and markdownConverter.js modules
  - Shared utilities and CSS reset system
  - Integrated marked.js for markdown rendering

🎨 UI/UX Improvements:
- Added edgy cloud announcement banner with seamless shimmer animation
- Direct, technical copy: "You don't need Puppeteer. You need Crawl4AI Cloud."
- Enhanced feature cards with emojis
- Fixed CSS conflicts with targeted reset approach
- Improved badge hover effects (red on hover)
- Added wrap toggle for code preview

📚 Documentation Updates:
- Split extraction diagrams into LLM and no-LLM versions
- Updated llms-full.txt with latest content
- Added versioned LLM context (v0.1.1)

🔧 Technical Enhancements:
- Refactored 3464 lines of monolithic content.js into modules
- Added proper event handling and cleanup
- Improved z-index management
- Better scroll position tracking for badges
- Enhanced error handling throughout

This release transforms the Chrome Extension from a simple tool into a powerful
visual data extraction suite, making web scraping accessible to everyone.
2025-06-09 23:18:27 +08:00

1812 lines
61 KiB
JavaScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
// Enhanced SchemaBuilder class for Crawl4AI Chrome Extension
// Singleton instance to prevent multiple toolbars
let schemaBuilderInstance = null;
class SchemaBuilder {
constructor() {
// Prevent multiple instances
if (schemaBuilderInstance) {
schemaBuilderInstance.stop();
}
schemaBuilderInstance = this;
this.container = null;
this.fields = [];
this.overlay = null;
this.toolbar = null;
this.highlightBox = null; // For hover preview
this.selectedBox = null; // For selected element
this.currentElement = null; // Currently hovered element
this.selectedElement = null; // Currently selected element (container)
this.selectedElements = new Set();
this.inspectingFields = false; // Field inspection mode
this.codeModal = null;
this.previewMode = false;
this.previewElements = [];
this.schema = null;
this.parentLevels = 1; // Default parent levels for base container
this.handleMouseMove = this.handleMouseMove.bind(this);
this.handleClick = this.handleClick.bind(this);
this.handleKeyPress = this.handleKeyPress.bind(this);
this.handleMouseLeave = this.handleMouseLeave.bind(this);
}
start() {
this.createOverlay();
this.createToolbar();
this.attachEventListeners();
this.updateToolbar();
}
stop() {
this.detachEventListeners();
this.overlay?.remove();
this.toolbar?.remove();
this.highlightBox?.remove();
this.selectedBox?.remove();
this.removeAllHighlights();
this.clearPreview();
this.container = null;
this.fields = [];
this.selectedElements.clear();
this.schema = null;
this.currentElement = null;
this.selectedElement = null;
this.inspectingFields = false;
this.parentLevels = 1;
// Clear singleton reference
if (schemaBuilderInstance === this) {
schemaBuilderInstance = null;
}
}
// Alias for content script compatibility
deactivate() {
this.stop();
}
createOverlay() {
// Create highlight box for hover preview
this.highlightBox = document.createElement('div');
this.highlightBox.className = 'c4ai-highlight-box';
document.body.appendChild(this.highlightBox);
// Create selected box for permanent selection
this.selectedBox = document.createElement('div');
this.selectedBox.className = 'c4ai-selected-box';
this.selectedBox.style.display = 'none';
document.body.appendChild(this.selectedBox);
}
createToolbar() {
// Remove any existing toolbar first
const existingToolbar = document.querySelector('.c4ai-toolbar');
if (existingToolbar) {
existingToolbar.remove();
}
this.toolbar = document.createElement('div');
this.toolbar.className = 'c4ai-toolbar';
this.toolbar.innerHTML = `
<div class="c4ai-toolbar-titlebar">
<div class="c4ai-titlebar-dots">
<button class="c4ai-dot c4ai-dot-close" id="c4ai-close"></button>
<button class="c4ai-dot c4ai-dot-minimize"></button>
<button class="c4ai-dot c4ai-dot-maximize"></button>
</div>
<img src="${chrome.runtime.getURL('icons/icon-16.png')}" class="c4ai-titlebar-icon" alt="Crawl4AI">
<div class="c4ai-titlebar-title">🔧 Schema Builder</div>
</div>
<div class="c4ai-toolbar-content">
<div class="c4ai-toolbar-status">
<div class="c4ai-status-item">
<span class="c4ai-status-label">Mode:</span>
<span class="c4ai-status-value" id="c4ai-mode">Select Container</span>
</div>
<div class="c4ai-status-item" id="c4ai-container-item" style="display: none;">
<span class="c4ai-status-label">Container:</span>
<span class="c4ai-status-value" id="c4ai-container">Not selected</span>
<button class="c4ai-nav-btn c4ai-nav-btn-small" id="c4ai-nav-up" title="Select parent">↑</button>
<button class="c4ai-nav-btn c4ai-nav-btn-small" id="c4ai-nav-down" title="Select child">↓</button>
<button class="c4ai-nav-btn c4ai-nav-btn-small c4ai-nav-deselect" id="c4ai-nav-close" title="Deselect">×</button>
</div>
<div class="c4ai-status-item" id="c4ai-selector-display" style="display: none;">
<div class="c4ai-container-selector" id="c4ai-container-selector"></div>
</div>
<div class="c4ai-status-item" id="c4ai-parent-levels" style="display: none;">
<span class="c4ai-status-label">Parent Levels:</span>
<div class="c4ai-parent-controls">
<button class="c4ai-parent-btn" id="c4ai-parent-minus">-</button>
<span class="c4ai-parent-value" id="c4ai-parent-value">1</span>
<button class="c4ai-parent-btn" id="c4ai-parent-plus">+</button>
</div>
</div>
</div>
<div class="c4ai-schema-section" id="c4ai-schema-section" style="display: none;">
<div class="c4ai-section-header">
<span>SCHEMA FIELDS (<span id="c4ai-field-count">0</span>)</span>
</div>
<div class="c4ai-fields-list" id="c4ai-fields-list"></div>
</div>
<div class="c4ai-actions-section" id="c4ai-actions-section" style="display: none;">
<div class="c4ai-section-header">ACTIONS</div>
<div class="c4ai-toolbar-actions">
<button id="c4ai-preview" class="c4ai-action-btn c4ai-preview-btn" disabled>
<span>👁️</span> Preview Matches
</button>
<button id="c4ai-test" class="c4ai-action-btn c4ai-test-btn" disabled>
<span>🧪</span> Test Schema
</button>
<button id="c4ai-deploy-cloud" class="c4ai-action-btn c4ai-export-btn c4ai-cloud-btn" disabled>
<span>☁️</span> Deploy
</button>
<button id="c4ai-export-schema" class="c4ai-action-btn c4ai-export-btn" disabled>
<span>📄</span> Schema
</button>
<button id="c4ai-export-data" class="c4ai-action-btn c4ai-export-btn" disabled>
<span>📊</span> Data
</button>
</div>
</div>
<div class="c4ai-stats-section" id="c4ai-stats-section" style="display: none;">
<div class="c4ai-section-header">STATS</div>
<div class="c4ai-stats">
<div class="c4ai-stat-item">
<span>Matches Found:</span>
<span id="c4ai-matches-count">0 items</span>
</div>
<div class="c4ai-stat-item">
<span>Schema Valid:</span>
<span id="c4ai-schema-valid">Not tested</span>
</div>
</div>
</div>
<div class="c4ai-toolbar-hint" id="c4ai-hint">
Click on a container element (e.g., product card, article, etc.)
</div>
<div class="c4ai-toolbar-footer" id="c4ai-footer-section" style="display: none;">
<button id="c4ai-inspect-fields" class="c4ai-action-btn c4ai-primary-btn">
<span>🏷️</span> Fields
</button>
</div>
</div>
`;
document.body.appendChild(this.toolbar);
// Force toolbar to top of z-index stack
this.toolbar.style.zIndex = '2147483647'; // Maximum z-index
// Add event listeners for toolbar buttons with error handling
const addClickHandler = (id, handler) => {
const element = document.getElementById(id);
if (element) {
element.addEventListener('click', (e) => {
e.preventDefault();
e.stopPropagation();
handler();
});
}
};
// Add all event listeners
addClickHandler('c4ai-inspect-fields', () => this.toggleFieldInspection());
addClickHandler('c4ai-preview', () => this.togglePreview());
addClickHandler('c4ai-test', () => this.testSchema());
addClickHandler('c4ai-export-schema', () => this.exportSchema());
addClickHandler('c4ai-export-data', () => this.exportData());
addClickHandler('c4ai-deploy-cloud', () => this.deployToCloud());
addClickHandler('c4ai-close', () => this.stop());
// Navigation controls
addClickHandler('c4ai-nav-up', () => this.navigateUp());
addClickHandler('c4ai-nav-down', () => this.navigateDown());
addClickHandler('c4ai-nav-close', () => this.deselectContainer());
// Parent level controls
addClickHandler('c4ai-parent-minus', () => this.adjustParentLevels(-1));
addClickHandler('c4ai-parent-plus', () => this.adjustParentLevels(1));
// Make toolbar draggable
if (window.C4AI_Utils && window.C4AI_Utils.makeDraggable) {
window.C4AI_Utils.makeDraggable(this.toolbar);
}
}
attachEventListeners() {
document.addEventListener('mousemove', this.handleMouseMove, true);
document.addEventListener('click', this.handleClick, true);
document.addEventListener('keydown', this.handleKeyPress, true);
document.addEventListener('mouseleave', this.handleMouseLeave, true);
}
detachEventListeners() {
document.removeEventListener('mousemove', this.handleMouseMove, true);
document.removeEventListener('click', this.handleClick, true);
document.removeEventListener('keydown', this.handleKeyPress, true);
document.removeEventListener('mouseleave', this.handleMouseLeave, true);
}
handleMouseMove(e) {
const element = document.elementFromPoint(e.clientX, e.clientY);
// Don't highlight if hovering over our UI elements
if (this.isOurElement(element)) {
this.highlightBox.style.display = 'none';
return;
}
// Only show highlight if:
// 1. No container selected (selection mode)
// 2. Or inspecting fields inside container
if (!this.container || (this.inspectingFields && this.container)) {
if (element) {
// If inspecting fields, only highlight elements inside container
if (this.inspectingFields && !this.container.element.contains(element)) {
this.highlightBox.style.display = 'none';
return;
}
this.currentElement = element;
this.highlightElement(element);
}
} else {
// Container selected but not inspecting fields - no highlight
this.highlightBox.style.display = 'none';
}
}
handleMouseLeave(e) {
// Hide highlight when mouse leaves
if (e.target === document) {
this.highlightBox.style.display = 'none';
}
}
handleClick(e) {
const element = e.target;
// Check if clicking on our UI elements
if (this.isOurElement(element)) {
return; // Let toolbar clicks work normally
}
// Use current element
const targetElement = this.currentElement || element;
if (!this.container) {
// Container selection mode - prevent default
e.preventDefault();
e.stopPropagation();
this.selectContainer(targetElement);
} else if (this.inspectingFields && this.container.element.contains(targetElement)) {
// Field selection mode AND clicking inside container - prevent default
e.preventDefault();
e.stopPropagation();
this.selectField(targetElement);
}
// Otherwise, let the click work normally
}
handleKeyPress(e) {
if (e.key === 'Escape') {
this.stop();
}
}
isOurElement(element) {
return window.C4AI_Utils.isOurElement(element) ||
(this.selectedBox && element === this.selectedBox);
}
showSelectedBox(element) {
if (!element) return;
const rect = element.getBoundingClientRect();
this.selectedBox.style.cssText = `
position: absolute;
left: ${rect.left + window.scrollX}px;
top: ${rect.top + window.scrollY}px;
width: ${rect.width}px;
height: ${rect.height}px;
display: block;
`;
this.selectedBox.className = 'c4ai-selected-box c4ai-selected-container';
}
updateNavButtonStates() {
const upBtn = document.getElementById('c4ai-nav-up');
const downBtn = document.getElementById('c4ai-nav-down');
if (this.selectedElement) {
// Disable up button if no parent or parent is body
upBtn.disabled = !this.selectedElement.parentElement || this.selectedElement.parentElement === document.body;
// Disable down button if no children
downBtn.disabled = this.selectedElement.children.length === 0;
}
}
navigateUp() {
if (!this.selectedElement || !this.selectedElement.parentElement) return;
const parent = this.selectedElement.parentElement;
if (parent === document.body) return;
// Update selected element and container
this.selectedElement = parent;
this.container.element = parent;
this.container.tagName = parent.tagName.toLowerCase();
this.container.selector = this.generateContainerSelector(parent);
// Update visual selection
this.showSelectedBox(parent);
this.updateNavButtonStates();
this.updateToolbar();
this.updateStats();
}
navigateDown() {
if (!this.selectedElement || this.selectedElement.children.length === 0) return;
const firstChild = this.selectedElement.children[0];
// Update selected element and container
this.selectedElement = firstChild;
this.container.element = firstChild;
this.container.tagName = firstChild.tagName.toLowerCase();
this.container.selector = this.generateContainerSelector(firstChild);
// Update visual selection
this.showSelectedBox(firstChild);
this.updateNavButtonStates();
this.updateToolbar();
this.updateStats();
}
deselectContainer() {
if (this.container) {
// Remove visual selection
this.container.element.classList.remove('c4ai-selected-container');
this.selectedBox.style.display = 'none';
// Clear container and related state
this.container = null;
this.selectedElement = null;
this.inspectingFields = false;
// Clear all fields
this.fields.forEach(field => {
field.element.classList.remove('c4ai-selected-field');
field.element.removeAttribute('data-c4ai-field');
});
this.fields = [];
this.selectedElements.clear();
this.updateToolbar();
this.updateStats();
}
}
toggleFieldInspection() {
this.inspectingFields = !this.inspectingFields;
const fieldsBtn = document.getElementById('c4ai-inspect-fields');
if (this.inspectingFields) {
fieldsBtn.classList.add('c4ai-active');
fieldsBtn.innerHTML = '<span>✓</span> Fields';
} else {
fieldsBtn.classList.remove('c4ai-active');
fieldsBtn.innerHTML = '<span>🏷️</span> Fields';
this.highlightBox.style.display = 'none';
}
this.updateToolbar();
}
// Legacy method - kept for compatibility but now redirects to test schema
stopAndGenerate() {
this.testSchema();
}
highlightElement(element) {
const rect = element.getBoundingClientRect();
this.highlightBox.style.cssText = `
left: ${rect.left + window.scrollX}px;
top: ${rect.top + window.scrollY}px;
width: ${rect.width}px;
height: ${rect.height}px;
display: block;
`;
if (!this.container) {
// Container selection mode
this.highlightBox.className = 'c4ai-highlight-box c4ai-container-mode';
} else {
// Field selection mode
this.highlightBox.className = 'c4ai-highlight-box c4ai-field-mode';
}
}
selectContainer(element) {
// Remove previous container highlight
if (this.container) {
this.container.element.classList.remove('c4ai-selected-container');
}
this.container = {
element: element,
html: element.outerHTML,
selector: this.generateContainerSelector(element),
tagName: element.tagName.toLowerCase()
};
element.classList.add('c4ai-selected-container');
this.selectedElement = element;
this.showSelectedBox(element);
// Hide hover highlight after selection
this.highlightBox.style.display = 'none';
// Update navigation button states
this.updateNavButtonStates();
this.updateToolbar();
this.updateStats();
}
selectField(element) {
// Don't select the container itself
if (element === this.container.element) {
return;
}
// Check if already selected - if so, deselect it
if (this.selectedElements.has(element)) {
this.deselectField(element);
return;
}
// Must be inside the container
if (!this.container.element.contains(element)) {
return;
}
this.showFieldDialog(element);
}
deselectField(element) {
// Remove from fields array
this.fields = this.fields.filter(f => f.element !== element);
// Remove from selected elements set
this.selectedElements.delete(element);
// Remove visual selection
element.classList.remove('c4ai-selected-field');
// Update UI
this.updateToolbar();
this.updateStats();
}
showFieldDialog(element) {
const dialog = document.createElement('div');
dialog.className = 'c4ai-field-dialog';
const rect = element.getBoundingClientRect();
dialog.style.cssText = `
left: ${rect.left + window.scrollX}px;
top: ${rect.bottom + window.scrollY + 10}px;
`;
// Get available attributes
const attributes = this.getElementAttributes(element);
const attributeOptions = attributes.map(attr =>
`<option value="${attr.name}">${attr.name}: "${attr.value.substring(0, 30)}${attr.value.length > 30 ? '...' : ''}"</option>`
).join('');
dialog.innerHTML = `
<div class="c4ai-field-dialog-content">
<h4>Configure Field</h4>
<div class="c4ai-field-input">
<label>Field Name:</label>
<input type="text" id="c4ai-field-name" placeholder="e.g., title, price, description" autofocus>
</div>
<div class="c4ai-field-input">
<label>Field Type:</label>
<select id="c4ai-field-type">
<option value="text">Text Content</option>
<option value="attribute">Attribute</option>
<option value="link">Link (href)</option>
<option value="image">Image (src)</option>
<option value="list">List</option>
<option value="nested">Nested Object</option>
</select>
</div>
<div class="c4ai-field-input" id="c4ai-attribute-select" style="display: none;">
<label>Select Attribute:</label>
<select id="c4ai-field-attribute">
${attributeOptions}
</select>
</div>
<div class="c4ai-field-preview">
<strong>Preview Value:</strong>
<div id="c4ai-preview-value">${element.textContent.trim().substring(0, 100)}</div>
</div>
<div class="c4ai-field-selector">
<strong>Selector (auto-generated):</strong>
<div id="c4ai-selector-preview">${this.generateSmartSelector(element, this.container.element)}</div>
</div>
<div class="c4ai-field-actions">
<button id="c4ai-field-save" class="c4ai-primary">✓ Save</button>
<button id="c4ai-field-cancel">✗ Cancel</button>
</div>
</div>
`;
document.body.appendChild(dialog);
const nameInput = dialog.querySelector('#c4ai-field-name');
const typeSelect = dialog.querySelector('#c4ai-field-type');
const attributeSelect = dialog.querySelector('#c4ai-field-attribute');
const attributeContainer = dialog.querySelector('#c4ai-attribute-select');
const previewValue = dialog.querySelector('#c4ai-preview-value');
const saveBtn = dialog.querySelector('#c4ai-field-save');
const cancelBtn = dialog.querySelector('#c4ai-field-cancel');
// Update preview based on type selection
const updatePreview = () => {
const type = typeSelect.value;
let value = '';
switch(type) {
case 'text':
value = element.textContent.trim();
attributeContainer.style.display = 'none';
break;
case 'attribute':
attributeContainer.style.display = 'block';
value = element.getAttribute(attributeSelect.value) || '';
break;
case 'link':
value = element.getAttribute('href') || element.querySelector('a')?.getAttribute('href') || '';
attributeContainer.style.display = 'none';
break;
case 'image':
value = element.getAttribute('src') || element.querySelector('img')?.getAttribute('src') || '';
attributeContainer.style.display = 'none';
break;
case 'list':
const listItems = element.querySelectorAll('li, option');
value = `[${listItems.length} items]`;
attributeContainer.style.display = 'none';
break;
case 'nested':
value = '[Complex nested structure]';
attributeContainer.style.display = 'none';
break;
}
previewValue.textContent = value.substring(0, 100) + (value.length > 100 ? '...' : '');
};
typeSelect.addEventListener('change', updatePreview);
attributeSelect.addEventListener('change', updatePreview);
const save = () => {
const fieldName = nameInput.value.trim();
if (fieldName) {
const type = typeSelect.value;
const selector = this.generateSmartSelector(element, this.container.element);
const field = {
name: fieldName,
type: type,
selector: selector,
element: element,
value: previewValue.textContent
};
// Add attribute if needed
if (type === 'attribute') {
field.attribute = attributeSelect.value;
} else if (type === 'link') {
field.type = 'attribute';
field.attribute = 'href';
} else if (type === 'image') {
field.type = 'attribute';
field.attribute = 'src';
}
this.fields.push(field);
element.classList.add('c4ai-selected-field');
element.setAttribute('data-c4ai-field', fieldName);
this.selectedElements.add(element);
this.updateToolbar();
this.updateStats();
this.generateSchema();
}
dialog.remove(); // Always close dialog
};
const cancel = () => {
dialog.remove();
};
saveBtn.addEventListener('click', save);
cancelBtn.addEventListener('click', cancel);
nameInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') save();
if (e.key === 'Escape') cancel();
});
nameInput.focus();
}
adjustParentLevels(delta) {
if (!this.container) return;
const newLevel = this.parentLevels + delta;
if (newLevel < 0 || newLevel > 5) return;
this.parentLevels = newLevel;
document.getElementById('c4ai-parent-value').textContent = newLevel;
// Update container selector with new parent levels
this.updateContainerSelector();
}
updateContainerSelector() {
if (!this.container || !this.selectedElement) return;
this.container.selector = this.generateContainerSelector(this.selectedElement);
this.container.element = this.selectedElement;
// Update the schema
this.generateSchema();
// Update display
const containerDisplay = document.getElementById('c4ai-container');
// containerDisplay.textContent = `${this.container.tagName} (${this.parentLevels} levels)`;
containerDisplay.textContent = `${this.container.tagName}`;
// Update selector display
const containerSelector = document.getElementById('c4ai-container-selector');
if (containerSelector) {
containerSelector.textContent = this.container.selector;
}
}
generateContainerSelector(element) {
// For container, include parent levels
let current = element;
const parts = [];
// Start from the target element
for (let i = 0; i <= this.parentLevels; i++) {
if (!current || current === document.body) break;
const selector = this.generateSingleElementSelector(current);
parts.unshift(selector);
if (i < this.parentLevels) {
current = current.parentElement;
}
}
// If we have parent levels, show them clearly
if (this.parentLevels > 0 && parts.length > 1) {
// Make it clear which part is the container
const containerPart = parts[parts.length - 1];
const parentParts = parts.slice(0, -1);
return parentParts.join(' > ') + ' > ' + containerPart;
}
return parts.join(' > ');
}
generateSingleElementSelector(element) {
// Generate selector for a single element
if (element.id) {
return `#${CSS.escape(element.id)}`;
}
// Check for data attributes (most stable)
const dataAttrs = ['data-testid', 'data-id', 'data-test', 'data-cy'];
for (const attr of dataAttrs) {
const value = element.getAttribute(attr);
if (value) {
return `[${attr}="${value}"]`;
}
}
// Check for aria-label
if (element.getAttribute('aria-label')) {
return `[aria-label="${element.getAttribute('aria-label')}"]`;
}
const tagName = element.tagName.toLowerCase();
// Check for simple, non-utility classes
const classes = Array.from(element.classList)
.filter(c => !c.startsWith('c4ai-')) // Exclude our classes
.filter(c => !c.includes('[') && !c.includes('(') && !c.includes(':')) // Exclude utility classes
.filter(c => c.length < 30); // Exclude very long classes
if (classes.length > 0 && classes.length <= 2) {
return tagName + classes.map(c => `.${CSS.escape(c)}`).join('');
}
return tagName;
}
generateSelector(element, context = document) {
// Try to generate a robust selector
if (element.id) {
return `#${CSS.escape(element.id)}`;
}
// Check for data attributes (most stable)
const dataAttrs = ['data-testid', 'data-id', 'data-test', 'data-cy'];
for (const attr of dataAttrs) {
const value = element.getAttribute(attr);
if (value) {
return `[${attr}="${value}"]`;
}
}
// Check for aria-label
if (element.getAttribute('aria-label')) {
return `[aria-label="${element.getAttribute('aria-label')}"]`;
}
// Try semantic HTML elements with text
const tagName = element.tagName.toLowerCase();
if (['button', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6'].includes(tagName)) {
const text = element.textContent.trim();
if (text && text.length < 50) {
// Use tag name with partial text match
return `${tagName}`;
}
}
// Check for simple, non-utility classes
const classes = Array.from(element.classList)
.filter(c => !c.startsWith('c4ai-')) // Exclude our classes
.filter(c => !c.includes('[') && !c.includes('(') && !c.includes(':')) // Exclude utility classes
.filter(c => c.length < 30); // Exclude very long classes
if (classes.length > 0 && classes.length <= 3) {
const selector = classes.map(c => `.${CSS.escape(c)}`).join('');
try {
if (context.querySelectorAll(selector).length === 1) {
return selector;
}
} catch (e) {
// Invalid selector, continue
}
}
// Use nth-child with simple parent tag
const parent = element.parentElement;
if (parent && parent !== context) {
const siblings = Array.from(parent.children);
const index = siblings.indexOf(element) + 1;
// Just use parent tag name to avoid recursion
const parentTag = parent.tagName.toLowerCase();
return `${parentTag} > ${tagName}:nth-child(${index})`;
}
// Final fallback
return tagName;
}
updateToolbar() {
// Update mode display
if (!this.container) {
document.getElementById('c4ai-mode').textContent = 'Select Container';
} else if (this.inspectingFields) {
document.getElementById('c4ai-mode').textContent = 'Select Fields';
} else {
document.getElementById('c4ai-mode').textContent = 'Container Selected';
}
// Show/hide container info and controls
const containerItem = document.getElementById('c4ai-container-item');
const parentLevelControls = document.getElementById('c4ai-parent-levels');
const footerSection = document.getElementById('c4ai-footer-section');
const selectorDisplay = document.getElementById('c4ai-selector-display');
const containerSelector = document.getElementById('c4ai-container-selector');
if (this.container) {
containerItem.style.display = 'flex';
parentLevelControls.style.display = 'flex';
footerSection.style.display = 'flex';
selectorDisplay.style.display = 'block';
// Update container display
document.getElementById('c4ai-container').textContent =
`${this.container.tagName} (${this.parentLevels} levels)`;
// Update selector display
containerSelector.textContent = this.container.selector;
} else {
containerItem.style.display = 'none';
parentLevelControls.style.display = 'none';
footerSection.style.display = 'none';
selectorDisplay.style.display = 'none';
}
// Show/hide sections based on state
const schemaSection = document.getElementById('c4ai-schema-section');
const actionsSection = document.getElementById('c4ai-actions-section');
const statsSection = document.getElementById('c4ai-stats-section');
if (this.fields.length > 0) {
schemaSection.style.display = 'block';
actionsSection.style.display = 'block';
statsSection.style.display = 'block';
// Update field count
document.getElementById('c4ai-field-count').textContent = this.fields.length;
// Update fields list with enhanced UI
const fieldsList = document.getElementById('c4ai-fields-list');
fieldsList.innerHTML = this.fields.map((field, index) => {
const icon = this.getFieldIcon(field.type);
return `
<div class="c4ai-field-item" data-index="${index}">
<div class="c4ai-field-header">
<span class="c4ai-field-icon">${icon}</span>
<span class="c4ai-field-name">${field.name}</span>
<div class="c4ai-field-actions">
<button class="c4ai-field-edit" data-index="${index}" title="Edit field">✏️</button>
<button class="c4ai-field-delete" data-index="${index}" title="Remove field">×</button>
</div>
</div>
<div class="c4ai-field-selector" contenteditable="true" data-index="${index}">${field.selector}</div>
</div>
`;
}).join('');
// Add event handlers
fieldsList.querySelectorAll('.c4ai-field-delete').forEach(btn => {
btn.addEventListener('click', (e) => {
const index = parseInt(e.target.dataset.index);
this.removeField(index);
});
});
fieldsList.querySelectorAll('.c4ai-field-edit').forEach(btn => {
btn.addEventListener('click', (e) => {
const index = parseInt(e.target.dataset.index);
this.editField(index);
});
});
fieldsList.querySelectorAll('.c4ai-field-selector').forEach(selector => {
selector.addEventListener('blur', (e) => {
const index = parseInt(e.target.dataset.index);
const newSelector = e.target.textContent.trim();
if (newSelector && this.fields[index]) {
this.fields[index].selector = newSelector;
this.generateSchema();
}
});
selector.addEventListener('keydown', (e) => {
if (e.key === 'Enter') {
e.preventDefault();
e.target.blur();
}
});
});
// Enable action buttons
document.getElementById('c4ai-preview').disabled = false;
document.getElementById('c4ai-test').disabled = false;
document.getElementById('c4ai-export-schema').disabled = false;
document.getElementById('c4ai-export-data').disabled = false;
document.getElementById('c4ai-deploy-cloud').disabled = false;
} else {
schemaSection.style.display = 'none';
actionsSection.style.display = 'none';
statsSection.style.display = 'none';
}
const hint = document.getElementById('c4ai-hint');
if (!this.container) {
hint.textContent = 'Click on a container element (e.g., product card, article, etc.)';
} else if (this.inspectingFields && this.fields.length === 0) {
hint.textContent = 'Click on fields inside the container to extract (title, price, etc.)';
} else if (this.inspectingFields) {
hint.innerHTML = `Continue selecting fields or click Fields button to stop.`;
} else if (this.fields.length === 0) {
hint.innerHTML = `Click <strong>Fields</strong> button to start selecting fields.`;
} else {
hint.innerHTML = `Use action buttons above or click <strong>Fields</strong> to add more.`;
}
}
getFieldIcon(type) {
const icons = {
'text': '📝',
'attribute': '📊',
'link': '🔗',
'image': '🖼️',
'list': '📚',
'nested': '📁'
};
return icons[type] || '📝';
}
removeField(index) {
const field = this.fields[index];
// Remove from arrays
this.fields.splice(index, 1);
// Remove visual selection
field.element.classList.remove('c4ai-selected-field');
field.element.removeAttribute('data-c4ai-field');
this.selectedElements.delete(field.element);
// Update UI
this.updateToolbar();
this.updateStats();
this.generateSchema();
}
editField(index) {
const field = this.fields[index];
if (!field) return;
// Re-show the field dialog with existing values
const dialog = document.createElement('div');
dialog.className = 'c4ai-field-dialog';
const rect = field.element.getBoundingClientRect();
dialog.style.cssText = `
left: ${rect.left + window.scrollX}px;
top: ${rect.bottom + window.scrollY + 10}px;
`;
// Get available attributes
const attributes = this.getElementAttributes(field.element);
const attributeOptions = attributes.map(attr =>
`<option value="${attr.name}" ${field.attribute === attr.name ? 'selected' : ''}>${attr.name}: "${attr.value.substring(0, 30)}${attr.value.length > 30 ? '...' : ''}"</option>`
).join('');
dialog.innerHTML = `
<div class="c4ai-field-dialog-content">
<h4>Edit Field</h4>
<div class="c4ai-field-input">
<label>Field Name:</label>
<input type="text" id="c4ai-field-name" value="${field.name}" placeholder="e.g., title, price, description" autofocus>
</div>
<div class="c4ai-field-input">
<label>Field Type:</label>
<select id="c4ai-field-type">
<option value="text" ${field.type === 'text' ? 'selected' : ''}>Text Content</option>
<option value="attribute" ${field.type === 'attribute' ? 'selected' : ''}>Attribute</option>
<option value="link" ${field.type === 'link' ? 'selected' : ''}>Link (href)</option>
<option value="image" ${field.type === 'image' ? 'selected' : ''}>Image (src)</option>
<option value="list" ${field.type === 'list' ? 'selected' : ''}>List</option>
<option value="nested" ${field.type === 'nested' ? 'selected' : ''}>Nested Object</option>
</select>
</div>
<div class="c4ai-field-input" id="c4ai-attribute-select" style="display: ${field.type === 'attribute' ? 'block' : 'none'};">
<label>Select Attribute:</label>
<select id="c4ai-field-attribute">
${attributeOptions}
</select>
</div>
<div class="c4ai-field-preview">
<strong>Preview Value:</strong>
<div id="c4ai-preview-value">${field.value}</div>
</div>
<div class="c4ai-field-selector">
<strong>Selector (auto-generated):</strong>
<div id="c4ai-selector-preview">${field.selector}</div>
</div>
<div class="c4ai-field-actions">
<button id="c4ai-field-save" class="c4ai-primary">✓ Update</button>
<button id="c4ai-field-cancel">✗ Cancel</button>
</div>
</div>
`;
document.body.appendChild(dialog);
const nameInput = dialog.querySelector('#c4ai-field-name');
const typeSelect = dialog.querySelector('#c4ai-field-type');
const attributeSelect = dialog.querySelector('#c4ai-field-attribute');
const attributeContainer = dialog.querySelector('#c4ai-attribute-select');
const previewValue = dialog.querySelector('#c4ai-preview-value');
const saveBtn = dialog.querySelector('#c4ai-field-save');
const cancelBtn = dialog.querySelector('#c4ai-field-cancel');
// Update preview based on type selection
const updatePreview = () => {
const type = typeSelect.value;
let value = '';
switch(type) {
case 'text':
value = field.element.textContent.trim();
attributeContainer.style.display = 'none';
break;
case 'attribute':
attributeContainer.style.display = 'block';
value = field.element.getAttribute(attributeSelect.value) || '';
break;
case 'link':
value = field.element.getAttribute('href') || field.element.querySelector('a')?.getAttribute('href') || '';
attributeContainer.style.display = 'none';
break;
case 'image':
value = field.element.getAttribute('src') || field.element.querySelector('img')?.getAttribute('src') || '';
attributeContainer.style.display = 'none';
break;
case 'list':
const listItems = field.element.querySelectorAll('li, option');
value = `[${listItems.length} items]`;
attributeContainer.style.display = 'none';
break;
case 'nested':
value = '[Complex nested structure]';
attributeContainer.style.display = 'none';
break;
}
previewValue.textContent = value.substring(0, 100) + (value.length > 100 ? '...' : '');
};
typeSelect.addEventListener('change', updatePreview);
attributeSelect.addEventListener('change', updatePreview);
const save = () => {
const fieldName = nameInput.value.trim();
if (fieldName) {
const type = typeSelect.value;
// Update field
field.name = fieldName;
field.type = type;
field.value = previewValue.textContent;
// Update attribute if needed
if (type === 'attribute') {
field.attribute = attributeSelect.value;
} else if (type === 'link') {
field.type = 'attribute';
field.attribute = 'href';
} else if (type === 'image') {
field.type = 'attribute';
field.attribute = 'src';
} else {
delete field.attribute;
}
// Update element attribute
field.element.setAttribute('data-c4ai-field', fieldName);
this.updateToolbar();
this.updateStats();
this.generateSchema();
}
dialog.remove();
};
const cancel = () => {
dialog.remove();
};
saveBtn.addEventListener('click', save);
cancelBtn.addEventListener('click', cancel);
nameInput.addEventListener('keypress', (e) => {
if (e.key === 'Enter') save();
if (e.key === 'Escape') cancel();
});
nameInput.focus();
nameInput.select();
}
updateStats() {
chrome.runtime.sendMessage({
action: 'updateStats',
stats: {
container: !!this.container,
fields: this.fields.length
}
});
}
removeAllHighlights() {
document.querySelectorAll('.c4ai-selected-container').forEach(el => {
el.classList.remove('c4ai-selected-container');
});
document.querySelectorAll('.c4ai-selected-field').forEach(el => {
el.classList.remove('c4ai-selected-field');
});
}
// New helper methods for enhanced functionality
getElementAttributes(element) {
const attributes = [];
for (const attr of element.attributes) {
attributes.push({
name: attr.name,
value: attr.value
});
}
return attributes;
}
generateSmartSelector(element, container) {
// Smart selector generation with 2-level parent context
const parts = [];
let current = element;
let depth = 0;
// Build path from element up to container (max 3 levels)
while (current && current !== container && depth < 3) {
let selector = current.tagName.toLowerCase();
// Add ID if available
if (current.id && !current.id.includes(':') && !current.id.includes('[')) {
selector = `#${CSS.escape(current.id)}`;
parts.unshift(selector);
break; // ID is unique enough
}
// Add classes (filter out dynamic/utility classes)
const classes = Array.from(current.classList)
.filter(c => !c.startsWith('c4ai-'))
.filter(c => !c.includes('[') && !c.includes('(') && !c.includes(':'))
.filter(c => c.length < 30)
.slice(0, 2); // Max 2 classes
if (classes.length > 0) {
selector += classes.map(c => `.${CSS.escape(c)}`).join('');
}
// Add data attributes for more specificity
const dataAttrs = ['data-testid', 'data-id', 'data-test'];
for (const attr of dataAttrs) {
if (current.hasAttribute(attr)) {
selector += `[${attr}="${CSS.escape(current.getAttribute(attr))}"]`;
break;
}
}
// Add nth-child if needed for disambiguation
if (current.parentElement && depth === 0) {
const siblings = Array.from(current.parentElement.children);
const sameTagSiblings = siblings.filter(s => s.tagName === current.tagName);
if (sameTagSiblings.length > 1) {
const index = sameTagSiblings.indexOf(current) + 1;
selector += `:nth-of-type(${index})`;
}
}
parts.unshift(selector);
current = current.parentElement;
depth++;
}
// Create relative selector from container
const fullSelector = parts.join(' > ');
// Test selector uniqueness within container
try {
const matches = container.querySelectorAll(fullSelector);
if (matches.length === 1 && matches[0] === element) {
return fullSelector;
}
} catch (e) {
// Invalid selector, continue with fallback
}
// Fallback to simple selector
return parts[parts.length - 1] || element.tagName.toLowerCase();
}
generateSchema() {
if (!this.container || this.fields.length === 0) {
return null;
}
// Build schema object
this.schema = {
name: `${window.location.hostname} Schema`,
baseSelector: this.container.selector,
fields: this.fields.map(field => {
const schemaField = {
name: field.name,
selector: field.selector,
type: field.type
};
if (field.attribute) {
schemaField.attribute = field.attribute;
}
return schemaField;
})
};
return this.schema;
}
togglePreview() {
this.previewMode = !this.previewMode;
const previewBtn = document.getElementById('c4ai-preview');
if (this.previewMode) {
previewBtn.innerHTML = '<span>🔄</span> Hide Preview';
this.showPreview();
} else {
previewBtn.innerHTML = '<span>👁️</span> Preview Matches';
this.clearPreview();
}
}
showPreview() {
if (!this.schema) {
this.generateSchema();
}
this.clearPreview();
// Find all matching containers
const containers = document.querySelectorAll(this.schema.baseSelector);
let successCount = 0;
containers.forEach((container, index) => {
// Highlight container
const containerBox = document.createElement('div');
containerBox.className = 'c4ai-preview-container';
const rect = container.getBoundingClientRect();
containerBox.style.cssText = `
position: absolute;
left: ${rect.left + window.scrollX}px;
top: ${rect.top + window.scrollY}px;
width: ${rect.width}px;
height: ${rect.height}px;
pointer-events: none;
z-index: 999997;
`;
document.body.appendChild(containerBox);
this.previewElements.push(containerBox);
// Check each field
let fieldsFound = 0;
this.schema.fields.forEach(field => {
try {
const fieldElement = container.querySelector(field.selector);
if (fieldElement) {
fieldsFound++;
// Highlight successful field
const fieldBox = document.createElement('div');
fieldBox.className = 'c4ai-preview-field-success';
const fieldRect = fieldElement.getBoundingClientRect();
fieldBox.style.cssText = `
position: absolute;
left: ${fieldRect.left + window.scrollX}px;
top: ${fieldRect.top + window.scrollY}px;
width: ${fieldRect.width}px;
height: ${fieldRect.height}px;
pointer-events: none;
z-index: 999998;
`;
document.body.appendChild(fieldBox);
this.previewElements.push(fieldBox);
}
} catch (e) {
// Invalid selector
}
});
// Add count badge
const badge = document.createElement('div');
badge.className = 'c4ai-preview-badge';
badge.textContent = `${index + 1}`;
badge.style.cssText = `
position: absolute;
left: ${rect.left + window.scrollX - 20}px;
top: ${rect.top + window.scrollY - 20}px;
z-index: 999999;
`;
document.body.appendChild(badge);
this.previewElements.push(badge);
if (fieldsFound === this.schema.fields.length) {
successCount++;
}
});
// Update stats
document.getElementById('c4ai-matches-count').textContent = `${containers.length} items`;
document.getElementById('c4ai-schema-valid').textContent =
successCount === containers.length ? '✓ Yes' : `⚠️ Partial (${successCount}/${containers.length})`;
}
clearPreview() {
this.previewElements.forEach(el => el.remove());
this.previewElements = [];
}
async testSchema() {
if (!this.schema) {
this.generateSchema();
}
// Extract data using schema
const results = [];
const containers = document.querySelectorAll(this.schema.baseSelector);
containers.forEach(container => {
const item = {};
this.schema.fields.forEach(field => {
try {
const element = container.querySelector(field.selector);
if (element) {
if (field.type === 'text') {
item[field.name] = element.textContent.trim();
} else if (field.type === 'attribute' && field.attribute) {
item[field.name] = element.getAttribute(field.attribute);
}
} else {
item[field.name] = null;
}
} catch (e) {
item[field.name] = null;
}
});
results.push(item);
});
// Show results modal
this.showResultsModal(results);
}
showResultsModal(data) {
const modal = document.createElement('div');
modal.className = 'c4ai-code-modal';
modal.innerHTML = `
<div class="c4ai-code-modal-content">
<div class="c4ai-code-modal-header">
<h2>Extracted Data (${data.length} items)</h2>
<button class="c4ai-close-modal" id="c4ai-close-results">✕</button>
</div>
<div class="c4ai-code-modal-body">
<pre class="c4ai-code-block"><code>${JSON.stringify(data, null, 2)}</code></pre>
</div>
<div class="c4ai-code-modal-footer">
<button class="c4ai-action-btn c4ai-download-btn" id="c4ai-download-data">
<span>⬇</span> Download JSON
</button>
<button class="c4ai-action-btn c4ai-download-btn" id="c4ai-download-python">
<span>🐍</span> Download Python Code
</button>
<button class="c4ai-action-btn c4ai-copy-btn" id="c4ai-copy-data">
<span>📋</span> Copy to Clipboard
</button>
</div>
</div>
`;
document.body.appendChild(modal);
// Event listeners
document.getElementById('c4ai-close-results').addEventListener('click', () => modal.remove());
document.getElementById('c4ai-download-data').addEventListener('click', () => {
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `extracted_data_${Date.now()}.json`;
a.click();
URL.revokeObjectURL(url);
});
document.getElementById('c4ai-copy-data').addEventListener('click', () => {
navigator.clipboard.writeText(JSON.stringify(data, null, 2)).then(() => {
const btn = document.getElementById('c4ai-copy-data');
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = '<span>📋</span> Copy to Clipboard';
}, 2000);
});
});
document.getElementById('c4ai-download-python').addEventListener('click', () => {
const pythonCode = this.generatePythonCode();
const blob = new Blob([pythonCode], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `crawl4ai_schema_${Date.now()}.py`;
a.click();
URL.revokeObjectURL(url);
});
}
exportSchema() {
if (!this.schema) {
this.generateSchema();
}
const blob = new Blob([JSON.stringify(this.schema, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `schema_${Date.now()}.json`;
a.click();
URL.revokeObjectURL(url);
}
async exportData() {
await this.testSchema();
}
deployToCloud() {
// Create cloud deployment modal
const modal = document.createElement('div');
modal.className = 'c4ai-code-modal';
modal.innerHTML = `
<div class="c4ai-cloud-modal-content">
<div class="c4ai-cloud-header">
<div class="c4ai-cloud-icon">🌩️</div>
<h2>Deploy to Crawl4AI Cloud</h2>
</div>
<div class="c4ai-cloud-body">
<div class="c4ai-cloud-features">
<h3>🚀 Coming Soon!</h3>
<p>Deploy your extraction schemas to the cloud with just one click:</p>
<ul>
<li>✨ <strong>Instant Deployment</strong> - Your schema live in seconds</li>
<li>🌐 <strong>API Access</strong> - RESTful endpoints for your extractions</li>
<li>⏰ <strong>Scheduled Runs</strong> - Automate data collection</li>
<li>📊 <strong>Analytics Dashboard</strong> - Monitor your extractions</li>
<li>🔄 <strong>Auto-scaling</strong> - Handle any volume seamlessly</li>
</ul>
</div>
<div class="c4ai-cloud-cta">
<p>Be the first to know when Crawl4AI Cloud launches!</p>
<button class="c4ai-action-btn c4ai-primary-btn c4ai-waitlist-btn" id="c4ai-join-waitlist">
<span>🎆</span> Join the Waiting List
</button>
</div>
</div>
<button class="c4ai-close-modal" id="c4ai-close-cloud-modal">✕</button>
</div>
`;
document.body.appendChild(modal);
// Add event listeners
document.getElementById('c4ai-close-cloud-modal').addEventListener('click', () => modal.remove());
document.getElementById('c4ai-join-waitlist').addEventListener('click', () => {
window.open('https://crawl4ai.com/join-waiting-list', '_blank');
modal.remove();
});
// Close on escape
const escHandler = (e) => {
if (e.key === 'Escape') {
modal.remove();
document.removeEventListener('keydown', escHandler);
}
};
document.addEventListener('keydown', escHandler);
}
generatePythonCode() {
if (!this.schema) {
this.generateSchema();
}
const schemaJson = JSON.stringify(this.schema, null, 2);
return `#!/usr/bin/env python3
"""
Generated by Crawl4AI Chrome Extension
URL: ${window.location.href}
Generated: ${new Date().toISOString()}
"""
import asyncio
import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# The extraction schema generated from your selections
EXTRACTION_SCHEMA = ${schemaJson}
async def extract_data(url: str = "${window.location.href}"):
"""Extract data using the generated schema"""
# Configure browser (optional)
browser_config = BrowserConfig(
headless=True, # Set to False to see the browser
verbose=False
)
# Configure extraction strategy
extraction_strategy = JsonCssExtractionStrategy(schema=EXTRACTION_SCHEMA)
# Configure crawler
crawler_config = CrawlerRunConfig(
extraction_strategy=extraction_strategy,
# Add more options as needed:
# wait_for="css:.product", # Wait for specific elements
# js_code="window.scrollTo(0, document.body.scrollHeight);", # Execute JS
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
print(f"\\n✅ Successfully extracted {len(data)} items!")
# Save results
with open('extracted_data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# Show sample results
print("\\n📊 Sample results (first 2 items):")
for i, item in enumerate(data[:2], 1):
print(f"\\nItem {i}:")
for key, value in item.items():
print(f" {key}: {value}")
return data
else:
print("❌ Extraction failed:", result.error_message)
return None
if __name__ == "__main__":
# Run the extraction
data = asyncio.run(extract_data())
print("\\n🎯 Next steps:")
print("1. Install Crawl4AI: pip install crawl4ai")
print("2. Modify the URL or add multiple URLs")
print("3. Customize crawler options as needed")
print("4. Check 'extracted_data.json' for full results")
`;
}
// Legacy code generation - kept for reference but no longer used
/*
generateCode() {
const fieldDescriptions = this.fields.map(f =>
`- ${f.name} (example: "${f.value.substring(0, 50)}...")`
).join('\n');
return `#!/usr/bin/env python3
"""
Generated by Crawl4AI Chrome Extension
URL: ${window.location.href}
Generated: ${new Date().toISOString()}
"""
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# HTML snippet of the selected container element
HTML_SNIPPET = """
${this.container.html}
"""
# Extraction query based on your field selections
EXTRACTION_QUERY = """
Create a JSON CSS extraction schema to extract the following fields:
${fieldDescriptions}
The schema should handle multiple ${this.container.tagName} elements on the page.
Each item should be extracted as a separate object in the results array.
"""
async def generate_schema():
"""Generate extraction schema using LLM"""
print("🔧 Generating extraction schema...")
try:
# Generate the schema using Crawl4AI's built-in LLM integration
schema = JsonCssExtractionStrategy.generate_schema(
html=HTML_SNIPPET,
query=EXTRACTION_QUERY,
)
# Save the schema for reuse
schema_path = Path('generated_schema.json')
with open(schema_path, 'w') as f:
json.dump(schema, f, indent=2)
print("✅ Schema generated successfully!")
print(f"📄 Schema saved to: {schema_path}")
print("\\nGenerated schema:")
print(json.dumps(schema, indent=2))
return schema
except Exception as e:
print(f"❌ Error generating schema: {e}")
return None
async def test_extraction(url: str = "${window.location.href}"):
"""Test the generated schema on the actual webpage"""
print("\\n🧪 Testing extraction on live webpage...")
# Load the generated schema
try:
with open('generated_schema.json', 'r') as f:
schema = json.load(f)
except FileNotFoundError:
print("❌ Schema file not found. Run generate_schema() first.")
return
# Configure browser
browser_config = BrowserConfig(
headless=True,
verbose=False
)
# Configure extraction
crawler_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
print(f"\\n✅ Successfully extracted {len(data)} items!")
# Save results
with open('extracted_data.json', 'w') as f:
json.dump(data, f, indent=2)
# Show sample results
print("\\n📊 Sample results (first 2 items):")
for i, item in enumerate(data[:2], 1):
print(f"\\nItem {i}:")
for key, value in item.items():
print(f" {key}: {value}")
else:
print("❌ Extraction failed:", result.error_message)
if __name__ == "__main__":
# Step 1: Generate the schema from HTML snippet
asyncio.run(generate_schema())
# Step 2: Test extraction on the live webpage
# Uncomment the line below to test extraction:
# asyncio.run(test_extraction())
print("\\n🎯 Next steps:")
print("1. Review the generated schema in 'generated_schema.json'")
print("2. Uncomment the test_extraction() line to test on the live site")
print("3. Use the schema in your Crawl4AI projects!")
`;
return code;
}
*/
/* Legacy modal - no longer used
showCodeModal(code) {
// Create modal
this.codeModal = document.createElement('div');
this.codeModal.className = 'c4ai-code-modal';
this.codeModal.innerHTML = `
<div class="c4ai-code-modal-content">
<div class="c4ai-code-modal-header">
<h2>Generated Python Code</h2>
<button class="c4ai-close-modal" id="c4ai-close-modal">✕</button>
</div>
<div class="c4ai-code-modal-body">
<pre class="c4ai-code-block"><code class="language-python">${window.C4AI_Utils.escapeHtml(code)}</code></pre>
</div>
<div class="c4ai-code-modal-footer">
<button class="c4ai-action-btn c4ai-cloud-btn" id="c4ai-run-cloud" disabled>
<span>☁️</span> Run on C4AI Cloud (Coming Soon)
</button>
<button class="c4ai-action-btn c4ai-download-btn" id="c4ai-download-code">
<span>⬇</span> Download Code
</button>
<button class="c4ai-action-btn c4ai-copy-btn" id="c4ai-copy-code">
<span>📋</span> Copy to Clipboard
</button>
</div>
</div>
`;
document.body.appendChild(this.codeModal);
// Add event listeners
document.getElementById('c4ai-close-modal').addEventListener('click', () => {
this.codeModal.remove();
this.codeModal = null;
// Don't stop the capture session
});
document.getElementById('c4ai-download-code').addEventListener('click', () => {
chrome.runtime.sendMessage({
action: 'downloadCode',
code: code,
filename: `crawl4ai_schema_${Date.now()}.py`
}, (response) => {
if (response && response.success) {
const btn = document.getElementById('c4ai-download-code');
const originalHTML = btn.innerHTML;
btn.innerHTML = '<span>✓</span> Downloaded!';
setTimeout(() => {
btn.innerHTML = originalHTML;
}, 2000);
} else {
console.error('Download failed:', response?.error);
alert('Download failed. Please check your browser settings.');
}
});
});
document.getElementById('c4ai-copy-code').addEventListener('click', () => {
navigator.clipboard.writeText(code).then(() => {
const btn = document.getElementById('c4ai-copy-code');
btn.innerHTML = '<span>✓</span> Copied!';
setTimeout(() => {
btn.innerHTML = '<span>📋</span> Copy to Clipboard';
}, 2000);
});
});
// Apply syntax highlighting
window.C4AI_Utils.applySyntaxHighlighting(this.codeModal.querySelector('.language-python'));
}
*/
}
// Export for use in content script
if (typeof window !== 'undefined') {
window.SchemaBuilder = SchemaBuilder;
}