`;
document.body.appendChild(modal);
// Event listeners
document.getElementById('c4ai-close-results').addEventListener('click', () => modal.remove());
document.getElementById('c4ai-download-data').addEventListener('click', () => {
const blob = new Blob([JSON.stringify(data, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `extracted_data_${Date.now()}.json`;
a.click();
URL.revokeObjectURL(url);
});
document.getElementById('c4ai-copy-data').addEventListener('click', () => {
navigator.clipboard.writeText(JSON.stringify(data, null, 2)).then(() => {
const btn = document.getElementById('c4ai-copy-data');
btn.innerHTML = 'โ Copied!';
setTimeout(() => {
btn.innerHTML = '๐ Copy to Clipboard';
}, 2000);
});
});
document.getElementById('c4ai-download-python').addEventListener('click', () => {
const pythonCode = this.generatePythonCode();
const blob = new Blob([pythonCode], { type: 'text/plain' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `crawl4ai_schema_${Date.now()}.py`;
a.click();
URL.revokeObjectURL(url);
});
}
exportSchema() {
if (!this.schema) {
this.generateSchema();
}
const blob = new Blob([JSON.stringify(this.schema, null, 2)], { type: 'application/json' });
const url = URL.createObjectURL(blob);
const a = document.createElement('a');
a.href = url;
a.download = `schema_${Date.now()}.json`;
a.click();
URL.revokeObjectURL(url);
}
async exportData() {
await this.testSchema();
}
async exportMarkdown() {
// Initialize markdown converter if not already done
if (!this.markdownConverter) {
this.markdownConverter = new MarkdownConverter();
}
if (!this.contentAnalyzer) {
this.contentAnalyzer = new ContentAnalyzer();
}
// Initialize markdown preview modal if not already done
if (!this.markdownPreviewModal) {
this.markdownPreviewModal = new MarkdownPreviewModal();
}
// Get all matching containers
const containers = document.querySelectorAll(this.container.selector);
if (containers.length === 0) {
this.showNotification('No matching containers found', 'error');
return;
}
// Show modal with callback to generate markdown
this.markdownPreviewModal.show(async (options) => {
return await this.generateMarkdownFromSchema(options);
});
}
async generateMarkdownFromSchema(options) {
// Get all matching containers
const containers = document.querySelectorAll(this.container.selector);
const markdownParts = [];
for (let i = 0; i < containers.length; i++) {
const container = containers[i];
// Add XPath header if enabled
if (options.includeXPath) {
const xpath = this.getXPath(container);
markdownParts.push(`### Container ${i + 1} - XPath: \`${xpath}\`\n`);
}
// Extract data based on schema fields
const extractedData = {};
this.fields.forEach(field => {
try {
const element = container.querySelector(field.selector);
if (element) {
if (field.type === 'text') {
extractedData[field.name] = element.textContent.trim();
} else if (field.type === 'attribute' && field.attribute) {
extractedData[field.name] = element.getAttribute(field.attribute);
}
}
} catch (e) {
// Skip invalid selectors
}
});
// Convert container to markdown based on options
const analysis = await this.contentAnalyzer.analyze([container]);
const containerMarkdown = await this.markdownConverter.convert([container], {
...options,
analysis,
extractedData // Pass extracted data for context
});
// Trim the markdown before adding
const trimmedMarkdown = containerMarkdown.trim();
markdownParts.push(trimmedMarkdown);
// Add separator if enabled and not last element
if (options.addSeparators && i < containers.length - 1) {
markdownParts.push('\n---\n');
}
}
return markdownParts.join('\n');
}
getXPath(element) {
if (element.id) {
return `//*[@id="${element.id}"]`;
}
const parts = [];
let current = element;
while (current && current.nodeType === Node.ELEMENT_NODE) {
let index = 0;
let sibling = current.previousSibling;
while (sibling) {
if (sibling.nodeType === Node.ELEMENT_NODE && sibling.nodeName === current.nodeName) {
index++;
}
sibling = sibling.previousSibling;
}
const tagName = current.nodeName.toLowerCase();
const part = index > 0 ? `${tagName}[${index + 1}]` : tagName;
parts.unshift(part);
current = current.parentNode;
}
return '/' + parts.join('/');
}
showNotification(message, type = 'success') {
const notification = document.createElement('div');
notification.className = `c4ai-notification c4ai-notification-${type}`;
notification.textContent = message;
document.body.appendChild(notification);
// Animate in
setTimeout(() => notification.classList.add('show'), 10);
// Remove after 3 seconds
setTimeout(() => {
notification.classList.remove('show');
setTimeout(() => notification.remove(), 300);
}, 3000);
}
deployToCloud() {
// Create cloud deployment modal
const modal = document.createElement('div');
modal.className = 'c4ai-code-modal';
modal.innerHTML = `
๐ฉ๏ธ
Deploy to Crawl4AI Cloud
๐ Coming Soon!
Deploy your extraction schemas to the cloud with just one click:
โจ Instant Deployment - Your schema live in seconds
๐ API Access - RESTful endpoints for your extractions
โฐ Scheduled Runs - Automate data collection
๐ Analytics Dashboard - Monitor your extractions
๐ Auto-scaling - Handle any volume seamlessly
Be the first to know when Crawl4AI Cloud launches!
`;
document.body.appendChild(modal);
// Add event listeners
document.getElementById('c4ai-close-cloud-modal').addEventListener('click', () => modal.remove());
document.getElementById('c4ai-join-waitlist').addEventListener('click', () => {
window.open('https://crawl4ai.com/join-waiting-list', '_blank');
modal.remove();
});
// Close on escape
const escHandler = (e) => {
if (e.key === 'Escape') {
modal.remove();
document.removeEventListener('keydown', escHandler);
}
};
document.addEventListener('keydown', escHandler);
}
generatePythonCode() {
if (!this.schema) {
this.generateSchema();
}
const schemaJson = JSON.stringify(this.schema, null, 2);
return `#!/usr/bin/env python3
"""
Generated by Crawl4AI Chrome Extension
URL: ${window.location.href}
Generated: ${new Date().toISOString()}
"""
import asyncio
import json
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai import JsonCssExtractionStrategy
# The extraction schema generated from your selections
EXTRACTION_SCHEMA = ${schemaJson}
async def extract_data(url: str = "${window.location.href}"):
"""Extract data using the generated schema"""
# Configure browser (optional)
browser_config = BrowserConfig(
headless=True, # Set to False to see the browser
verbose=False
)
# Configure extraction strategy
extraction_strategy = JsonCssExtractionStrategy(schema=EXTRACTION_SCHEMA)
# Configure crawler
crawler_config = CrawlerRunConfig(
extraction_strategy=extraction_strategy,
# Add more options as needed:
# wait_for="css:.product", # Wait for specific elements
# js_code="window.scrollTo(0, document.body.scrollHeight);", # Execute JS
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
print(f"\\nโ Successfully extracted {len(data)} items!")
# Save results
with open('extracted_data.json', 'w', encoding='utf-8') as f:
json.dump(data, f, indent=2, ensure_ascii=False)
# Show sample results
print("\\n๐ Sample results (first 2 items):")
for i, item in enumerate(data[:2], 1):
print(f"\\nItem {i}:")
for key, value in item.items():
print(f" {key}: {value}")
return data
else:
print("โ Extraction failed:", result.error_message)
return None
if __name__ == "__main__":
# Run the extraction
data = asyncio.run(extract_data())
print("\\n๐ฏ Next steps:")
print("1. Install Crawl4AI: pip install crawl4ai")
print("2. Modify the URL or add multiple URLs")
print("3. Customize crawler options as needed")
print("4. Check 'extracted_data.json' for full results")
`;
}
// Legacy code generation - kept for reference but no longer used
/*
generateCode() {
const fieldDescriptions = this.fields.map(f =>
`- ${f.name} (example: "${f.value.substring(0, 50)}...")`
).join('\n');
return `#!/usr/bin/env python3
"""
Generated by Crawl4AI Chrome Extension
URL: ${window.location.href}
Generated: ${new Date().toISOString()}
"""
import asyncio
import json
from pathlib import Path
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai import JsonCssExtractionStrategy
# HTML snippet of the selected container element
HTML_SNIPPET = """
${this.container.html}
"""
# Extraction query based on your field selections
EXTRACTION_QUERY = """
Create a JSON CSS extraction schema to extract the following fields:
${fieldDescriptions}
The schema should handle multiple ${this.container.tagName} elements on the page.
Each item should be extracted as a separate object in the results array.
"""
async def generate_schema():
"""Generate extraction schema using LLM"""
print("๐ง Generating extraction schema...")
try:
# Generate the schema using Crawl4AI's built-in LLM integration
schema = JsonCssExtractionStrategy.generate_schema(
html=HTML_SNIPPET,
query=EXTRACTION_QUERY,
)
# Save the schema for reuse
schema_path = Path('generated_schema.json')
with open(schema_path, 'w') as f:
json.dump(schema, f, indent=2)
print("โ Schema generated successfully!")
print(f"๐ Schema saved to: {schema_path}")
print("\\nGenerated schema:")
print(json.dumps(schema, indent=2))
return schema
except Exception as e:
print(f"โ Error generating schema: {e}")
return None
async def test_extraction(url: str = "${window.location.href}"):
"""Test the generated schema on the actual webpage"""
print("\\n๐งช Testing extraction on live webpage...")
# Load the generated schema
try:
with open('generated_schema.json', 'r') as f:
schema = json.load(f)
except FileNotFoundError:
print("โ Schema file not found. Run generate_schema() first.")
return
# Configure browser
browser_config = BrowserConfig(
headless=True,
verbose=False
)
# Configure extraction
crawler_config = CrawlerRunConfig(
extraction_strategy=JsonCssExtractionStrategy(schema=schema)
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url=url,
config=crawler_config
)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
print(f"\\nโ Successfully extracted {len(data)} items!")
# Save results
with open('extracted_data.json', 'w') as f:
json.dump(data, f, indent=2)
# Show sample results
print("\\n๐ Sample results (first 2 items):")
for i, item in enumerate(data[:2], 1):
print(f"\\nItem {i}:")
for key, value in item.items():
print(f" {key}: {value}")
else:
print("โ Extraction failed:", result.error_message)
if __name__ == "__main__":
# Step 1: Generate the schema from HTML snippet
asyncio.run(generate_schema())
# Step 2: Test extraction on the live webpage
# Uncomment the line below to test extraction:
# asyncio.run(test_extraction())
print("\\n๐ฏ Next steps:")
print("1. Review the generated schema in 'generated_schema.json'")
print("2. Uncomment the test_extraction() line to test on the live site")
print("3. Use the schema in your Crawl4AI projects!")
`;
return code;
}
*/
/* Legacy modal - no longer used
showCodeModal(code) {
// Create modal
this.codeModal = document.createElement('div');
this.codeModal.className = 'c4ai-code-modal';
this.codeModal.innerHTML = `