Files
crawl4ai/docs/md_v2/apps/crawl4ai-assistant/content/markdownConverter.js
UncleCode 0ac12da9f3 feat: Major Chrome Extension overhaul with Click2Crawl, instant Schema extraction, and modular architecture
 New Features:
- Click2Crawl: Visual element selection with markdown conversion
  - Ctrl/Cmd+Click to select multiple elements
  - Visual text mode for WYSIWYG extraction
  - Real-time markdown preview with syntax highlighting
  - Export to .md file or clipboard

- Schema Builder Enhancement: Instant data extraction without LLMs
  - Test schemas directly in browser
  - See JSON results immediately
  - Export data or Python code
  - Cloud deployment ready (coming soon)

- Modular Architecture:
  - Separated into schemaBuilder.js, scriptBuilder.js, click2CrawlBuilder.js
  - Added contentAnalyzer.js and markdownConverter.js modules
  - Shared utilities and CSS reset system
  - Integrated marked.js for markdown rendering

🎨 UI/UX Improvements:
- Added edgy cloud announcement banner with seamless shimmer animation
- Direct, technical copy: "You don't need Puppeteer. You need Crawl4AI Cloud."
- Enhanced feature cards with emojis
- Fixed CSS conflicts with targeted reset approach
- Improved badge hover effects (red on hover)
- Added wrap toggle for code preview

📚 Documentation Updates:
- Split extraction diagrams into LLM and no-LLM versions
- Updated llms-full.txt with latest content
- Added versioned LLM context (v0.1.1)

🔧 Technical Enhancements:
- Refactored 3464 lines of monolithic content.js into modules
- Added proper event handling and cleanup
- Improved z-index management
- Better scroll position tracking for badges
- Enhanced error handling throughout

This release transforms the Chrome Extension from a simple tool into a powerful
visual data extraction suite, making web scraping accessible to everyone.
2025-06-09 23:18:27 +08:00

718 lines
21 KiB
JavaScript

class MarkdownConverter {
constructor() {
// Conversion handlers for different element types
this.converters = {
'H1': async (el, ctx) => await this.convertHeading(el, 1, ctx),
'H2': async (el, ctx) => await this.convertHeading(el, 2, ctx),
'H3': async (el, ctx) => await this.convertHeading(el, 3, ctx),
'H4': async (el, ctx) => await this.convertHeading(el, 4, ctx),
'H5': async (el, ctx) => await this.convertHeading(el, 5, ctx),
'H6': async (el, ctx) => await this.convertHeading(el, 6, ctx),
'P': async (el, ctx) => await this.convertParagraph(el, ctx),
'A': async (el, ctx) => await this.convertLink(el, ctx),
'IMG': async (el, ctx) => await this.convertImage(el, ctx),
'UL': async (el, ctx) => await this.convertList(el, 'ul', ctx),
'OL': async (el, ctx) => await this.convertList(el, 'ol', ctx),
'LI': async (el, ctx) => await this.convertListItem(el, ctx),
'TABLE': async (el, ctx) => await this.convertTable(el, ctx),
'BLOCKQUOTE': async (el, ctx) => await this.convertBlockquote(el, ctx),
'PRE': async (el, ctx) => await this.convertPreformatted(el, ctx),
'CODE': async (el, ctx) => await this.convertCode(el, ctx),
'HR': async (el, ctx) => '\n---\n',
'BR': async (el, ctx) => ' \n',
'STRONG': async (el, ctx) => `**${await this.getTextContent(el, ctx)}**`,
'B': async (el, ctx) => `**${await this.getTextContent(el, ctx)}**`,
'EM': async (el, ctx) => `*${await this.getTextContent(el, ctx)}*`,
'I': async (el, ctx) => `*${await this.getTextContent(el, ctx)}*`,
'DEL': async (el, ctx) => `~~${await this.getTextContent(el, ctx)}~~`,
'S': async (el, ctx) => `~~${await this.getTextContent(el, ctx)}~~`,
'DIV': async (el, ctx) => await this.convertDiv(el, ctx),
'SPAN': async (el, ctx) => await this.convertSpan(el, ctx),
'ARTICLE': async (el, ctx) => await this.convertArticle(el, ctx),
'SECTION': async (el, ctx) => await this.convertSection(el, ctx),
'FIGURE': async (el, ctx) => await this.convertFigure(el, ctx),
'FIGCAPTION': async (el, ctx) => await this.convertFigCaption(el, ctx),
'VIDEO': async (el, ctx) => await this.convertVideo(el, ctx),
'IFRAME': async (el, ctx) => await this.convertIframe(el, ctx),
'DL': async (el, ctx) => await this.convertDefinitionList(el, ctx),
'DT': async (el, ctx) => await this.convertDefinitionTerm(el, ctx),
'DD': async (el, ctx) => await this.convertDefinitionDescription(el, ctx),
'TR': async (el, ctx) => await this.convertTableRow(el, ctx)
};
// Maintain context during conversion
this.conversionContext = {
listDepth: 0,
inTable: false,
inCode: false,
preserveWhitespace: false,
references: [],
imageCount: 0,
linkCount: 0
};
}
async convert(elements, options = {}) {
// Reset context
this.resetContext();
// Apply options
this.options = {
includeImages: true,
preserveTables: true,
keepCodeFormatting: true,
simplifyLayout: false,
preserveLinks: true,
...options
};
// Convert elements
const markdownParts = [];
for (const element of elements) {
const markdown = await this.convertElement(element, this.conversionContext);
if (markdown.trim()) {
markdownParts.push(markdown);
}
}
// Join parts with appropriate spacing
let result = markdownParts.join('\n\n');
// Add references if using reference-style links
if (this.conversionContext.references.length > 0) {
result += '\n\n' + this.generateReferences();
}
// Post-process to clean up
result = this.postProcess(result);
return result;
}
resetContext() {
this.conversionContext = {
listDepth: 0,
inTable: false,
inCode: false,
preserveWhitespace: false,
references: [],
imageCount: 0,
linkCount: 0
};
}
async convertElement(element, context) {
// Skip hidden elements
if (this.isHidden(element)) {
return '';
}
// Skip script and style elements
if (['SCRIPT', 'STYLE', 'NOSCRIPT'].includes(element.tagName)) {
return '';
}
// Get converter for this element type
const converter = this.converters[element.tagName];
if (converter) {
return await converter(element, context);
} else {
// For unknown elements, process children
return await this.processChildren(element, context);
}
}
async processChildren(element, context) {
const parts = [];
for (const child of element.childNodes) {
if (child.nodeType === Node.TEXT_NODE) {
const text = this.processTextNode(child, context);
if (text) {
parts.push(text);
}
} else if (child.nodeType === Node.ELEMENT_NODE) {
const markdown = await this.convertElement(child, context);
if (markdown) {
parts.push(markdown);
}
}
}
return parts.join('');
}
processTextNode(node, context) {
let text = node.textContent;
// Preserve whitespace in code blocks
if (!context.preserveWhitespace && !context.inCode) {
// Normalize whitespace
text = text.replace(/\s+/g, ' ');
// Trim if at block boundaries
if (this.isBlockBoundary(node.previousSibling)) {
text = text.trimStart();
}
if (this.isBlockBoundary(node.nextSibling)) {
text = text.trimEnd();
}
}
// Escape markdown characters
if (!context.inCode) {
text = this.escapeMarkdown(text);
}
return text;
}
isBlockBoundary(node) {
if (!node || node.nodeType !== Node.ELEMENT_NODE) {
return true;
}
const blockElements = [
'DIV', 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
'UL', 'OL', 'LI', 'BLOCKQUOTE', 'PRE', 'TABLE',
'HR', 'ARTICLE', 'SECTION', 'HEADER', 'FOOTER',
'NAV', 'ASIDE', 'MAIN'
];
return blockElements.includes(node.tagName);
}
escapeMarkdown(text) {
// In text-only mode, don't escape characters
if (this.options.textOnly) {
return text;
}
// Escape special markdown characters
return text
.replace(/\\/g, '\\\\')
.replace(/\*/g, '\\*')
.replace(/_/g, '\\_')
.replace(/\[/g, '\\[')
.replace(/\]/g, '\\]')
.replace(/\(/g, '\\(')
.replace(/\)/g, '\\)')
.replace(/\#/g, '\\#')
.replace(/\+/g, '\\+')
.replace(/\-/g, '\\-')
.replace(/\./g, '\\.')
.replace(/\!/g, '\\!')
.replace(/\|/g, '\\|');
}
async convertHeading(element, level, context) {
const text = await this.getTextContent(element, context);
return '#'.repeat(level) + ' ' + text + '\n';
}
async convertParagraph(element, context) {
const content = await this.processChildren(element, context);
return content.trim() ? content + '\n' : '';
}
async convertLink(element, context) {
if (!this.options.preserveLinks || this.options.textOnly) {
return await this.getTextContent(element, context);
}
const text = await this.getTextContent(element, context);
const href = element.getAttribute('href');
const title = element.getAttribute('title');
if (!href) {
return text;
}
// Convert relative URLs to absolute
const absoluteUrl = this.makeAbsoluteUrl(href);
// Use reference-style links for cleaner markdown
if (text && absoluteUrl) {
if (title) {
return `[${text}](${absoluteUrl} "${title}")`;
} else {
return `[${text}](${absoluteUrl})`;
}
}
return text;
}
async convertImage(element, context) {
if (!this.options.includeImages || this.options.textOnly) {
// In text-only mode, return alt text if available
if (this.options.textOnly) {
const alt = element.getAttribute('alt');
return alt ? `[Image: ${alt}]` : '';
}
return '';
}
const src = element.getAttribute('src');
const alt = element.getAttribute('alt') || '';
const title = element.getAttribute('title');
if (!src) {
return '';
}
// Convert relative URLs to absolute
const absoluteUrl = this.makeAbsoluteUrl(src);
if (title) {
return `![${alt}](${absoluteUrl} "${title}")`;
} else {
return `![${alt}](${absoluteUrl})`;
}
}
async convertList(element, type, context) {
const oldDepth = context.listDepth;
context.listDepth++;
const items = [];
for (const child of element.children) {
if (child.tagName === 'LI') {
const markdown = await this.convertListItem(child, { ...context, listType: type });
if (markdown) {
items.push(markdown);
}
}
}
context.listDepth = oldDepth;
return items.join('\n') + (context.listDepth === 0 ? '\n' : '');
}
async convertListItem(element, context) {
const indent = ' '.repeat(Math.max(0, context.listDepth - 1));
const bullet = context.listType === 'ol' ? '1.' : '-';
const content = (await this.processChildren(element, context)).trim();
return `${indent}${bullet} ${content}`;
}
async convertTable(element, context) {
if (!this.options.preserveTables || this.options.textOnly) {
// Fallback to simple text representation
return await this.convertTableToText(element, context);
}
const rows = [];
const headerRows = [];
let maxCols = 0;
// Process table rows
for (const child of element.children) {
if (child.tagName === 'THEAD') {
for (const row of child.children) {
if (row.tagName === 'TR') {
const cells = await this.processTableRow(row, context);
headerRows.push(cells);
maxCols = Math.max(maxCols, cells.length);
}
}
} else if (child.tagName === 'TBODY') {
for (const row of child.children) {
if (row.tagName === 'TR') {
const cells = await this.processTableRow(row, context);
rows.push(cells);
maxCols = Math.max(maxCols, cells.length);
}
}
} else if (child.tagName === 'TR') {
const cells = await this.processTableRow(child, context);
rows.push(cells);
maxCols = Math.max(maxCols, cells.length);
}
}
// Build markdown table
const markdownRows = [];
// Add headers
if (headerRows.length > 0) {
for (const headerRow of headerRows) {
const paddedRow = this.padTableRow(headerRow, maxCols);
markdownRows.push('| ' + paddedRow.join(' | ') + ' |');
}
// Add separator
const separator = Array(maxCols).fill('---');
markdownRows.push('| ' + separator.join(' | ') + ' |');
}
// Add body rows
for (const row of rows) {
const paddedRow = this.padTableRow(row, maxCols);
markdownRows.push('| ' + paddedRow.join(' | ') + ' |');
}
return markdownRows.join('\n') + '\n';
}
async processTableRow(row, context) {
const cells = [];
for (const cell of row.children) {
if (cell.tagName === 'TD' || cell.tagName === 'TH') {
const content = (await this.getTextContent(cell, context)).trim();
cells.push(content);
}
}
return cells;
}
async convertTableRow(element, context) {
// Convert a single table row to markdown
if (this.options.textOnly) {
const cells = await this.processTableRow(element, context);
return cells.join(' ');
}
// For non-text-only mode, create a simple table representation
const cells = await this.processTableRow(element, context);
return '| ' + cells.join(' | ') + ' |';
}
padTableRow(row, targetLength) {
const padded = [...row];
while (padded.length < targetLength) {
padded.push('');
}
return padded;
}
async convertTableToText(element, context) {
// Convert table to clean text representation
const lines = [];
const rows = element.querySelectorAll('tr');
for (const row of rows) {
const cells = row.querySelectorAll('td, th');
const cellTexts = [];
for (const cell of cells) {
const text = (await this.getTextContent(cell, context)).trim();
if (text) {
cellTexts.push(text);
}
}
if (cellTexts.length > 0) {
// Join cells with space, handling common patterns
lines.push(cellTexts.join(' '));
}
}
return lines.join('\n');
}
async convertBlockquote(element, context) {
const lines = (await this.processChildren(element, context)).trim().split('\n');
return lines.map(line => '> ' + line).join('\n') + '\n';
}
async convertPreformatted(element, context) {
const oldInCode = context.inCode;
const oldPreserveWhitespace = context.preserveWhitespace;
context.inCode = true;
context.preserveWhitespace = true;
let content = '';
let language = '';
// Check if this is a code block with language
const codeElement = element.querySelector('code');
if (codeElement) {
// Try to detect language from class
const className = codeElement.className;
const langMatch = className.match(/language-(\w+)/);
if (langMatch) {
language = langMatch[1];
}
content = codeElement.textContent;
} else {
content = element.textContent;
}
context.inCode = oldInCode;
context.preserveWhitespace = oldPreserveWhitespace;
// Use fenced code blocks
return '```' + language + '\n' + content + '\n```\n';
}
async convertCode(element, context) {
if (element.parentElement && element.parentElement.tagName === 'PRE') {
// Already handled by convertPreformatted
return element.textContent;
}
const content = element.textContent;
return '`' + content + '`';
}
async convertDiv(element, context) {
// Check for special div types
if (element.className.includes('code-block') ||
element.className.includes('highlight')) {
return await this.convertPreformatted(element, context);
}
const content = await this.processChildren(element, context);
return content.trim() ? content + '\n' : '';
}
async convertSpan(element, context) {
// Check for special span types
if (element.className.includes('code') ||
element.className.includes('inline-code')) {
return this.convertCode(element, context);
}
return await this.processChildren(element, context);
}
async convertArticle(element, context) {
const content = await this.processChildren(element, context);
return content.trim() ? content + '\n' : '';
}
async convertSection(element, context) {
const content = await this.processChildren(element, context);
return content.trim() ? content + '\n' : '';
}
async convertFigure(element, context) {
const content = await this.processChildren(element, context);
return content.trim() ? content + '\n' : '';
}
async convertFigCaption(element, context) {
const caption = await this.getTextContent(element, context);
return caption ? '\n*' + caption + '*\n' : '';
}
async convertVideo(element, context) {
const title = element.getAttribute('title') || 'Video';
if (this.options.textOnly) {
return `[Video: ${title}]`;
}
const src = element.getAttribute('src');
const poster = element.getAttribute('poster');
if (!src) {
return '';
}
// Convert to markdown with poster image if available
if (poster) {
const absolutePoster = this.makeAbsoluteUrl(poster);
const absoluteSrc = this.makeAbsoluteUrl(src);
return `[![${title}](${absolutePoster})](${absoluteSrc})`;
} else {
const absoluteSrc = this.makeAbsoluteUrl(src);
return `[${title}](${absoluteSrc})`;
}
}
async convertIframe(element, context) {
const title = element.getAttribute('title') || 'Embedded content';
if (this.options.textOnly) {
const src = element.getAttribute('src') || '';
if (src.includes('youtube.com') || src.includes('youtu.be')) {
return `[Video: ${title}]`;
} else if (src.includes('vimeo.com')) {
return `[Video: ${title}]`;
} else {
return `[Embedded: ${title}]`;
}
}
const src = element.getAttribute('src');
if (!src) {
return '';
}
// Check for common embeds
if (src.includes('youtube.com') || src.includes('youtu.be')) {
return `[▶️ ${title}](${src})`;
} else if (src.includes('vimeo.com')) {
return `[▶️ ${title}](${src})`;
} else {
return `[${title}](${src})`;
}
}
async convertDefinitionList(element, context) {
return await this.processChildren(element, context) + '\n';
}
async convertDefinitionTerm(element, context) {
const term = await this.getTextContent(element, context);
return '**' + term + '**\n';
}
async convertDefinitionDescription(element, context) {
const description = await this.processChildren(element, context);
return ': ' + description + '\n';
}
async getTextContent(element, context) {
// Special handling for elements that might contain other markdown
if (context.inCode) {
return element.textContent;
}
return await this.processChildren(element, context);
}
makeAbsoluteUrl(url) {
if (!url) return '';
try {
// Check if already absolute
if (url.startsWith('http://') || url.startsWith('https://')) {
return url;
}
// Handle protocol-relative URLs
if (url.startsWith('//')) {
return window.location.protocol + url;
}
// Convert relative to absolute
const base = window.location.origin;
const path = window.location.pathname;
if (url.startsWith('/')) {
return base + url;
} else {
// Relative to current path
const pathDir = path.substring(0, path.lastIndexOf('/') + 1);
return base + pathDir + url;
}
} catch (e) {
return url;
}
}
isHidden(element) {
const style = window.getComputedStyle(element);
return style.display === 'none' ||
style.visibility === 'hidden' ||
style.opacity === '0';
}
generateReferences() {
return this.conversionContext.references
.map((ref, index) => `[${index + 1}]: ${ref.url}`)
.join('\n');
}
postProcess(markdown) {
// Apply text-only specific processing
if (this.options.textOnly) {
markdown = this.postProcessTextOnly(markdown);
}
// Clean up excessive newlines
markdown = markdown.replace(/\n{3,}/g, '\n\n');
// Clean up spaces before punctuation
markdown = markdown.replace(/ +([.,;:!?])/g, '$1');
// Ensure proper spacing around headers
markdown = markdown.replace(/\n(#{1,6} )/g, '\n\n$1');
markdown = markdown.replace(/(#{1,6} .+)\n(?![\n#])/g, '$1\n\n');
// Clean up list spacing
markdown = markdown.replace(/\n\n(-|\d+\.) /g, '\n$1 ');
// Trim final result
return markdown.trim();
}
postProcessTextOnly(markdown) {
// Smart pattern recognition for common formats
const lines = markdown.split('\n');
const processedLines = [];
let inMetadata = false;
let currentItem = null;
for (let i = 0; i < lines.length; i++) {
const line = lines[i].trim();
if (!line) {
processedLines.push('');
continue;
}
// Detect numbered list items (common in HN, Reddit, etc.)
const numberPattern = /^(\d+)\.\s*(.+)$/;
const numberMatch = line.match(numberPattern);
if (numberMatch) {
// Start of a new numbered item
inMetadata = false;
currentItem = numberMatch[1];
const content = numberMatch[2];
// Check if content has domain in parentheses
const domainPattern = /^(.+?)\s*\(([^)]+)\)\s*(.*)$/;
const domainMatch = content.match(domainPattern);
if (domainMatch) {
const [, title, domain, rest] = domainMatch;
processedLines.push(`${currentItem}. **${title.trim()}** (${domain})`);
if (rest.trim()) {
processedLines.push(` ${rest.trim()}`);
inMetadata = true;
}
} else {
processedLines.push(`${currentItem}. **${content}**`);
}
} else if (line.match(/\b(points?|by|ago|hide|comments?)\b/i) && currentItem) {
// This looks like metadata for the current item
const cleanedLine = line
.replace(/\s+/g, ' ')
.replace(/\s*\|\s*/g, ' | ')
.trim();
processedLines.push(` ${cleanedLine}`);
inMetadata = true;
} else if (inMetadata && line.length < 100) {
// Continue metadata if we're in metadata mode and line is short
processedLines.push(` ${line}`);
} else {
// Regular content
inMetadata = false;
processedLines.push(line);
}
}
// Clean up the output
let result = processedLines.join('\n');
// Remove excessive blank lines
result = result.replace(/\n{3,}/g, '\n\n');
// Ensure proper spacing after numbered items
result = result.replace(/^(\d+\..+)$\n^(?!\s)/gm, '$1\n\n');
return result;
}
}