✨ New Features: - Click2Crawl: Visual element selection with markdown conversion - Ctrl/Cmd+Click to select multiple elements - Visual text mode for WYSIWYG extraction - Real-time markdown preview with syntax highlighting - Export to .md file or clipboard - Schema Builder Enhancement: Instant data extraction without LLMs - Test schemas directly in browser - See JSON results immediately - Export data or Python code - Cloud deployment ready (coming soon) - Modular Architecture: - Separated into schemaBuilder.js, scriptBuilder.js, click2CrawlBuilder.js - Added contentAnalyzer.js and markdownConverter.js modules - Shared utilities and CSS reset system - Integrated marked.js for markdown rendering 🎨 UI/UX Improvements: - Added edgy cloud announcement banner with seamless shimmer animation - Direct, technical copy: "You don't need Puppeteer. You need Crawl4AI Cloud." - Enhanced feature cards with emojis - Fixed CSS conflicts with targeted reset approach - Improved badge hover effects (red on hover) - Added wrap toggle for code preview 📚 Documentation Updates: - Split extraction diagrams into LLM and no-LLM versions - Updated llms-full.txt with latest content - Added versioned LLM context (v0.1.1) 🔧 Technical Enhancements: - Refactored 3464 lines of monolithic content.js into modules - Added proper event handling and cleanup - Improved z-index management - Better scroll position tracking for badges - Enhanced error handling throughout This release transforms the Chrome Extension from a simple tool into a powerful visual data extraction suite, making web scraping accessible to everyone.
718 lines
21 KiB
JavaScript
718 lines
21 KiB
JavaScript
class MarkdownConverter {
|
|
constructor() {
|
|
// Conversion handlers for different element types
|
|
this.converters = {
|
|
'H1': async (el, ctx) => await this.convertHeading(el, 1, ctx),
|
|
'H2': async (el, ctx) => await this.convertHeading(el, 2, ctx),
|
|
'H3': async (el, ctx) => await this.convertHeading(el, 3, ctx),
|
|
'H4': async (el, ctx) => await this.convertHeading(el, 4, ctx),
|
|
'H5': async (el, ctx) => await this.convertHeading(el, 5, ctx),
|
|
'H6': async (el, ctx) => await this.convertHeading(el, 6, ctx),
|
|
'P': async (el, ctx) => await this.convertParagraph(el, ctx),
|
|
'A': async (el, ctx) => await this.convertLink(el, ctx),
|
|
'IMG': async (el, ctx) => await this.convertImage(el, ctx),
|
|
'UL': async (el, ctx) => await this.convertList(el, 'ul', ctx),
|
|
'OL': async (el, ctx) => await this.convertList(el, 'ol', ctx),
|
|
'LI': async (el, ctx) => await this.convertListItem(el, ctx),
|
|
'TABLE': async (el, ctx) => await this.convertTable(el, ctx),
|
|
'BLOCKQUOTE': async (el, ctx) => await this.convertBlockquote(el, ctx),
|
|
'PRE': async (el, ctx) => await this.convertPreformatted(el, ctx),
|
|
'CODE': async (el, ctx) => await this.convertCode(el, ctx),
|
|
'HR': async (el, ctx) => '\n---\n',
|
|
'BR': async (el, ctx) => ' \n',
|
|
'STRONG': async (el, ctx) => `**${await this.getTextContent(el, ctx)}**`,
|
|
'B': async (el, ctx) => `**${await this.getTextContent(el, ctx)}**`,
|
|
'EM': async (el, ctx) => `*${await this.getTextContent(el, ctx)}*`,
|
|
'I': async (el, ctx) => `*${await this.getTextContent(el, ctx)}*`,
|
|
'DEL': async (el, ctx) => `~~${await this.getTextContent(el, ctx)}~~`,
|
|
'S': async (el, ctx) => `~~${await this.getTextContent(el, ctx)}~~`,
|
|
'DIV': async (el, ctx) => await this.convertDiv(el, ctx),
|
|
'SPAN': async (el, ctx) => await this.convertSpan(el, ctx),
|
|
'ARTICLE': async (el, ctx) => await this.convertArticle(el, ctx),
|
|
'SECTION': async (el, ctx) => await this.convertSection(el, ctx),
|
|
'FIGURE': async (el, ctx) => await this.convertFigure(el, ctx),
|
|
'FIGCAPTION': async (el, ctx) => await this.convertFigCaption(el, ctx),
|
|
'VIDEO': async (el, ctx) => await this.convertVideo(el, ctx),
|
|
'IFRAME': async (el, ctx) => await this.convertIframe(el, ctx),
|
|
'DL': async (el, ctx) => await this.convertDefinitionList(el, ctx),
|
|
'DT': async (el, ctx) => await this.convertDefinitionTerm(el, ctx),
|
|
'DD': async (el, ctx) => await this.convertDefinitionDescription(el, ctx),
|
|
'TR': async (el, ctx) => await this.convertTableRow(el, ctx)
|
|
};
|
|
|
|
// Maintain context during conversion
|
|
this.conversionContext = {
|
|
listDepth: 0,
|
|
inTable: false,
|
|
inCode: false,
|
|
preserveWhitespace: false,
|
|
references: [],
|
|
imageCount: 0,
|
|
linkCount: 0
|
|
};
|
|
}
|
|
|
|
async convert(elements, options = {}) {
|
|
// Reset context
|
|
this.resetContext();
|
|
|
|
// Apply options
|
|
this.options = {
|
|
includeImages: true,
|
|
preserveTables: true,
|
|
keepCodeFormatting: true,
|
|
simplifyLayout: false,
|
|
preserveLinks: true,
|
|
...options
|
|
};
|
|
|
|
// Convert elements
|
|
const markdownParts = [];
|
|
|
|
for (const element of elements) {
|
|
const markdown = await this.convertElement(element, this.conversionContext);
|
|
if (markdown.trim()) {
|
|
markdownParts.push(markdown);
|
|
}
|
|
}
|
|
|
|
// Join parts with appropriate spacing
|
|
let result = markdownParts.join('\n\n');
|
|
|
|
// Add references if using reference-style links
|
|
if (this.conversionContext.references.length > 0) {
|
|
result += '\n\n' + this.generateReferences();
|
|
}
|
|
|
|
// Post-process to clean up
|
|
result = this.postProcess(result);
|
|
|
|
return result;
|
|
}
|
|
|
|
resetContext() {
|
|
this.conversionContext = {
|
|
listDepth: 0,
|
|
inTable: false,
|
|
inCode: false,
|
|
preserveWhitespace: false,
|
|
references: [],
|
|
imageCount: 0,
|
|
linkCount: 0
|
|
};
|
|
}
|
|
|
|
async convertElement(element, context) {
|
|
// Skip hidden elements
|
|
if (this.isHidden(element)) {
|
|
return '';
|
|
}
|
|
|
|
// Skip script and style elements
|
|
if (['SCRIPT', 'STYLE', 'NOSCRIPT'].includes(element.tagName)) {
|
|
return '';
|
|
}
|
|
|
|
// Get converter for this element type
|
|
const converter = this.converters[element.tagName];
|
|
|
|
if (converter) {
|
|
return await converter(element, context);
|
|
} else {
|
|
// For unknown elements, process children
|
|
return await this.processChildren(element, context);
|
|
}
|
|
}
|
|
|
|
async processChildren(element, context) {
|
|
const parts = [];
|
|
|
|
for (const child of element.childNodes) {
|
|
if (child.nodeType === Node.TEXT_NODE) {
|
|
const text = this.processTextNode(child, context);
|
|
if (text) {
|
|
parts.push(text);
|
|
}
|
|
} else if (child.nodeType === Node.ELEMENT_NODE) {
|
|
const markdown = await this.convertElement(child, context);
|
|
if (markdown) {
|
|
parts.push(markdown);
|
|
}
|
|
}
|
|
}
|
|
|
|
return parts.join('');
|
|
}
|
|
|
|
processTextNode(node, context) {
|
|
let text = node.textContent;
|
|
|
|
// Preserve whitespace in code blocks
|
|
if (!context.preserveWhitespace && !context.inCode) {
|
|
// Normalize whitespace
|
|
text = text.replace(/\s+/g, ' ');
|
|
|
|
// Trim if at block boundaries
|
|
if (this.isBlockBoundary(node.previousSibling)) {
|
|
text = text.trimStart();
|
|
}
|
|
if (this.isBlockBoundary(node.nextSibling)) {
|
|
text = text.trimEnd();
|
|
}
|
|
}
|
|
|
|
// Escape markdown characters
|
|
if (!context.inCode) {
|
|
text = this.escapeMarkdown(text);
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
isBlockBoundary(node) {
|
|
if (!node || node.nodeType !== Node.ELEMENT_NODE) {
|
|
return true;
|
|
}
|
|
|
|
const blockElements = [
|
|
'DIV', 'P', 'H1', 'H2', 'H3', 'H4', 'H5', 'H6',
|
|
'UL', 'OL', 'LI', 'BLOCKQUOTE', 'PRE', 'TABLE',
|
|
'HR', 'ARTICLE', 'SECTION', 'HEADER', 'FOOTER',
|
|
'NAV', 'ASIDE', 'MAIN'
|
|
];
|
|
|
|
return blockElements.includes(node.tagName);
|
|
}
|
|
|
|
escapeMarkdown(text) {
|
|
// In text-only mode, don't escape characters
|
|
if (this.options.textOnly) {
|
|
return text;
|
|
}
|
|
|
|
// Escape special markdown characters
|
|
return text
|
|
.replace(/\\/g, '\\\\')
|
|
.replace(/\*/g, '\\*')
|
|
.replace(/_/g, '\\_')
|
|
.replace(/\[/g, '\\[')
|
|
.replace(/\]/g, '\\]')
|
|
.replace(/\(/g, '\\(')
|
|
.replace(/\)/g, '\\)')
|
|
.replace(/\#/g, '\\#')
|
|
.replace(/\+/g, '\\+')
|
|
.replace(/\-/g, '\\-')
|
|
.replace(/\./g, '\\.')
|
|
.replace(/\!/g, '\\!')
|
|
.replace(/\|/g, '\\|');
|
|
}
|
|
|
|
async convertHeading(element, level, context) {
|
|
const text = await this.getTextContent(element, context);
|
|
return '#'.repeat(level) + ' ' + text + '\n';
|
|
}
|
|
|
|
async convertParagraph(element, context) {
|
|
const content = await this.processChildren(element, context);
|
|
return content.trim() ? content + '\n' : '';
|
|
}
|
|
|
|
async convertLink(element, context) {
|
|
if (!this.options.preserveLinks || this.options.textOnly) {
|
|
return await this.getTextContent(element, context);
|
|
}
|
|
|
|
const text = await this.getTextContent(element, context);
|
|
const href = element.getAttribute('href');
|
|
const title = element.getAttribute('title');
|
|
|
|
if (!href) {
|
|
return text;
|
|
}
|
|
|
|
// Convert relative URLs to absolute
|
|
const absoluteUrl = this.makeAbsoluteUrl(href);
|
|
|
|
// Use reference-style links for cleaner markdown
|
|
if (text && absoluteUrl) {
|
|
if (title) {
|
|
return `[${text}](${absoluteUrl} "${title}")`;
|
|
} else {
|
|
return `[${text}](${absoluteUrl})`;
|
|
}
|
|
}
|
|
|
|
return text;
|
|
}
|
|
|
|
async convertImage(element, context) {
|
|
if (!this.options.includeImages || this.options.textOnly) {
|
|
// In text-only mode, return alt text if available
|
|
if (this.options.textOnly) {
|
|
const alt = element.getAttribute('alt');
|
|
return alt ? `[Image: ${alt}]` : '';
|
|
}
|
|
return '';
|
|
}
|
|
|
|
const src = element.getAttribute('src');
|
|
const alt = element.getAttribute('alt') || '';
|
|
const title = element.getAttribute('title');
|
|
|
|
if (!src) {
|
|
return '';
|
|
}
|
|
|
|
// Convert relative URLs to absolute
|
|
const absoluteUrl = this.makeAbsoluteUrl(src);
|
|
|
|
if (title) {
|
|
return ``;
|
|
} else {
|
|
return ``;
|
|
}
|
|
}
|
|
|
|
async convertList(element, type, context) {
|
|
const oldDepth = context.listDepth;
|
|
context.listDepth++;
|
|
|
|
const items = [];
|
|
for (const child of element.children) {
|
|
if (child.tagName === 'LI') {
|
|
const markdown = await this.convertListItem(child, { ...context, listType: type });
|
|
if (markdown) {
|
|
items.push(markdown);
|
|
}
|
|
}
|
|
}
|
|
|
|
context.listDepth = oldDepth;
|
|
|
|
return items.join('\n') + (context.listDepth === 0 ? '\n' : '');
|
|
}
|
|
|
|
async convertListItem(element, context) {
|
|
const indent = ' '.repeat(Math.max(0, context.listDepth - 1));
|
|
const bullet = context.listType === 'ol' ? '1.' : '-';
|
|
const content = (await this.processChildren(element, context)).trim();
|
|
|
|
return `${indent}${bullet} ${content}`;
|
|
}
|
|
|
|
async convertTable(element, context) {
|
|
if (!this.options.preserveTables || this.options.textOnly) {
|
|
// Fallback to simple text representation
|
|
return await this.convertTableToText(element, context);
|
|
}
|
|
|
|
const rows = [];
|
|
const headerRows = [];
|
|
let maxCols = 0;
|
|
|
|
// Process table rows
|
|
for (const child of element.children) {
|
|
if (child.tagName === 'THEAD') {
|
|
for (const row of child.children) {
|
|
if (row.tagName === 'TR') {
|
|
const cells = await this.processTableRow(row, context);
|
|
headerRows.push(cells);
|
|
maxCols = Math.max(maxCols, cells.length);
|
|
}
|
|
}
|
|
} else if (child.tagName === 'TBODY') {
|
|
for (const row of child.children) {
|
|
if (row.tagName === 'TR') {
|
|
const cells = await this.processTableRow(row, context);
|
|
rows.push(cells);
|
|
maxCols = Math.max(maxCols, cells.length);
|
|
}
|
|
}
|
|
} else if (child.tagName === 'TR') {
|
|
const cells = await this.processTableRow(child, context);
|
|
rows.push(cells);
|
|
maxCols = Math.max(maxCols, cells.length);
|
|
}
|
|
}
|
|
|
|
// Build markdown table
|
|
const markdownRows = [];
|
|
|
|
// Add headers
|
|
if (headerRows.length > 0) {
|
|
for (const headerRow of headerRows) {
|
|
const paddedRow = this.padTableRow(headerRow, maxCols);
|
|
markdownRows.push('| ' + paddedRow.join(' | ') + ' |');
|
|
}
|
|
|
|
// Add separator
|
|
const separator = Array(maxCols).fill('---');
|
|
markdownRows.push('| ' + separator.join(' | ') + ' |');
|
|
}
|
|
|
|
// Add body rows
|
|
for (const row of rows) {
|
|
const paddedRow = this.padTableRow(row, maxCols);
|
|
markdownRows.push('| ' + paddedRow.join(' | ') + ' |');
|
|
}
|
|
|
|
return markdownRows.join('\n') + '\n';
|
|
}
|
|
|
|
async processTableRow(row, context) {
|
|
const cells = [];
|
|
|
|
for (const cell of row.children) {
|
|
if (cell.tagName === 'TD' || cell.tagName === 'TH') {
|
|
const content = (await this.getTextContent(cell, context)).trim();
|
|
cells.push(content);
|
|
}
|
|
}
|
|
|
|
return cells;
|
|
}
|
|
|
|
async convertTableRow(element, context) {
|
|
// Convert a single table row to markdown
|
|
if (this.options.textOnly) {
|
|
const cells = await this.processTableRow(element, context);
|
|
return cells.join(' ');
|
|
}
|
|
|
|
// For non-text-only mode, create a simple table representation
|
|
const cells = await this.processTableRow(element, context);
|
|
return '| ' + cells.join(' | ') + ' |';
|
|
}
|
|
|
|
padTableRow(row, targetLength) {
|
|
const padded = [...row];
|
|
while (padded.length < targetLength) {
|
|
padded.push('');
|
|
}
|
|
return padded;
|
|
}
|
|
|
|
async convertTableToText(element, context) {
|
|
// Convert table to clean text representation
|
|
const lines = [];
|
|
const rows = element.querySelectorAll('tr');
|
|
|
|
for (const row of rows) {
|
|
const cells = row.querySelectorAll('td, th');
|
|
const cellTexts = [];
|
|
|
|
for (const cell of cells) {
|
|
const text = (await this.getTextContent(cell, context)).trim();
|
|
if (text) {
|
|
cellTexts.push(text);
|
|
}
|
|
}
|
|
|
|
if (cellTexts.length > 0) {
|
|
// Join cells with space, handling common patterns
|
|
lines.push(cellTexts.join(' '));
|
|
}
|
|
}
|
|
|
|
return lines.join('\n');
|
|
}
|
|
|
|
async convertBlockquote(element, context) {
|
|
const lines = (await this.processChildren(element, context)).trim().split('\n');
|
|
return lines.map(line => '> ' + line).join('\n') + '\n';
|
|
}
|
|
|
|
async convertPreformatted(element, context) {
|
|
const oldInCode = context.inCode;
|
|
const oldPreserveWhitespace = context.preserveWhitespace;
|
|
|
|
context.inCode = true;
|
|
context.preserveWhitespace = true;
|
|
|
|
let content = '';
|
|
let language = '';
|
|
|
|
// Check if this is a code block with language
|
|
const codeElement = element.querySelector('code');
|
|
if (codeElement) {
|
|
// Try to detect language from class
|
|
const className = codeElement.className;
|
|
const langMatch = className.match(/language-(\w+)/);
|
|
if (langMatch) {
|
|
language = langMatch[1];
|
|
}
|
|
|
|
content = codeElement.textContent;
|
|
} else {
|
|
content = element.textContent;
|
|
}
|
|
|
|
context.inCode = oldInCode;
|
|
context.preserveWhitespace = oldPreserveWhitespace;
|
|
|
|
// Use fenced code blocks
|
|
return '```' + language + '\n' + content + '\n```\n';
|
|
}
|
|
|
|
async convertCode(element, context) {
|
|
if (element.parentElement && element.parentElement.tagName === 'PRE') {
|
|
// Already handled by convertPreformatted
|
|
return element.textContent;
|
|
}
|
|
|
|
const content = element.textContent;
|
|
return '`' + content + '`';
|
|
}
|
|
|
|
async convertDiv(element, context) {
|
|
// Check for special div types
|
|
if (element.className.includes('code-block') ||
|
|
element.className.includes('highlight')) {
|
|
return await this.convertPreformatted(element, context);
|
|
}
|
|
|
|
const content = await this.processChildren(element, context);
|
|
return content.trim() ? content + '\n' : '';
|
|
}
|
|
|
|
async convertSpan(element, context) {
|
|
// Check for special span types
|
|
if (element.className.includes('code') ||
|
|
element.className.includes('inline-code')) {
|
|
return this.convertCode(element, context);
|
|
}
|
|
|
|
return await this.processChildren(element, context);
|
|
}
|
|
|
|
async convertArticle(element, context) {
|
|
const content = await this.processChildren(element, context);
|
|
return content.trim() ? content + '\n' : '';
|
|
}
|
|
|
|
async convertSection(element, context) {
|
|
const content = await this.processChildren(element, context);
|
|
return content.trim() ? content + '\n' : '';
|
|
}
|
|
|
|
async convertFigure(element, context) {
|
|
const content = await this.processChildren(element, context);
|
|
return content.trim() ? content + '\n' : '';
|
|
}
|
|
|
|
async convertFigCaption(element, context) {
|
|
const caption = await this.getTextContent(element, context);
|
|
return caption ? '\n*' + caption + '*\n' : '';
|
|
}
|
|
|
|
async convertVideo(element, context) {
|
|
const title = element.getAttribute('title') || 'Video';
|
|
|
|
if (this.options.textOnly) {
|
|
return `[Video: ${title}]`;
|
|
}
|
|
|
|
const src = element.getAttribute('src');
|
|
const poster = element.getAttribute('poster');
|
|
|
|
if (!src) {
|
|
return '';
|
|
}
|
|
|
|
// Convert to markdown with poster image if available
|
|
if (poster) {
|
|
const absolutePoster = this.makeAbsoluteUrl(poster);
|
|
const absoluteSrc = this.makeAbsoluteUrl(src);
|
|
return `[](${absoluteSrc})`;
|
|
} else {
|
|
const absoluteSrc = this.makeAbsoluteUrl(src);
|
|
return `[${title}](${absoluteSrc})`;
|
|
}
|
|
}
|
|
|
|
async convertIframe(element, context) {
|
|
const title = element.getAttribute('title') || 'Embedded content';
|
|
|
|
if (this.options.textOnly) {
|
|
const src = element.getAttribute('src') || '';
|
|
if (src.includes('youtube.com') || src.includes('youtu.be')) {
|
|
return `[Video: ${title}]`;
|
|
} else if (src.includes('vimeo.com')) {
|
|
return `[Video: ${title}]`;
|
|
} else {
|
|
return `[Embedded: ${title}]`;
|
|
}
|
|
}
|
|
|
|
const src = element.getAttribute('src');
|
|
if (!src) {
|
|
return '';
|
|
}
|
|
|
|
// Check for common embeds
|
|
if (src.includes('youtube.com') || src.includes('youtu.be')) {
|
|
return `[▶️ ${title}](${src})`;
|
|
} else if (src.includes('vimeo.com')) {
|
|
return `[▶️ ${title}](${src})`;
|
|
} else {
|
|
return `[${title}](${src})`;
|
|
}
|
|
}
|
|
|
|
async convertDefinitionList(element, context) {
|
|
return await this.processChildren(element, context) + '\n';
|
|
}
|
|
|
|
async convertDefinitionTerm(element, context) {
|
|
const term = await this.getTextContent(element, context);
|
|
return '**' + term + '**\n';
|
|
}
|
|
|
|
async convertDefinitionDescription(element, context) {
|
|
const description = await this.processChildren(element, context);
|
|
return ': ' + description + '\n';
|
|
}
|
|
|
|
async getTextContent(element, context) {
|
|
// Special handling for elements that might contain other markdown
|
|
if (context.inCode) {
|
|
return element.textContent;
|
|
}
|
|
|
|
return await this.processChildren(element, context);
|
|
}
|
|
|
|
makeAbsoluteUrl(url) {
|
|
if (!url) return '';
|
|
|
|
try {
|
|
// Check if already absolute
|
|
if (url.startsWith('http://') || url.startsWith('https://')) {
|
|
return url;
|
|
}
|
|
|
|
// Handle protocol-relative URLs
|
|
if (url.startsWith('//')) {
|
|
return window.location.protocol + url;
|
|
}
|
|
|
|
// Convert relative to absolute
|
|
const base = window.location.origin;
|
|
const path = window.location.pathname;
|
|
|
|
if (url.startsWith('/')) {
|
|
return base + url;
|
|
} else {
|
|
// Relative to current path
|
|
const pathDir = path.substring(0, path.lastIndexOf('/') + 1);
|
|
return base + pathDir + url;
|
|
}
|
|
} catch (e) {
|
|
return url;
|
|
}
|
|
}
|
|
|
|
isHidden(element) {
|
|
const style = window.getComputedStyle(element);
|
|
return style.display === 'none' ||
|
|
style.visibility === 'hidden' ||
|
|
style.opacity === '0';
|
|
}
|
|
|
|
generateReferences() {
|
|
return this.conversionContext.references
|
|
.map((ref, index) => `[${index + 1}]: ${ref.url}`)
|
|
.join('\n');
|
|
}
|
|
|
|
postProcess(markdown) {
|
|
// Apply text-only specific processing
|
|
if (this.options.textOnly) {
|
|
markdown = this.postProcessTextOnly(markdown);
|
|
}
|
|
|
|
// Clean up excessive newlines
|
|
markdown = markdown.replace(/\n{3,}/g, '\n\n');
|
|
|
|
// Clean up spaces before punctuation
|
|
markdown = markdown.replace(/ +([.,;:!?])/g, '$1');
|
|
|
|
// Ensure proper spacing around headers
|
|
markdown = markdown.replace(/\n(#{1,6} )/g, '\n\n$1');
|
|
markdown = markdown.replace(/(#{1,6} .+)\n(?![\n#])/g, '$1\n\n');
|
|
|
|
// Clean up list spacing
|
|
markdown = markdown.replace(/\n\n(-|\d+\.) /g, '\n$1 ');
|
|
|
|
// Trim final result
|
|
return markdown.trim();
|
|
}
|
|
|
|
postProcessTextOnly(markdown) {
|
|
// Smart pattern recognition for common formats
|
|
const lines = markdown.split('\n');
|
|
const processedLines = [];
|
|
let inMetadata = false;
|
|
let currentItem = null;
|
|
|
|
for (let i = 0; i < lines.length; i++) {
|
|
const line = lines[i].trim();
|
|
if (!line) {
|
|
processedLines.push('');
|
|
continue;
|
|
}
|
|
|
|
// Detect numbered list items (common in HN, Reddit, etc.)
|
|
const numberPattern = /^(\d+)\.\s*(.+)$/;
|
|
const numberMatch = line.match(numberPattern);
|
|
|
|
if (numberMatch) {
|
|
// Start of a new numbered item
|
|
inMetadata = false;
|
|
currentItem = numberMatch[1];
|
|
const content = numberMatch[2];
|
|
|
|
// Check if content has domain in parentheses
|
|
const domainPattern = /^(.+?)\s*\(([^)]+)\)\s*(.*)$/;
|
|
const domainMatch = content.match(domainPattern);
|
|
|
|
if (domainMatch) {
|
|
const [, title, domain, rest] = domainMatch;
|
|
processedLines.push(`${currentItem}. **${title.trim()}** (${domain})`);
|
|
if (rest.trim()) {
|
|
processedLines.push(` ${rest.trim()}`);
|
|
inMetadata = true;
|
|
}
|
|
} else {
|
|
processedLines.push(`${currentItem}. **${content}**`);
|
|
}
|
|
} else if (line.match(/\b(points?|by|ago|hide|comments?)\b/i) && currentItem) {
|
|
// This looks like metadata for the current item
|
|
const cleanedLine = line
|
|
.replace(/\s+/g, ' ')
|
|
.replace(/\s*\|\s*/g, ' | ')
|
|
.trim();
|
|
processedLines.push(` ${cleanedLine}`);
|
|
inMetadata = true;
|
|
} else if (inMetadata && line.length < 100) {
|
|
// Continue metadata if we're in metadata mode and line is short
|
|
processedLines.push(` ${line}`);
|
|
} else {
|
|
// Regular content
|
|
inMetadata = false;
|
|
processedLines.push(line);
|
|
}
|
|
}
|
|
|
|
// Clean up the output
|
|
let result = processedLines.join('\n');
|
|
|
|
// Remove excessive blank lines
|
|
result = result.replace(/\n{3,}/g, '\n\n');
|
|
|
|
// Ensure proper spacing after numbered items
|
|
result = result.replace(/^(\d+\..+)$\n^(?!\s)/gm, '$1\n\n');
|
|
|
|
return result;
|
|
}
|
|
} |