feat: add Script Builder to Chrome Extension and reorganize LLM context files

This commit introduces significant enhancements to the Crawl4AI ecosystem:

  Chrome Extension - Script Builder (Alpha):
  - Add recording functionality to capture user interactions (clicks, typing, scrolling)
  - Implement smart event grouping for cleaner script generation
  - Support export to both JavaScript and C4A script formats
  - Add timeline view for visualizing and editing recorded actions
  - Include wait commands (time-based and element-based)
  - Add saved flows functionality for reusing automation scripts
  - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents)
  - Release new extension versions: v1.1.0, v1.2.0, v1.2.1

  LLM Context Builder Improvements:
  - Reorganize context files from llmtxt/ to llm.txt/ with better structure
  - Separate diagram templates from text content (diagrams/ and txt/ subdirectories)
  - Add comprehensive context files for all major Crawl4AI components
  - Improve file naming convention for better discoverability

  Documentation Updates:
  - Update apps index page to match main documentation theme
  - Standardize color scheme: "Available" tags use primary color (#50ffff)
  - Change "Coming Soon" tags to dark gray for better visual hierarchy
  - Add interactive two-column layout for extension landing page
  - Include code examples for both Schema Builder and Script Builder features

  Technical Improvements:
  - Enhance event capture mechanism with better element selection
  - Add support for contenteditable elements and complex form interactions
  - Implement proper scroll event handling for both window and element scrolling
  - Add meta key support for keyboard shortcuts
  - Improve selector generation for more reliable element targeting

  The Script Builder is released as Alpha, acknowledging potential bugs while providing
  early access to this powerful automation recording feature.
This commit is contained in:
UncleCode
2025-06-08 22:02:12 +08:00
parent 926592649e
commit 40640badad
72 changed files with 28600 additions and 100986 deletions

View File

@@ -8,7 +8,15 @@
"Bash(mkdir:*)",
"Bash(cp:*)",
"Bash(rm:*)",
"Bash(true)"
"Bash(true)",
"Bash(./package-extension.sh:*)",
"Bash(find:*)",
"Bash(chmod:*)",
"Bash(rg:*)",
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 5 -B 5 \"Script Builder\" docs/md_v2/apps/crawl4ai-assistant/)",
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 30 \"generateCode\\(events, format\\)\" docs/md_v2/apps/crawl4ai-assistant/content/content.js)",
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg \"<style>\" docs/md_v2/apps/crawl4ai-assistant/index.html -A 5)",
"Bash(git checkout:*)"
]
},
"enableAllProjectMcpServers": false

View File

@@ -384,16 +384,20 @@ code {
.coming-features {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
gap: 1.5rem;
margin-top: 2rem;
}
.coming-feature {
background: var(--bg-tertiary);
padding: 2rem;
padding: 1.5rem;
border-radius: 12px;
border: 1px solid var(--border-color);
transition: all 0.3s ease;
height: 100%;
display: flex;
flex-direction: column;
}
.coming-feature:hover {
@@ -429,16 +433,18 @@ code {
color: var(--text-secondary);
margin-bottom: 1rem;
line-height: 1.6;
flex-grow: 1;
}
.feature-preview {
background: var(--bg-secondary);
padding: 1rem;
padding: 0.75rem 1rem;
border-radius: 6px;
font-family: var(--font-code);
font-size: 0.875rem;
font-size: 0.8125rem;
color: var(--text-accent);
border: 1px solid var(--border-color);
margin-top: auto;
}
.stay-tuned {
@@ -537,3 +543,487 @@ code {
font-size: 1.5rem;
}
}
/* Interactive Tools Section */
.interactive-tools {
margin: 3rem 0;
}
.interactive-tools h2 {
font-size: 2rem;
margin-bottom: 2rem;
color: var(--text-primary);
}
.tools-container {
display: grid;
grid-template-columns: 300px 1fr;
gap: 2rem;
min-height: 400px;
}
/* Tool Selector Panel */
.tools-panel {
display: flex;
flex-direction: column;
gap: 1rem;
}
.tool-selector {
background: var(--bg-tertiary);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 1.5rem;
cursor: pointer;
transition: all 0.3s ease;
display: flex;
align-items: center;
gap: 1rem;
position: relative;
}
.tool-selector:hover {
border-color: var(--primary-green);
transform: translateX(4px);
}
.tool-selector.active {
background: var(--bg-secondary);
border-color: var(--primary-green);
box-shadow: 0 0 20px rgba(15, 187, 170, 0.3);
}
.tool-icon {
font-size: 2.5rem;
flex-shrink: 0;
}
.tool-info h3 {
margin: 0;
font-size: 1.125rem;
color: var(--text-primary);
}
.tool-info p {
margin: 0.25rem 0 0;
font-size: 0.875rem;
color: var(--text-secondary);
}
.tool-status {
position: absolute;
top: 1rem;
right: 1rem;
font-size: 0.75rem;
padding: 0.25rem 0.75rem;
border-radius: 20px;
background: var(--primary-green);
color: var(--bg-dark);
font-weight: 600;
}
.tool-status.alpha {
background: var(--primary-pink);
}
/* Tool Details Panel */
.tool-details {
background: var(--bg-secondary);
border: 1px solid var(--border-color);
border-radius: 12px;
padding: 2rem;
position: relative;
overflow: hidden;
}
.tool-content {
display: none;
animation: fadeIn 0.4s ease;
}
.tool-content.active {
display: block;
}
@keyframes fadeIn {
from {
opacity: 0;
transform: translateY(10px);
}
to {
opacity: 1;
transform: translateY(0);
}
}
.tool-header {
margin-bottom: 2rem;
}
.tool-header h3 {
font-size: 1.75rem;
margin: 0;
color: var(--text-primary);
}
.tool-tagline {
color: var(--text-secondary);
font-size: 1rem;
margin-top: 0.5rem;
display: block;
}
/* Tool Steps */
.tool-steps {
display: flex;
flex-direction: column;
gap: 1.5rem;
margin-bottom: 2rem;
}
.step-item {
display: flex;
gap: 1.5rem;
align-items: flex-start;
}
.step-number {
background: var(--primary-green);
color: var(--bg-dark);
width: 40px;
height: 40px;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-weight: bold;
flex-shrink: 0;
}
.step-content h4 {
margin: 0 0 0.5rem;
color: var(--text-primary);
}
.step-content p {
margin: 0 0 0.5rem;
color: var(--text-secondary);
font-size: 0.875rem;
}
.step-visual {
display: flex;
align-items: center;
gap: 0.5rem;
font-size: 0.875rem;
color: var(--text-secondary);
}
.highlight-green {
color: var(--primary-green);
font-size: 1.25rem;
}
.highlight-pink {
color: var(--primary-pink);
font-size: 1.25rem;
}
.highlight-accent {
color: var(--primary-green);
font-size: 1.25rem;
}
.recording-dot {
color: #ff3c74;
font-size: 1.25rem;
animation: pulse 1.5s ease-in-out infinite;
}
@keyframes pulse {
0%, 100% { opacity: 1; }
50% { opacity: 0.5; }
}
.action-icon {
font-size: 1.25rem;
margin: 0 0.25rem;
}
/* Tool Features */
.tool-features {
display: flex;
flex-wrap: wrap;
gap: 0.75rem;
margin-top: 2rem;
}
.feature-tag {
background: var(--bg-tertiary);
border: 1px solid var(--border-color);
padding: 0.5rem 1rem;
border-radius: 20px;
font-size: 0.875rem;
color: var(--text-secondary);
}
.feature-tag.alpha-tag {
border-color: var(--primary-pink);
color: var(--primary-pink);
}
/* Code Showcase Section */
.code-showcase {
margin: 3rem 0;
}
.code-showcase h2 {
font-size: 2rem;
margin-bottom: 2rem;
color: var(--text-primary);
}
/* Code Tabs */
.code-tabs {
display: flex;
gap: 1rem;
margin-bottom: 2rem;
}
.code-tab {
background: var(--bg-tertiary);
border: 1px solid var(--border-color);
padding: 0.75rem 1.5rem;
border-radius: 8px;
font-size: 1rem;
color: var(--text-secondary);
cursor: pointer;
transition: all 0.3s ease;
font-family: var(--font-primary);
}
.code-tab:hover {
border-color: var(--primary-green);
color: var(--text-primary);
}
.code-tab.active {
background: var(--primary-green);
color: var(--bg-dark);
border-color: var(--primary-green);
font-weight: 600;
}
/* Code Examples */
.code-examples {
position: relative;
min-height: 500px;
}
.code-example {
position: absolute;
width: 100%;
opacity: 0;
visibility: hidden;
transition: opacity 0.4s ease, visibility 0.4s ease;
}
.code-example.active {
opacity: 1;
visibility: visible;
position: relative;
}
/* Copy Button */
.copy-button {
position: absolute;
right: 1rem;
top: 50%;
transform: translateY(-50%);
background: var(--bg-tertiary);
border: 1px solid var(--border-color);
color: var(--text-secondary);
padding: 0.25rem 0.75rem;
border-radius: 4px;
font-size: 0.75rem;
cursor: pointer;
transition: all 0.2s ease;
font-family: var(--font-primary);
}
.copy-button:hover {
background: var(--primary-green);
color: var(--bg-dark);
border-color: var(--primary-green);
}
.copy-button.copied {
background: var(--primary-green);
color: var(--bg-dark);
}
/* Responsive Updates */
@media (max-width: 768px) {
.tools-container {
grid-template-columns: 1fr;
}
.tools-panel {
flex-direction: row;
overflow-x: auto;
padding-bottom: 0.5rem;
}
.tool-selector {
min-width: 250px;
}
.code-tabs {
flex-wrap: wrap;
}
}
/* Script Builder Section */
.script-builder-section {
margin: 4rem 0;
}
.script-builder-section h2 {
font-size: 2rem;
margin-bottom: 2rem;
color: var(--text-primary);
}
.script-builder-section h2 span {
color: var(--primary-pink);
font-size: 0.875rem;
font-weight: normal;
margin-left: 0.5rem;
}
.script-features-grid {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
gap: 1.5rem;
margin-top: 2rem;
}
.script-feature {
background: var(--bg-tertiary);
padding: 1.5rem;
border-radius: 8px;
border: 1px solid var(--border-color);
text-align: center;
transition: all 0.2s ease;
}
.script-feature:hover {
border-color: var(--primary-green);
transform: translateY(-2px);
box-shadow: 0 4px 16px rgba(15, 187, 170, 0.2);
}
.script-feature .feature-icon {
font-size: 2.5rem;
margin-bottom: 1rem;
display: block;
}
.script-feature h4 {
font-size: 1.125rem;
margin-bottom: 0.5rem;
color: var(--text-primary);
}
.script-feature p {
font-size: 0.875rem;
color: var(--text-secondary);
}
.script-workflow {
margin-top: 2rem;
}
.workflow-step {
display: flex;
gap: 1.5rem;
margin-bottom: 2rem;
align-items: flex-start;
}
.workflow-step .step-number {
background: var(--primary-pink);
color: var(--bg-dark);
width: 40px;
height: 40px;
border-radius: 50%;
display: flex;
align-items: center;
justify-content: center;
font-weight: bold;
flex-shrink: 0;
}
.workflow-step .step-content h4 {
margin-bottom: 0.5rem;
color: var(--text-primary);
}
.workflow-step .step-content p {
color: var(--text-secondary);
}
.action-types {
display: grid;
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
gap: 1rem;
margin-top: 1.5rem;
}
.action-type {
background: var(--bg-tertiary);
padding: 1rem;
border-radius: 6px;
font-size: 0.875rem;
color: var(--text-secondary);
border: 1px solid var(--border-color);
font-family: var(--font-code);
}
.action-type code {
color: var(--primary-green);
font-weight: 600;
margin-right: 0.5rem;
}
.alpha-note {
background: rgba(243, 128, 245, 0.1);
border: 1px solid var(--primary-pink);
border-radius: 8px;
padding: 1.5rem;
margin-top: 2rem;
color: var(--text-primary);
}
.alpha-note strong {
color: var(--primary-pink);
}
@media (max-width: 768px) {
.script-features-grid {
grid-template-columns: 1fr;
}
.workflow-step {
flex-direction: column;
gap: 1rem;
}
.action-types {
grid-template-columns: 1fr;
}
.coming-soon-section h2 {
font-size: 1.5rem;
}
}

View File

@@ -2,7 +2,7 @@
// Handle messages from content script
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
if (message.action === 'downloadCode') {
if (message.action === 'downloadCode' || message.action === 'downloadScript') {
try {
// Create a data URL for the Python code
const dataUrl = 'data:text/plain;charset=utf-8,' + encodeURIComponent(message.code);

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -43,23 +43,23 @@
<span class="terminal-title">About Crawl4AI Assistant</span>
</div>
<div class="terminal-content">
<p>Transform any website into structured data with just a few clicks! The Crawl4AI Assistant Chrome Extension lets you visually select elements on any webpage and automatically generates Python code for web scraping.</p>
<p>Transform any website into structured data with just a few clicks! The Crawl4AI Assistant Chrome Extension provides two powerful tools for web scraping and automation.</p>
<div class="features-grid">
<div class="feature-card">
<span class="feature-icon">🎯</span>
<h3>Visual Selection</h3>
<p>Click on any element to select it - no CSS selectors needed</p>
<h3>Schema Builder</h3>
<p>Click to select elements and build extraction schemas visually</p>
</div>
<div class="feature-card">
<span class="feature-icon">📊</span>
<h3>Schema Builder</h3>
<p>Build extraction schemas by clicking on container and field elements</p>
<span class="feature-icon">🔴</span>
<h3>Script Builder <span style="color: #f380f5; font-size: 0.75rem;">(Alpha)</span></h3>
<p>Record browser actions to create automation scripts</p>
</div>
<div class="feature-card">
<span class="feature-icon">🐍</span>
<h3>Python Code</h3>
<p>Get production-ready Crawl4AI code with LLM extraction</p>
<p>Get production-ready Crawl4AI code instantly</p>
</div>
<div class="feature-card">
<span class="feature-icon">🎨</span>
@@ -85,9 +85,9 @@
<div class="step-content">
<h4>Download the Extension</h4>
<p>Get the latest release from GitHub or use the button below</p>
<a href="crawl4ai-assistant-v1.0.1.zip" class="download-button" download>
<a href="crawl4ai-assistant-v1.2.1.zip" class="download-button" download>
<span class="button-icon"></span>
Download Extension (v1.0.1)
Download Extension (v1.2.1)
</a>
</div>
</div>
@@ -110,64 +110,152 @@
</div>
</section>
<!-- Usage Guide -->
<section class="usage-section">
<h2>How to Use</h2>
<div class="terminal-window">
<div class="terminal-header">
<span class="terminal-title">Step-by-Step Guide</span>
<!-- Interactive Tools Section -->
<section class="interactive-tools">
<h2>Explore Our Tools</h2>
<div class="tools-container">
<!-- Left Panel - Tool Selector -->
<div class="tools-panel">
<div class="tool-selector active" data-tool="schema-builder">
<div class="tool-icon">📊</div>
<div class="tool-info">
<h3>Schema Builder</h3>
<p>Visual data extraction</p>
</div>
<div class="terminal-content">
<div class="usage-flow">
<div class="usage-step">
<div class="usage-header">
<span class="usage-icon">1</span>
<h4>Start Schema Builder</h4>
</div>
<p>Click the extension icon and select "Schema Builder" to begin</p>
<div class="tool-status">Available</div>
</div>
<div class="usage-step">
<div class="usage-header">
<span class="usage-icon">2</span>
<div class="tool-selector" data-tool="script-builder">
<div class="tool-icon">🔴</div>
<div class="tool-info">
<h3>Script Builder</h3>
<p>Browser automation</p>
</div>
<div class="tool-status alpha">Alpha</div>
</div>
</div>
<!-- Right Panel - Tool Details -->
<div class="tool-details">
<!-- Schema Builder Details -->
<div class="tool-content active" id="schema-builder">
<div class="tool-header">
<h3>📊 Schema Builder</h3>
<span class="tool-tagline">Click to extract data visually</span>
</div>
<div class="tool-steps">
<div class="step-item">
<div class="step-number">1</div>
<div class="step-content">
<h4>Select Container</h4>
<p>Click on any repeating element like product cards or articles</p>
<div class="step-visual">
<span class="highlight-green"></span> Elements highlighted in green
</div>
<p>Click on a container element (e.g., product card, article, listing)</p>
<div class="code-snippet">
<span class="comment"># Container will be highlighted in green</span>
</div>
</div>
<div class="usage-step">
<div class="usage-header">
<span class="usage-icon">3</span>
<h4>Select Fields</h4>
<div class="step-item">
<div class="step-number">2</div>
<div class="step-content">
<h4>Mark Fields</h4>
<p>Click on data fields inside the container</p>
<div class="step-visual">
<span class="highlight-pink"></span> Fields highlighted in pink
</div>
<p>Click on individual fields inside the container and name them</p>
<div class="code-snippet">
<span class="comment"># Fields will be highlighted in pink</span>
<span class="comment"># Examples: title, price, description, image</span>
</div>
</div>
<div class="usage-step">
<div class="usage-header">
<span class="usage-icon">4</span>
<h4>Generate Code</h4>
<div class="step-item">
<div class="step-number">3</div>
<div class="step-content">
<h4>Generate & Extract</h4>
<p>Get your CSS selectors and Python code instantly</p>
<div class="step-visual">
<span class="highlight-accent"></span> Ready to use code
</div>
<p>Click "Stop & Generate" to create your Python extraction code</p>
</div>
</div>
</div>
<div class="tool-features">
<div class="feature-tag">No CSS knowledge needed</div>
<div class="feature-tag">Smart selector generation</div>
<div class="feature-tag">LLM-ready schemas</div>
</div>
</div>
<!-- Script Builder Details -->
<div class="tool-content" id="script-builder">
<div class="tool-header">
<h3>🔴 Script Builder</h3>
<span class="tool-tagline">Record actions, generate automation</span>
</div>
<div class="tool-steps">
<div class="step-item">
<div class="step-number">1</div>
<div class="step-content">
<h4>Hit Record</h4>
<p>Start capturing your browser interactions</p>
<div class="step-visual">
<span class="recording-dot"></span> Recording indicator
</div>
</div>
</div>
<div class="step-item">
<div class="step-number">2</div>
<div class="step-content">
<h4>Interact Naturally</h4>
<p>Click, type, scroll - everything is captured</p>
<div class="step-visual">
<span class="action-icon">🖱️</span> <span class="action-icon">⌨️</span> <span class="action-icon">📜</span>
</div>
</div>
</div>
<div class="step-item">
<div class="step-number">3</div>
<div class="step-content">
<h4>Export Script</h4>
<p>Get JavaScript for Crawl4AI's js_code parameter</p>
<div class="step-visual">
<span class="highlight-accent">📝</span> Automation ready
</div>
</div>
</div>
</div>
<div class="tool-features">
<div class="feature-tag">Smart action grouping</div>
<div class="feature-tag">Wait detection</div>
<div class="feature-tag">Keyboard shortcuts</div>
<div class="feature-tag alpha-tag">Alpha version</div>
</div>
</div>
</div>
</div>
</section>
<!-- Generated Code Example -->
<section class="code-section">
<h2>Generated Code Example</h2>
<!-- Interactive Code Examples -->
<section class="code-showcase">
<h2>See the Generated Code</h2>
<div class="code-tabs">
<button class="code-tab active" data-example="schema">📊 Schema Builder</button>
<button class="code-tab" data-example="script">🔴 Script Builder</button>
</div>
<div class="code-examples">
<!-- Schema Builder Code -->
<div class="code-example active" id="code-schema">
<div class="terminal-window">
<div class="terminal-header">
<span class="terminal-title">example_extraction.py</span>
<span class="terminal-title">schema_extraction.py</span>
<button class="copy-button" data-code="schema">Copy</button>
</div>
<div class="terminal-content">
<pre><code><span class="keyword">import</span> asyncio
@@ -191,26 +279,17 @@
<span class="string">"selector"</span>: <span class="string">"span.price"</span>,
<span class="string">"type"</span>: <span class="string">"text"</span>
},
{
<span class="string">"name"</span>: <span class="string">"description"</span>,
<span class="string">"selector"</span>: <span class="string">"p.description"</span>,
<span class="string">"type"</span>: <span class="string">"text"</span>
},
{
<span class="string">"name"</span>: <span class="string">"image"</span>,
<span class="string">"selector"</span>: <span class="string">"img.product-image"</span>,
<span class="string">"selector"</span>: <span class="string">"img.product-img"</span>,
<span class="string">"type"</span>: <span class="string">"attribute"</span>,
<span class="string">"attribute"</span>: <span class="string">"src"</span>
}
]
}
<span class="comment"># Create extraction strategy</span>
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=<span class="keyword">True</span>)
<span class="comment"># Configure the crawler</span>
config = CrawlerRunConfig(
extraction_strategy=extraction_strategy
extraction_strategy=JsonCssExtractionStrategy(schema)
)
<span class="keyword">async</span> <span class="keyword">with</span> AsyncWebCrawler() <span class="keyword">as</span> crawler:
@@ -218,24 +297,76 @@
url=<span class="string">"https://example.com/products"</span>,
config=config
)
<span class="keyword">return</span> json.loads(result.extracted_content)
<span class="comment"># Parse the extracted data</span>
products = json.loads(result.extracted_content)
<span class="keyword">print</span>(<span class="string">f"Extracted {len(products)} products"</span>)
asyncio.run(extract_products())</code></pre>
</div>
</div>
</div>
<span class="comment"># Display first product</span>
<span class="keyword">if</span> products:
<span class="keyword">print</span>(json.dumps(products[0], indent=2))
<!-- Script Builder Code -->
<div class="code-example" id="code-script">
<div class="terminal-window">
<div class="terminal-header">
<span class="terminal-title">automation_script.py</span>
<button class="copy-button" data-code="script">Copy</button>
</div>
<div class="terminal-content">
<pre><code><span class="keyword">import</span> asyncio
<span class="keyword">from</span> crawl4ai <span class="keyword">import</span> AsyncWebCrawler, CrawlerRunConfig
<span class="keyword">return</span> products
<span class="comment"># JavaScript generated from your recorded actions</span>
js_script = <span class="string">"""
// Search for products
document.querySelector('button.search-toggle').click();
await new Promise(r => setTimeout(r, 500));
<span class="comment"># Run the extraction</span>
<span class="keyword">if</span> __name__ == <span class="string">"__main__"</span>:
asyncio.run(extract_products())</code></pre>
// Type search query
const searchInput = document.querySelector('input#search');
searchInput.value = 'wireless headphones';
searchInput.dispatchEvent(new Event('input', {bubbles: true}));
// Submit search
searchInput.dispatchEvent(new KeyboardEvent('keydown', {
key: 'Enter', keyCode: 13, bubbles: true
}));
// Wait for results
await new Promise(r => setTimeout(r, 2000));
// Click first product
document.querySelector('.product-item:first-child').click();
// Wait for product page
await new Promise(r => setTimeout(r, 1000));
// Add to cart
document.querySelector('button.add-to-cart').click();
"""</span>
<span class="keyword">async</span> <span class="keyword">def</span> <span class="function">automate_shopping</span>():
config = CrawlerRunConfig(
js_code=js_script,
wait_for=<span class="string">"css:.cart-confirmation"</span>,
screenshot=<span class="keyword">True</span>
)
<span class="keyword">async</span> <span class="keyword">with</span> AsyncWebCrawler() <span class="keyword">as</span> crawler:
result = <span class="keyword">await</span> crawler.arun(
url=<span class="string">"https://shop.example.com"</span>,
config=config
)
<span class="keyword">print</span>(<span class="string">f"✓ Automation complete: {result.url}"</span>)
<span class="keyword">return</span> result
asyncio.run(automate_shopping())</code></pre>
</div>
</div>
</div>
</div>
</section>
<!-- Coming Soon Section -->
<section class="coming-soon-section">
<h2>Coming Soon: Even More Power</h2>
@@ -279,17 +410,6 @@
<code>🤖 Auto-detect fields • Smart naming • Pattern recognition</code>
</div>
</div>
<div class="coming-feature">
<div class="feature-header">
<span class="feature-badge">Script</span>
<h3>C4A Script Builder</h3>
</div>
<p>Visual automation script builder for complex interactions - fill forms, click buttons, handle pagination, all without writing code.</p>
<div class="feature-preview">
<code>🎯 Visual automation • Record & replay • Export as C4A script</code>
</div>
</div>
</div>
<div class="stay-tuned">
@@ -324,5 +444,61 @@
</footer>
</div>
</div>
<script>
// Tool Selector Interaction
document.querySelectorAll('.tool-selector').forEach(selector => {
selector.addEventListener('click', function() {
// Remove active class from all selectors
document.querySelectorAll('.tool-selector').forEach(s => s.classList.remove('active'));
document.querySelectorAll('.tool-content').forEach(c => c.classList.remove('active'));
// Add active class to clicked selector
this.classList.add('active');
// Show corresponding content
const toolId = this.getAttribute('data-tool');
document.getElementById(toolId).classList.add('active');
});
});
// Code Tab Interaction
document.querySelectorAll('.code-tab').forEach(tab => {
tab.addEventListener('click', function() {
// Remove active class from all tabs
document.querySelectorAll('.code-tab').forEach(t => t.classList.remove('active'));
document.querySelectorAll('.code-example').forEach(e => e.classList.remove('active'));
// Add active class to clicked tab
this.classList.add('active');
// Show corresponding code
const exampleId = this.getAttribute('data-example');
document.getElementById('code-' + exampleId).classList.add('active');
});
});
// Copy Button Functionality
document.querySelectorAll('.copy-button').forEach(button => {
button.addEventListener('click', async function() {
const codeType = this.getAttribute('data-code');
const codeElement = document.getElementById('code-' + codeType).querySelector('pre code');
const codeText = codeElement.textContent;
try {
await navigator.clipboard.writeText(codeText);
this.textContent = 'Copied!';
this.classList.add('copied');
setTimeout(() => {
this.textContent = 'Copy';
this.classList.remove('copied');
}, 2000);
} catch (err) {
console.error('Failed to copy code:', err);
}
});
});
</script>
</body>
</html>

View File

@@ -1,8 +1,8 @@
{
"manifest_version": 3,
"name": "Crawl4AI Assistant",
"version": "1.0.1",
"description": "Visual schema and script builder for Crawl4AI - Build extraction schemas by clicking on elements",
"version": "1.2.1",
"description": "Visual schema and script builder for Crawl4AI - Build extraction schemas and automation scripts by clicking and recording actions",
"permissions": [
"activeTab",
"storage",

View File

@@ -30,11 +30,11 @@
</div>
</button>
<button id="script-mode" class="mode-button script" disabled>
<button id="script-mode" class="mode-button script">
<div class="icon">🎯</div>
<div class="mode-info">
<h3>Script Builder</h3>
<p>Coming soon - Build automation scripts</p>
<h3>Script Builder <span style="color: #ff3c74; font-size: 10px;">(Alpha)</span></h3>
<p>Record actions to build automation scripts</p>
</div>
</button>
</div>

View File

@@ -18,6 +18,10 @@ document.addEventListener('DOMContentLoaded', () => {
startSchemaCapture();
});
document.getElementById('script-mode').addEventListener('click', () => {
startScriptCapture();
});
// Session actions
document.getElementById('generate-code').addEventListener('click', () => {
generateCode();
@@ -62,6 +66,19 @@ function startSchemaCapture() {
});
}
function startScriptCapture() {
chrome.tabs.query({ active: true, currentWindow: true }, (tabs) => {
chrome.tabs.sendMessage(tabs[0].id, {
action: 'startScriptCapture'
}, (response) => {
if (response && response.success) {
// Close the popup to let user interact with the page
window.close();
}
});
});
}
function showActiveSession(stats) {
document.querySelector('.mode-selector').style.display = 'none';
document.getElementById('active-session').classList.remove('hidden');

View File

@@ -11,8 +11,8 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
}
.app-card {
background: var(--md-code-bg-color);
border: 1px solid var(--md-default-fg-color--lightest);
background: #3f3f44;
border: 1px solid #3f3f44;
border-radius: 8px;
padding: 1.5rem;
transition: all 0.3s ease;
@@ -23,7 +23,7 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
.app-card:hover {
transform: translateY(-4px);
box-shadow: 0 8px 16px rgba(0, 0, 0, 0.3);
border-color: var(--md-primary-fg-color);
border-color: #50ffff;
}
.app-card h3 {
@@ -31,36 +31,38 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
display: flex;
align-items: center;
gap: 0.5rem;
color: #e8e9ed;
}
.app-status {
display: inline-block;
padding: 0.2rem 0.6rem;
border-radius: 4px;
padding: 0.25rem 0.75rem;
border-radius: 20px;
font-size: 0.7rem;
font-weight: bold;
font-weight: 600;
text-transform: uppercase;
margin-bottom: 1rem;
}
.status-available {
background: #22c55e;
color: #000;
background: #50ffff;
color: #070708;
}
.status-beta {
background: #f59e0b;
color: #000;
color: #070708;
}
.status-coming-soon {
background: var(--md-default-fg-color--lightest);
color: var(--md-default-bg-color);
background: #2a2a2a;
color: #888;
}
.app-description {
margin: 1rem 0;
line-height: 1.6;
color: #a3abba;
}
.app-features {
@@ -73,13 +75,15 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
padding-left: 1.5rem;
position: relative;
margin-bottom: 0.5rem;
color: #d5cec0;
font-size: 0.9rem;
}
.app-features li:before {
content: "";
content: "";
position: absolute;
left: 0;
color: var(--md-primary-fg-color);
color: #50ffff;
font-weight: bold;
}
@@ -89,35 +93,49 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
.app-btn {
display: inline-block;
padding: 0.8rem 1.5rem;
background: var(--md-primary-fg-color);
color: var(--md-primary-bg-color);
padding: 0.75rem 1.5rem;
background: #50ffff;
color: #070708;
text-decoration: none;
border-radius: 6px;
font-weight: bold;
font-weight: 600;
transition: all 0.2s ease;
font-family: dm, Monaco, monospace;
}
.app-btn:hover {
background: var(--md-primary-fg-color--dark);
background: #09b5a5;
transform: scale(1.05);
color: #070708;
}
.app-btn.disabled {
background: var(--md-default-fg-color--lightest);
background: #2a2a2a;
color: #666;
cursor: not-allowed;
transform: none;
}
.app-btn.disabled:hover {
background: #2a2a2a;
transform: none;
}
.intro-section {
background: var(--md-code-bg-color);
background: #3f3f44;
border-radius: 8px;
padding: 2rem;
margin-bottom: 3rem;
border: 1px solid #3f3f44;
}
.intro-section h2 {
margin-top: 0;
color: #50ffff;
}
.intro-section p {
color: #d5cec0;
}
</style>

View File

@@ -71,33 +71,6 @@
</section>
<section class="builder">
<div class="special-contexts">
<h2>Quick Presets</h2>
<div class="preset-options">
<label class="preset-option">
<input type="radio" name="preset" value="vibe" id="preset-vibe">
<div class="preset-card">
<h3>🎯 Vibe Coding</h3>
<p>Curated context for general AI prompting - perfect for exploring capabilities</p>
</div>
</label>
<label class="preset-option">
<input type="radio" name="preset" value="all" id="preset-all">
<div class="preset-card">
<h3>📚 Complete Library</h3>
<p>Comprehensive context including all components and perspectives</p>
</div>
</label>
<label class="preset-option">
<input type="radio" name="preset" value="custom" id="preset-custom" checked>
<div class="preset-card">
<h3>🔧 Custom Selection</h3>
<p>Choose specific components and context types</p>
</div>
</label>
</div>
</div>
<div class="component-selector" id="component-selector">
<h2>Select Components & Context Types</h2>
<div class="select-all-controls">
@@ -111,9 +84,9 @@
<tr>
<th width="50"></th>
<th>Component</th>
<th class="clickable-header" data-type="memory">Memory</th>
<th class="clickable-header" data-type="reasoning">Reasoning</th>
<th class="clickable-header" data-type="examples">Examples</th>
<th class="clickable-header" data-type="memory">Memory<br><span class="header-subtitle">Full Content</span></th>
<th class="clickable-header" data-type="reasoning">Reasoning<br><span class="header-subtitle">Diagrams</span></th>
<th class="clickable-header" data-type="examples">Examples<br><span class="header-subtitle">Code</span></th>
</tr>
</thead>
<tbody id="components-tbody">
@@ -124,6 +97,10 @@
</div>
<div class="action-area">
<div class="token-summary" id="token-summary">
<span class="token-label">Estimated Tokens:</span>
<span class="token-count" id="total-tokens">0</span>
</div>
<button class="download-btn" id="download-btn">
<span class="icon"></span> Generate & Download Context
</button>

View File

@@ -363,6 +363,15 @@ body {
font-weight: bold;
}
.header-subtitle {
font-size: 10px;
color: var(--tertiary-color);
text-transform: none;
font-weight: normal;
display: block;
margin-top: 2px;
}
.component-selection-table th.clickable-header {
cursor: pointer;
user-select: none;
@@ -374,6 +383,16 @@ body {
color: var(--background-color);
}
.component-selection-table th.clickable-header[data-type="examples"] {
cursor: default;
opacity: 0.5;
}
.component-selection-table th.clickable-header[data-type="examples"]:hover {
background-color: var(--hover-bg);
color: var(--primary-color);
}
.component-selection-table th:nth-child(3),
.component-selection-table th:nth-child(4),
.component-selection-table th:nth-child(5) {
@@ -400,12 +419,25 @@ body {
font-weight: bold;
}
/* Token display in table cells */
.token-info {
display: block;
font-size: 11px;
color: var(--tertiary-color);
margin-top: 2px;
}
.component-selection-table input[type="checkbox"] {
cursor: pointer;
width: 16px;
height: 16px;
}
.component-selection-table input[type="checkbox"]:disabled {
cursor: not-allowed;
opacity: 0.3;
}
/* Disabled row state */
.component-selection-table tr.disabled td:not(:first-child) {
opacity: 0.5;
@@ -418,6 +450,30 @@ body {
margin: 40px 0;
}
/* Token Summary */
.token-summary {
margin-bottom: 20px;
font-size: 16px;
}
.token-label {
color: var(--tertiary-color);
margin-right: 10px;
}
.token-count {
color: var(--primary-color);
font-weight: bold;
font-size: 20px;
}
.token-count::after {
content: " est.";
font-size: 12px;
color: var(--tertiary-color);
margin-left: 4px;
}
.download-btn {
background-color: var(--primary-dimmed);
color: var(--background-color);

View File

@@ -1,48 +1,61 @@
// Crawl4AI LLM Context Builder JavaScript
// Component definitions
// Component definitions - order matters
const components = [
{
id: 'all',
name: 'All Components',
description: 'All components with all context types',
special: true
id: 'installation',
name: 'Installation',
description: 'Setup and installation options'
},
{
id: 'core',
name: 'Core Functionality',
description: 'Basic crawling and scraping features'
id: 'simple_crawling',
name: 'Simple Crawling',
description: 'Basic web crawling operations'
},
{
id: 'config_objects',
name: 'Configuration Objects',
description: 'Browser and crawler configuration'
},
{
id: 'deep_crawling',
name: 'Deep Crawling',
description: 'Multi-page crawling strategies'
},
{
id: 'deployment',
name: 'Deployment',
description: 'Installation and Docker setup'
},
{
id: 'extraction',
name: 'Data Extraction',
description: 'Structured data extraction strategies'
},
{
id: 'markdown',
name: 'Markdown Generation',
description: 'Content-to-markdown conversion'
id: 'multi_urls_crawling',
name: 'Multi URLs Crawling',
description: 'Crawling multiple URLs efficiently'
},
{
id: 'vibe',
name: 'Vibe Coding',
description: 'General-purpose AI context',
special: false
id: 'deep_crawling',
name: 'Deep Crawling',
description: 'Multi-page crawling strategies'
},
{
id: 'docker',
name: 'Docker',
description: 'Docker deployment and configuration'
},
{
id: 'cli',
name: 'CLI',
description: 'Command-line interface usage'
},
{
id: 'http_based_crawler_strategy',
name: 'HTTP-based Crawler',
description: 'HTTP crawler strategy implementation'
},
{
id: 'url_seeder',
name: 'URL Seeder',
description: 'URL seeding and discovery'
},
{
id: 'deep_crawl_advanced_filters_scorers',
name: 'Advanced Filters & Scorers',
description: 'Deep crawl filtering and scoring'
}
];
@@ -51,45 +64,47 @@ const contextTypes = ['memory', 'reasoning', 'examples'];
// State management
const state = {
preset: 'custom',
selectedComponents: new Set(),
selectedContextTypes: new Map()
selectedContextTypes: new Map(),
tokenCounts: new Map() // Store token counts for each file
};
// Initialize the application
document.addEventListener('DOMContentLoaded', () => {
setupPresetHandlers();
renderComponents();
renderReferenceTable();
setupActionHandlers();
setupColumnHeaderHandlers();
// Initialize only core component as selected with all context types
state.selectedComponents.add('core');
state.selectedContextTypes.set('core', new Set(contextTypes));
// Initialize first component as selected with available context types
const firstComponent = components[0];
state.selectedComponents.add(firstComponent.id);
state.selectedContextTypes.set(firstComponent.id, new Set(['memory', 'reasoning']));
updateComponentUI();
});
// Setup preset radio button handlers
function setupPresetHandlers() {
const presetRadios = document.querySelectorAll('input[name="preset"]');
presetRadios.forEach(radio => {
radio.addEventListener('change', (e) => {
state.preset = e.target.value;
updatePresetSelection();
});
});
// Helper function to count tokens (words × 2.5)
function estimateTokens(text) {
if (!text) return 0;
const words = text.trim().split(/\s+/).length;
return Math.round(words * 2.5);
}
// Update UI based on preset selection
function updatePresetSelection() {
const componentSelector = document.getElementById('component-selector');
// Update total token count display
function updateTotalTokenCount() {
let totalTokens = 0;
if (state.preset === 'custom') {
componentSelector.style.display = 'block';
} else {
componentSelector.style.display = 'none';
state.selectedComponents.forEach(compId => {
const types = state.selectedContextTypes.get(compId);
if (types) {
types.forEach(type => {
const key = `${compId}-${type}`;
totalTokens += state.tokenCounts.get(key) || 0;
});
}
});
document.getElementById('total-tokens').textContent = totalTokens.toLocaleString();
}
// Render component selection table
@@ -97,10 +112,13 @@ function renderComponents() {
const tbody = document.getElementById('components-tbody');
tbody.innerHTML = '';
components.filter(c => !c.special).forEach(component => {
components.forEach(component => {
const row = createComponentRow(component);
tbody.appendChild(row);
});
// Fetch token counts for all files
fetchAllTokenCounts();
}
// Create a component table row
@@ -124,9 +142,17 @@ function createComponentRow(component) {
// Context type cells
contextTypes.forEach(type => {
const td = document.createElement('td');
const key = `${component.id}-${type}`;
const tokenCount = state.tokenCounts.get(key) || 0;
const isDisabled = type === 'examples' ? 'disabled' : '';
td.innerHTML = `
<input type="checkbox" id="check-${component.id}-${type}"
data-component="${component.id}" data-type="${type}">
data-component="${component.id}" data-type="${type}"
${isDisabled}>
<span class="token-info" id="tokens-${component.id}-${type}">
${tokenCount > 0 ? `${tokenCount.toLocaleString()} tokens` : ''}
</span>
`;
tr.appendChild(td);
});
@@ -140,9 +166,11 @@ function createComponentRow(component) {
// Add event listeners for context type checkboxes
contextTypes.forEach(type => {
const typeCheckbox = tr.querySelector(`#check-${component.id}-${type}`);
if (!typeCheckbox.disabled) {
typeCheckbox.addEventListener('change', (e) => {
handleContextTypeToggle(component.id, type, e.target.checked);
});
}
});
return tr;
@@ -152,12 +180,12 @@ function createComponentRow(component) {
function handleComponentToggle(componentId, checked) {
if (checked) {
state.selectedComponents.add(componentId);
// Select all context types when component is selected
// Select only available context types when component is selected
if (!state.selectedContextTypes.has(componentId)) {
state.selectedContextTypes.set(componentId, new Set(contextTypes));
state.selectedContextTypes.set(componentId, new Set(['memory', 'reasoning']));
} else {
// If component was already partially selected, select all
state.selectedContextTypes.set(componentId, new Set(contextTypes));
// If component was already partially selected, select all available
state.selectedContextTypes.set(componentId, new Set(['memory', 'reasoning']));
}
} else {
state.selectedComponents.delete(componentId);
@@ -195,8 +223,10 @@ function handleContextTypeToggle(componentId, type, checked) {
// Update UI to reflect current state
function updateComponentUI() {
components.filter(c => !c.special).forEach(component => {
components.forEach(component => {
const row = document.getElementById(`component-${component.id}`);
if (!row) return;
const mainCheckbox = row.querySelector(`#check-${component.id}`);
const hasSelection = state.selectedComponents.has(component.id);
const selectedTypes = state.selectedContextTypes.get(component.id) || new Set();
@@ -213,15 +243,93 @@ function updateComponentUI() {
typeCheckbox.checked = selectedTypes.has(type);
});
});
updateTotalTokenCount();
}
// Fetch token counts for all files
async function fetchAllTokenCounts() {
const promises = [];
components.forEach(component => {
contextTypes.forEach(type => {
promises.push(fetchTokenCount(component.id, type));
});
});
await Promise.all(promises);
updateComponentUI();
renderReferenceTable(); // Update reference table with token counts
}
// Fetch token count for a specific file
async function fetchTokenCount(componentId, type) {
const key = `${componentId}-${type}`;
try {
const fileName = getFileName(componentId, type);
const baseUrl = getBaseUrl(type);
const response = await fetch(baseUrl + fileName);
if (response.ok) {
const content = await response.text();
const tokens = estimateTokens(content);
state.tokenCounts.set(key, tokens);
// Update UI
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
if (tokenSpan) {
tokenSpan.textContent = `${tokens.toLocaleString()} tokens`;
}
} else if (type === 'examples') {
// Examples might not exist yet
state.tokenCounts.set(key, 0);
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
if (tokenSpan) {
tokenSpan.textContent = '';
}
}
} catch (error) {
console.warn(`Failed to fetch token count for ${componentId}-${type}`);
if (type === 'examples') {
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
if (tokenSpan) {
tokenSpan.textContent = '';
}
}
}
}
// Get file name based on component and type
function getFileName(componentId, type) {
// For new structure, all files are just [componentId].txt
return `${componentId}.txt`;
}
// Get base URL based on context type
function getBaseUrl(type) {
// For MkDocs, we need to go up to the root level
const basePrefix = window.location.pathname.includes('/apps/') ? '../../' : '/';
switch(type) {
case 'memory':
return basePrefix + 'assets/llm.txt/txt/';
case 'reasoning':
return basePrefix + 'assets/llm.txt/diagrams/';
case 'examples':
return basePrefix + 'assets/llm.txt/examples/'; // Will return 404 for now
default:
return basePrefix + 'assets/llm.txt/txt/';
}
}
// Setup action button handlers
function setupActionHandlers() {
// Select/Deselect all buttons
document.getElementById('select-all').addEventListener('click', () => {
components.filter(c => !c.special).forEach(comp => {
components.forEach(comp => {
state.selectedComponents.add(comp.id);
state.selectedContextTypes.set(comp.id, new Set(contextTypes));
state.selectedContextTypes.set(comp.id, new Set(['memory', 'reasoning']));
});
updateComponentUI();
});
@@ -249,9 +357,12 @@ function setupColumnHeaderHandlers() {
// Toggle all checkboxes in a column
function toggleColumnSelection(type) {
// Don't toggle examples column
if (type === 'examples') return;
// Check if all are currently selected
let allSelected = true;
components.filter(c => !c.special).forEach(comp => {
components.forEach(comp => {
const types = state.selectedContextTypes.get(comp.id);
if (!types || !types.has(type)) {
allSelected = false;
@@ -259,7 +370,7 @@ function toggleColumnSelection(type) {
});
// Toggle all
components.filter(c => !c.special).forEach(comp => {
components.forEach(comp => {
if (!state.selectedContextTypes.has(comp.id)) {
state.selectedContextTypes.set(comp.id, new Set());
}
@@ -314,46 +425,50 @@ async function handleDownload() {
function getSelectedFiles() {
const files = [];
if (state.preset === 'vibe') {
files.push('crawl4ai_vibe.llm.full.md');
} else if (state.preset === 'all') {
// Use the dedicated aggregated files for all components
files.push('crawl4ai_all_memory_content.llm.md');
files.push('crawl4ai_all_reasoning_content.llm.md');
files.push('crawl4ai_all_examples_content.llm.md');
} else {
// Custom selection
// Build list of selected files with their context info
state.selectedComponents.forEach(compId => {
const types = state.selectedContextTypes.get(compId);
if (types) {
types.forEach(type => {
files.push(`crawl4ai_${compId}_${type}_content.llm.md`);
files.push({
componentId: compId,
type: type,
fileName: getFileName(compId, type),
baseUrl: getBaseUrl(type)
});
});
}
});
}
return files;
}
// Fetch multiple files
async function fetchFiles(fileNames) {
// Use /assets/llmtxt/ path with .txt extension
const baseUrl = '/assets/llmtxt/';
const promises = fileNames.map(async (fileName) => {
// Convert .md to .txt for fetching
const txtFileName = fileName.replace('.md', '.txt');
async function fetchFiles(fileInfos) {
const promises = fileInfos.map(async (fileInfo) => {
try {
const response = await fetch(baseUrl + txtFileName);
const response = await fetch(fileInfo.baseUrl + fileInfo.fileName);
if (!response.ok) {
console.warn(`Failed to fetch ${txtFileName} from ${baseUrl + txtFileName}`);
return { fileName, content: `<!-- Failed to load ${fileName} -->` };
if (fileInfo.type === 'examples') {
return {
fileInfo,
content: `<!-- Examples for ${fileInfo.componentId} coming soon -->\n\nExamples are currently being developed for this component.`
};
}
console.warn(`Failed to fetch ${fileInfo.fileName} from ${fileInfo.baseUrl + fileInfo.fileName}`);
return { fileInfo, content: `<!-- Failed to load ${fileInfo.fileName} -->` };
}
const content = await response.text();
return { fileName, content };
return { fileInfo, content };
} catch (error) {
console.warn(`Error fetching ${txtFileName} from ${baseUrl + txtFileName}:`, error);
return { fileName, content: `<!-- Error loading ${fileName} -->` };
if (fileInfo.type === 'examples') {
return {
fileInfo,
content: `<!-- Examples for ${fileInfo.componentId} coming soon -->\n\nExamples are currently being developed for this component.`
};
}
console.warn(`Error fetching ${fileInfo.fileName}:`, error);
return { fileInfo, content: `<!-- Error loading ${fileInfo.fileName} -->` };
}
});
@@ -362,20 +477,31 @@ async function fetchFiles(fileNames) {
// Combine file contents with headers
function combineContents(fileContents) {
// Calculate total tokens
let totalTokens = 0;
fileContents.forEach(({ content }) => {
totalTokens += estimateTokens(content);
});
const header = `# Crawl4AI Custom LLM Context
Generated on: ${new Date().toISOString()}
Total files: ${fileContents.length}
Estimated tokens: ${totalTokens.toLocaleString()}
---
`;
const sections = fileContents.map(({ fileName, content }) => {
const componentName = extractComponentName(fileName);
const contextType = extractContextType(fileName);
const sections = fileContents.map(({ fileInfo, content }) => {
const component = components.find(c => c.id === fileInfo.componentId);
const componentName = component ? component.name : fileInfo.componentId;
const contextType = getContextTypeName(fileInfo.type);
const tokens = estimateTokens(content);
return `## ${componentName} - ${contextType}
Source: ${fileName}
Component ID: ${fileInfo.componentId}
Context Type: ${fileInfo.type}
Estimated tokens: ${tokens.toLocaleString()}
${content}
@@ -387,25 +513,14 @@ ${content}
return header + sections.join('\n');
}
// Extract component name from filename
function extractComponentName(fileName) {
// Pattern: crawl4ai_{component}_{type}_content.llm.md
const match = fileName.match(/crawl4ai_(.+?)_(memory|reasoning|examples|llm\.full)/);
if (match) {
const compId = match[1];
const component = components.find(c => c.id === compId);
return component ? component.name : compId.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
// Get display name for context type
function getContextTypeName(type) {
switch(type) {
case 'memory': return 'Full Content';
case 'reasoning': return 'Diagrams & Workflows';
case 'examples': return 'Code Examples';
default: return type;
}
return 'Unknown Component';
}
// Extract context type from filename
function extractContextType(fileName) {
if (fileName.includes('_memory_')) return 'Memory';
if (fileName.includes('_reasoning_')) return 'Reasoning';
if (fileName.includes('_examples_')) return 'Examples';
if (fileName.includes('.llm.full')) return 'Complete Context';
return 'Context';
}
// Download file to user's computer
@@ -426,33 +541,35 @@ function renderReferenceTable() {
const tbody = document.getElementById('reference-table-body');
tbody.innerHTML = '';
// Since vibe is no longer special, just show all components the same way
// Get base path for links
const basePrefix = window.location.pathname.includes('/apps/') ? '../../' : '/';
components.forEach(component => {
const row = document.createElement('tr');
const memoryTokens = state.tokenCounts.get(`${component.id}-memory`) || 0;
const reasoningTokens = state.tokenCounts.get(`${component.id}-reasoning`) || 0;
const examplesTokens = state.tokenCounts.get(`${component.id}-examples`) || 0;
row.innerHTML = `
<td><strong>${component.name}</strong></td>
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_memory_content.llm.txt" class="file-link" target="_blank">Memory</a></td>
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_reasoning_content.llm.txt" class="file-link" target="_blank">Reasoning</a></td>
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_examples_content.llm.txt" class="file-link" target="_blank">Examples</a></td>
<td><a href="/assets/llmtxt/crawl4ai_${component.id}.llm.full.txt" class="file-link" target="_blank">Full</a></td>
<td>
<a href="${basePrefix}assets/llm.txt/txt/${component.id}.txt" class="file-link" target="_blank">Memory</a>
${memoryTokens > 0 ? `<span class="file-size">${memoryTokens.toLocaleString()} tokens</span>` : ''}
</td>
<td>
<a href="${basePrefix}assets/llm.txt/diagrams/${component.id}.txt" class="file-link" target="_blank">Reasoning</a>
${reasoningTokens > 0 ? `<span class="file-size">${reasoningTokens.toLocaleString()} tokens</span>` : ''}
</td>
<td>
${examplesTokens > 0
? `<a href="${basePrefix}assets/llm.txt/examples/${component.id}.txt" class="file-link" target="_blank">Examples</a>
<span class="file-size">${examplesTokens.toLocaleString()} tokens</span>`
: '-'
}
</td>
<td>-</td>
`;
tbody.appendChild(row);
});
}
// Check if examples file exists (all components have examples)
function hasExamplesFile(componentId) {
// All components have examples files
return true;
}
// Check if full file exists (all components have full files)
function hasFullFile(componentId) {
// All components have full files
return true;
}
// Utility function to capitalize first letter
function capitalizeFirst(str) {
return str.charAt(0).toUpperCase() + str.slice(1);
}

View File

@@ -0,0 +1,425 @@
## CLI Workflows and Profile Management
Visual representations of command-line interface operations, browser profile management, and identity-based crawling workflows.
### CLI Command Flow Architecture
```mermaid
flowchart TD
A[crwl command] --> B{Command Type?}
B -->|URL Crawling| C[Parse URL & Options]
B -->|Profile Management| D[profiles subcommand]
B -->|CDP Browser| E[cdp subcommand]
B -->|Browser Control| F[browser subcommand]
B -->|Configuration| G[config subcommand]
C --> C1{Output Format?}
C1 -->|Default| C2[HTML/Markdown]
C1 -->|JSON| C3[Structured Data]
C1 -->|markdown| C4[Clean Markdown]
C1 -->|markdown-fit| C5[Filtered Content]
C --> C6{Authentication?}
C6 -->|Profile Specified| C7[Load Browser Profile]
C6 -->|No Profile| C8[Anonymous Session]
C7 --> C9[Launch with User Data]
C8 --> C10[Launch Clean Browser]
C9 --> C11[Execute Crawl]
C10 --> C11
C11 --> C12{Success?}
C12 -->|Yes| C13[Return Results]
C12 -->|No| C14[Error Handling]
D --> D1[Interactive Profile Menu]
D1 --> D2{Menu Choice?}
D2 -->|Create| D3[Open Browser for Setup]
D2 -->|List| D4[Show Existing Profiles]
D2 -->|Delete| D5[Remove Profile]
D2 -->|Use| D6[Crawl with Profile]
E --> E1[Launch CDP Browser]
E1 --> E2[Remote Debugging Active]
F --> F1{Browser Action?}
F1 -->|start| F2[Start Builtin Browser]
F1 -->|stop| F3[Stop Builtin Browser]
F1 -->|status| F4[Check Browser Status]
F1 -->|view| F5[Open Browser Window]
G --> G1{Config Action?}
G1 -->|list| G2[Show All Settings]
G1 -->|set| G3[Update Setting]
G1 -->|get| G4[Read Setting]
style A fill:#e1f5fe
style C13 fill:#c8e6c9
style C14 fill:#ffcdd2
style D3 fill:#fff3e0
style E2 fill:#f3e5f5
```
### Profile Management Workflow
```mermaid
sequenceDiagram
participant User
participant CLI
participant ProfileManager
participant Browser
participant FileSystem
User->>CLI: crwl profiles
CLI->>ProfileManager: Initialize profile manager
ProfileManager->>FileSystem: Scan for existing profiles
FileSystem-->>ProfileManager: Profile list
ProfileManager-->>CLI: Show interactive menu
CLI-->>User: Display options
Note over User: User selects "Create new profile"
User->>CLI: Create profile "linkedin-auth"
CLI->>ProfileManager: create_profile("linkedin-auth")
ProfileManager->>FileSystem: Create profile directory
ProfileManager->>Browser: Launch with new user data dir
Browser-->>User: Opens browser window
Note over User: User manually logs in to LinkedIn
User->>Browser: Navigate and authenticate
Browser->>FileSystem: Save cookies, session data
User->>CLI: Press 'q' to save profile
CLI->>ProfileManager: finalize_profile()
ProfileManager->>FileSystem: Lock profile settings
ProfileManager-->>CLI: Profile saved
CLI-->>User: Profile "linkedin-auth" created
Note over User: Later usage
User->>CLI: crwl https://linkedin.com/feed -p linkedin-auth
CLI->>ProfileManager: load_profile("linkedin-auth")
ProfileManager->>FileSystem: Read profile data
FileSystem-->>ProfileManager: User data directory
ProfileManager-->>CLI: Profile configuration
CLI->>Browser: Launch with existing profile
Browser-->>CLI: Authenticated session ready
CLI->>Browser: Navigate to target URL
Browser-->>CLI: Crawl results with auth context
CLI-->>User: Authenticated content
```
### Browser Management State Machine
```mermaid
stateDiagram-v2
[*] --> Stopped: Initial state
Stopped --> Starting: crwl browser start
Starting --> Running: Browser launched
Running --> Viewing: crwl browser view
Viewing --> Running: Close window
Running --> Stopping: crwl browser stop
Stopping --> Stopped: Cleanup complete
Running --> Restarting: crwl browser restart
Restarting --> Running: New browser instance
Stopped --> CDP_Mode: crwl cdp
CDP_Mode --> CDP_Running: Remote debugging active
CDP_Running --> CDP_Mode: Manual close
CDP_Mode --> Stopped: Exit CDP
Running --> StatusCheck: crwl browser status
StatusCheck --> Running: Return status
note right of Running : Port 9222 active\nBuiltin browser available
note right of CDP_Running : Remote debugging\nManual control enabled
note right of Viewing : Visual browser window\nDirect interaction
```
### Authentication Workflow for Protected Sites
```mermaid
flowchart TD
A[Protected Site Access Needed] --> B[Create Profile Strategy]
B --> C{Existing Profile?}
C -->|Yes| D[Test Profile Validity]
C -->|No| E[Create New Profile]
D --> D1{Profile Valid?}
D1 -->|Yes| F[Use Existing Profile]
D1 -->|No| E
E --> E1[crwl profiles]
E1 --> E2[Select Create New Profile]
E2 --> E3[Enter Profile Name]
E3 --> E4[Browser Opens for Auth]
E4 --> E5{Authentication Method?}
E5 -->|Login Form| E6[Fill Username/Password]
E5 -->|OAuth| E7[OAuth Flow]
E5 -->|2FA| E8[Handle 2FA]
E5 -->|Session Cookie| E9[Import Cookies]
E6 --> E10[Manual Login Process]
E7 --> E10
E8 --> E10
E9 --> E10
E10 --> E11[Verify Authentication]
E11 --> E12{Auth Successful?}
E12 -->|Yes| E13[Save Profile - Press q]
E12 -->|No| E10
E13 --> F
F --> G[Execute Authenticated Crawl]
G --> H[crwl URL -p profile-name]
H --> I[Load Profile Data]
I --> J[Launch Browser with Auth]
J --> K[Navigate to Protected Content]
K --> L[Extract Authenticated Data]
L --> M[Return Results]
style E4 fill:#fff3e0
style E10 fill:#e3f2fd
style F fill:#e8f5e8
style M fill:#c8e6c9
```
### CDP Browser Architecture
```mermaid
graph TB
subgraph "CLI Layer"
A[crwl cdp command] --> B[CDP Manager]
B --> C[Port Configuration]
B --> D[Profile Selection]
end
subgraph "Browser Process"
E[Chromium/Firefox] --> F[Remote Debugging]
F --> G[WebSocket Endpoint]
G --> H[ws://localhost:9222]
end
subgraph "Client Connections"
I[Manual Browser Control] --> H
J[DevTools Interface] --> H
K[External Automation] --> H
L[Crawl4AI Crawler] --> H
end
subgraph "Profile Data"
M[User Data Directory] --> E
N[Cookies & Sessions] --> M
O[Extensions] --> M
P[Browser State] --> M
end
A --> E
C --> H
D --> M
style H fill:#e3f2fd
style E fill:#f3e5f5
style M fill:#e8f5e8
```
### Configuration Management Hierarchy
```mermaid
graph TD
subgraph "Global Configuration"
A[~/.crawl4ai/config.yml] --> B[Default Settings]
B --> C[LLM Providers]
B --> D[Browser Defaults]
B --> E[Output Preferences]
end
subgraph "Profile Configuration"
F[Profile Directory] --> G[Browser State]
F --> H[Authentication Data]
F --> I[Site-Specific Settings]
end
subgraph "Command-Line Overrides"
J[-b browser_config] --> K[Runtime Browser Settings]
L[-c crawler_config] --> M[Runtime Crawler Settings]
N[-o output_format] --> O[Runtime Output Format]
end
subgraph "Configuration Files"
P[browser.yml] --> Q[Browser Config Template]
R[crawler.yml] --> S[Crawler Config Template]
T[extract.yml] --> U[Extraction Config]
end
subgraph "Resolution Order"
V[Command Line Args] --> W[Config Files]
W --> X[Profile Settings]
X --> Y[Global Defaults]
end
J --> V
L --> V
N --> V
P --> W
R --> W
T --> W
F --> X
A --> Y
style V fill:#ffcdd2
style W fill:#fff3e0
style X fill:#e3f2fd
style Y fill:#e8f5e8
```
### Identity-Based Crawling Decision Tree
```mermaid
flowchart TD
A[Target Website Assessment] --> B{Authentication Required?}
B -->|No| C[Standard Anonymous Crawl]
B -->|Yes| D{Authentication Type?}
D -->|Login Form| E[Create Login Profile]
D -->|OAuth/SSO| F[Create OAuth Profile]
D -->|API Key/Token| G[Use Headers/Config]
D -->|Session Cookies| H[Import Cookie Profile]
E --> E1[crwl profiles → Manual login]
F --> F1[crwl profiles → OAuth flow]
G --> G1[Configure headers in crawler config]
H --> H1[Import cookies to profile]
E1 --> I[Test Authentication]
F1 --> I
G1 --> I
H1 --> I
I --> J{Auth Test Success?}
J -->|Yes| K[Production Crawl Setup]
J -->|No| L[Debug Authentication]
L --> L1{Common Issues?}
L1 -->|Rate Limiting| L2[Add delays/user simulation]
L1 -->|Bot Detection| L3[Enable stealth mode]
L1 -->|Session Expired| L4[Refresh authentication]
L1 -->|CAPTCHA| L5[Manual intervention needed]
L2 --> M[Retry with Adjustments]
L3 --> M
L4 --> E1
L5 --> N[Semi-automated approach]
M --> I
N --> O[Manual auth + automated crawl]
K --> P[Automated Authenticated Crawling]
O --> P
C --> P
P --> Q[Monitor & Maintain Profiles]
style I fill:#fff3e0
style K fill:#e8f5e8
style P fill:#c8e6c9
style L fill:#ffcdd2
style N fill:#f3e5f5
```
### CLI Usage Patterns and Best Practices
```mermaid
timeline
title CLI Workflow Evolution
section Setup Phase
Installation : pip install crawl4ai
: crawl4ai-setup
Basic Test : crwl https://example.com
Config Setup : crwl config set defaults
section Profile Creation
Site Analysis : Identify auth requirements
Profile Creation : crwl profiles
Manual Login : Authenticate in browser
Profile Save : Press 'q' to save
section Development Phase
Test Crawls : crwl URL -p profile -v
Config Tuning : Adjust browser/crawler settings
Output Testing : Try different output formats
Error Handling : Debug authentication issues
section Production Phase
Automated Crawls : crwl URL -p profile -o json
Batch Processing : Multiple URLs with same profile
Monitoring : Check profile validity
Maintenance : Update profiles as needed
```
### Multi-Profile Management Strategy
```mermaid
graph LR
subgraph "Profile Categories"
A[Social Media Profiles]
B[Work/Enterprise Profiles]
C[E-commerce Profiles]
D[Research Profiles]
end
subgraph "Social Media"
A --> A1[linkedin-personal]
A --> A2[twitter-monitor]
A --> A3[facebook-research]
A --> A4[instagram-brand]
end
subgraph "Enterprise"
B --> B1[company-intranet]
B --> B2[github-enterprise]
B --> B3[confluence-docs]
B --> B4[jira-tickets]
end
subgraph "E-commerce"
C --> C1[amazon-seller]
C --> C2[shopify-admin]
C --> C3[ebay-monitor]
C --> C4[marketplace-competitor]
end
subgraph "Research"
D --> D1[academic-journals]
D --> D2[data-platforms]
D --> D3[survey-tools]
D --> D4[government-portals]
end
subgraph "Usage Patterns"
E[Daily Monitoring] --> A2
E --> B1
F[Weekly Reports] --> C3
F --> D2
G[On-Demand Research] --> D1
G --> D4
H[Competitive Analysis] --> C4
H --> A4
end
style A1 fill:#e3f2fd
style B1 fill:#f3e5f5
style C1 fill:#e8f5e8
style D1 fill:#fff3e0
```
**📖 Learn more:** [CLI Reference](https://docs.crawl4ai.com/core/cli/), [Identity-Based Crawling](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [Authentication Strategies](https://docs.crawl4ai.com/advanced/hooks-auth/)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,401 @@
## Deep Crawling Filters & Scorers Architecture
Visual representations of advanced URL filtering, scoring strategies, and performance optimization workflows for intelligent deep crawling.
### Filter Chain Processing Pipeline
```mermaid
flowchart TD
A[URL Input] --> B{Domain Filter}
B -->|✓ Pass| C{Pattern Filter}
B -->|✗ Fail| X1[Reject: Invalid Domain]
C -->|✓ Pass| D{Content Type Filter}
C -->|✗ Fail| X2[Reject: Pattern Mismatch]
D -->|✓ Pass| E{SEO Filter}
D -->|✗ Fail| X3[Reject: Wrong Content Type]
E -->|✓ Pass| F{Content Relevance Filter}
E -->|✗ Fail| X4[Reject: Low SEO Score]
F -->|✓ Pass| G[URL Accepted]
F -->|✗ Fail| X5[Reject: Low Relevance]
G --> H[Add to Crawl Queue]
subgraph "Fast Filters"
B
C
D
end
subgraph "Slow Filters"
E
F
end
style A fill:#e3f2fd
style G fill:#c8e6c9
style H fill:#e8f5e8
style X1 fill:#ffcdd2
style X2 fill:#ffcdd2
style X3 fill:#ffcdd2
style X4 fill:#ffcdd2
style X5 fill:#ffcdd2
```
### URL Scoring System Architecture
```mermaid
graph TB
subgraph "Input URL"
A[https://python.org/tutorial/2024/ml-guide.html]
end
subgraph "Individual Scorers"
B[Keyword Relevance Scorer]
C[Path Depth Scorer]
D[Content Type Scorer]
E[Freshness Scorer]
F[Domain Authority Scorer]
end
subgraph "Scoring Process"
B --> B1[Keywords: python, tutorial, ml<br/>Score: 0.85]
C --> C1[Depth: 4 levels<br/>Optimal: 3<br/>Score: 0.75]
D --> D1[Content: HTML<br/>Score: 1.0]
E --> E1[Year: 2024<br/>Score: 1.0]
F --> F1[Domain: python.org<br/>Score: 1.0]
end
subgraph "Composite Scoring"
G[Weighted Combination]
B1 --> G
C1 --> G
D1 --> G
E1 --> G
F1 --> G
end
subgraph "Final Result"
H[Composite Score: 0.92]
I{Score > Threshold?}
J[Accept URL]
K[Reject URL]
end
A --> B
A --> C
A --> D
A --> E
A --> F
G --> H
H --> I
I -->|✓ 0.92 > 0.6| J
I -->|✗ Score too low| K
style A fill:#e3f2fd
style G fill:#fff3e0
style H fill:#e8f5e8
style J fill:#c8e6c9
style K fill:#ffcdd2
```
### Filter vs Scorer Decision Matrix
```mermaid
flowchart TD
A[URL Processing Decision] --> B{Binary Decision Needed?}
B -->|Yes - Include/Exclude| C[Use Filters]
B -->|No - Quality Rating| D[Use Scorers]
C --> C1{Filter Type Needed?}
C1 -->|Domain Control| C2[DomainFilter]
C1 -->|Pattern Matching| C3[URLPatternFilter]
C1 -->|Content Type| C4[ContentTypeFilter]
C1 -->|SEO Quality| C5[SEOFilter]
C1 -->|Content Relevance| C6[ContentRelevanceFilter]
D --> D1{Scoring Criteria?}
D1 -->|Keyword Relevance| D2[KeywordRelevanceScorer]
D1 -->|URL Structure| D3[PathDepthScorer]
D1 -->|Content Quality| D4[ContentTypeScorer]
D1 -->|Time Sensitivity| D5[FreshnessScorer]
D1 -->|Source Authority| D6[DomainAuthorityScorer]
C2 --> E[Chain Filters]
C3 --> E
C4 --> E
C5 --> E
C6 --> E
D2 --> F[Composite Scorer]
D3 --> F
D4 --> F
D5 --> F
D6 --> F
E --> G[Binary Output: Pass/Fail]
F --> H[Numeric Score: 0.0-1.0]
G --> I[Apply to URL Queue]
H --> J[Priority Ranking]
style C fill:#e8f5e8
style D fill:#fff3e0
style E fill:#f3e5f5
style F fill:#e3f2fd
style G fill:#c8e6c9
style H fill:#ffecb3
```
### Performance Optimization Strategy
```mermaid
sequenceDiagram
participant Queue as URL Queue
participant Fast as Fast Filters
participant Slow as Slow Filters
participant Score as Scorers
participant Output as Filtered URLs
Note over Queue, Output: Batch Processing (1000 URLs)
Queue->>Fast: Apply Domain Filter
Fast-->>Queue: 60% passed (600 URLs)
Queue->>Fast: Apply Pattern Filter
Fast-->>Queue: 70% passed (420 URLs)
Queue->>Fast: Apply Content Type Filter
Fast-->>Queue: 90% passed (378 URLs)
Note over Fast: Fast filters eliminate 62% of URLs
Queue->>Slow: Apply SEO Filter (378 URLs)
Slow-->>Queue: 80% passed (302 URLs)
Queue->>Slow: Apply Relevance Filter
Slow-->>Queue: 75% passed (227 URLs)
Note over Slow: Content analysis on remaining URLs
Queue->>Score: Calculate Composite Scores
Score-->>Queue: Scored and ranked
Queue->>Output: Top 100 URLs by score
Output-->>Queue: Processing complete
Note over Queue, Output: Total: 90% filtered out, 10% high-quality URLs retained
```
### Custom Filter Implementation Flow
```mermaid
stateDiagram-v2
[*] --> Planning
Planning --> IdentifyNeeds: Define filtering criteria
IdentifyNeeds --> ChooseType: Binary vs Scoring decision
ChooseType --> FilterImpl: Binary decision needed
ChooseType --> ScorerImpl: Quality rating needed
FilterImpl --> InheritURLFilter: Extend URLFilter base class
ScorerImpl --> InheritURLScorer: Extend URLScorer base class
InheritURLFilter --> ImplementApply: def apply(url) -> bool
InheritURLScorer --> ImplementScore: def _calculate_score(url) -> float
ImplementApply --> AddLogic: Add custom filtering logic
ImplementScore --> AddLogic
AddLogic --> TestFilter: Unit testing
TestFilter --> OptimizePerf: Performance optimization
OptimizePerf --> Integration: Integrate with FilterChain
Integration --> Production: Deploy to production
Production --> Monitor: Monitor performance
Monitor --> Tune: Tune parameters
Tune --> Production
note right of Planning : Consider performance impact
note right of AddLogic : Handle edge cases
note right of OptimizePerf : Cache frequently accessed data
```
### Filter Chain Optimization Patterns
```mermaid
graph TB
subgraph "Naive Approach - Poor Performance"
A1[All URLs] --> B1[Slow Filter 1]
B1 --> C1[Slow Filter 2]
C1 --> D1[Fast Filter 1]
D1 --> E1[Fast Filter 2]
E1 --> F1[Final Results]
B1 -.->|High CPU| G1[Performance Issues]
C1 -.->|Network Calls| G1
end
subgraph "Optimized Approach - High Performance"
A2[All URLs] --> B2[Fast Filter 1]
B2 --> C2[Fast Filter 2]
C2 --> D2[Batch Process]
D2 --> E2[Slow Filter 1]
E2 --> F2[Slow Filter 2]
F2 --> G2[Final Results]
D2 --> H2[Concurrent Processing]
H2 --> I2[Semaphore Control]
end
subgraph "Performance Metrics"
J[Processing Time]
K[Memory Usage]
L[CPU Utilization]
M[Network Requests]
end
G1 -.-> J
G1 -.-> K
G1 -.-> L
G1 -.-> M
G2 -.-> J
G2 -.-> K
G2 -.-> L
G2 -.-> M
style A1 fill:#ffcdd2
style G1 fill:#ffcdd2
style A2 fill:#c8e6c9
style G2 fill:#c8e6c9
style H2 fill:#e8f5e8
style I2 fill:#e8f5e8
```
### Composite Scoring Weight Distribution
```mermaid
pie title Composite Scorer Weight Distribution
"Keyword Relevance (30%)" : 30
"Domain Authority (25%)" : 25
"Content Type (20%)" : 20
"Freshness (15%)" : 15
"Path Depth (10%)" : 10
```
### Deep Crawl Integration Architecture
```mermaid
graph TD
subgraph "Deep Crawl Strategy"
A[Start URL] --> B[Extract Links]
B --> C[Apply Filter Chain]
C --> D[Calculate Scores]
D --> E[Priority Queue]
E --> F[Crawl Next URL]
F --> B
end
subgraph "Filter Chain Components"
C --> C1[Domain Filter]
C --> C2[Pattern Filter]
C --> C3[Content Filter]
C --> C4[SEO Filter]
C --> C5[Relevance Filter]
end
subgraph "Scoring Components"
D --> D1[Keyword Scorer]
D --> D2[Depth Scorer]
D --> D3[Freshness Scorer]
D --> D4[Authority Scorer]
D --> D5[Composite Score]
end
subgraph "Queue Management"
E --> E1{Score > Threshold?}
E1 -->|Yes| E2[High Priority Queue]
E1 -->|No| E3[Low Priority Queue]
E2 --> F
E3 --> G[Delayed Processing]
end
subgraph "Control Flow"
H{Max Depth Reached?}
I{Max Pages Reached?}
J[Stop Crawling]
end
F --> H
H -->|No| I
H -->|Yes| J
I -->|No| B
I -->|Yes| J
style A fill:#e3f2fd
style E2 fill:#c8e6c9
style E3 fill:#fff3e0
style J fill:#ffcdd2
```
### Filter Performance Comparison
```mermaid
xychart-beta
title "Filter Performance Comparison (1000 URLs)"
x-axis [Domain, Pattern, ContentType, SEO, Relevance]
y-axis "Processing Time (ms)" 0 --> 1000
bar [50, 80, 45, 300, 800]
```
### Scoring Algorithm Workflow
```mermaid
flowchart TD
A[Input URL] --> B[Parse URL Components]
B --> C[Extract Features]
C --> D[Domain Analysis]
C --> E[Path Analysis]
C --> F[Content Type Detection]
C --> G[Keyword Extraction]
C --> H[Freshness Detection]
D --> I[Domain Authority Score]
E --> J[Path Depth Score]
F --> K[Content Type Score]
G --> L[Keyword Relevance Score]
H --> M[Freshness Score]
I --> N[Apply Weights]
J --> N
K --> N
L --> N
M --> N
N --> O[Normalize Scores]
O --> P[Calculate Final Score]
P --> Q{Score >= Threshold?}
Q -->|Yes| R[Accept for Crawling]
Q -->|No| S[Reject URL]
R --> T[Add to Priority Queue]
S --> U[Log Rejection Reason]
style A fill:#e3f2fd
style P fill:#fff3e0
style R fill:#c8e6c9
style S fill:#ffcdd2
style T fill:#e8f5e8
```
**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/), [Custom Implementations](https://docs.crawl4ai.com/advanced/custom-filters/)

View File

@@ -0,0 +1,428 @@
## Deep Crawling Workflows and Architecture
Visual representations of multi-level website exploration, filtering strategies, and intelligent crawling patterns.
### Deep Crawl Strategy Overview
```mermaid
flowchart TD
A[Start Deep Crawl] --> B{Strategy Selection}
B -->|Explore All Levels| C[BFS Strategy]
B -->|Dive Deep Fast| D[DFS Strategy]
B -->|Smart Prioritization| E[Best-First Strategy]
C --> C1[Breadth-First Search]
C1 --> C2[Process all depth 0 links]
C2 --> C3[Process all depth 1 links]
C3 --> C4[Continue by depth level]
D --> D1[Depth-First Search]
D1 --> D2[Follow first link deeply]
D2 --> D3[Backtrack when max depth reached]
D3 --> D4[Continue with next branch]
E --> E1[Best-First Search]
E1 --> E2[Score all discovered URLs]
E2 --> E3[Process highest scoring URLs first]
E3 --> E4[Continuously re-prioritize queue]
C4 --> F[Apply Filters]
D4 --> F
E4 --> F
F --> G{Filter Chain Processing}
G -->|Domain Filter| G1[Check allowed/blocked domains]
G -->|URL Pattern Filter| G2[Match URL patterns]
G -->|Content Type Filter| G3[Verify content types]
G -->|SEO Filter| G4[Evaluate SEO quality]
G -->|Content Relevance| G5[Score content relevance]
G1 --> H{Passed All Filters?}
G2 --> H
G3 --> H
G4 --> H
G5 --> H
H -->|Yes| I[Add to Crawl Queue]
H -->|No| J[Discard URL]
I --> K{Processing Mode}
K -->|Streaming| L[Process Immediately]
K -->|Batch| M[Collect All Results]
L --> N[Stream Result to User]
M --> O[Return Complete Result Set]
J --> P{More URLs in Queue?}
N --> P
O --> P
P -->|Yes| Q{Within Limits?}
P -->|No| R[Deep Crawl Complete]
Q -->|Max Depth OK| S{Max Pages OK}
Q -->|Max Depth Exceeded| T[Skip Deeper URLs]
S -->|Under Limit| U[Continue Crawling]
S -->|Limit Reached| R
T --> P
U --> F
style A fill:#e1f5fe
style R fill:#c8e6c9
style C fill:#fff3e0
style D fill:#f3e5f5
style E fill:#e8f5e8
```
### Deep Crawl Strategy Comparison
```mermaid
graph TB
subgraph "BFS - Breadth-First Search"
BFS1[Level 0: Start URL]
BFS2[Level 1: All direct links]
BFS3[Level 2: All second-level links]
BFS4[Level 3: All third-level links]
BFS1 --> BFS2
BFS2 --> BFS3
BFS3 --> BFS4
BFS_NOTE[Complete each depth before going deeper<br/>Good for site mapping<br/>Memory intensive for wide sites]
end
subgraph "DFS - Depth-First Search"
DFS1[Start URL]
DFS2[First Link → Deep]
DFS3[Follow until max depth]
DFS4[Backtrack and try next]
DFS1 --> DFS2
DFS2 --> DFS3
DFS3 --> DFS4
DFS4 --> DFS2
DFS_NOTE[Go deep on first path<br/>Memory efficient<br/>May miss important pages]
end
subgraph "Best-First - Priority Queue"
BF1[Start URL]
BF2[Score all discovered links]
BF3[Process highest scoring first]
BF4[Continuously re-prioritize]
BF1 --> BF2
BF2 --> BF3
BF3 --> BF4
BF4 --> BF2
BF_NOTE[Intelligent prioritization<br/>Finds relevant content fast<br/>Recommended for most use cases]
end
style BFS1 fill:#e3f2fd
style DFS1 fill:#f3e5f5
style BF1 fill:#e8f5e8
style BFS_NOTE fill:#fff3e0
style DFS_NOTE fill:#fff3e0
style BF_NOTE fill:#fff3e0
```
### Filter Chain Processing Sequence
```mermaid
sequenceDiagram
participant URL as Discovered URL
participant Chain as Filter Chain
participant Domain as Domain Filter
participant Pattern as URL Pattern Filter
participant Content as Content Type Filter
participant SEO as SEO Filter
participant Relevance as Content Relevance Filter
participant Queue as Crawl Queue
URL->>Chain: Process URL
Chain->>Domain: Check domain rules
alt Domain Allowed
Domain-->>Chain: ✓ Pass
Chain->>Pattern: Check URL patterns
alt Pattern Matches
Pattern-->>Chain: ✓ Pass
Chain->>Content: Check content type
alt Content Type Valid
Content-->>Chain: ✓ Pass
Chain->>SEO: Evaluate SEO quality
alt SEO Score Above Threshold
SEO-->>Chain: ✓ Pass
Chain->>Relevance: Score content relevance
alt Relevance Score High
Relevance-->>Chain: ✓ Pass
Chain->>Queue: Add to crawl queue
Queue-->>URL: Queued for crawling
else Relevance Score Low
Relevance-->>Chain: ✗ Reject
Chain-->>URL: Filtered out - Low relevance
end
else SEO Score Low
SEO-->>Chain: ✗ Reject
Chain-->>URL: Filtered out - Poor SEO
end
else Invalid Content Type
Content-->>Chain: ✗ Reject
Chain-->>URL: Filtered out - Wrong content type
end
else Pattern Mismatch
Pattern-->>Chain: ✗ Reject
Chain-->>URL: Filtered out - Pattern mismatch
end
else Domain Blocked
Domain-->>Chain: ✗ Reject
Chain-->>URL: Filtered out - Blocked domain
end
```
### URL Lifecycle State Machine
```mermaid
stateDiagram-v2
[*] --> Discovered: Found on page
Discovered --> FilterPending: Enter filter chain
FilterPending --> DomainCheck: Apply domain filter
DomainCheck --> PatternCheck: Domain allowed
DomainCheck --> Rejected: Domain blocked
PatternCheck --> ContentCheck: Pattern matches
PatternCheck --> Rejected: Pattern mismatch
ContentCheck --> SEOCheck: Content type valid
ContentCheck --> Rejected: Invalid content
SEOCheck --> RelevanceCheck: SEO score sufficient
SEOCheck --> Rejected: Poor SEO score
RelevanceCheck --> Scored: Relevance score calculated
RelevanceCheck --> Rejected: Low relevance
Scored --> Queued: Added to priority queue
Queued --> Crawling: Selected for processing
Crawling --> Success: Page crawled successfully
Crawling --> Failed: Crawl failed
Success --> LinkExtraction: Extract new links
LinkExtraction --> [*]: Process complete
Failed --> [*]: Record failure
Rejected --> [*]: Log rejection reason
note right of Scored : Score determines priority<br/>in Best-First strategy
note right of Failed : Errors logged with<br/>depth and reason
```
### Streaming vs Batch Processing Architecture
```mermaid
graph TB
subgraph "Input"
A[Start URL] --> B[Deep Crawl Strategy]
end
subgraph "Crawl Engine"
B --> C[URL Discovery]
C --> D[Filter Chain]
D --> E[Priority Queue]
E --> F[Page Processor]
end
subgraph "Streaming Mode stream=True"
F --> G1[Process Page]
G1 --> H1[Extract Content]
H1 --> I1[Yield Result Immediately]
I1 --> J1[async for result]
J1 --> K1[Real-time Processing]
G1 --> L1[Extract Links]
L1 --> M1[Add to Queue]
M1 --> F
end
subgraph "Batch Mode stream=False"
F --> G2[Process Page]
G2 --> H2[Extract Content]
H2 --> I2[Store Result]
I2 --> N2[Result Collection]
G2 --> L2[Extract Links]
L2 --> M2[Add to Queue]
M2 --> O2{More URLs?}
O2 -->|Yes| F
O2 -->|No| P2[Return All Results]
P2 --> Q2[Batch Processing]
end
style I1 fill:#e8f5e8
style K1 fill:#e8f5e8
style P2 fill:#e3f2fd
style Q2 fill:#e3f2fd
```
### Advanced Scoring and Prioritization System
```mermaid
flowchart LR
subgraph "URL Discovery"
A[Page Links] --> B[Extract URLs]
B --> C[Normalize URLs]
end
subgraph "Scoring System"
C --> D[Keyword Relevance Scorer]
D --> D1[URL Text Analysis]
D --> D2[Keyword Matching]
D --> D3[Calculate Base Score]
D3 --> E[Additional Scoring Factors]
E --> E1[URL Structure weight: 0.2]
E --> E2[Link Context weight: 0.3]
E --> E3[Page Depth Penalty weight: 0.1]
E --> E4[Domain Authority weight: 0.4]
D1 --> F[Combined Score]
D2 --> F
D3 --> F
E1 --> F
E2 --> F
E3 --> F
E4 --> F
end
subgraph "Prioritization"
F --> G{Score Threshold}
G -->|Above Threshold| H[Priority Queue]
G -->|Below Threshold| I[Discard URL]
H --> J[Best-First Selection]
J --> K[Highest Score First]
K --> L[Process Page]
L --> M[Update Scores]
M --> N[Re-prioritize Queue]
N --> J
end
style F fill:#fff3e0
style H fill:#e8f5e8
style L fill:#e3f2fd
```
### Deep Crawl Performance and Limits
```mermaid
graph TD
subgraph "Crawl Constraints"
A[Max Depth: 2] --> A1[Prevents infinite crawling]
B[Max Pages: 50] --> B1[Controls resource usage]
C[Score Threshold: 0.3] --> C1[Quality filtering]
D[Domain Limits] --> D1[Scope control]
end
subgraph "Performance Monitoring"
E[Pages Crawled] --> F[Depth Distribution]
E --> G[Success Rate]
E --> H[Average Score]
E --> I[Processing Time]
F --> J[Performance Report]
G --> J
H --> J
I --> J
end
subgraph "Resource Management"
K[Memory Usage] --> L{Memory Threshold}
L -->|Under Limit| M[Continue Crawling]
L -->|Over Limit| N[Reduce Concurrency]
O[CPU Usage] --> P{CPU Threshold}
P -->|Normal| M
P -->|High| Q[Add Delays]
R[Network Load] --> S{Rate Limits}
S -->|OK| M
S -->|Exceeded| T[Throttle Requests]
end
M --> U[Optimal Performance]
N --> V[Reduced Performance]
Q --> V
T --> V
style U fill:#c8e6c9
style V fill:#fff3e0
style J fill:#e3f2fd
```
### Error Handling and Recovery Flow
```mermaid
sequenceDiagram
participant Strategy as Deep Crawl Strategy
participant Queue as Priority Queue
participant Crawler as Page Crawler
participant Error as Error Handler
participant Result as Result Collector
Strategy->>Queue: Get next URL
Queue-->>Strategy: Return highest priority URL
Strategy->>Crawler: Crawl page
alt Successful Crawl
Crawler-->>Strategy: Return page content
Strategy->>Result: Store successful result
Strategy->>Strategy: Extract new links
Strategy->>Queue: Add new URLs to queue
else Network Error
Crawler-->>Error: Network timeout/failure
Error->>Error: Log error with details
Error->>Queue: Mark URL as failed
Error-->>Strategy: Skip to next URL
else Parse Error
Crawler-->>Error: HTML parsing failed
Error->>Error: Log parse error
Error->>Result: Store failed result
Error-->>Strategy: Continue with next URL
else Rate Limit Hit
Crawler-->>Error: Rate limit exceeded
Error->>Error: Apply backoff strategy
Error->>Queue: Re-queue URL with delay
Error-->>Strategy: Wait before retry
else Depth Limit
Strategy->>Strategy: Check depth constraint
Strategy-->>Queue: Skip URL - too deep
else Page Limit
Strategy->>Strategy: Check page count
Strategy-->>Result: Stop crawling - limit reached
end
Strategy->>Queue: Request next URL
Queue-->>Strategy: More URLs available?
alt Queue Empty
Queue-->>Result: Crawl complete
else Queue Has URLs
Queue-->>Strategy: Continue crawling
end
```
**📖 Learn more:** [Deep Crawling Strategies](https://docs.crawl4ai.com/core/deep-crawling/), [Content Filtering](https://docs.crawl4ai.com/core/content-selection/), [Advanced Crawling Patterns](https://docs.crawl4ai.com/advanced/advanced-features/)

View File

@@ -0,0 +1,603 @@
## Docker Deployment Architecture and Workflows
Visual representations of Crawl4AI Docker deployment, API architecture, configuration management, and service interactions.
### Docker Deployment Decision Flow
```mermaid
flowchart TD
A[Start Docker Deployment] --> B{Deployment Type?}
B -->|Quick Start| C[Pre-built Image]
B -->|Development| D[Docker Compose]
B -->|Custom Build| E[Manual Build]
B -->|Production| F[Production Setup]
C --> C1[docker pull unclecode/crawl4ai]
C1 --> C2{Need LLM Support?}
C2 -->|Yes| C3[Setup .llm.env]
C2 -->|No| C4[Basic run]
C3 --> C5[docker run with --env-file]
C4 --> C6[docker run basic]
D --> D1[git clone repository]
D1 --> D2[cp .llm.env.example .llm.env]
D2 --> D3{Build Type?}
D3 -->|Pre-built| D4[IMAGE=latest docker compose up]
D3 -->|Local Build| D5[docker compose up --build]
D3 -->|All Features| D6[INSTALL_TYPE=all docker compose up]
E --> E1[docker buildx build]
E1 --> E2{Architecture?}
E2 -->|Single| E3[--platform linux/amd64]
E2 -->|Multi| E4[--platform linux/amd64,linux/arm64]
E3 --> E5[Build complete]
E4 --> E5
F --> F1[Production configuration]
F1 --> F2[Custom config.yml]
F2 --> F3[Resource limits]
F3 --> F4[Health monitoring]
F4 --> F5[Production ready]
C5 --> G[Service running on :11235]
C6 --> G
D4 --> G
D5 --> G
D6 --> G
E5 --> H[docker run custom image]
H --> G
F5 --> I[Production deployment]
G --> J[Access playground at /playground]
G --> K[Health check at /health]
I --> L[Production monitoring]
style A fill:#e1f5fe
style G fill:#c8e6c9
style I fill:#c8e6c9
style J fill:#fff3e0
style K fill:#fff3e0
style L fill:#e8f5e8
```
### Docker Container Architecture
```mermaid
graph TB
subgraph "Host Environment"
A[Docker Engine] --> B[Crawl4AI Container]
C[.llm.env] --> B
D[Custom config.yml] --> B
E[Port 11235] --> B
F[Shared Memory 1GB+] --> B
end
subgraph "Container Services"
B --> G[FastAPI Server :8020]
B --> H[Gunicorn WSGI]
B --> I[Supervisord Process Manager]
B --> J[Redis Cache :6379]
G --> K[REST API Endpoints]
G --> L[WebSocket Connections]
G --> M[MCP Protocol]
H --> N[Worker Processes]
I --> O[Service Monitoring]
J --> P[Request Caching]
end
subgraph "Browser Management"
B --> Q[Playwright Framework]
Q --> R[Chromium Browser]
Q --> S[Firefox Browser]
Q --> T[WebKit Browser]
R --> U[Browser Pool]
S --> U
T --> U
U --> V[Page Sessions]
U --> W[Context Management]
end
subgraph "External Services"
X[OpenAI API] -.-> K
Y[Anthropic Claude] -.-> K
Z[Local Ollama] -.-> K
AA[Groq API] -.-> K
BB[Google Gemini] -.-> K
end
subgraph "Client Interactions"
CC[Python SDK] --> K
DD[REST API Calls] --> K
EE[MCP Clients] --> M
FF[Web Browser] --> G
GG[Monitoring Tools] --> K
end
style B fill:#e3f2fd
style G fill:#f3e5f5
style Q fill:#e8f5e8
style K fill:#fff3e0
```
### API Endpoints Architecture
```mermaid
graph LR
subgraph "Core Endpoints"
A[/crawl] --> A1[Single URL crawl]
A2[/crawl/stream] --> A3[Streaming multi-URL]
A4[/crawl/job] --> A5[Async job submission]
A6[/crawl/job/{id}] --> A7[Job status check]
end
subgraph "Specialized Endpoints"
B[/html] --> B1[Preprocessed HTML]
B2[/screenshot] --> B3[PNG capture]
B4[/pdf] --> B5[PDF generation]
B6[/execute_js] --> B7[JavaScript execution]
B8[/md] --> B9[Markdown extraction]
end
subgraph "Utility Endpoints"
C[/health] --> C1[Service status]
C2[/metrics] --> C3[Prometheus metrics]
C4[/schema] --> C5[API documentation]
C6[/playground] --> C7[Interactive testing]
end
subgraph "LLM Integration"
D[/llm/{url}] --> D1[Q&A over URL]
D2[/ask] --> D3[Library context search]
D4[/config/dump] --> D5[Config validation]
end
subgraph "MCP Protocol"
E[/mcp/sse] --> E1[Server-Sent Events]
E2[/mcp/ws] --> E3[WebSocket connection]
E4[/mcp/schema] --> E5[MCP tool definitions]
end
style A fill:#e3f2fd
style B fill:#f3e5f5
style C fill:#e8f5e8
style D fill:#fff3e0
style E fill:#fce4ec
```
### Request Processing Flow
```mermaid
sequenceDiagram
participant Client
participant FastAPI
participant RequestValidator
participant BrowserPool
participant Playwright
participant ExtractionEngine
participant LLMProvider
Client->>FastAPI: POST /crawl with config
FastAPI->>RequestValidator: Validate JSON structure
alt Valid Request
RequestValidator-->>FastAPI: ✓ Validated
FastAPI->>BrowserPool: Request browser instance
BrowserPool->>Playwright: Launch browser/reuse session
Playwright-->>BrowserPool: Browser ready
BrowserPool-->>FastAPI: Browser allocated
FastAPI->>Playwright: Navigate to URL
Playwright->>Playwright: Execute JS, wait conditions
Playwright-->>FastAPI: Page content ready
FastAPI->>ExtractionEngine: Process content
alt LLM Extraction
ExtractionEngine->>LLMProvider: Send content + schema
LLMProvider-->>ExtractionEngine: Structured data
else CSS Extraction
ExtractionEngine->>ExtractionEngine: Apply CSS selectors
end
ExtractionEngine-->>FastAPI: Extraction complete
FastAPI->>BrowserPool: Release browser
FastAPI-->>Client: CrawlResult response
else Invalid Request
RequestValidator-->>FastAPI: ✗ Validation error
FastAPI-->>Client: 400 Bad Request
end
```
### Configuration Management Flow
```mermaid
stateDiagram-v2
[*] --> ConfigLoading
ConfigLoading --> DefaultConfig: Load default config.yml
ConfigLoading --> CustomConfig: Custom config mounted
ConfigLoading --> EnvOverrides: Environment variables
DefaultConfig --> ConfigMerging
CustomConfig --> ConfigMerging
EnvOverrides --> ConfigMerging
ConfigMerging --> ConfigValidation
ConfigValidation --> Valid: Schema validation passes
ConfigValidation --> Invalid: Validation errors
Invalid --> ConfigError: Log errors and exit
ConfigError --> [*]
Valid --> ServiceInitialization
ServiceInitialization --> FastAPISetup
ServiceInitialization --> BrowserPoolInit
ServiceInitialization --> CacheSetup
FastAPISetup --> Running
BrowserPoolInit --> Running
CacheSetup --> Running
Running --> ConfigReload: Config change detected
ConfigReload --> ConfigValidation
Running --> [*]: Service shutdown
note right of ConfigMerging : Priority: ENV > Custom > Default
note right of ServiceInitialization : All services must initialize successfully
```
### Multi-Architecture Build Process
```mermaid
flowchart TD
A[Developer Push] --> B[GitHub Repository]
B --> C[Docker Buildx]
C --> D{Build Strategy}
D -->|Multi-arch| E[Parallel Builds]
D -->|Single-arch| F[Platform-specific Build]
E --> G[AMD64 Build]
E --> H[ARM64 Build]
F --> I[Target Platform Build]
subgraph "AMD64 Build Process"
G --> G1[Ubuntu base image]
G1 --> G2[Python 3.11 install]
G2 --> G3[System dependencies]
G3 --> G4[Crawl4AI installation]
G4 --> G5[Playwright setup]
G5 --> G6[FastAPI configuration]
G6 --> G7[AMD64 image ready]
end
subgraph "ARM64 Build Process"
H --> H1[Ubuntu ARM64 base]
H1 --> H2[Python 3.11 install]
H2 --> H3[ARM-specific deps]
H3 --> H4[Crawl4AI installation]
H4 --> H5[Playwright setup]
H5 --> H6[FastAPI configuration]
H6 --> H7[ARM64 image ready]
end
subgraph "Single Architecture"
I --> I1[Base image selection]
I1 --> I2[Platform dependencies]
I2 --> I3[Application setup]
I3 --> I4[Platform image ready]
end
G7 --> J[Multi-arch Manifest]
H7 --> J
I4 --> K[Platform Image]
J --> L[Docker Hub Registry]
K --> L
L --> M[Pull Request Auto-selects Architecture]
style A fill:#e1f5fe
style J fill:#c8e6c9
style K fill:#c8e6c9
style L fill:#f3e5f5
style M fill:#e8f5e8
```
### MCP Integration Architecture
```mermaid
graph TB
subgraph "MCP Client Applications"
A[Claude Code] --> B[MCP Protocol]
C[Cursor IDE] --> B
D[Windsurf] --> B
E[Custom MCP Client] --> B
end
subgraph "Crawl4AI MCP Server"
B --> F[MCP Endpoint Router]
F --> G[SSE Transport /mcp/sse]
F --> H[WebSocket Transport /mcp/ws]
F --> I[Schema Endpoint /mcp/schema]
G --> J[MCP Tool Handler]
H --> J
J --> K[Tool: md]
J --> L[Tool: html]
J --> M[Tool: screenshot]
J --> N[Tool: pdf]
J --> O[Tool: execute_js]
J --> P[Tool: crawl]
J --> Q[Tool: ask]
end
subgraph "Crawl4AI Core Services"
K --> R[Markdown Generator]
L --> S[HTML Preprocessor]
M --> T[Screenshot Service]
N --> U[PDF Generator]
O --> V[JavaScript Executor]
P --> W[Batch Crawler]
Q --> X[Context Search]
R --> Y[Browser Pool]
S --> Y
T --> Y
U --> Y
V --> Y
W --> Y
X --> Z[Knowledge Base]
end
subgraph "External Resources"
Y --> AA[Playwright Browsers]
Z --> BB[Library Documentation]
Z --> CC[Code Examples]
AA --> DD[Web Pages]
end
style B fill:#e3f2fd
style J fill:#f3e5f5
style Y fill:#e8f5e8
style Z fill:#fff3e0
```
### API Request/Response Flow Patterns
```mermaid
sequenceDiagram
participant Client
participant LoadBalancer
participant FastAPI
participant ConfigValidator
participant BrowserManager
participant CrawlEngine
participant ResponseBuilder
Note over Client,ResponseBuilder: Basic Crawl Request
Client->>LoadBalancer: POST /crawl
LoadBalancer->>FastAPI: Route request
FastAPI->>ConfigValidator: Validate browser_config
ConfigValidator-->>FastAPI: ✓ Valid BrowserConfig
FastAPI->>ConfigValidator: Validate crawler_config
ConfigValidator-->>FastAPI: ✓ Valid CrawlerRunConfig
FastAPI->>BrowserManager: Allocate browser
BrowserManager-->>FastAPI: Browser instance
FastAPI->>CrawlEngine: Execute crawl
Note over CrawlEngine: Page processing
CrawlEngine->>CrawlEngine: Navigate & wait
CrawlEngine->>CrawlEngine: Extract content
CrawlEngine->>CrawlEngine: Apply strategies
CrawlEngine-->>FastAPI: CrawlResult
FastAPI->>ResponseBuilder: Format response
ResponseBuilder-->>FastAPI: JSON response
FastAPI->>BrowserManager: Release browser
FastAPI-->>LoadBalancer: Response ready
LoadBalancer-->>Client: 200 OK + CrawlResult
Note over Client,ResponseBuilder: Streaming Request
Client->>FastAPI: POST /crawl/stream
FastAPI-->>Client: 200 OK (stream start)
loop For each URL
FastAPI->>CrawlEngine: Process URL
CrawlEngine-->>FastAPI: Result ready
FastAPI-->>Client: NDJSON line
end
FastAPI-->>Client: Stream completed
```
### Configuration Validation Workflow
```mermaid
flowchart TD
A[Client Request] --> B[JSON Payload]
B --> C{Pre-validation}
C -->|✓ Valid JSON| D[Extract Configurations]
C -->|✗ Invalid JSON| E[Return 400 Bad Request]
D --> F[BrowserConfig Validation]
D --> G[CrawlerRunConfig Validation]
F --> H{BrowserConfig Valid?}
G --> I{CrawlerRunConfig Valid?}
H -->|✓ Valid| J[Browser Setup]
H -->|✗ Invalid| K[Log Browser Config Errors]
I -->|✓ Valid| L[Crawler Setup]
I -->|✗ Invalid| M[Log Crawler Config Errors]
K --> N[Collect All Errors]
M --> N
N --> O[Return 422 Validation Error]
J --> P{Both Configs Valid?}
L --> P
P -->|✓ Yes| Q[Proceed to Crawling]
P -->|✗ No| O
Q --> R[Execute Crawl Pipeline]
R --> S[Return CrawlResult]
E --> T[Client Error Response]
O --> T
S --> U[Client Success Response]
style A fill:#e1f5fe
style Q fill:#c8e6c9
style S fill:#c8e6c9
style U fill:#c8e6c9
style E fill:#ffcdd2
style O fill:#ffcdd2
style T fill:#ffcdd2
```
### Production Deployment Architecture
```mermaid
graph TB
subgraph "Load Balancer Layer"
A[NGINX/HAProxy] --> B[Health Check]
A --> C[Request Routing]
A --> D[SSL Termination]
end
subgraph "Application Layer"
C --> E[Crawl4AI Instance 1]
C --> F[Crawl4AI Instance 2]
C --> G[Crawl4AI Instance N]
E --> H[FastAPI Server]
F --> I[FastAPI Server]
G --> J[FastAPI Server]
H --> K[Browser Pool 1]
I --> L[Browser Pool 2]
J --> M[Browser Pool N]
end
subgraph "Shared Services"
N[Redis Cluster] --> E
N --> F
N --> G
O[Monitoring Stack] --> P[Prometheus]
O --> Q[Grafana]
O --> R[AlertManager]
P --> E
P --> F
P --> G
end
subgraph "External Dependencies"
S[OpenAI API] -.-> H
T[Anthropic API] -.-> I
U[Local LLM Cluster] -.-> J
end
subgraph "Persistent Storage"
V[Configuration Volume] --> E
V --> F
V --> G
W[Cache Volume] --> N
X[Logs Volume] --> O
end
style A fill:#e3f2fd
style E fill:#f3e5f5
style F fill:#f3e5f5
style G fill:#f3e5f5
style N fill:#e8f5e8
style O fill:#fff3e0
```
### Docker Resource Management
```mermaid
graph TD
subgraph "Resource Allocation"
A[Host Resources] --> B[CPU Cores]
A --> C[Memory GB]
A --> D[Disk Space]
A --> E[Network Bandwidth]
B --> F[Container Limits]
C --> F
D --> F
E --> F
end
subgraph "Container Configuration"
F --> G[--cpus=4]
F --> H[--memory=8g]
F --> I[--shm-size=2g]
F --> J[Volume Mounts]
G --> K[Browser Processes]
H --> L[Browser Memory]
I --> M[Shared Memory for Browsers]
J --> N[Config & Cache Storage]
end
subgraph "Monitoring & Scaling"
O[Resource Monitor] --> P[CPU Usage %]
O --> Q[Memory Usage %]
O --> R[Request Queue Length]
P --> S{CPU > 80%?}
Q --> T{Memory > 90%?}
R --> U{Queue > 100?}
S -->|Yes| V[Scale Up]
T -->|Yes| V
U -->|Yes| V
V --> W[Add Container Instance]
W --> X[Update Load Balancer]
end
subgraph "Performance Optimization"
Y[Browser Pool Tuning] --> Z[Max Pages: 40]
Y --> AA[Idle TTL: 30min]
Y --> BB[Concurrency Limits]
Z --> CC[Memory Efficiency]
AA --> DD[Resource Cleanup]
BB --> EE[Throughput Control]
end
style A fill:#e1f5fe
style F fill:#f3e5f5
style O fill:#e8f5e8
style Y fill:#fff3e0
```
**📖 Learn more:** [Docker Deployment Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Production Configuration](https://docs.crawl4ai.com/core/docker-deployment/#production-deployment)

View File

@@ -0,0 +1,478 @@
## Extraction Strategy Workflows and Architecture
Visual representations of Crawl4AI's data extraction approaches, strategy selection, and processing workflows.
### Extraction Strategy Decision Tree
```mermaid
flowchart TD
A[Content to Extract] --> B{Content Type?}
B -->|Simple Patterns| C[Common Data Types]
B -->|Structured HTML| D[Predictable Structure]
B -->|Complex Content| E[Requires Reasoning]
B -->|Mixed Content| F[Multiple Data Types]
C --> C1{Pattern Type?}
C1 -->|Email, Phone, URLs| C2[Built-in Regex Patterns]
C1 -->|Custom Patterns| C3[Custom Regex Strategy]
C1 -->|LLM-Generated| C4[One-time Pattern Generation]
D --> D1{Selector Type?}
D1 -->|CSS Selectors| D2[JsonCssExtractionStrategy]
D1 -->|XPath Expressions| D3[JsonXPathExtractionStrategy]
D1 -->|Need Schema?| D4[Auto-generate Schema with LLM]
E --> E1{LLM Provider?}
E1 -->|OpenAI/Anthropic| E2[Cloud LLM Strategy]
E1 -->|Local Ollama| E3[Local LLM Strategy]
E1 -->|Cost-sensitive| E4[Hybrid: Generate Schema Once]
F --> F1[Multi-Strategy Approach]
F1 --> F2[1. Regex for Patterns]
F1 --> F3[2. CSS for Structure]
F1 --> F4[3. LLM for Complex Analysis]
C2 --> G[Fast Extraction ⚡]
C3 --> G
C4 --> H[Cached Pattern Reuse]
D2 --> I[Schema-based Extraction 🏗️]
D3 --> I
D4 --> J[Generated Schema Cache]
E2 --> K[Intelligent Parsing 🧠]
E3 --> K
E4 --> L[Hybrid Cost-Effective]
F2 --> M[Comprehensive Results 📊]
F3 --> M
F4 --> M
style G fill:#c8e6c9
style I fill:#e3f2fd
style K fill:#fff3e0
style M fill:#f3e5f5
style H fill:#e8f5e8
style J fill:#e8f5e8
style L fill:#ffecb3
```
### LLM Extraction Strategy Workflow
```mermaid
sequenceDiagram
participant User
participant Crawler
participant LLMStrategy
participant Chunker
participant LLMProvider
participant Parser
User->>Crawler: Configure LLMExtractionStrategy
User->>Crawler: arun(url, config)
Crawler->>Crawler: Navigate to URL
Crawler->>Crawler: Extract content (HTML/Markdown)
Crawler->>LLMStrategy: Process content
LLMStrategy->>LLMStrategy: Check content size
alt Content > chunk_threshold
LLMStrategy->>Chunker: Split into chunks with overlap
Chunker-->>LLMStrategy: Return chunks[]
loop For each chunk
LLMStrategy->>LLMProvider: Send chunk + schema + instruction
LLMProvider-->>LLMStrategy: Return structured JSON
end
LLMStrategy->>LLMStrategy: Merge chunk results
else Content <= threshold
LLMStrategy->>LLMProvider: Send full content + schema
LLMProvider-->>LLMStrategy: Return structured JSON
end
LLMStrategy->>Parser: Validate JSON schema
Parser-->>LLMStrategy: Validated data
LLMStrategy->>LLMStrategy: Track token usage
LLMStrategy-->>Crawler: Return extracted_content
Crawler-->>User: CrawlResult with JSON data
User->>LLMStrategy: show_usage()
LLMStrategy-->>User: Token count & estimated cost
```
### Schema-Based Extraction Architecture
```mermaid
graph TB
subgraph "Schema Definition"
A[JSON Schema] --> A1[baseSelector]
A --> A2[fields[]]
A --> A3[nested structures]
A2 --> A4[CSS/XPath selectors]
A2 --> A5[Data types: text, html, attribute]
A2 --> A6[Default values]
A3 --> A7[nested objects]
A3 --> A8[nested_list arrays]
A3 --> A9[simple lists]
end
subgraph "Extraction Engine"
B[HTML Content] --> C[Selector Engine]
C --> C1[CSS Selector Parser]
C --> C2[XPath Evaluator]
C1 --> D[Element Matcher]
C2 --> D
D --> E[Type Converter]
E --> E1[Text Extraction]
E --> E2[HTML Preservation]
E --> E3[Attribute Extraction]
E --> E4[Nested Processing]
end
subgraph "Result Processing"
F[Raw Extracted Data] --> G[Structure Builder]
G --> G1[Object Construction]
G --> G2[Array Assembly]
G --> G3[Type Validation]
G1 --> H[JSON Output]
G2 --> H
G3 --> H
end
A --> C
E --> F
H --> I[extracted_content]
style A fill:#e3f2fd
style C fill:#f3e5f5
style G fill:#e8f5e8
style H fill:#c8e6c9
```
### Automatic Schema Generation Process
```mermaid
stateDiagram-v2
[*] --> CheckCache
CheckCache --> CacheHit: Schema exists
CheckCache --> SamplePage: Schema missing
CacheHit --> LoadSchema
LoadSchema --> FastExtraction
SamplePage --> ExtractHTML: Crawl sample URL
ExtractHTML --> LLMAnalysis: Send HTML to LLM
LLMAnalysis --> GenerateSchema: Create CSS/XPath selectors
GenerateSchema --> ValidateSchema: Test generated schema
ValidateSchema --> SchemaWorks: Valid selectors
ValidateSchema --> RefineSchema: Invalid selectors
RefineSchema --> LLMAnalysis: Iterate with feedback
SchemaWorks --> CacheSchema: Save for reuse
CacheSchema --> FastExtraction: Use cached schema
FastExtraction --> [*]: No more LLM calls needed
note right of CheckCache : One-time LLM cost
note right of FastExtraction : Unlimited fast reuse
note right of CacheSchema : JSON file storage
```
### Multi-Strategy Extraction Pipeline
```mermaid
flowchart LR
A[Web Page Content] --> B[Strategy Pipeline]
subgraph B["Extraction Pipeline"]
B1[Stage 1: Regex Patterns]
B2[Stage 2: Schema-based CSS]
B3[Stage 3: LLM Analysis]
B1 --> B1a[Email addresses]
B1 --> B1b[Phone numbers]
B1 --> B1c[URLs and links]
B1 --> B1d[Currency amounts]
B2 --> B2a[Structured products]
B2 --> B2b[Article metadata]
B2 --> B2c[User reviews]
B2 --> B2d[Navigation links]
B3 --> B3a[Sentiment analysis]
B3 --> B3b[Key topics]
B3 --> B3c[Entity recognition]
B3 --> B3d[Content summary]
end
B1a --> C[Result Merger]
B1b --> C
B1c --> C
B1d --> C
B2a --> C
B2b --> C
B2c --> C
B2d --> C
B3a --> C
B3b --> C
B3c --> C
B3d --> C
C --> D[Combined JSON Output]
D --> E[Final CrawlResult]
style B1 fill:#c8e6c9
style B2 fill:#e3f2fd
style B3 fill:#fff3e0
style C fill:#f3e5f5
```
### Performance Comparison Matrix
```mermaid
graph TD
subgraph "Strategy Performance"
A[Extraction Strategy Comparison]
subgraph "Speed ⚡"
S1[Regex: ~10ms]
S2[CSS Schema: ~50ms]
S3[XPath: ~100ms]
S4[LLM: ~2-10s]
end
subgraph "Accuracy 🎯"
A1[Regex: Pattern-dependent]
A2[CSS: High for structured]
A3[XPath: Very high]
A4[LLM: Excellent for complex]
end
subgraph "Cost 💰"
C1[Regex: Free]
C2[CSS: Free]
C3[XPath: Free]
C4[LLM: $0.001-0.01 per page]
end
subgraph "Complexity 🔧"
X1[Regex: Simple patterns only]
X2[CSS: Structured HTML]
X3[XPath: Complex selectors]
X4[LLM: Any content type]
end
end
style S1 fill:#c8e6c9
style S2 fill:#e8f5e8
style S3 fill:#fff3e0
style S4 fill:#ffcdd2
style A2 fill:#e8f5e8
style A3 fill:#c8e6c9
style A4 fill:#c8e6c9
style C1 fill:#c8e6c9
style C2 fill:#c8e6c9
style C3 fill:#c8e6c9
style C4 fill:#fff3e0
style X1 fill:#ffcdd2
style X2 fill:#e8f5e8
style X3 fill:#c8e6c9
style X4 fill:#c8e6c9
```
### Regex Pattern Strategy Flow
```mermaid
flowchart TD
A[Regex Extraction] --> B{Pattern Source?}
B -->|Built-in| C[Use Predefined Patterns]
B -->|Custom| D[Define Custom Regex]
B -->|LLM-Generated| E[Generate with AI]
C --> C1[Email Pattern]
C --> C2[Phone Pattern]
C --> C3[URL Pattern]
C --> C4[Currency Pattern]
C --> C5[Date Pattern]
D --> D1[Write Custom Regex]
D --> D2[Test Pattern]
D --> D3{Pattern Works?}
D3 -->|No| D1
D3 -->|Yes| D4[Use Pattern]
E --> E1[Provide Sample Content]
E --> E2[LLM Analyzes Content]
E --> E3[Generate Optimized Regex]
E --> E4[Cache Pattern for Reuse]
C1 --> F[Pattern Matching]
C2 --> F
C3 --> F
C4 --> F
C5 --> F
D4 --> F
E4 --> F
F --> G[Extract Matches]
G --> H[Group by Pattern Type]
H --> I[JSON Output with Labels]
style C fill:#e8f5e8
style D fill:#e3f2fd
style E fill:#fff3e0
style F fill:#f3e5f5
```
### Complex Schema Structure Visualization
```mermaid
graph TB
subgraph "E-commerce Schema Example"
A[Category baseSelector] --> B[Category Fields]
A --> C[Products nested_list]
B --> B1[category_name]
B --> B2[category_id attribute]
B --> B3[category_url attribute]
C --> C1[Product baseSelector]
C1 --> C2[name text]
C1 --> C3[price text]
C1 --> C4[Details nested object]
C1 --> C5[Features list]
C1 --> C6[Reviews nested_list]
C4 --> C4a[brand text]
C4 --> C4b[model text]
C4 --> C4c[specs html]
C5 --> C5a[feature text array]
C6 --> C6a[reviewer text]
C6 --> C6b[rating attribute]
C6 --> C6c[comment text]
C6 --> C6d[date attribute]
end
subgraph "JSON Output Structure"
D[categories array] --> D1[category object]
D1 --> D2[category_name]
D1 --> D3[category_id]
D1 --> D4[products array]
D4 --> D5[product object]
D5 --> D6[name, price]
D5 --> D7[details object]
D5 --> D8[features array]
D5 --> D9[reviews array]
D7 --> D7a[brand, model, specs]
D8 --> D8a[feature strings]
D9 --> D9a[review objects]
end
A -.-> D
B1 -.-> D2
C2 -.-> D6
C4 -.-> D7
C5 -.-> D8
C6 -.-> D9
style A fill:#e3f2fd
style C fill:#f3e5f5
style C4 fill:#e8f5e8
style D fill:#fff3e0
```
### Error Handling and Fallback Strategy
```mermaid
stateDiagram-v2
[*] --> PrimaryStrategy
PrimaryStrategy --> Success: Extraction successful
PrimaryStrategy --> ValidationFailed: Invalid data
PrimaryStrategy --> ExtractionFailed: No matches found
PrimaryStrategy --> TimeoutError: LLM timeout
ValidationFailed --> FallbackStrategy: Try alternative
ExtractionFailed --> FallbackStrategy: Try alternative
TimeoutError --> FallbackStrategy: Try alternative
FallbackStrategy --> FallbackSuccess: Fallback works
FallbackStrategy --> FallbackFailed: All strategies failed
FallbackSuccess --> Success: Return results
FallbackFailed --> ErrorReport: Log failure details
Success --> [*]: Complete
ErrorReport --> [*]: Return empty results
note right of PrimaryStrategy : Try fastest/most accurate first
note right of FallbackStrategy : Use simpler but reliable method
note left of ErrorReport : Provide debugging information
```
### Token Usage and Cost Optimization
```mermaid
flowchart TD
A[LLM Extraction Request] --> B{Content Size Check}
B -->|Small < 1200 tokens| C[Single LLM Call]
B -->|Large > 1200 tokens| D[Chunking Strategy]
C --> C1[Send full content]
C1 --> C2[Parse JSON response]
C2 --> C3[Track token usage]
D --> D1[Split into chunks]
D1 --> D2[Add overlap between chunks]
D2 --> D3[Process chunks in parallel]
D3 --> D4[Chunk 1 → LLM]
D3 --> D5[Chunk 2 → LLM]
D3 --> D6[Chunk N → LLM]
D4 --> D7[Merge results]
D5 --> D7
D6 --> D7
D7 --> D8[Deduplicate data]
D8 --> D9[Aggregate token usage]
C3 --> E[Cost Calculation]
D9 --> E
E --> F[Usage Report]
F --> F1[Prompt tokens: X]
F --> F2[Completion tokens: Y]
F --> F3[Total cost: $Z]
style C fill:#c8e6c9
style D fill:#fff3e0
style E fill:#e3f2fd
style F fill:#f3e5f5
```
**📖 Learn more:** [LLM Strategies](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Pattern Matching](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)

View File

@@ -0,0 +1,472 @@
## HTTP Crawler Strategy Workflows
Visual representations of HTTP-based crawling architecture, request flows, and performance characteristics compared to browser-based strategies.
### HTTP vs Browser Strategy Decision Tree
```mermaid
flowchart TD
A[Content Crawling Need] --> B{Content Type Analysis}
B -->|Static HTML| C{JavaScript Required?}
B -->|Dynamic SPA| D[Browser Strategy Required]
B -->|API Endpoints| E[HTTP Strategy Optimal]
B -->|Mixed Content| F{Primary Content Source?}
C -->|No JS Needed| G[HTTP Strategy Recommended]
C -->|JS Required| H[Browser Strategy Required]
C -->|Unknown| I{Performance Priority?}
I -->|Speed Critical| J[Try HTTP First]
I -->|Accuracy Critical| K[Use Browser Strategy]
F -->|Mostly Static| G
F -->|Mostly Dynamic| D
G --> L{Resource Constraints?}
L -->|Memory Limited| M[HTTP Strategy - Lightweight]
L -->|CPU Limited| N[HTTP Strategy - No Browser]
L -->|Network Limited| O[HTTP Strategy - Efficient]
L -->|No Constraints| P[Either Strategy Works]
J --> Q[Test HTTP Results]
Q --> R{Content Complete?}
R -->|Yes| S[Continue with HTTP]
R -->|No| T[Switch to Browser Strategy]
D --> U[Browser Strategy Features]
H --> U
K --> U
T --> U
U --> V[JavaScript Execution]
U --> W[Screenshots/PDFs]
U --> X[Complex Interactions]
U --> Y[Session Management]
M --> Z[HTTP Strategy Benefits]
N --> Z
O --> Z
S --> Z
Z --> AA[10x Faster Processing]
Z --> BB[Lower Memory Usage]
Z --> CC[Higher Concurrency]
Z --> DD[Simpler Deployment]
style G fill:#c8e6c9
style M fill:#c8e6c9
style N fill:#c8e6c9
style O fill:#c8e6c9
style S fill:#c8e6c9
style D fill:#e3f2fd
style H fill:#e3f2fd
style K fill:#e3f2fd
style T fill:#e3f2fd
style U fill:#e3f2fd
```
### HTTP Request Lifecycle Sequence
```mermaid
sequenceDiagram
participant Client
participant HTTPStrategy as HTTP Strategy
participant Session as HTTP Session
participant Server as Target Server
participant Processor as Content Processor
Client->>HTTPStrategy: crawl(url, config)
HTTPStrategy->>HTTPStrategy: validate_url()
alt URL Type Check
HTTPStrategy->>HTTPStrategy: handle_file_url()
Note over HTTPStrategy: file:// URLs
else
HTTPStrategy->>HTTPStrategy: handle_raw_content()
Note over HTTPStrategy: raw:// content
else
HTTPStrategy->>Session: prepare_request()
Session->>Session: apply_config()
Session->>Session: set_headers()
Session->>Session: setup_auth()
Session->>Server: HTTP Request
Note over Session,Server: GET/POST/PUT with headers
alt Success Response
Server-->>Session: HTTP 200 + Content
Session-->>HTTPStrategy: response_data
else Redirect Response
Server-->>Session: HTTP 3xx + Location
Session->>Server: Follow redirect
Server-->>Session: HTTP 200 + Content
Session-->>HTTPStrategy: final_response
else Error Response
Server-->>Session: HTTP 4xx/5xx
Session-->>HTTPStrategy: error_response
end
end
HTTPStrategy->>Processor: process_content()
Processor->>Processor: clean_html()
Processor->>Processor: extract_metadata()
Processor->>Processor: generate_markdown()
Processor-->>HTTPStrategy: processed_result
HTTPStrategy-->>Client: CrawlResult
Note over Client,Processor: Fast, lightweight processing
Note over HTTPStrategy: No browser overhead
```
### HTTP Strategy Architecture
```mermaid
graph TB
subgraph "HTTP Crawler Strategy"
A[AsyncHTTPCrawlerStrategy] --> B[Session Manager]
A --> C[Request Builder]
A --> D[Response Handler]
A --> E[Error Manager]
B --> B1[Connection Pool]
B --> B2[DNS Cache]
B --> B3[SSL Context]
C --> C1[Headers Builder]
C --> C2[Auth Handler]
C --> C3[Payload Encoder]
D --> D1[Content Decoder]
D --> D2[Redirect Handler]
D --> D3[Status Validator]
E --> E1[Retry Logic]
E --> E2[Timeout Handler]
E --> E3[Exception Mapper]
end
subgraph "Content Processing"
F[Raw HTML] --> G[HTML Cleaner]
G --> H[Markdown Generator]
H --> I[Link Extractor]
I --> J[Media Extractor]
J --> K[Metadata Parser]
end
subgraph "External Resources"
L[Target Websites]
M[Local Files]
N[Raw Content]
end
subgraph "Output"
O[CrawlResult]
O --> O1[HTML Content]
O --> O2[Markdown Text]
O --> O3[Extracted Links]
O --> O4[Media References]
O --> O5[Status Information]
end
A --> F
L --> A
M --> A
N --> A
K --> O
style A fill:#e3f2fd
style B fill:#f3e5f5
style F fill:#e8f5e8
style O fill:#fff3e0
```
### Performance Comparison Flow
```mermaid
graph LR
subgraph "HTTP Strategy Performance"
A1[Request Start] --> A2[DNS Lookup: 50ms]
A2 --> A3[TCP Connect: 100ms]
A3 --> A4[HTTP Request: 200ms]
A4 --> A5[Content Download: 300ms]
A5 --> A6[Processing: 50ms]
A6 --> A7[Total: ~700ms]
end
subgraph "Browser Strategy Performance"
B1[Request Start] --> B2[Browser Launch: 2000ms]
B2 --> B3[Page Navigation: 1000ms]
B3 --> B4[JS Execution: 500ms]
B4 --> B5[Content Rendering: 300ms]
B5 --> B6[Processing: 100ms]
B6 --> B7[Total: ~3900ms]
end
subgraph "Resource Usage"
C1[HTTP Memory: ~50MB]
C2[Browser Memory: ~500MB]
C3[HTTP CPU: Low]
C4[Browser CPU: High]
C5[HTTP Concurrency: 100+]
C6[Browser Concurrency: 10-20]
end
A7 --> D[5.5x Faster]
B7 --> D
C1 --> E[10x Less Memory]
C2 --> E
C5 --> F[5x More Concurrent]
C6 --> F
style A7 fill:#c8e6c9
style B7 fill:#ffcdd2
style C1 fill:#c8e6c9
style C2 fill:#ffcdd2
style C5 fill:#c8e6c9
style C6 fill:#ffcdd2
```
### HTTP Request Types and Configuration
```mermaid
stateDiagram-v2
[*] --> HTTPConfigSetup
HTTPConfigSetup --> MethodSelection
MethodSelection --> GET: Simple data retrieval
MethodSelection --> POST: Form submission
MethodSelection --> PUT: Data upload
MethodSelection --> DELETE: Resource removal
GET --> HeaderSetup: Set Accept headers
POST --> PayloadSetup: JSON or form data
PUT --> PayloadSetup: File or data upload
DELETE --> AuthSetup: Authentication required
PayloadSetup --> JSONPayload: application/json
PayloadSetup --> FormPayload: form-data
PayloadSetup --> RawPayload: custom content
JSONPayload --> HeaderSetup
FormPayload --> HeaderSetup
RawPayload --> HeaderSetup
HeaderSetup --> AuthSetup
AuthSetup --> SSLSetup
SSLSetup --> RedirectSetup
RedirectSetup --> RequestExecution
RequestExecution --> [*]: Request complete
note right of GET : Default method for most crawling
note right of POST : API interactions, form submissions
note right of JSONPayload : Structured data transmission
note right of HeaderSetup : User-Agent, Accept, Custom headers
```
### Error Handling and Retry Workflow
```mermaid
flowchart TD
A[HTTP Request] --> B{Response Received?}
B -->|No| C[Connection Error]
B -->|Yes| D{Status Code Check}
C --> C1{Timeout Error?}
C1 -->|Yes| C2[ConnectionTimeoutError]
C1 -->|No| C3[Network Error]
D -->|2xx| E[Success Response]
D -->|3xx| F[Redirect Response]
D -->|4xx| G[Client Error]
D -->|5xx| H[Server Error]
F --> F1{Follow Redirects?}
F1 -->|Yes| F2[Follow Redirect]
F1 -->|No| F3[Return Redirect Response]
F2 --> A
G --> G1{Retry on 4xx?}
G1 -->|No| G2[HTTPStatusError]
G1 -->|Yes| I[Check Retry Count]
H --> H1{Retry on 5xx?}
H1 -->|Yes| I
H1 -->|No| H2[HTTPStatusError]
C2 --> I
C3 --> I
I --> J{Retries < Max?}
J -->|No| K[Final Error]
J -->|Yes| L[Calculate Backoff]
L --> M[Wait Backoff Time]
M --> N[Increment Retry Count]
N --> A
E --> O[Process Content]
F3 --> O
O --> P[Return CrawlResult]
G2 --> Q[Error CrawlResult]
H2 --> Q
K --> Q
style E fill:#c8e6c9
style P fill:#c8e6c9
style G2 fill:#ffcdd2
style H2 fill:#ffcdd2
style K fill:#ffcdd2
style Q fill:#ffcdd2
```
### Batch Processing Architecture
```mermaid
sequenceDiagram
participant Client
participant BatchManager as Batch Manager
participant HTTPPool as Connection Pool
participant Workers as HTTP Workers
participant Targets as Target Servers
Client->>BatchManager: batch_crawl(urls)
BatchManager->>BatchManager: create_semaphore(max_concurrent)
loop For each URL batch
BatchManager->>HTTPPool: acquire_connection()
HTTPPool->>Workers: assign_worker()
par Concurrent Processing
Workers->>Targets: HTTP Request 1
Workers->>Targets: HTTP Request 2
Workers->>Targets: HTTP Request N
end
par Response Handling
Targets-->>Workers: Response 1
Targets-->>Workers: Response 2
Targets-->>Workers: Response N
end
Workers->>HTTPPool: return_connection()
HTTPPool->>BatchManager: batch_results()
end
BatchManager->>BatchManager: aggregate_results()
BatchManager-->>Client: final_results()
Note over Workers,Targets: 20-100 concurrent connections
Note over BatchManager: Memory-efficient processing
Note over HTTPPool: Connection reuse optimization
```
### Content Type Processing Pipeline
```mermaid
graph TD
A[HTTP Response] --> B{Content-Type Detection}
B -->|text/html| C[HTML Processing]
B -->|application/json| D[JSON Processing]
B -->|text/plain| E[Text Processing]
B -->|application/xml| F[XML Processing]
B -->|Other| G[Binary Processing]
C --> C1[Parse HTML Structure]
C1 --> C2[Extract Text Content]
C2 --> C3[Generate Markdown]
C3 --> C4[Extract Links/Media]
D --> D1[Parse JSON Structure]
D1 --> D2[Extract Data Fields]
D2 --> D3[Format as Readable Text]
E --> E1[Clean Text Content]
E1 --> E2[Basic Formatting]
F --> F1[Parse XML Structure]
F1 --> F2[Extract Text Nodes]
F2 --> F3[Convert to Markdown]
G --> G1[Save Binary Content]
G1 --> G2[Generate Metadata]
C4 --> H[Content Analysis]
D3 --> H
E2 --> H
F3 --> H
G2 --> H
H --> I[Link Extraction]
H --> J[Media Detection]
H --> K[Metadata Parsing]
I --> L[CrawlResult Assembly]
J --> L
K --> L
L --> M[Final Output]
style C fill:#e8f5e8
style H fill:#fff3e0
style L fill:#e3f2fd
style M fill:#c8e6c9
```
### Integration with Processing Strategies
```mermaid
graph LR
subgraph "HTTP Strategy Core"
A[HTTP Request] --> B[Raw Content]
B --> C[Content Decoder]
end
subgraph "Processing Pipeline"
C --> D[HTML Cleaner]
D --> E[Markdown Generator]
E --> F{Content Filter?}
F -->|Yes| G[Pruning Filter]
F -->|Yes| H[BM25 Filter]
F -->|No| I[Raw Markdown]
G --> J[Fit Markdown]
H --> J
end
subgraph "Extraction Strategies"
I --> K[CSS Extraction]
J --> K
I --> L[XPath Extraction]
J --> L
I --> M[LLM Extraction]
J --> M
end
subgraph "Output Generation"
K --> N[Structured JSON]
L --> N
M --> N
I --> O[Clean Markdown]
J --> P[Filtered Content]
N --> Q[Final CrawlResult]
O --> Q
P --> Q
end
style A fill:#e3f2fd
style C fill:#f3e5f5
style E fill:#e8f5e8
style Q fill:#c8e6c9
```
**📖 Learn more:** [HTTP vs Browser Strategies](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Error Handling](https://docs.crawl4ai.com/api/async-webcrawler/)

View File

@@ -0,0 +1,368 @@
## Installation Workflows and Architecture
Visual representations of Crawl4AI installation processes, deployment options, and system interactions.
### Installation Decision Flow
```mermaid
flowchart TD
A[Start Installation] --> B{Environment Type?}
B -->|Local Development| C[Basic Python Install]
B -->|Production| D[Docker Deployment]
B -->|Research/Testing| E[Google Colab]
B -->|CI/CD Pipeline| F[Automated Setup]
C --> C1[pip install crawl4ai]
C1 --> C2[crawl4ai-setup]
C2 --> C3{Need Advanced Features?}
C3 -->|No| C4[Basic Installation Complete]
C3 -->|Text Clustering| C5[pip install crawl4ai with torch]
C3 -->|Transformers| C6[pip install crawl4ai with transformer]
C3 -->|All Features| C7[pip install crawl4ai with all]
C5 --> C8[crawl4ai-download-models]
C6 --> C8
C7 --> C8
C8 --> C9[Advanced Installation Complete]
D --> D1{Deployment Method?}
D1 -->|Pre-built Image| D2[docker pull unclecode/crawl4ai]
D1 -->|Docker Compose| D3[Clone repo + docker compose]
D1 -->|Custom Build| D4[docker buildx build]
D2 --> D5[Configure .llm.env]
D3 --> D5
D4 --> D5
D5 --> D6[docker run with ports]
D6 --> D7[Docker Deployment Complete]
E --> E1[Colab pip install]
E1 --> E2[playwright install chromium]
E2 --> E3[Test basic crawl]
E3 --> E4[Colab Setup Complete]
F --> F1[Automated pip install]
F1 --> F2[Automated setup scripts]
F2 --> F3[CI/CD Integration Complete]
C4 --> G[Verify with crawl4ai-doctor]
C9 --> G
D7 --> H[Health check via API]
E4 --> I[Run test crawl]
F3 --> G
G --> J[Installation Verified]
H --> J
I --> J
style A fill:#e1f5fe
style J fill:#c8e6c9
style C4 fill:#fff3e0
style C9 fill:#fff3e0
style D7 fill:#f3e5f5
style E4 fill:#fce4ec
style F3 fill:#e8f5e8
```
### Basic Installation Sequence
```mermaid
sequenceDiagram
participant User
participant PyPI
participant System
participant Playwright
participant Crawler
User->>PyPI: pip install crawl4ai
PyPI-->>User: Package downloaded
User->>System: crawl4ai-setup
System->>Playwright: Install browser binaries
Playwright-->>System: Chromium, Firefox installed
System-->>User: Setup complete
User->>System: crawl4ai-doctor
System->>System: Check Python version
System->>System: Verify Playwright installation
System->>System: Test browser launch
System-->>User: Diagnostics report
User->>Crawler: Basic crawl test
Crawler->>Playwright: Launch browser
Playwright-->>Crawler: Browser ready
Crawler->>Crawler: Navigate to test URL
Crawler-->>User: Success confirmation
```
### Docker Deployment Architecture
```mermaid
graph TB
subgraph "Host System"
A[Docker Engine] --> B[Crawl4AI Container]
C[.llm.env File] --> B
D[Port 11235] --> B
end
subgraph "Container Environment"
B --> E[FastAPI Server]
B --> F[Playwright Browsers]
B --> G[Python Runtime]
E --> H[/crawl Endpoint]
E --> I[/playground Interface]
E --> J[/health Monitoring]
E --> K[/metrics Prometheus]
F --> L[Chromium Browser]
F --> M[Firefox Browser]
F --> N[WebKit Browser]
end
subgraph "External Services"
O[OpenAI API] --> B
P[Anthropic API] --> B
Q[Local LLM Ollama] --> B
end
subgraph "Client Applications"
R[Python SDK] --> H
S[REST API Calls] --> H
T[Web Browser] --> I
U[Monitoring Tools] --> J
V[Prometheus] --> K
end
style B fill:#e3f2fd
style E fill:#f3e5f5
style F fill:#e8f5e8
style G fill:#fff3e0
```
### Advanced Features Installation Flow
```mermaid
stateDiagram-v2
[*] --> BasicInstall
BasicInstall --> FeatureChoice: crawl4ai installed
FeatureChoice --> TorchInstall: Need text clustering
FeatureChoice --> TransformerInstall: Need HuggingFace models
FeatureChoice --> AllInstall: Need everything
FeatureChoice --> Complete: Basic features sufficient
TorchInstall --> TorchSetup: pip install crawl4ai with torch
TransformerInstall --> TransformerSetup: pip install crawl4ai with transformer
AllInstall --> AllSetup: pip install crawl4ai with all
TorchSetup --> ModelDownload: crawl4ai-setup
TransformerSetup --> ModelDownload: crawl4ai-setup
AllSetup --> ModelDownload: crawl4ai-setup
ModelDownload --> PreDownload: crawl4ai-download-models
PreDownload --> Complete: All models cached
Complete --> Verification: crawl4ai-doctor
Verification --> [*]: Installation verified
note right of TorchInstall : PyTorch for semantic operations
note right of TransformerInstall : HuggingFace for LLM features
note right of AllInstall : Complete feature set
```
### Platform-Specific Installation Matrix
```mermaid
graph LR
subgraph "Installation Methods"
A[Python Package] --> A1[pip install]
B[Docker Image] --> B1[docker pull]
C[Source Build] --> C1[git clone + build]
D[Cloud Platform] --> D1[Colab/Kaggle]
end
subgraph "Operating Systems"
E[Linux x86_64]
F[Linux ARM64]
G[macOS Intel]
H[macOS Apple Silicon]
I[Windows x86_64]
end
subgraph "Feature Sets"
J[Basic crawling]
K[Text clustering torch]
L[LLM transformers]
M[All features]
end
A1 --> E
A1 --> F
A1 --> G
A1 --> H
A1 --> I
B1 --> E
B1 --> F
B1 --> G
B1 --> H
C1 --> E
C1 --> F
C1 --> G
C1 --> H
C1 --> I
D1 --> E
D1 --> I
E --> J
E --> K
E --> L
E --> M
F --> J
F --> K
F --> L
F --> M
G --> J
G --> K
G --> L
G --> M
H --> J
H --> K
H --> L
H --> M
I --> J
I --> K
I --> L
I --> M
style A1 fill:#e3f2fd
style B1 fill:#f3e5f5
style C1 fill:#e8f5e8
style D1 fill:#fff3e0
```
### Docker Multi-Stage Build Process
```mermaid
sequenceDiagram
participant Dev as Developer
participant Git as GitHub Repo
participant Docker as Docker Engine
participant Registry as Docker Hub
participant User as End User
Dev->>Git: Push code changes
Docker->>Git: Clone repository
Docker->>Docker: Stage 1 - Base Python image
Docker->>Docker: Stage 2 - Install dependencies
Docker->>Docker: Stage 3 - Install Playwright
Docker->>Docker: Stage 4 - Copy application code
Docker->>Docker: Stage 5 - Setup FastAPI server
Note over Docker: Multi-architecture build
Docker->>Docker: Build for linux/amd64
Docker->>Docker: Build for linux/arm64
Docker->>Registry: Push multi-arch manifest
Registry-->>Docker: Build complete
User->>Registry: docker pull unclecode/crawl4ai
Registry-->>User: Download appropriate architecture
User->>Docker: docker run with configuration
Docker->>Docker: Start container
Docker->>Docker: Initialize FastAPI server
Docker->>Docker: Setup Playwright browsers
Docker-->>User: Service ready on port 11235
```
### Installation Verification Workflow
```mermaid
flowchart TD
A[Installation Complete] --> B[Run crawl4ai-doctor]
B --> C{Python Version Check}
C -->|✓ 3.10+| D{Playwright Check}
C -->|✗ < 3.10| C1[Upgrade Python]
C1 --> D
D -->|✓ Installed| E{Browser Binaries}
D -->|✗ Missing| D1[Run crawl4ai-setup]
D1 --> E
E -->|✓ Available| F{Test Browser Launch}
E -->|✗ Missing| E1[playwright install]
E1 --> F
F -->|✓ Success| G[Test Basic Crawl]
F -->|✗ Failed| F1[Check system dependencies]
F1 --> F
G --> H{Crawl Test Result}
H -->|✓ Success| I[Installation Verified ✓]
H -->|✗ Failed| H1[Check network/permissions]
H1 --> G
I --> J[Ready for Production Use]
style I fill:#c8e6c9
style J fill:#e8f5e8
style C1 fill:#ffcdd2
style D1 fill:#fff3e0
style E1 fill:#fff3e0
style F1 fill:#ffcdd2
style H1 fill:#ffcdd2
```
### Resource Requirements by Installation Type
```mermaid
graph TD
subgraph "Basic Installation"
A1[Memory: 512MB]
A2[Disk: 2GB]
A3[CPU: 1 core]
A4[Network: Required for setup]
end
subgraph "Advanced Features torch"
B1[Memory: 2GB+]
B2[Disk: 5GB+]
B3[CPU: 2+ cores]
B4[GPU: Optional CUDA]
end
subgraph "All Features"
C1[Memory: 4GB+]
C2[Disk: 10GB+]
C3[CPU: 4+ cores]
C4[GPU: Recommended]
end
subgraph "Docker Deployment"
D1[Memory: 1GB+]
D2[Disk: 3GB+]
D3[CPU: 2+ cores]
D4[Ports: 11235]
D5[Shared Memory: 1GB]
end
style A1 fill:#e8f5e8
style B1 fill:#fff3e0
style C1 fill:#ffecb3
style D1 fill:#e3f2fd
```
**📖 Learn more:** [Installation Guide](https://docs.crawl4ai.com/core/installation/), [Docker Deployment](https://docs.crawl4ai.com/core/docker-deployment/), [System Requirements](https://docs.crawl4ai.com/core/installation/#prerequisites)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,392 @@
## Multi-URL Crawling Workflows and Architecture
Visual representations of concurrent crawling patterns, resource management, and monitoring systems for handling multiple URLs efficiently.
### Multi-URL Processing Modes
```mermaid
flowchart TD
A[Multi-URL Crawling Request] --> B{Processing Mode?}
B -->|Batch Mode| C[Collect All URLs]
B -->|Streaming Mode| D[Process URLs Individually]
C --> C1[Queue All URLs]
C1 --> C2[Execute Concurrently]
C2 --> C3[Wait for All Completion]
C3 --> C4[Return Complete Results Array]
D --> D1[Queue URLs]
D1 --> D2[Start First Batch]
D2 --> D3[Yield Results as Available]
D3 --> D4{More URLs?}
D4 -->|Yes| D5[Start Next URLs]
D4 -->|No| D6[Stream Complete]
D5 --> D3
C4 --> E[Process Results]
D6 --> E
E --> F[Success/Failure Analysis]
F --> G[End]
style C fill:#e3f2fd
style D fill:#f3e5f5
style C4 fill:#c8e6c9
style D6 fill:#c8e6c9
```
### Memory-Adaptive Dispatcher Flow
```mermaid
stateDiagram-v2
[*] --> Initializing
Initializing --> MonitoringMemory: Start dispatcher
MonitoringMemory --> CheckingMemory: Every check_interval
CheckingMemory --> MemoryOK: Memory < threshold
CheckingMemory --> MemoryHigh: Memory >= threshold
MemoryOK --> DispatchingTasks: Start new crawls
MemoryHigh --> WaitingForMemory: Pause dispatching
DispatchingTasks --> TaskRunning: Launch crawler
TaskRunning --> TaskCompleted: Crawl finished
TaskRunning --> TaskFailed: Crawl error
TaskCompleted --> MonitoringMemory: Update stats
TaskFailed --> MonitoringMemory: Update stats
WaitingForMemory --> CheckingMemory: Wait timeout
WaitingForMemory --> MonitoringMemory: Memory freed
note right of MemoryHigh: Prevents OOM crashes
note right of DispatchingTasks: Respects max_session_permit
note right of WaitingForMemory: Configurable timeout
```
### Concurrent Crawling Architecture
```mermaid
graph TB
subgraph "URL Queue Management"
A[URL Input List] --> B[URL Queue]
B --> C[Priority Scheduler]
C --> D[Batch Assignment]
end
subgraph "Dispatcher Layer"
E[Memory Adaptive Dispatcher]
F[Semaphore Dispatcher]
G[Rate Limiter]
H[Resource Monitor]
E --> I[Memory Checker]
F --> J[Concurrency Controller]
G --> K[Delay Calculator]
H --> L[System Stats]
end
subgraph "Crawler Pool"
M[Crawler Instance 1]
N[Crawler Instance 2]
O[Crawler Instance 3]
P[Crawler Instance N]
M --> Q[Browser Session 1]
N --> R[Browser Session 2]
O --> S[Browser Session 3]
P --> T[Browser Session N]
end
subgraph "Result Processing"
U[Result Collector]
V[Success Handler]
W[Error Handler]
X[Retry Queue]
Y[Final Results]
end
D --> E
D --> F
E --> M
F --> N
G --> O
H --> P
Q --> U
R --> U
S --> U
T --> U
U --> V
U --> W
W --> X
X --> B
V --> Y
style E fill:#e3f2fd
style F fill:#f3e5f5
style G fill:#e8f5e8
style H fill:#fff3e0
```
### Rate Limiting and Backoff Strategy
```mermaid
sequenceDiagram
participant C as Crawler
participant RL as Rate Limiter
participant S as Server
participant D as Dispatcher
C->>RL: Request to crawl URL
RL->>RL: Calculate delay
RL->>RL: Apply base delay (1-3s)
RL->>C: Delay applied
C->>S: HTTP Request
alt Success Response
S-->>C: 200 OK + Content
C->>RL: Report success
RL->>RL: Reset failure count
C->>D: Return successful result
else Rate Limited
S-->>C: 429 Too Many Requests
C->>RL: Report rate limit
RL->>RL: Exponential backoff
RL->>RL: Increase delay (up to max_delay)
RL->>C: Apply longer delay
C->>S: Retry request after delay
else Server Error
S-->>C: 503 Service Unavailable
C->>RL: Report server error
RL->>RL: Moderate backoff
RL->>C: Retry with backoff
else Max Retries Exceeded
RL->>C: Stop retrying
C->>D: Return failed result
end
```
### Large-Scale Crawling Workflow
```mermaid
flowchart TD
A[Load URL List 10k+ URLs] --> B[Initialize Dispatcher]
B --> C{Select Dispatcher Type}
C -->|Memory Constrained| D[Memory Adaptive]
C -->|Fixed Resources| E[Semaphore Based]
D --> F[Set Memory Threshold 70%]
E --> G[Set Concurrency Limit]
F --> H[Configure Monitoring]
G --> H
H --> I[Start Crawling Process]
I --> J[Monitor System Resources]
J --> K{Memory Usage?}
K -->|< Threshold| L[Continue Dispatching]
K -->|>= Threshold| M[Pause New Tasks]
L --> N[Process Results Stream]
M --> O[Wait for Memory]
O --> K
N --> P{Result Type?}
P -->|Success| Q[Save to Database]
P -->|Failure| R[Log Error]
Q --> S[Update Progress Counter]
R --> S
S --> T{More URLs?}
T -->|Yes| U[Get Next Batch]
T -->|No| V[Generate Final Report]
U --> L
V --> W[Analysis Complete]
style A fill:#e1f5fe
style D fill:#e8f5e8
style E fill:#f3e5f5
style V fill:#c8e6c9
style W fill:#a5d6a7
```
### Real-Time Monitoring Dashboard Flow
```mermaid
graph LR
subgraph "Data Collection"
A[Crawler Tasks] --> B[Performance Metrics]
A --> C[Memory Usage]
A --> D[Success/Failure Rates]
A --> E[Response Times]
end
subgraph "Monitor Processing"
F[CrawlerMonitor] --> G[Aggregate Statistics]
F --> H[Display Formatter]
F --> I[Update Scheduler]
end
subgraph "Display Modes"
J[DETAILED Mode]
K[AGGREGATED Mode]
J --> L[Individual Task Status]
J --> M[Task-Level Metrics]
K --> N[Summary Statistics]
K --> O[Overall Progress]
end
subgraph "Output Interface"
P[Console Display]
Q[Progress Bars]
R[Status Tables]
S[Real-time Updates]
end
B --> F
C --> F
D --> F
E --> F
G --> J
G --> K
H --> J
H --> K
I --> J
I --> K
L --> P
M --> Q
N --> R
O --> S
style F fill:#e3f2fd
style J fill:#f3e5f5
style K fill:#e8f5e8
```
### Error Handling and Recovery Pattern
```mermaid
stateDiagram-v2
[*] --> ProcessingURL
ProcessingURL --> CrawlAttempt: Start crawl
CrawlAttempt --> Success: HTTP 200
CrawlAttempt --> NetworkError: Connection failed
CrawlAttempt --> RateLimit: HTTP 429
CrawlAttempt --> ServerError: HTTP 5xx
CrawlAttempt --> Timeout: Request timeout
Success --> [*]: Return result
NetworkError --> RetryCheck: Check retry count
RateLimit --> BackoffWait: Apply exponential backoff
ServerError --> RetryCheck: Check retry count
Timeout --> RetryCheck: Check retry count
BackoffWait --> RetryCheck: After delay
RetryCheck --> CrawlAttempt: retries < max_retries
RetryCheck --> Failed: retries >= max_retries
Failed --> ErrorLog: Log failure details
ErrorLog --> [*]: Return failed result
note right of BackoffWait: Exponential backoff for rate limits
note right of RetryCheck: Configurable max_retries
note right of ErrorLog: Detailed error tracking
```
### Resource Management Timeline
```mermaid
gantt
title Multi-URL Crawling Resource Management
dateFormat X
axisFormat %s
section Memory Usage
Initialize Dispatcher :0, 1
Memory Monitoring :1, 10
Peak Usage Period :3, 7
Memory Cleanup :7, 9
section Task Execution
URL Queue Setup :0, 2
Batch 1 Processing :2, 5
Batch 2 Processing :4, 7
Batch 3 Processing :6, 9
Final Results :9, 10
section Rate Limiting
Normal Delays :2, 4
Backoff Period :4, 6
Recovery Period :6, 8
section Monitoring
System Health Check :0, 10
Progress Updates :1, 9
Performance Metrics :2, 8
```
### Concurrent Processing Performance Matrix
```mermaid
graph TD
subgraph "Input Factors"
A[Number of URLs]
B[Concurrency Level]
C[Memory Threshold]
D[Rate Limiting]
end
subgraph "Processing Characteristics"
A --> E[Low 1-100 URLs]
A --> F[Medium 100-1k URLs]
A --> G[High 1k-10k URLs]
A --> H[Very High 10k+ URLs]
B --> I[Conservative 1-5]
B --> J[Moderate 5-15]
B --> K[Aggressive 15-30]
C --> L[Strict 60-70%]
C --> M[Balanced 70-80%]
C --> N[Relaxed 80-90%]
end
subgraph "Recommended Configurations"
E --> O[Simple Semaphore]
F --> P[Memory Adaptive Basic]
G --> Q[Memory Adaptive Advanced]
H --> R[Memory Adaptive + Monitoring]
I --> O
J --> P
K --> Q
K --> R
L --> Q
M --> P
N --> O
end
style O fill:#c8e6c9
style P fill:#fff3e0
style Q fill:#ffecb3
style R fill:#ffcdd2
```
**📖 Learn more:** [Multi-URL Crawling Guide](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Dispatcher Configuration](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/#performance-optimization)

View File

@@ -0,0 +1,411 @@
## Simple Crawling Workflows and Data Flow
Visual representations of basic web crawling operations, configuration patterns, and result processing workflows.
### Basic Crawling Sequence
```mermaid
sequenceDiagram
participant User
participant Crawler as AsyncWebCrawler
participant Browser as Browser Instance
participant Page as Web Page
participant Processor as Content Processor
User->>Crawler: Create with BrowserConfig
Crawler->>Browser: Launch browser instance
Browser-->>Crawler: Browser ready
User->>Crawler: arun(url, CrawlerRunConfig)
Crawler->>Browser: Create new page/context
Browser->>Page: Navigate to URL
Page-->>Browser: Page loaded
Browser->>Processor: Extract raw HTML
Processor->>Processor: Clean HTML
Processor->>Processor: Generate markdown
Processor->>Processor: Extract media/links
Processor-->>Crawler: CrawlResult created
Crawler-->>User: Return CrawlResult
Note over User,Processor: All processing happens asynchronously
```
### Crawling Configuration Flow
```mermaid
flowchart TD
A[Start Crawling] --> B{Browser Config Set?}
B -->|No| B1[Use Default BrowserConfig]
B -->|Yes| B2[Custom BrowserConfig]
B1 --> C[Launch Browser]
B2 --> C
C --> D{Crawler Run Config Set?}
D -->|No| D1[Use Default CrawlerRunConfig]
D -->|Yes| D2[Custom CrawlerRunConfig]
D1 --> E[Navigate to URL]
D2 --> E
E --> F{Page Load Success?}
F -->|No| F1[Return Error Result]
F -->|Yes| G[Apply Content Filters]
G --> G1{excluded_tags set?}
G1 -->|Yes| G2[Remove specified tags]
G1 -->|No| G3[Keep all tags]
G2 --> G4{css_selector set?}
G3 --> G4
G4 -->|Yes| G5[Extract selected elements]
G4 -->|No| G6[Process full page]
G5 --> H[Generate Markdown]
G6 --> H
H --> H1{markdown_generator set?}
H1 -->|Yes| H2[Use custom generator]
H1 -->|No| H3[Use default generator]
H2 --> I[Extract Media and Links]
H3 --> I
I --> I1{process_iframes?}
I1 -->|Yes| I2[Include iframe content]
I1 -->|No| I3[Skip iframes]
I2 --> J[Create CrawlResult]
I3 --> J
J --> K[Return Result]
style A fill:#e1f5fe
style K fill:#c8e6c9
style F1 fill:#ffcdd2
```
### CrawlResult Data Structure
```mermaid
graph TB
subgraph "CrawlResult Object"
A[CrawlResult] --> B[Basic Info]
A --> C[Content Variants]
A --> D[Extracted Data]
A --> E[Media Assets]
A --> F[Optional Outputs]
B --> B1[url: Final URL]
B --> B2[success: Boolean]
B --> B3[status_code: HTTP Status]
B --> B4[error_message: Error Details]
C --> C1[html: Raw HTML]
C --> C2[cleaned_html: Sanitized HTML]
C --> C3[markdown: MarkdownGenerationResult]
C3 --> C3A[raw_markdown: Basic conversion]
C3 --> C3B[markdown_with_citations: With references]
C3 --> C3C[fit_markdown: Filtered content]
C3 --> C3D[references_markdown: Citation list]
D --> D1[links: Internal/External]
D --> D2[media: Images/Videos/Audio]
D --> D3[metadata: Page info]
D --> D4[extracted_content: JSON data]
D --> D5[tables: Structured table data]
E --> E1[screenshot: Base64 image]
E --> E2[pdf: PDF bytes]
E --> E3[mhtml: Archive file]
E --> E4[downloaded_files: File paths]
F --> F1[session_id: Browser session]
F --> F2[ssl_certificate: Security info]
F --> F3[response_headers: HTTP headers]
F --> F4[network_requests: Traffic log]
F --> F5[console_messages: Browser logs]
end
style A fill:#e3f2fd
style C3 fill:#f3e5f5
style D5 fill:#e8f5e8
```
### Content Processing Pipeline
```mermaid
flowchart LR
subgraph "Input Sources"
A1[Web URL]
A2[Raw HTML]
A3[Local File]
end
A1 --> B[Browser Navigation]
A2 --> C[Direct Processing]
A3 --> C
B --> D[Raw HTML Capture]
C --> D
D --> E{Content Filtering}
E --> E1[Remove Scripts/Styles]
E --> E2[Apply excluded_tags]
E --> E3[Apply css_selector]
E --> E4[Remove overlay elements]
E1 --> F[Cleaned HTML]
E2 --> F
E3 --> F
E4 --> F
F --> G{Markdown Generation}
G --> G1[HTML to Markdown]
G --> G2[Apply Content Filter]
G --> G3[Generate Citations]
G1 --> H[MarkdownGenerationResult]
G2 --> H
G3 --> H
F --> I{Media Extraction}
I --> I1[Find Images]
I --> I2[Find Videos/Audio]
I --> I3[Score Relevance]
I1 --> J[Media Dictionary]
I2 --> J
I3 --> J
F --> K{Link Extraction}
K --> K1[Internal Links]
K --> K2[External Links]
K --> K3[Apply Link Filters]
K1 --> L[Links Dictionary]
K2 --> L
K3 --> L
H --> M[Final CrawlResult]
J --> M
L --> M
style D fill:#e3f2fd
style F fill:#f3e5f5
style H fill:#e8f5e8
style M fill:#c8e6c9
```
### Table Extraction Workflow
```mermaid
stateDiagram-v2
[*] --> DetectTables
DetectTables --> ScoreTables: Find table elements
ScoreTables --> EvaluateThreshold: Calculate quality scores
EvaluateThreshold --> PassThreshold: score >= table_score_threshold
EvaluateThreshold --> RejectTable: score < threshold
PassThreshold --> ExtractHeaders: Parse table structure
ExtractHeaders --> ExtractRows: Get header cells
ExtractRows --> ExtractMetadata: Get data rows
ExtractMetadata --> CreateTableObject: Get caption/summary
CreateTableObject --> AddToResult: {headers, rows, caption, summary}
AddToResult --> [*]: Table extraction complete
RejectTable --> [*]: Table skipped
note right of ScoreTables : Factors: header presence, data density, structure quality
note right of EvaluateThreshold : Threshold 1-10, higher = stricter
```
### Error Handling Decision Tree
```mermaid
flowchart TD
A[Start Crawl] --> B[Navigate to URL]
B --> C{Navigation Success?}
C -->|Network Error| C1[Set error_message: Network failure]
C -->|Timeout| C2[Set error_message: Page timeout]
C -->|Invalid URL| C3[Set error_message: Invalid URL format]
C -->|Success| D[Process Page Content]
C1 --> E[success = False]
C2 --> E
C3 --> E
D --> F{Content Processing OK?}
F -->|Parser Error| F1[Set error_message: HTML parsing failed]
F -->|Memory Error| F2[Set error_message: Insufficient memory]
F -->|Success| G[Generate Outputs]
F1 --> E
F2 --> E
G --> H{Output Generation OK?}
H -->|Markdown Error| H1[Partial success with warnings]
H -->|Extraction Error| H2[Partial success with warnings]
H -->|Success| I[success = True]
H1 --> I
H2 --> I
E --> J[Return Failed CrawlResult]
I --> K[Return Successful CrawlResult]
J --> L[User Error Handling]
K --> M[User Result Processing]
L --> L1{Check error_message}
L1 -->|Network| L2[Retry with different config]
L1 -->|Timeout| L3[Increase page_timeout]
L1 -->|Parser| L4[Try different scraping_strategy]
style E fill:#ffcdd2
style I fill:#c8e6c9
style J fill:#ffcdd2
style K fill:#c8e6c9
```
### Configuration Impact Matrix
```mermaid
graph TB
subgraph "Configuration Categories"
A[Content Processing]
B[Page Interaction]
C[Output Generation]
D[Performance]
end
subgraph "Configuration Options"
A --> A1[word_count_threshold]
A --> A2[excluded_tags]
A --> A3[css_selector]
A --> A4[exclude_external_links]
B --> B1[process_iframes]
B --> B2[remove_overlay_elements]
B --> B3[scan_full_page]
B --> B4[wait_for]
C --> C1[screenshot]
C --> C2[pdf]
C --> C3[markdown_generator]
C --> C4[table_score_threshold]
D --> D1[cache_mode]
D --> D2[verbose]
D --> D3[page_timeout]
D --> D4[semaphore_count]
end
subgraph "Result Impact"
A1 --> R1[Filters short text blocks]
A2 --> R2[Removes specified HTML tags]
A3 --> R3[Focuses on selected content]
A4 --> R4[Cleans links dictionary]
B1 --> R5[Includes iframe content]
B2 --> R6[Removes popups/modals]
B3 --> R7[Loads dynamic content]
B4 --> R8[Waits for specific elements]
C1 --> R9[Adds screenshot field]
C2 --> R10[Adds pdf field]
C3 --> R11[Custom markdown processing]
C4 --> R12[Filters table quality]
D1 --> R13[Controls caching behavior]
D2 --> R14[Detailed logging output]
D3 --> R15[Prevents timeout errors]
D4 --> R16[Limits concurrent operations]
end
style A fill:#e3f2fd
style B fill:#f3e5f5
style C fill:#e8f5e8
style D fill:#fff3e0
```
### Raw HTML and Local File Processing
```mermaid
sequenceDiagram
participant User
participant Crawler
participant Processor
participant FileSystem
Note over User,FileSystem: Raw HTML Processing
User->>Crawler: arun("raw://html_content")
Crawler->>Processor: Parse raw HTML directly
Processor->>Processor: Apply same content filters
Processor-->>Crawler: Standard CrawlResult
Crawler-->>User: Result with markdown
Note over User,FileSystem: Local File Processing
User->>Crawler: arun("file:///path/to/file.html")
Crawler->>FileSystem: Read local file
FileSystem-->>Crawler: File content
Crawler->>Processor: Process file HTML
Processor->>Processor: Apply content processing
Processor-->>Crawler: Standard CrawlResult
Crawler-->>User: Result with markdown
Note over User,FileSystem: Both return identical CrawlResult structure
```
### Comprehensive Processing Example Flow
```mermaid
flowchart TD
A[Input: example.com] --> B[Create Configurations]
B --> B1[BrowserConfig verbose=True]
B --> B2[CrawlerRunConfig with filters]
B1 --> C[Launch AsyncWebCrawler]
B2 --> C
C --> D[Navigate and Process]
D --> E{Check Success}
E -->|Failed| E1[Print Error Message]
E -->|Success| F[Extract Content Summary]
F --> F1[Get Page Title]
F --> F2[Get Content Preview]
F --> F3[Process Media Items]
F --> F4[Process Links]
F3 --> F3A[Count Images]
F3 --> F3B[Show First 3 Images]
F4 --> F4A[Count Internal Links]
F4 --> F4B[Show First 3 Links]
F1 --> G[Display Results]
F2 --> G
F3A --> G
F3B --> G
F4A --> G
F4B --> G
E1 --> H[End with Error]
G --> I[End with Success]
style E1 fill:#ffcdd2
style G fill:#c8e6c9
style H fill:#ffcdd2
style I fill:#c8e6c9
```
**📖 Learn more:** [Simple Crawling Guide](https://docs.crawl4ai.com/core/simple-crawling/), [Configuration Options](https://docs.crawl4ai.com/core/browser-crawler-config/), [Result Processing](https://docs.crawl4ai.com/core/crawler-result/), [Table Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/)

View File

@@ -0,0 +1,441 @@
## URL Seeding Workflows and Architecture
Visual representations of URL discovery strategies, filtering pipelines, and smart crawling workflows.
### URL Seeding vs Deep Crawling Strategy Comparison
```mermaid
graph TB
subgraph "Deep Crawling Approach"
A1[Start URL] --> A2[Load Page]
A2 --> A3[Extract Links]
A3 --> A4{More Links?}
A4 -->|Yes| A5[Queue Next Page]
A5 --> A2
A4 -->|No| A6[Complete]
A7[⏱️ Real-time Discovery]
A8[🐌 Sequential Processing]
A9[🔍 Limited by Page Structure]
A10[💾 High Memory Usage]
end
subgraph "URL Seeding Approach"
B1[Domain Input] --> B2[Query Sitemap]
B1 --> B3[Query Common Crawl]
B2 --> B4[Merge Results]
B3 --> B4
B4 --> B5[Apply Filters]
B5 --> B6[Score Relevance]
B6 --> B7[Rank Results]
B7 --> B8[Select Top URLs]
B9[⚡ Instant Discovery]
B10[🚀 Parallel Processing]
B11[🎯 Pattern-based Filtering]
B12[💡 Smart Relevance Scoring]
end
style A1 fill:#ffecb3
style B1 fill:#e8f5e8
style A6 fill:#ffcdd2
style B8 fill:#c8e6c9
```
### URL Discovery Data Flow
```mermaid
sequenceDiagram
participant User
participant Seeder as AsyncUrlSeeder
participant SM as Sitemap
participant CC as Common Crawl
participant Filter as URL Filter
participant Scorer as BM25 Scorer
User->>Seeder: urls("example.com", config)
par Parallel Data Sources
Seeder->>SM: Fetch sitemap.xml
SM-->>Seeder: 500 URLs
and
Seeder->>CC: Query Common Crawl
CC-->>Seeder: 2000 URLs
end
Seeder->>Seeder: Merge and deduplicate
Note over Seeder: 2200 unique URLs
Seeder->>Filter: Apply pattern filter
Filter-->>Seeder: 800 matching URLs
alt extract_head=True
loop For each URL
Seeder->>Seeder: Extract <head> metadata
end
Note over Seeder: Title, description, keywords
end
alt query provided
Seeder->>Scorer: Calculate relevance scores
Scorer-->>Seeder: Scored URLs
Seeder->>Seeder: Filter by score_threshold
Note over Seeder: 200 relevant URLs
end
Seeder->>Seeder: Sort by relevance
Seeder->>Seeder: Apply max_urls limit
Seeder-->>User: Top 100 URLs ready for crawling
```
### SeedingConfig Decision Tree
```mermaid
flowchart TD
A[SeedingConfig Setup] --> B{Data Source Strategy?}
B -->|Fast & Official| C[source="sitemap"]
B -->|Comprehensive| D[source="cc"]
B -->|Maximum Coverage| E[source="sitemap+cc"]
C --> F{Need Filtering?}
D --> F
E --> F
F -->|Yes| G[Set URL Pattern]
F -->|No| H[pattern="*"]
G --> I{Pattern Examples}
I --> I1[pattern="*/blog/*"]
I --> I2[pattern="*/docs/api/*"]
I --> I3[pattern="*.pdf"]
I --> I4[pattern="*/product/*"]
H --> J{Need Metadata?}
I1 --> J
I2 --> J
I3 --> J
I4 --> J
J -->|Yes| K[extract_head=True]
J -->|No| L[extract_head=False]
K --> M{Need Validation?}
L --> M
M -->|Yes| N[live_check=True]
M -->|No| O[live_check=False]
N --> P{Need Relevance Scoring?}
O --> P
P -->|Yes| Q[Set Query + BM25]
P -->|No| R[Skip Scoring]
Q --> S[query="search terms"]
S --> T[scoring_method="bm25"]
T --> U[score_threshold=0.3]
R --> V[Performance Tuning]
U --> V
V --> W[Set max_urls]
W --> X[Set concurrency]
X --> Y[Set hits_per_sec]
Y --> Z[Configuration Complete]
style A fill:#e3f2fd
style Z fill:#c8e6c9
style K fill:#fff3e0
style N fill:#fff3e0
style Q fill:#f3e5f5
```
### BM25 Relevance Scoring Pipeline
```mermaid
graph TB
subgraph "Text Corpus Preparation"
A1[URL Collection] --> A2[Extract Metadata]
A2 --> A3[Title + Description + Keywords]
A3 --> A4[Tokenize Text]
A4 --> A5[Remove Stop Words]
A5 --> A6[Create Document Corpus]
end
subgraph "BM25 Algorithm"
B1[Query Terms] --> B2[Term Frequency Calculation]
A6 --> B2
B2 --> B3[Inverse Document Frequency]
B3 --> B4[BM25 Score Calculation]
B4 --> B5[Score = Σ(IDF × TF × K1+1)/(TF + K1×(1-b+b×|d|/avgdl))]
end
subgraph "Scoring Results"
B5 --> C1[URL Relevance Scores]
C1 --> C2{Score ≥ Threshold?}
C2 -->|Yes| C3[Include in Results]
C2 -->|No| C4[Filter Out]
C3 --> C5[Sort by Score DESC]
C5 --> C6[Return Top URLs]
end
subgraph "Example Scores"
D1["python async tutorial" → 0.85]
D2["python documentation" → 0.72]
D3["javascript guide" → 0.23]
D4["contact us page" → 0.05]
end
style B5 fill:#e3f2fd
style C6 fill:#c8e6c9
style D1 fill:#c8e6c9
style D2 fill:#c8e6c9
style D3 fill:#ffecb3
style D4 fill:#ffcdd2
```
### Multi-Domain Discovery Architecture
```mermaid
graph TB
subgraph "Input Layer"
A1[Domain List]
A2[SeedingConfig]
A3[Query Terms]
end
subgraph "Discovery Engine"
B1[AsyncUrlSeeder]
B2[Parallel Workers]
B3[Rate Limiter]
B4[Memory Manager]
end
subgraph "Data Sources"
C1[Sitemap Fetcher]
C2[Common Crawl API]
C3[Live URL Checker]
C4[Metadata Extractor]
end
subgraph "Processing Pipeline"
D1[URL Deduplication]
D2[Pattern Filtering]
D3[Relevance Scoring]
D4[Quality Assessment]
end
subgraph "Output Layer"
E1[Scored URL Lists]
E2[Domain Statistics]
E3[Performance Metrics]
E4[Cache Storage]
end
A1 --> B1
A2 --> B1
A3 --> B1
B1 --> B2
B2 --> B3
B3 --> B4
B2 --> C1
B2 --> C2
B2 --> C3
B2 --> C4
C1 --> D1
C2 --> D1
C3 --> D2
C4 --> D3
D1 --> D2
D2 --> D3
D3 --> D4
D4 --> E1
B4 --> E2
B3 --> E3
D1 --> E4
style B1 fill:#e3f2fd
style D3 fill:#f3e5f5
style E1 fill:#c8e6c9
```
### Complete Discovery-to-Crawl Pipeline
```mermaid
stateDiagram-v2
[*] --> Discovery
Discovery --> SourceSelection: Configure data sources
SourceSelection --> Sitemap: source="sitemap"
SourceSelection --> CommonCrawl: source="cc"
SourceSelection --> Both: source="sitemap+cc"
Sitemap --> URLCollection
CommonCrawl --> URLCollection
Both --> URLCollection
URLCollection --> Filtering: Apply patterns
Filtering --> MetadataExtraction: extract_head=True
Filtering --> LiveValidation: extract_head=False
MetadataExtraction --> LiveValidation: live_check=True
MetadataExtraction --> RelevanceScoring: live_check=False
LiveValidation --> RelevanceScoring
RelevanceScoring --> ResultRanking: query provided
RelevanceScoring --> ResultLimiting: no query
ResultRanking --> ResultLimiting: apply score_threshold
ResultLimiting --> URLSelection: apply max_urls
URLSelection --> CrawlPreparation: URLs ready
CrawlPreparation --> CrawlExecution: AsyncWebCrawler
CrawlExecution --> StreamProcessing: stream=True
CrawlExecution --> BatchProcessing: stream=False
StreamProcessing --> [*]
BatchProcessing --> [*]
note right of Discovery : 🔍 Smart URL Discovery
note right of URLCollection : 📚 Merge & Deduplicate
note right of RelevanceScoring : 🎯 BM25 Algorithm
note right of CrawlExecution : 🕷️ High-Performance Crawling
```
### Performance Optimization Strategies
```mermaid
graph LR
subgraph "Input Optimization"
A1[Smart Source Selection] --> A2[Sitemap First]
A2 --> A3[Add CC if Needed]
A3 --> A4[Pattern Filtering Early]
end
subgraph "Processing Optimization"
B1[Parallel Workers] --> B2[Bounded Queues]
B2 --> B3[Rate Limiting]
B3 --> B4[Memory Management]
B4 --> B5[Lazy Evaluation]
end
subgraph "Output Optimization"
C1[Relevance Threshold] --> C2[Max URL Limits]
C2 --> C3[Caching Strategy]
C3 --> C4[Streaming Results]
end
subgraph "Performance Metrics"
D1[URLs/Second: 100-1000]
D2[Memory Usage: Bounded]
D3[Network Efficiency: 95%+]
D4[Cache Hit Rate: 80%+]
end
A4 --> B1
B5 --> C1
C4 --> D1
style A2 fill:#e8f5e8
style B2 fill:#e3f2fd
style C3 fill:#f3e5f5
style D3 fill:#c8e6c9
```
### URL Discovery vs Traditional Crawling Comparison
```mermaid
graph TB
subgraph "Traditional Approach"
T1[Start URL] --> T2[Crawl Page]
T2 --> T3[Extract Links]
T3 --> T4[Queue New URLs]
T4 --> T2
T5[❌ Time: Hours/Days]
T6[❌ Resource Heavy]
T7[❌ Depth Limited]
T8[❌ Discovery Bias]
end
subgraph "URL Seeding Approach"
S1[Domain Input] --> S2[Query All Sources]
S2 --> S3[Pattern Filter]
S3 --> S4[Relevance Score]
S4 --> S5[Select Best URLs]
S5 --> S6[Ready to Crawl]
S7[✅ Time: Seconds/Minutes]
S8[✅ Resource Efficient]
S9[✅ Complete Coverage]
S10[✅ Quality Focused]
end
subgraph "Use Case Decision Matrix"
U1[Small Sites < 1000 pages] --> U2[Use Deep Crawling]
U3[Large Sites > 10000 pages] --> U4[Use URL Seeding]
U5[Unknown Structure] --> U6[Start with Seeding]
U7[Real-time Discovery] --> U8[Use Deep Crawling]
U9[Quality over Quantity] --> U10[Use URL Seeding]
end
style S6 fill:#c8e6c9
style S7 fill:#c8e6c9
style S8 fill:#c8e6c9
style S9 fill:#c8e6c9
style S10 fill:#c8e6c9
style T5 fill:#ffcdd2
style T6 fill:#ffcdd2
style T7 fill:#ffcdd2
style T8 fill:#ffcdd2
```
### Data Source Characteristics and Selection
```mermaid
graph TB
subgraph "Sitemap Source"
SM1[📋 Official URL List]
SM2[⚡ Fast Response]
SM3[📅 Recently Updated]
SM4[🎯 High Quality URLs]
SM5[❌ May Miss Some Pages]
end
subgraph "Common Crawl Source"
CC1[🌐 Comprehensive Coverage]
CC2[📚 Historical Data]
CC3[🔍 Deep Discovery]
CC4[⏳ Slower Response]
CC5[🧹 May Include Noise]
end
subgraph "Combined Strategy"
CB1[🚀 Best of Both]
CB2[📊 Maximum Coverage]
CB3[✨ Automatic Deduplication]
CB4[⚖️ Balanced Performance]
end
subgraph "Selection Guidelines"
G1[Speed Critical → Sitemap Only]
G2[Coverage Critical → Common Crawl]
G3[Best Quality → Combined]
G4[Unknown Domain → Combined]
end
style SM2 fill:#c8e6c9
style SM4 fill:#c8e6c9
style CC1 fill:#e3f2fd
style CC3 fill:#e3f2fd
style CB1 fill:#f3e5f5
style CB3 fill:#f3e5f5
```
**📖 Learn more:** [URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [Performance Optimization](https://docs.crawl4ai.com/advanced/optimization/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/)

View File

@@ -0,0 +1,295 @@
## CLI & Identity-Based Browsing
Command-line interface for web crawling with persistent browser profiles, authentication, and identity management.
### Basic CLI Usage
```bash
# Simple crawling
crwl https://example.com
# Get markdown output
crwl https://example.com -o markdown
# JSON output with cache bypass
crwl https://example.com -o json --bypass-cache
# Verbose mode with specific browser settings
crwl https://example.com -b "headless=false,viewport_width=1280" -v
```
### Profile Management Commands
```bash
# Launch interactive profile manager
crwl profiles
# Create, list, and manage browser profiles
# This opens a menu where you can:
# 1. List existing profiles
# 2. Create new profile (opens browser for setup)
# 3. Delete profiles
# 4. Use profile to crawl a website
# Use a specific profile for crawling
crwl https://example.com -p my-profile-name
# Example workflow for authenticated sites:
# 1. Create profile and log in
crwl profiles # Select "Create new profile"
# 2. Use profile for crawling authenticated content
crwl https://site-requiring-login.com/dashboard -p my-profile-name
```
### CDP Browser Management
```bash
# Launch browser with CDP debugging (default port 9222)
crwl cdp
# Use specific profile and custom port
crwl cdp -p my-profile -P 9223
# Launch headless browser with CDP
crwl cdp --headless
# Launch in incognito mode (ignores profile)
crwl cdp --incognito
# Use custom user data directory
crwl cdp --user-data-dir ~/my-browser-data --port 9224
```
### Builtin Browser Management
```bash
# Start persistent browser instance
crwl browser start
# Check browser status
crwl browser status
# Open visible window to see the browser
crwl browser view --url https://example.com
# Stop the browser
crwl browser stop
# Restart with different options
crwl browser restart --browser-type chromium --port 9223 --no-headless
# Use builtin browser in crawling
crwl https://example.com -b "browser_mode=builtin"
```
### Authentication Workflow Examples
```bash
# Complete workflow for LinkedIn scraping
# 1. Create authenticated profile
crwl profiles
# Select "Create new profile" → login to LinkedIn in browser → press 'q' to save
# 2. Use profile for crawling
crwl https://linkedin.com/in/someone -p linkedin-profile -o markdown
# 3. Extract structured data with authentication
crwl https://linkedin.com/search/results/people/ \
-p linkedin-profile \
-j "Extract people profiles with names, titles, and companies" \
-b "headless=false"
# GitHub authenticated crawling
crwl profiles # Create github-profile
crwl https://github.com/settings/profile -p github-profile
# Twitter/X authenticated access
crwl profiles # Create twitter-profile
crwl https://twitter.com/home -p twitter-profile -o markdown
```
### Advanced CLI Configuration
```bash
# Complex crawling with multiple configs
crwl https://example.com \
-B browser.yml \
-C crawler.yml \
-e extract_llm.yml \
-s llm_schema.json \
-p my-auth-profile \
-o json \
-v
# Quick LLM extraction with authentication
crwl https://private-site.com/dashboard \
-p auth-profile \
-j "Extract user dashboard data including metrics and notifications" \
-b "headless=true,viewport_width=1920"
# Content filtering with authentication
crwl https://members-only-site.com \
-p member-profile \
-f filter_bm25.yml \
-c "css_selector=.member-content,scan_full_page=true" \
-o markdown-fit
```
### Configuration Files for Identity Browsing
```yaml
# browser_auth.yml
headless: false
use_managed_browser: true
user_data_dir: "/path/to/profile"
viewport_width: 1280
viewport_height: 720
simulate_user: true
override_navigator: true
# crawler_auth.yml
magic: true
remove_overlay_elements: true
simulate_user: true
wait_for: "css:.authenticated-content"
page_timeout: 60000
delay_before_return_html: 2
scan_full_page: true
```
### Global Configuration Management
```bash
# List all configuration settings
crwl config list
# Set default LLM provider
crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token"
# Set browser defaults
crwl config set BROWSER_HEADLESS false # Always show browser
crwl config set USER_AGENT_MODE random # Random user agents
# Enable verbose mode globally
crwl config set VERBOSE true
```
### Q&A with Authenticated Content
```bash
# Ask questions about authenticated content
crwl https://private-dashboard.com -p dashboard-profile \
-q "What are the key metrics shown in my dashboard?"
# Multiple questions workflow
crwl https://company-intranet.com -p work-profile -o markdown # View content
crwl https://company-intranet.com -p work-profile \
-q "Summarize this week's announcements"
crwl https://company-intranet.com -p work-profile \
-q "What are the upcoming deadlines?"
```
### Profile Creation Programmatically
```python
# Create profiles via Python API
import asyncio
from crawl4ai import BrowserProfiler
async def create_auth_profile():
profiler = BrowserProfiler()
# Create profile interactively (opens browser)
profile_path = await profiler.create_profile("linkedin-auth")
print(f"Profile created at: {profile_path}")
# List all profiles
profiles = profiler.list_profiles()
for profile in profiles:
print(f"Profile: {profile['name']} at {profile['path']}")
# Use profile for crawling
from crawl4ai import AsyncWebCrawler, BrowserConfig
browser_config = BrowserConfig(
headless=True,
use_managed_browser=True,
user_data_dir=profile_path
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://linkedin.com/feed")
return result
# asyncio.run(create_auth_profile())
```
### Identity Browsing Best Practices
```bash
# 1. Create specific profiles for different sites
crwl profiles # Create "linkedin-work"
crwl profiles # Create "github-personal"
crwl profiles # Create "company-intranet"
# 2. Use descriptive profile names
crwl https://site1.com -p site1-admin-account
crwl https://site2.com -p site2-user-account
# 3. Combine with appropriate browser settings
crwl https://secure-site.com \
-p secure-profile \
-b "headless=false,simulate_user=true,magic=true" \
-c "wait_for=.logged-in-indicator,page_timeout=30000"
# 4. Test profile before automated crawling
crwl cdp -p test-profile # Manually verify login status
crwl https://test-url.com -p test-profile -v # Verbose test crawl
```
### Troubleshooting Authentication Issues
```bash
# Debug authentication problems
crwl https://auth-site.com -p auth-profile \
-b "headless=false,verbose=true" \
-c "verbose=true,page_timeout=60000" \
-v
# Check profile status
crwl profiles # List profiles and check creation dates
# Recreate problematic profiles
crwl profiles # Delete old profile, create new one
# Test with visible browser
crwl https://problem-site.com -p profile-name \
-b "headless=false" \
-c "delay_before_return_html=5"
```
### Common Use Cases
```bash
# Social media monitoring (after authentication)
crwl https://twitter.com/home -p twitter-monitor \
-j "Extract latest tweets with sentiment and engagement metrics"
# E-commerce competitor analysis (with account access)
crwl https://competitor-site.com/products -p competitor-account \
-j "Extract product prices, availability, and descriptions"
# Company dashboard monitoring
crwl https://company-dashboard.com -p work-profile \
-c "css_selector=.dashboard-content" \
-q "What alerts or notifications need attention?"
# Research data collection (authenticated access)
crwl https://research-platform.com/data -p research-profile \
-e extract_research.yml \
-s research_schema.json \
-o json
```
**📖 Learn more:** [Identity-Based Crawling Documentation](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Browser Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [CLI Examples](https://docs.crawl4ai.com/core/cli/)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,446 @@
## Deep Crawling Filters & Scorers
Advanced URL filtering and scoring strategies for intelligent deep crawling with performance optimization.
### URL Filters - Content and Domain Control
```python
from crawl4ai.deep_crawling.filters import (
URLPatternFilter, DomainFilter, ContentTypeFilter,
FilterChain, ContentRelevanceFilter, SEOFilter
)
# Pattern-based filtering
pattern_filter = URLPatternFilter(
patterns=[
"*.html", # HTML pages only
"*/blog/*", # Blog posts
"*/articles/*", # Article pages
"*2024*", # Recent content
"^https://example.com/docs/.*" # Regex pattern
],
use_glob=True,
reverse=False # False = include matching, True = exclude matching
)
# Domain filtering with subdomains
domain_filter = DomainFilter(
allowed_domains=["example.com", "docs.example.com"],
blocked_domains=["ads.example.com", "tracker.com"]
)
# Content type filtering
content_filter = ContentTypeFilter(
allowed_types=["text/html", "application/pdf"],
check_extension=True
)
# Apply individual filters
url = "https://example.com/blog/2024/article.html"
print(f"Pattern filter: {pattern_filter.apply(url)}")
print(f"Domain filter: {domain_filter.apply(url)}")
print(f"Content filter: {content_filter.apply(url)}")
```
### Filter Chaining - Combine Multiple Filters
```python
# Create filter chain for comprehensive filtering
filter_chain = FilterChain([
DomainFilter(allowed_domains=["example.com"]),
URLPatternFilter(patterns=["*/blog/*", "*/docs/*"]),
ContentTypeFilter(allowed_types=["text/html"])
])
# Apply chain to URLs
urls = [
"https://example.com/blog/post1.html",
"https://spam.com/content.html",
"https://example.com/blog/image.jpg",
"https://example.com/docs/guide.html"
]
async def filter_urls(urls, filter_chain):
filtered = []
for url in urls:
if await filter_chain.apply(url):
filtered.append(url)
return filtered
# Usage
filtered_urls = await filter_urls(urls, filter_chain)
print(f"Filtered URLs: {filtered_urls}")
# Check filter statistics
for filter_obj in filter_chain.filters:
stats = filter_obj.stats
print(f"{filter_obj.name}: {stats.passed_urls}/{stats.total_urls} passed")
```
### Advanced Content Filters
```python
# BM25-based content relevance filtering
relevance_filter = ContentRelevanceFilter(
query="python machine learning tutorial",
threshold=0.5, # Minimum relevance score
k1=1.2, # TF saturation parameter
b=0.75, # Length normalization
avgdl=1000 # Average document length
)
# SEO quality filtering
seo_filter = SEOFilter(
threshold=0.65, # Minimum SEO score
keywords=["python", "tutorial", "guide"],
weights={
"title_length": 0.15,
"title_kw": 0.18,
"meta_description": 0.12,
"canonical": 0.10,
"robot_ok": 0.20,
"schema_org": 0.10,
"url_quality": 0.15
}
)
# Apply advanced filters
url = "https://example.com/python-ml-tutorial"
relevance_score = await relevance_filter.apply(url)
seo_score = await seo_filter.apply(url)
print(f"Relevance: {relevance_score}, SEO: {seo_score}")
```
### URL Scorers - Quality and Relevance Scoring
```python
from crawl4ai.deep_crawling.scorers import (
KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer,
FreshnessScorer, DomainAuthorityScorer, CompositeScorer
)
# Keyword relevance scoring
keyword_scorer = KeywordRelevanceScorer(
keywords=["python", "tutorial", "guide", "machine", "learning"],
weight=1.0,
case_sensitive=False
)
# Path depth scoring (optimal depth = 3)
depth_scorer = PathDepthScorer(
optimal_depth=3, # /category/subcategory/article
weight=0.8
)
# Content type scoring
content_type_scorer = ContentTypeScorer(
type_weights={
"html": 1.0, # Highest priority
"pdf": 0.8, # Medium priority
"txt": 0.6, # Lower priority
"doc": 0.4 # Lowest priority
},
weight=0.9
)
# Freshness scoring
freshness_scorer = FreshnessScorer(
weight=0.7,
current_year=2024
)
# Domain authority scoring
domain_scorer = DomainAuthorityScorer(
domain_weights={
"python.org": 1.0,
"github.com": 0.9,
"stackoverflow.com": 0.85,
"medium.com": 0.7,
"personal-blog.com": 0.3
},
default_weight=0.5,
weight=1.0
)
# Score individual URLs
url = "https://python.org/tutorial/2024/machine-learning.html"
scores = {
"keyword": keyword_scorer.score(url),
"depth": depth_scorer.score(url),
"content": content_type_scorer.score(url),
"freshness": freshness_scorer.score(url),
"domain": domain_scorer.score(url)
}
print(f"Individual scores: {scores}")
```
### Composite Scoring - Combine Multiple Scorers
```python
# Create composite scorer combining all strategies
composite_scorer = CompositeScorer(
scorers=[
KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
PathDepthScorer(optimal_depth=3, weight=1.0),
ContentTypeScorer({"html": 1.0, "pdf": 0.8}, weight=1.2),
FreshnessScorer(weight=0.8, current_year=2024),
DomainAuthorityScorer({
"python.org": 1.0,
"github.com": 0.9
}, weight=1.3)
],
normalize=True # Normalize by number of scorers
)
# Score multiple URLs
urls_to_score = [
"https://python.org/tutorial/2024/basics.html",
"https://github.com/user/python-guide/blob/main/README.md",
"https://random-blog.com/old/2018/python-stuff.html",
"https://python.org/docs/deep/nested/advanced/guide.html"
]
scored_urls = []
for url in urls_to_score:
score = composite_scorer.score(url)
scored_urls.append((url, score))
# Sort by score (highest first)
scored_urls.sort(key=lambda x: x[1], reverse=True)
for url, score in scored_urls:
print(f"Score: {score:.3f} - {url}")
# Check scorer statistics
print(f"\nScoring statistics:")
print(f"URLs scored: {composite_scorer.stats._urls_scored}")
print(f"Average score: {composite_scorer.stats.get_average():.3f}")
```
### Advanced Filter Patterns
```python
# Complex pattern matching
advanced_patterns = URLPatternFilter(
patterns=[
r"^https://docs\.python\.org/\d+/", # Python docs with version
r".*/tutorial/.*\.html$", # Tutorial pages
r".*/guide/(?!deprecated).*", # Guides but not deprecated
"*/blog/{2020,2021,2022,2023,2024}/*", # Recent blog posts
"**/{api,reference}/**/*.html" # API/reference docs
],
use_glob=True
)
# Exclude patterns (reverse=True)
exclude_filter = URLPatternFilter(
patterns=[
"*/admin/*",
"*/login/*",
"*/private/*",
"**/.*", # Hidden files
"*.{jpg,png,gif,css,js}$" # Media and assets
],
reverse=True # Exclude matching patterns
)
# Content type with extension mapping
detailed_content_filter = ContentTypeFilter(
allowed_types=["text", "application"],
check_extension=True,
ext_map={
"html": "text/html",
"htm": "text/html",
"md": "text/markdown",
"pdf": "application/pdf",
"doc": "application/msword",
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
}
)
```
### Performance-Optimized Filtering
```python
# High-performance filter chain for large-scale crawling
class OptimizedFilterChain:
def __init__(self):
# Fast filters first (domain, patterns)
self.fast_filters = [
DomainFilter(
allowed_domains=["example.com", "docs.example.com"],
blocked_domains=["ads.example.com"]
),
URLPatternFilter([
"*.html", "*.pdf", "*/blog/*", "*/docs/*"
])
]
# Slower filters last (content analysis)
self.slow_filters = [
ContentRelevanceFilter(
query="important content",
threshold=0.3
)
]
async def apply_optimized(self, url: str) -> bool:
# Apply fast filters first
for filter_obj in self.fast_filters:
if not filter_obj.apply(url):
return False
# Only apply slow filters if fast filters pass
for filter_obj in self.slow_filters:
if not await filter_obj.apply(url):
return False
return True
# Batch filtering with concurrency
async def batch_filter_urls(urls, filter_chain, max_concurrent=50):
import asyncio
semaphore = asyncio.Semaphore(max_concurrent)
async def filter_single(url):
async with semaphore:
return await filter_chain.apply(url), url
tasks = [filter_single(url) for url in urls]
results = await asyncio.gather(*tasks)
return [url for passed, url in results if passed]
# Usage with 1000 URLs
large_url_list = [f"https://example.com/page{i}.html" for i in range(1000)]
optimized_chain = OptimizedFilterChain()
filtered = await batch_filter_urls(large_url_list, optimized_chain)
```
### Custom Filter Implementation
```python
from crawl4ai.deep_crawling.filters import URLFilter
import re
class CustomLanguageFilter(URLFilter):
"""Filter URLs by language indicators"""
def __init__(self, allowed_languages=["en"], weight=1.0):
super().__init__()
self.allowed_languages = set(allowed_languages)
self.lang_patterns = {
"en": re.compile(r"/en/|/english/|lang=en"),
"es": re.compile(r"/es/|/spanish/|lang=es"),
"fr": re.compile(r"/fr/|/french/|lang=fr"),
"de": re.compile(r"/de/|/german/|lang=de")
}
def apply(self, url: str) -> bool:
# Default to English if no language indicators
if not any(pattern.search(url) for pattern in self.lang_patterns.values()):
result = "en" in self.allowed_languages
self._update_stats(result)
return result
# Check for allowed languages
for lang in self.allowed_languages:
if lang in self.lang_patterns:
if self.lang_patterns[lang].search(url):
self._update_stats(True)
return True
self._update_stats(False)
return False
# Custom scorer implementation
from crawl4ai.deep_crawling.scorers import URLScorer
class CustomComplexityScorer(URLScorer):
"""Score URLs by content complexity indicators"""
def __init__(self, weight=1.0):
super().__init__(weight)
self.complexity_indicators = {
"tutorial": 0.9,
"guide": 0.8,
"example": 0.7,
"reference": 0.6,
"api": 0.5
}
def _calculate_score(self, url: str) -> float:
url_lower = url.lower()
max_score = 0.0
for indicator, score in self.complexity_indicators.items():
if indicator in url_lower:
max_score = max(max_score, score)
return max_score
# Use custom filters and scorers
custom_filter = CustomLanguageFilter(allowed_languages=["en", "es"])
custom_scorer = CustomComplexityScorer(weight=1.2)
url = "https://example.com/en/tutorial/advanced-guide.html"
passes_filter = custom_filter.apply(url)
complexity_score = custom_scorer.score(url)
print(f"Passes language filter: {passes_filter}")
print(f"Complexity score: {complexity_score}")
```
### Integration with Deep Crawling
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import DeepCrawlStrategy
async def deep_crawl_with_filtering():
# Create comprehensive filter chain
filter_chain = FilterChain([
DomainFilter(allowed_domains=["python.org"]),
URLPatternFilter(["*/tutorial/*", "*/guide/*", "*/docs/*"]),
ContentTypeFilter(["text/html"]),
SEOFilter(threshold=0.6, keywords=["python", "programming"])
])
# Create composite scorer
scorer = CompositeScorer([
KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
FreshnessScorer(weight=0.8),
PathDepthScorer(optimal_depth=3, weight=1.0)
], normalize=True)
# Configure deep crawl strategy with filters and scorers
deep_strategy = DeepCrawlStrategy(
max_depth=3,
max_pages=100,
url_filter=filter_chain,
url_scorer=scorer,
score_threshold=0.6 # Only crawl URLs scoring above 0.6
)
config = CrawlerRunConfig(
deep_crawl_strategy=deep_strategy,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://python.org",
config=config
)
print(f"Deep crawl completed: {result.success}")
if hasattr(result, 'deep_crawl_results'):
print(f"Pages crawled: {len(result.deep_crawl_results)}")
# Run the deep crawl
await deep_crawl_with_filtering()
```
**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Custom Filter Development](https://docs.crawl4ai.com/advanced/custom-filters/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/)

View File

@@ -0,0 +1,348 @@
## Deep Crawling
Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies.
### Basic Deep Crawl Setup
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
# Basic breadth-first deep crawling
async def basic_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2, # Initial page + 2 levels
include_external=False # Stay within same domain
),
scraping_strategy=LXMLWebScrapingStrategy(),
verbose=True
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://docs.crawl4ai.com", config=config)
# Group results by depth
pages_by_depth = {}
for result in results:
depth = result.metadata.get("depth", 0)
if depth not in pages_by_depth:
pages_by_depth[depth] = []
pages_by_depth[depth].append(result.url)
print(f"Crawled {len(results)} pages total")
for depth, urls in sorted(pages_by_depth.items()):
print(f"Depth {depth}: {len(urls)} pages")
```
### Deep Crawl Strategies
```python
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# Breadth-First Search - explores all links at one depth before going deeper
bfs_strategy = BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=50, # Limit total pages
score_threshold=0.3 # Minimum score for URLs
)
# Depth-First Search - explores as deep as possible before backtracking
dfs_strategy = DFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=30,
score_threshold=0.5
)
# Best-First - prioritizes highest scoring pages (recommended)
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration"],
weight=0.7
)
best_first_strategy = BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer,
max_pages=25 # No score_threshold needed - naturally prioritizes
)
# Usage
config = CrawlerRunConfig(
deep_crawl_strategy=best_first_strategy, # Choose your strategy
scraping_strategy=LXMLWebScrapingStrategy()
)
```
### Streaming vs Batch Processing
```python
# Batch mode - wait for all results
async def batch_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
stream=False # Default - collect all results first
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://example.com", config=config)
# Process all results at once
for result in results:
print(f"Batch processed: {result.url}")
# Streaming mode - process results as they arrive
async def streaming_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
stream=True # Process results immediately
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://example.com", config=config):
depth = result.metadata.get("depth", 0)
print(f"Stream processed depth {depth}: {result.url}")
```
### Filtering with Filter Chains
```python
from crawl4ai.deep_crawling.filters import (
FilterChain,
URLPatternFilter,
DomainFilter,
ContentTypeFilter,
SEOFilter,
ContentRelevanceFilter
)
# Single URL pattern filter
url_filter = URLPatternFilter(patterns=["*core*", "*guide*"])
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=1,
filter_chain=FilterChain([url_filter])
)
)
# Multiple filters in chain
advanced_filter_chain = FilterChain([
# Domain filtering
DomainFilter(
allowed_domains=["docs.example.com"],
blocked_domains=["old.docs.example.com", "staging.example.com"]
),
# URL pattern matching
URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]),
# Content type filtering
ContentTypeFilter(allowed_types=["text/html"]),
# SEO quality filter
SEOFilter(
threshold=0.5,
keywords=["tutorial", "guide", "documentation"]
),
# Content relevance filter
ContentRelevanceFilter(
query="Web crawling and data extraction with Python",
threshold=0.7
)
])
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
filter_chain=advanced_filter_chain
)
)
```
### Intelligent Crawling with Scorers
```python
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
# Keyword relevance scoring
async def scored_deep_crawl():
keyword_scorer = KeywordRelevanceScorer(
keywords=["browser", "crawler", "web", "automation"],
weight=1.0
)
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
url_scorer=keyword_scorer
),
stream=True, # Recommended with BestFirst
verbose=True
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
```
### Limiting Crawl Size
```python
# Max pages limitation across strategies
async def limited_crawls():
# BFS with page limit
bfs_config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
max_pages=5, # Only crawl 5 pages total
url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0)
)
)
# DFS with score threshold
dfs_config = CrawlerRunConfig(
deep_crawl_strategy=DFSDeepCrawlStrategy(
max_depth=2,
score_threshold=0.7, # Only URLs with scores above 0.7
max_pages=10,
url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0)
)
)
# Best-First with both constraints
bf_config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
max_pages=7, # Automatically gets highest scored pages
url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0)
),
stream=True
)
async with AsyncWebCrawler() as crawler:
# Use any of the configs
async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config):
score = result.metadata.get("score", 0)
print(f"Score: {score:.2f} | {result.url}")
```
### Complete Advanced Deep Crawler
```python
async def comprehensive_deep_crawl():
# Sophisticated filter chain
filter_chain = FilterChain([
DomainFilter(
allowed_domains=["docs.crawl4ai.com"],
blocked_domains=["old.docs.crawl4ai.com"]
),
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
ContentTypeFilter(allowed_types=["text/html"]),
SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"])
])
# Multi-keyword scorer
keyword_scorer = KeywordRelevanceScorer(
keywords=["crawl", "example", "async", "configuration", "browser"],
weight=0.8
)
# Complete configuration
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
include_external=False,
filter_chain=filter_chain,
url_scorer=keyword_scorer,
max_pages=20
),
scraping_strategy=LXMLWebScrapingStrategy(),
stream=True,
verbose=True,
cache_mode=CacheMode.BYPASS
)
# Execute and analyze
results = []
start_time = time.time()
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
results.append(result)
score = result.metadata.get("score", 0)
depth = result.metadata.get("depth", 0)
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
# Performance analysis
duration = time.time() - start_time
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
print(f"✅ Crawled {len(results)} pages in {duration:.2f}s")
print(f"✅ Average relevance score: {avg_score:.2f}")
# Depth distribution
depth_counts = {}
for result in results:
depth = result.metadata.get("depth", 0)
depth_counts[depth] = depth_counts.get(depth, 0) + 1
for depth, count in sorted(depth_counts.items()):
print(f"📊 Depth {depth}: {count} pages")
```
### Error Handling and Robustness
```python
async def robust_deep_crawl():
config = CrawlerRunConfig(
deep_crawl_strategy=BestFirstCrawlingStrategy(
max_depth=2,
max_pages=15,
url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"])
),
stream=True,
page_timeout=30000 # 30 second timeout per page
)
successful_pages = []
failed_pages = []
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
if result.success:
successful_pages.append(result)
depth = result.metadata.get("depth", 0)
score = result.metadata.get("score", 0)
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
else:
failed_pages.append({
'url': result.url,
'error': result.error_message,
'depth': result.metadata.get("depth", 0)
})
print(f"❌ Failed: {result.url} - {result.error_message}")
print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed")
# Analyze failures by depth
if failed_pages:
failure_by_depth = {}
for failure in failed_pages:
depth = failure['depth']
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
print("❌ Failures by depth:")
for depth, count in sorted(failure_by_depth.items()):
print(f" Depth {depth}: {count} failures")
```
**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/)

View File

@@ -0,0 +1,826 @@
## Docker Deployment
Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration.
### Quick Start with Pre-built Images
```bash
# Pull latest image
docker pull unclecode/crawl4ai:latest
# Setup LLM API keys
cat > .llm.env << EOL
OPENAI_API_KEY=sk-your-key
ANTHROPIC_API_KEY=your-anthropic-key
GROQ_API_KEY=your-groq-key
GEMINI_API_TOKEN=your-gemini-token
EOL
# Run with LLM support
docker run -d \
-p 11235:11235 \
--name crawl4ai \
--env-file .llm.env \
--shm-size=1g \
unclecode/crawl4ai:latest
# Basic run (no LLM)
docker run -d \
-p 11235:11235 \
--name crawl4ai \
--shm-size=1g \
unclecode/crawl4ai:latest
# Check health
curl http://localhost:11235/health
```
### Docker Compose Deployment
```bash
# Clone and setup
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
cp deploy/docker/.llm.env.example .llm.env
# Edit .llm.env with your API keys
# Run pre-built image
IMAGE=unclecode/crawl4ai:latest docker compose up -d
# Build locally
docker compose up --build -d
# Build with all features
INSTALL_TYPE=all docker compose up --build -d
# Build with GPU support
ENABLE_GPU=true docker compose up --build -d
# Stop service
docker compose down
```
### Manual Build with Multi-Architecture
```bash
# Clone repository
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
# Build for current architecture
docker buildx build -t crawl4ai-local:latest --load .
# Build for multiple architectures
docker buildx build --platform linux/amd64,linux/arm64 \
-t crawl4ai-local:latest --load .
# Build with specific features
docker buildx build \
--build-arg INSTALL_TYPE=all \
--build-arg ENABLE_GPU=false \
-t crawl4ai-local:latest --load .
# Run custom build
docker run -d \
-p 11235:11235 \
--name crawl4ai-custom \
--env-file .llm.env \
--shm-size=1g \
crawl4ai-local:latest
```
### Build Arguments
```bash
# Available build options
docker buildx build \
--build-arg INSTALL_TYPE=all \ # default|all|torch|transformer
--build-arg ENABLE_GPU=true \ # true|false
--build-arg APP_HOME=/app \ # Install path
--build-arg USE_LOCAL=true \ # Use local source
--build-arg GITHUB_REPO=url \ # Git repo if USE_LOCAL=false
--build-arg GITHUB_BRANCH=main \ # Git branch
-t crawl4ai-custom:latest --load .
```
### Core API Endpoints
```python
# Main crawling endpoints
import requests
import json
# Basic crawl
payload = {
"urls": ["https://example.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
}
response = requests.post("http://localhost:11235/crawl", json=payload)
# Streaming crawl
payload["crawler_config"]["params"]["stream"] = True
response = requests.post("http://localhost:11235/crawl/stream", json=payload)
# Health check
response = requests.get("http://localhost:11235/health")
# API schema
response = requests.get("http://localhost:11235/schema")
# Metrics (Prometheus format)
response = requests.get("http://localhost:11235/metrics")
```
### Specialized Endpoints
```python
# HTML extraction (preprocessed for schema)
response = requests.post("http://localhost:11235/html",
json={"url": "https://example.com"})
# Screenshot capture
response = requests.post("http://localhost:11235/screenshot", json={
"url": "https://example.com",
"screenshot_wait_for": 2,
"output_path": "/path/to/save/screenshot.png"
})
# PDF generation
response = requests.post("http://localhost:11235/pdf", json={
"url": "https://example.com",
"output_path": "/path/to/save/document.pdf"
})
# JavaScript execution
response = requests.post("http://localhost:11235/execute_js", json={
"url": "https://example.com",
"scripts": [
"return document.title",
"return Array.from(document.querySelectorAll('a')).map(a => a.href)"
]
})
# Markdown generation
response = requests.post("http://localhost:11235/md", json={
"url": "https://example.com",
"f": "fit", # raw|fit|bm25|llm
"q": "extract main content", # query for filtering
"c": "0" # cache: 0=bypass, 1=use
})
# LLM Q&A
response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?")
# Library context (for AI assistants)
response = requests.get("http://localhost:11235/ask", params={
"context_type": "all", # code|doc|all
"query": "how to use extraction strategies",
"score_ratio": 0.5,
"max_results": 20
})
```
### Python SDK Usage
```python
import asyncio
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
# Non-streaming crawl
results = await client.crawl(
["https://example.com"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
for result in results:
print(f"URL: {result.url}, Success: {result.success}")
print(f"Content length: {len(result.markdown)}")
# Streaming crawl
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
async for result in await client.crawl(
["https://example.com", "https://python.org"],
browser_config=BrowserConfig(headless=True),
crawler_config=stream_config
):
print(f"Streamed: {result.url} - {result.success}")
# Get API schema
schema = await client.get_schema()
print(f"Schema available: {bool(schema)}")
asyncio.run(main())
```
### Advanced API Configuration
```python
# Complex extraction with LLM
payload = {
"urls": ["https://example.com"],
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True,
"viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}
}
},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "openai/gpt-4o-mini",
"api_token": "env:OPENAI_API_KEY"
}
},
"schema": {
"type": "dict",
"value": {
"type": "object",
"properties": {
"title": {"type": "string"},
"content": {"type": "string"}
}
}
},
"instruction": "Extract title and main content"
}
},
"markdown_generator": {
"type": "DefaultMarkdownGenerator",
"params": {
"content_filter": {
"type": "PruningContentFilter",
"params": {"threshold": 0.6}
}
}
}
}
}
}
response = requests.post("http://localhost:11235/crawl", json=payload)
```
### CSS Extraction Strategy
```python
# CSS-based structured extraction
schema = {
"name": "ProductList",
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
]
}
payload = {
"urls": ["https://example-shop.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {"type": "dict", "value": schema}
}
}
}
}
}
response = requests.post("http://localhost:11235/crawl", json=payload)
data = response.json()
extracted = json.loads(data["results"][0]["extracted_content"])
```
### MCP (Model Context Protocol) Integration
```bash
# Add Crawl4AI as MCP provider to Claude Code
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
# List MCP providers
claude mcp list
# Test MCP connection
python tests/mcp/test_mcp_socket.py
# Available MCP endpoints
# SSE: http://localhost:11235/mcp/sse
# WebSocket: ws://localhost:11235/mcp/ws
# Schema: http://localhost:11235/mcp/schema
```
Available MCP tools:
- `md` - Generate markdown from web content
- `html` - Extract preprocessed HTML
- `screenshot` - Capture webpage screenshots
- `pdf` - Generate PDF documents
- `execute_js` - Run JavaScript on web pages
- `crawl` - Perform multi-URL crawling
- `ask` - Query Crawl4AI library context
### Configuration Management
```yaml
# config.yml structure
app:
title: "Crawl4AI API"
version: "1.0.0"
host: "0.0.0.0"
port: 11235
timeout_keep_alive: 300
llm:
provider: "openai/gpt-4o-mini"
api_key_env: "OPENAI_API_KEY"
security:
enabled: false
jwt_enabled: false
trusted_hosts: ["*"]
crawler:
memory_threshold_percent: 95.0
rate_limiter:
base_delay: [1.0, 2.0]
timeouts:
stream_init: 30.0
batch_process: 300.0
pool:
max_pages: 40
idle_ttl_sec: 1800
rate_limiting:
enabled: true
default_limit: "1000/minute"
storage_uri: "memory://"
logging:
level: "INFO"
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
```
### Custom Configuration Deployment
```bash
# Method 1: Mount custom config
docker run -d -p 11235:11235 \
--name crawl4ai-custom \
--env-file .llm.env \
--shm-size=1g \
-v $(pwd)/my-config.yml:/app/config.yml \
unclecode/crawl4ai:latest
# Method 2: Build with custom config
# Edit deploy/docker/config.yml then build
docker buildx build -t crawl4ai-custom:latest --load .
```
### Monitoring and Health Checks
```bash
# Health endpoint
curl http://localhost:11235/health
# Prometheus metrics
curl http://localhost:11235/metrics
# Configuration validation
curl -X POST http://localhost:11235/config/dump \
-H "Content-Type: application/json" \
-d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}'
```
### Playground Interface
Access the interactive playground at `http://localhost:11235/playground` for:
- Testing configurations with visual interface
- Generating JSON payloads for REST API
- Converting Python config to JSON format
- Testing crawl operations directly in browser
### Async Job Processing
```python
# Submit job for async processing
import time
# Submit crawl job
response = requests.post("http://localhost:11235/crawl/job", json=payload)
task_id = response.json()["task_id"]
# Poll for completion
while True:
result = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
status = result.json()
if status["status"] in ["COMPLETED", "FAILED"]:
break
time.sleep(1.5)
print("Final result:", status)
```
### Production Deployment
```bash
# Production-ready deployment
docker run -d \
--name crawl4ai-prod \
--restart unless-stopped \
-p 11235:11235 \
--env-file .llm.env \
--shm-size=2g \
--memory=8g \
--cpus=4 \
-v /path/to/custom-config.yml:/app/config.yml \
unclecode/crawl4ai:latest
# With Docker Compose for production
version: '3.8'
services:
crawl4ai:
image: unclecode/crawl4ai:latest
ports:
- "11235:11235"
environment:
- OPENAI_API_KEY=${OPENAI_API_KEY}
volumes:
- ./config.yml:/app/config.yml
shm_size: 2g
deploy:
resources:
limits:
memory: 8G
cpus: '4'
restart: unless-stopped
```
### Configuration Validation and JSON Structure
```python
# Method 1: Create config objects and dump to see expected JSON structure
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
import json
# Create browser config and see JSON structure
browser_config = BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=720,
proxy="http://user:pass@proxy:8080"
)
# Get JSON structure
browser_json = browser_config.dump()
print("BrowserConfig JSON structure:")
print(json.dumps(browser_json, indent=2))
# Create crawler config with extraction strategy
schema = {
"name": "Articles",
"baseSelector": ".article",
"fields": [
{"name": "title", "selector": "h2", "type": "text"},
{"name": "content", "selector": ".content", "type": "html"}
]
}
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
extraction_strategy=JsonCssExtractionStrategy(schema),
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
wait_for="css:.loaded"
)
crawler_json = crawler_config.dump()
print("\nCrawlerRunConfig JSON structure:")
print(json.dumps(crawler_json, indent=2))
```
### Reverse Validation - JSON to Objects
```python
# Method 2: Load JSON back to config objects for validation
from crawl4ai.async_configs import from_serializable_dict
# Test JSON structure by converting back to objects
test_browser_json = {
"type": "BrowserConfig",
"params": {
"headless": True,
"viewport_width": 1280,
"proxy": "http://user:pass@proxy:8080"
}
}
try:
# Convert JSON back to object
restored_browser = from_serializable_dict(test_browser_json)
print(f"✅ Valid BrowserConfig: {type(restored_browser)}")
print(f"Headless: {restored_browser.headless}")
print(f"Proxy: {restored_browser.proxy}")
except Exception as e:
print(f"❌ Invalid BrowserConfig JSON: {e}")
# Test complex crawler config JSON
test_crawler_json = {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "bypass",
"screenshot": True,
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict",
"value": {
"name": "Products",
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h3", "type": "text"}
]
}
}
}
}
}
}
try:
restored_crawler = from_serializable_dict(test_crawler_json)
print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}")
print(f"Cache mode: {restored_crawler.cache_mode}")
print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}")
except Exception as e:
print(f"❌ Invalid CrawlerRunConfig JSON: {e}")
```
### Using Server's /config/dump Endpoint for Validation
```python
import requests
# Method 3: Use server endpoint to validate configuration syntax
def validate_config_with_server(config_code: str) -> dict:
"""Validate configuration using server's /config/dump endpoint"""
response = requests.post(
"http://localhost:11235/config/dump",
json={"code": config_code}
)
if response.status_code == 200:
print("✅ Valid configuration syntax")
return response.json()
else:
print(f"❌ Invalid configuration: {response.status_code}")
print(response.json())
return None
# Test valid configuration
valid_config = """
CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
wait_for="css:.content-loaded"
)
"""
result = validate_config_with_server(valid_config)
if result:
print("Generated JSON structure:")
print(json.dumps(result, indent=2))
# Test invalid configuration (should fail)
invalid_config = """
CrawlerRunConfig(
cache_mode="invalid_mode",
screenshot=True,
js_code=some_function() # This will fail
)
"""
validate_config_with_server(invalid_config)
```
### Configuration Builder Helper
```python
def build_and_validate_request(urls, browser_params=None, crawler_params=None):
"""Helper to build and validate complete request payload"""
# Create configurations
browser_config = BrowserConfig(**(browser_params or {}))
crawler_config = CrawlerRunConfig(**(crawler_params or {}))
# Build complete request payload
payload = {
"urls": urls if isinstance(urls, list) else [urls],
"browser_config": browser_config.dump(),
"crawler_config": crawler_config.dump()
}
print("✅ Complete request payload:")
print(json.dumps(payload, indent=2))
# Validate by attempting to reconstruct
try:
test_browser = from_serializable_dict(payload["browser_config"])
test_crawler = from_serializable_dict(payload["crawler_config"])
print("✅ Payload validation successful")
return payload
except Exception as e:
print(f"❌ Payload validation failed: {e}")
return None
# Example usage
payload = build_and_validate_request(
urls=["https://example.com"],
browser_params={"headless": True, "viewport_width": 1280},
crawler_params={
"cache_mode": CacheMode.BYPASS,
"screenshot": True,
"word_count_threshold": 10
}
)
if payload:
# Send to server
response = requests.post("http://localhost:11235/crawl", json=payload)
print(f"Server response: {response.status_code}")
```
### Common JSON Structure Patterns
```python
# Pattern 1: Simple primitive values
simple_config = {
"type": "CrawlerRunConfig",
"params": {
"cache_mode": "bypass", # String enum value
"screenshot": True, # Boolean
"page_timeout": 60000 # Integer
}
}
# Pattern 2: Nested objects
nested_config = {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"llm_config": {
"type": "LLMConfig",
"params": {
"provider": "openai/gpt-4o-mini",
"api_token": "env:OPENAI_API_KEY"
}
},
"instruction": "Extract main content"
}
}
}
}
# Pattern 3: Dictionary values (must use type: dict wrapper)
dict_config = {
"type": "CrawlerRunConfig",
"params": {
"extraction_strategy": {
"type": "JsonCssExtractionStrategy",
"params": {
"schema": {
"type": "dict", # Required wrapper
"value": { # Actual dictionary content
"name": "Products",
"baseSelector": ".product",
"fields": [
{"name": "title", "selector": "h2", "type": "text"}
]
}
}
}
}
}
}
# Pattern 4: Lists and arrays
list_config = {
"type": "CrawlerRunConfig",
"params": {
"js_code": [ # Lists are handled directly
"window.scrollTo(0, document.body.scrollHeight);",
"document.querySelector('.load-more')?.click();"
],
"excluded_tags": ["script", "style", "nav"]
}
}
```
### Troubleshooting Common JSON Errors
```python
def diagnose_json_errors():
"""Common JSON structure errors and fixes"""
# ❌ WRONG: Missing type wrapper for objects
wrong_config = {
"browser_config": {
"headless": True # Missing type wrapper
}
}
# ✅ CORRECT: Proper type wrapper
correct_config = {
"browser_config": {
"type": "BrowserConfig",
"params": {
"headless": True
}
}
}
# ❌ WRONG: Dictionary without type: dict wrapper
wrong_dict = {
"schema": {
"name": "Products" # Raw dict, should be wrapped
}
}
# ✅ CORRECT: Dictionary with proper wrapper
correct_dict = {
"schema": {
"type": "dict",
"value": {
"name": "Products"
}
}
}
# ❌ WRONG: Invalid enum string
wrong_enum = {
"cache_mode": "DISABLED" # Wrong case/value
}
# ✅ CORRECT: Valid enum string
correct_enum = {
"cache_mode": "bypass" # or "enabled", "disabled", etc.
}
print("Common error patterns documented above")
# Validate your JSON structure before sending
def pre_flight_check(payload):
"""Run checks before sending to server"""
required_keys = ["urls", "browser_config", "crawler_config"]
for key in required_keys:
if key not in payload:
print(f"❌ Missing required key: {key}")
return False
# Check type wrappers
for config_key in ["browser_config", "crawler_config"]:
config = payload[config_key]
if not isinstance(config, dict) or "type" not in config:
print(f"❌ {config_key} missing type wrapper")
return False
if "params" not in config:
print(f"❌ {config_key} missing params")
return False
print("✅ Pre-flight check passed")
return True
# Example usage
payload = {
"urls": ["https://example.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
}
if pre_flight_check(payload):
# Safe to send to server
pass
```
**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration)

View File

@@ -0,0 +1,788 @@
## Extraction Strategies
Powerful data extraction from web pages using LLM-based intelligent parsing or fast schema/pattern-based approaches.
### LLM-Based Extraction - Intelligent Content Understanding
```python
import os
import asyncio
import json
from pydantic import BaseModel, Field
from typing import List
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
# Define structured data model
class Product(BaseModel):
name: str = Field(description="Product name")
price: str = Field(description="Product price")
description: str = Field(description="Product description")
features: List[str] = Field(description="List of product features")
rating: float = Field(description="Product rating out of 5")
# Configure LLM provider
llm_config = LLMConfig(
provider="openai/gpt-4o-mini", # or "ollama/llama3.3", "anthropic/claude-3-5-sonnet"
api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY"
temperature=0.1,
max_tokens=2000
)
# Create LLM extraction strategy
llm_strategy = LLMExtractionStrategy(
llm_config=llm_config,
schema=Product.model_json_schema(),
extraction_type="schema", # or "block" for freeform text
instruction="""
Extract product information from the webpage content.
Focus on finding complete product details including:
- Product name and price
- Detailed description
- All listed features
- Customer rating if available
Return valid JSON array of products.
""",
chunk_token_threshold=1200, # Split content if too large
overlap_rate=0.1, # 10% overlap between chunks
apply_chunking=True, # Enable automatic chunking
input_format="markdown", # "html", "fit_markdown", or "markdown"
extra_args={"temperature": 0.0, "max_tokens": 800},
verbose=True
)
async def extract_with_llm():
browser_config = BrowserConfig(headless=True)
crawl_config = CrawlerRunConfig(
extraction_strategy=llm_strategy,
cache_mode=CacheMode.BYPASS,
word_count_threshold=10
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=crawl_config
)
if result.success:
# Parse extracted JSON
products = json.loads(result.extracted_content)
print(f"Extracted {len(products)} products")
for product in products[:3]: # Show first 3
print(f"Product: {product['name']}")
print(f"Price: {product['price']}")
print(f"Rating: {product.get('rating', 'N/A')}")
# Show token usage and cost
llm_strategy.show_usage()
else:
print(f"Extraction failed: {result.error_message}")
asyncio.run(extract_with_llm())
```
### LLM Strategy Advanced Configuration
```python
# Multiple provider configurations
providers = {
"openai": LLMConfig(
provider="openai/gpt-4o",
api_token="env:OPENAI_API_KEY",
temperature=0.1
),
"anthropic": LLMConfig(
provider="anthropic/claude-3-5-sonnet-20240620",
api_token="env:ANTHROPIC_API_KEY",
max_tokens=4000
),
"ollama": LLMConfig(
provider="ollama/llama3.3",
api_token=None, # Not needed for Ollama
base_url="http://localhost:11434"
),
"groq": LLMConfig(
provider="groq/llama3-70b-8192",
api_token="env:GROQ_API_KEY"
)
}
# Advanced chunking for large content
large_content_strategy = LLMExtractionStrategy(
llm_config=providers["openai"],
schema=YourModel.model_json_schema(),
extraction_type="schema",
instruction="Extract detailed information...",
# Chunking parameters
chunk_token_threshold=2000, # Larger chunks for complex content
overlap_rate=0.15, # More overlap for context preservation
apply_chunking=True,
# Input format selection
input_format="fit_markdown", # Use filtered content if available
# LLM parameters
extra_args={
"temperature": 0.0, # Deterministic output
"top_p": 0.9,
"frequency_penalty": 0.1,
"presence_penalty": 0.1,
"max_tokens": 1500
},
verbose=True
)
# Knowledge graph extraction
class Entity(BaseModel):
name: str
type: str # "person", "organization", "location", etc.
description: str
class Relationship(BaseModel):
source: str
target: str
relationship: str
confidence: float
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
summary: str
knowledge_strategy = LLMExtractionStrategy(
llm_config=providers["anthropic"],
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="""
Create a knowledge graph from the content by:
1. Identifying key entities (people, organizations, locations, concepts)
2. Finding relationships between entities
3. Providing confidence scores for relationships
4. Summarizing the main topics
""",
input_format="html", # Use HTML for better structure preservation
apply_chunking=True,
chunk_token_threshold=1500
)
```
### JSON CSS Extraction - Fast Schema-Based Extraction
```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# Basic CSS extraction schema
simple_schema = {
"name": "Product Listings",
"baseSelector": "div.product-card",
"fields": [
{
"name": "title",
"selector": "h2.product-title",
"type": "text"
},
{
"name": "price",
"selector": ".price",
"type": "text"
},
{
"name": "image_url",
"selector": "img.product-image",
"type": "attribute",
"attribute": "src"
},
{
"name": "product_url",
"selector": "a.product-link",
"type": "attribute",
"attribute": "href"
}
]
}
# Complex nested schema with multiple data types
complex_schema = {
"name": "E-commerce Product Catalog",
"baseSelector": "div.category",
"baseFields": [
{
"name": "category_id",
"type": "attribute",
"attribute": "data-category-id"
},
{
"name": "category_url",
"type": "attribute",
"attribute": "data-url"
}
],
"fields": [
{
"name": "category_name",
"selector": "h2.category-title",
"type": "text"
},
{
"name": "products",
"selector": "div.product",
"type": "nested_list", # Array of complex objects
"fields": [
{
"name": "name",
"selector": "h3.product-name",
"type": "text",
"default": "Unknown Product"
},
{
"name": "price",
"selector": "span.price",
"type": "text"
},
{
"name": "details",
"selector": "div.product-details",
"type": "nested", # Single complex object
"fields": [
{
"name": "brand",
"selector": "span.brand",
"type": "text"
},
{
"name": "model",
"selector": "span.model",
"type": "text"
},
{
"name": "specs",
"selector": "div.specifications",
"type": "html" # Preserve HTML structure
}
]
},
{
"name": "features",
"selector": "ul.features li",
"type": "list", # Simple array of strings
"fields": [
{"name": "feature", "type": "text"}
]
},
{
"name": "reviews",
"selector": "div.review",
"type": "nested_list",
"fields": [
{
"name": "reviewer",
"selector": "span.reviewer-name",
"type": "text"
},
{
"name": "rating",
"selector": "span.rating",
"type": "attribute",
"attribute": "data-rating"
},
{
"name": "comment",
"selector": "p.review-text",
"type": "text"
},
{
"name": "date",
"selector": "time.review-date",
"type": "attribute",
"attribute": "datetime"
}
]
}
]
}
]
}
async def extract_with_css_schema():
strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
config = CrawlerRunConfig(
extraction_strategy=strategy,
cache_mode=CacheMode.BYPASS,
# Enable dynamic content loading if needed
js_code="window.scrollTo(0, document.body.scrollHeight);",
wait_for="css:.product:nth-child(10)", # Wait for products to load
process_iframes=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/catalog",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
print(f"Extracted {len(data)} categories")
for category in data:
print(f"Category: {category['category_name']}")
print(f"Products: {len(category.get('products', []))}")
# Show first product details
if category.get('products'):
product = category['products'][0]
print(f" First product: {product.get('name')}")
print(f" Features: {len(product.get('features', []))}")
print(f" Reviews: {len(product.get('reviews', []))}")
asyncio.run(extract_with_css_schema())
```
### Automatic Schema Generation - One-Time LLM, Unlimited Use
```python
import json
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def generate_and_use_schema():
"""
1. Use LLM once to generate schema from sample HTML
2. Cache the schema for reuse
3. Use cached schema for fast extraction without LLM calls
"""
cache_dir = Path("./schema_cache")
cache_dir.mkdir(exist_ok=True)
schema_file = cache_dir / "ecommerce_schema.json"
# Step 1: Generate or load cached schema
if schema_file.exists():
schema = json.load(schema_file.open())
print("Using cached schema")
else:
print("Generating schema using LLM...")
# Configure LLM for schema generation
llm_config = LLMConfig(
provider="openai/gpt-4o", # or "ollama/llama3.3" for local
api_token="env:OPENAI_API_KEY"
)
# Get sample HTML from target site
async with AsyncWebCrawler() as crawler:
sample_result = await crawler.arun(
url="https://example.com/products",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
sample_html = sample_result.cleaned_html[:5000] # Use first 5k chars
# Generate schema automatically (ONE-TIME LLM COST)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
schema_type="css",
llm_config=llm_config,
instruction="Extract product information including name, price, description, and features"
)
# Cache schema for future use (NO MORE LLM CALLS)
json.dump(schema, schema_file.open("w"), indent=2)
print("Schema generated and cached")
# Step 2: Use schema for fast extraction (NO LLM CALLS)
strategy = JsonCssExtractionStrategy(schema, verbose=True)
config = CrawlerRunConfig(
extraction_strategy=strategy,
cache_mode=CacheMode.BYPASS
)
# Step 3: Extract from multiple pages using same schema
urls = [
"https://example.com/products",
"https://example.com/electronics",
"https://example.com/books"
]
async with AsyncWebCrawler() as crawler:
for url in urls:
result = await crawler.arun(url=url, config=config)
if result.success:
data = json.loads(result.extracted_content)
print(f"{url}: Extracted {len(data)} items")
else:
print(f"{url}: Failed - {result.error_message}")
asyncio.run(generate_and_use_schema())
```
### XPath Extraction Strategy
```python
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
# XPath-based schema (alternative to CSS)
xpath_schema = {
"name": "News Articles",
"baseSelector": "//article[@class='news-item']",
"baseFields": [
{
"name": "article_id",
"type": "attribute",
"attribute": "data-id"
}
],
"fields": [
{
"name": "headline",
"selector": ".//h2[@class='headline']",
"type": "text"
},
{
"name": "author",
"selector": ".//span[@class='author']/text()",
"type": "text"
},
{
"name": "publish_date",
"selector": ".//time/@datetime",
"type": "text"
},
{
"name": "content",
"selector": ".//div[@class='article-body']",
"type": "html"
},
{
"name": "tags",
"selector": ".//div[@class='tags']/span[@class='tag']",
"type": "list",
"fields": [
{"name": "tag", "type": "text"}
]
}
]
}
# Generate XPath schema automatically
async def generate_xpath_schema():
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)
sample_html = """
<article class="news-item" data-id="123">
<h2 class="headline">Breaking News</h2>
<span class="author">John Doe</span>
<time datetime="2024-01-01">Today</time>
<div class="article-body"><p>Content here...</p></div>
</article>
"""
schema = JsonXPathExtractionStrategy.generate_schema(
html=sample_html,
schema_type="xpath",
llm_config=llm_config
)
return schema
# Use XPath strategy
xpath_strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
```
### Regex Extraction Strategy - Pattern-Based Fast Extraction
```python
from crawl4ai.extraction_strategy import RegexExtractionStrategy
# Built-in patterns for common data types
async def extract_with_builtin_patterns():
# Use multiple built-in patterns
strategy = RegexExtractionStrategy(
pattern=(
RegexExtractionStrategy.Email |
RegexExtractionStrategy.PhoneUS |
RegexExtractionStrategy.Url |
RegexExtractionStrategy.Currency |
RegexExtractionStrategy.DateIso
)
)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/contact",
config=config
)
if result.success:
matches = json.loads(result.extracted_content)
# Group by pattern type
by_type = {}
for match in matches:
label = match['label']
if label not in by_type:
by_type[label] = []
by_type[label].append(match['value'])
for pattern_type, values in by_type.items():
print(f"{pattern_type}: {len(values)} matches")
for value in values[:3]: # Show first 3
print(f" {value}")
# Custom regex patterns
custom_patterns = {
"product_code": r"SKU-\d{4,6}",
"discount": r"\d{1,2}%\s*off",
"model_number": r"Model:\s*([A-Z0-9-]+)"
}
async def extract_with_custom_patterns():
strategy = RegexExtractionStrategy(custom=custom_patterns)
config = CrawlerRunConfig(extraction_strategy=strategy)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com/products",
config=config
)
if result.success:
data = json.loads(result.extracted_content)
for item in data:
print(f"{item['label']}: {item['value']}")
# LLM-generated patterns (one-time cost)
async def generate_custom_patterns():
cache_file = Path("./patterns/price_patterns.json")
if cache_file.exists():
patterns = json.load(cache_file.open())
else:
llm_config = LLMConfig(
provider="openai/gpt-4o-mini",
api_token="env:OPENAI_API_KEY"
)
# Get sample content
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com/pricing")
sample_html = result.cleaned_html
# Generate optimized patterns
patterns = RegexExtractionStrategy.generate_pattern(
label="pricing_info",
html=sample_html,
query="Extract all pricing information including discounts and special offers",
llm_config=llm_config
)
# Cache for reuse
cache_file.parent.mkdir(exist_ok=True)
json.dump(patterns, cache_file.open("w"), indent=2)
# Use cached patterns (no more LLM calls)
strategy = RegexExtractionStrategy(custom=patterns)
return strategy
asyncio.run(extract_with_builtin_patterns())
asyncio.run(extract_with_custom_patterns())
```
### Complete Extraction Workflow - Combining Strategies
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
RegexExtractionStrategy,
LLMExtractionStrategy
)
async def multi_strategy_extraction():
"""
Demonstrate using multiple extraction strategies in sequence:
1. Fast regex for common patterns
2. Schema-based for structured data
3. LLM for complex reasoning
"""
browser_config = BrowserConfig(headless=True)
# Strategy 1: Fast regex extraction
regex_strategy = RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
)
# Strategy 2: Schema-based structured extraction
product_schema = {
"name": "Products",
"baseSelector": "div.product",
"fields": [
{"name": "name", "selector": "h3", "type": "text"},
{"name": "price", "selector": ".price", "type": "text"},
{"name": "rating", "selector": ".rating", "type": "attribute", "attribute": "data-rating"}
]
}
css_strategy = JsonCssExtractionStrategy(product_schema)
# Strategy 3: LLM for complex analysis
llm_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
schema={
"type": "object",
"properties": {
"sentiment": {"type": "string"},
"key_topics": {"type": "array", "items": {"type": "string"}},
"summary": {"type": "string"}
}
},
extraction_type="schema",
instruction="Analyze the content sentiment, extract key topics, and provide a summary"
)
url = "https://example.com/product-reviews"
async with AsyncWebCrawler(config=browser_config) as crawler:
# Extract contact info with regex
regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
regex_result = await crawler.arun(url=url, config=regex_config)
# Extract structured product data
css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
css_result = await crawler.arun(url=url, config=css_config)
# Extract insights with LLM
llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
llm_result = await crawler.arun(url=url, config=llm_config)
# Combine results
results = {
"contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
"products": json.loads(css_result.extracted_content) if css_result.success else [],
"analysis": json.loads(llm_result.extracted_content) if llm_result.success else {}
}
print(f"Found {len(results['contacts'])} contact entries")
print(f"Found {len(results['products'])} products")
print(f"Sentiment: {results['analysis'].get('sentiment', 'N/A')}")
return results
# Performance comparison
async def compare_extraction_performance():
"""Compare speed and accuracy of different strategies"""
import time
url = "https://example.com/large-catalog"
strategies = {
"regex": RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
"css": JsonCssExtractionStrategy({
"name": "Prices",
"baseSelector": ".price",
"fields": [{"name": "amount", "selector": "span", "type": "text"}]
}),
"llm": LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
instruction="Extract all prices from the content",
extraction_type="block"
)
}
async with AsyncWebCrawler() as crawler:
for name, strategy in strategies.items():
start_time = time.time()
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url=url, config=config)
duration = time.time() - start_time
if result.success:
data = json.loads(result.extracted_content)
print(f"{name}: {len(data)} items in {duration:.2f}s")
else:
print(f"{name}: Failed in {duration:.2f}s")
asyncio.run(multi_strategy_extraction())
asyncio.run(compare_extraction_performance())
```
### Best Practices and Strategy Selection
```python
# Strategy selection guide
def choose_extraction_strategy(use_case):
"""
Guide for selecting the right extraction strategy
"""
strategies = {
# Fast pattern matching for common data types
"contact_info": RegexExtractionStrategy(
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
),
# Structured data from consistent HTML
"product_catalogs": JsonCssExtractionStrategy,
# Complex reasoning and semantic understanding
"content_analysis": LLMExtractionStrategy,
# Mixed approach for comprehensive extraction
"complete_site_analysis": "multi_strategy"
}
recommendations = {
"speed_priority": "Use RegexExtractionStrategy for simple patterns, JsonCssExtractionStrategy for structured data",
"accuracy_priority": "Use LLMExtractionStrategy for complex content, JsonCssExtractionStrategy for predictable structure",
"cost_priority": "Avoid LLM strategies, use schema generation once then JsonCssExtractionStrategy",
"scale_priority": "Cache schemas, use regex for simple patterns, avoid LLM for high-volume extraction"
}
return recommendations.get(use_case, "Combine strategies based on content complexity")
# Error handling and validation
async def robust_extraction():
strategies = [
RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email),
JsonCssExtractionStrategy(simple_schema),
# LLM as fallback for complex cases
]
async with AsyncWebCrawler() as crawler:
for strategy in strategies:
try:
config = CrawlerRunConfig(extraction_strategy=strategy)
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.extracted_content:
data = json.loads(result.extracted_content)
if data: # Validate non-empty results
print(f"Success with {strategy.__class__.__name__}")
return data
except Exception as e:
print(f"Strategy {strategy.__class__.__name__} failed: {e}")
continue
print("All strategies failed")
return None
```
**📖 Learn more:** [LLM Strategies Deep Dive](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Regex Patterns](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)

View File

@@ -0,0 +1,388 @@
## HTTP Crawler Strategy
Fast, lightweight HTTP-only crawling without browser overhead for cases where JavaScript execution isn't needed.
### Basic HTTP Crawler Setup
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig, CacheMode
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
async def main():
# Initialize HTTP strategy
http_strategy = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(
method="GET",
verify_ssl=True,
follow_redirects=True
),
logger=AsyncLogger(verbose=True)
)
# Use with AsyncWebCrawler
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
result = await crawler.arun("https://example.com")
print(f"Status: {result.status_code}")
print(f"Content: {len(result.html)} chars")
if __name__ == "__main__":
asyncio.run(main())
```
### HTTP Request Types
```python
# GET request (default)
http_config = HTTPCrawlerConfig(
method="GET",
headers={"Accept": "application/json"}
)
# POST with JSON data
http_config = HTTPCrawlerConfig(
method="POST",
json={"key": "value", "data": [1, 2, 3]},
headers={"Content-Type": "application/json"}
)
# POST with form data
http_config = HTTPCrawlerConfig(
method="POST",
data={"username": "user", "password": "pass"},
headers={"Content-Type": "application/x-www-form-urlencoded"}
)
# Advanced configuration
http_config = HTTPCrawlerConfig(
method="GET",
headers={"User-Agent": "Custom Bot/1.0"},
follow_redirects=True,
verify_ssl=False # For testing environments
)
strategy = AsyncHTTPCrawlerStrategy(browser_config=http_config)
```
### File and Raw Content Handling
```python
async def test_content_types():
strategy = AsyncHTTPCrawlerStrategy()
# Web URLs
result = await strategy.crawl("https://httpbin.org/get")
print(f"Web content: {result.status_code}")
# Local files
result = await strategy.crawl("file:///path/to/local/file.html")
print(f"File content: {len(result.html)}")
# Raw HTML content
raw_html = "raw://<html><body><h1>Test</h1><p>Content</p></body></html>"
result = await strategy.crawl(raw_html)
print(f"Raw content: {result.html}")
# Raw content with complex HTML
complex_html = """raw://<!DOCTYPE html>
<html>
<head><title>Test Page</title></head>
<body>
<div class="content">
<h1>Main Title</h1>
<p>Paragraph content</p>
<ul><li>Item 1</li><li>Item 2</li></ul>
</div>
</body>
</html>"""
result = await strategy.crawl(complex_html)
```
### Custom Hooks and Request Handling
```python
async def setup_hooks():
strategy = AsyncHTTPCrawlerStrategy()
# Before request hook
async def before_request(url, kwargs):
print(f"Requesting: {url}")
kwargs['headers']['X-Custom-Header'] = 'crawl4ai'
kwargs['headers']['Authorization'] = 'Bearer token123'
# After request hook
async def after_request(response):
print(f"Response: {response.status_code}")
if hasattr(response, 'redirected_url'):
print(f"Redirected to: {response.redirected_url}")
# Error handling hook
async def on_error(error):
print(f"Request failed: {error}")
# Set hooks
strategy.set_hook('before_request', before_request)
strategy.set_hook('after_request', after_request)
strategy.set_hook('on_error', on_error)
# Use with hooks
result = await strategy.crawl("https://httpbin.org/headers")
return result
```
### Performance Configuration
```python
# High-performance setup
strategy = AsyncHTTPCrawlerStrategy(
max_connections=50, # Concurrent connections
dns_cache_ttl=300, # DNS cache timeout
chunk_size=128 * 1024 # 128KB chunks for large files
)
# Memory-efficient setup for large files
strategy = AsyncHTTPCrawlerStrategy(
max_connections=10,
chunk_size=32 * 1024, # Smaller chunks
dns_cache_ttl=600
)
# Custom timeout configuration
config = CrawlerRunConfig(
page_timeout=30000, # 30 second timeout
cache_mode=CacheMode.BYPASS
)
result = await strategy.crawl("https://slow-server.com", config=config)
```
### Error Handling and Retries
```python
from crawl4ai.async_crawler_strategy import (
ConnectionTimeoutError,
HTTPStatusError,
HTTPCrawlerError
)
async def robust_crawling():
strategy = AsyncHTTPCrawlerStrategy()
urls = [
"https://example.com",
"https://httpbin.org/status/404",
"https://nonexistent.domain.test"
]
for url in urls:
try:
result = await strategy.crawl(url)
print(f"✓ {url}: {result.status_code}")
except HTTPStatusError as e:
print(f"✗ {url}: HTTP {e.status_code}")
except ConnectionTimeoutError as e:
print(f"✗ {url}: Timeout - {e}")
except HTTPCrawlerError as e:
print(f"✗ {url}: Crawler error - {e}")
except Exception as e:
print(f"✗ {url}: Unexpected error - {e}")
# Retry mechanism
async def crawl_with_retry(url, max_retries=3):
strategy = AsyncHTTPCrawlerStrategy()
for attempt in range(max_retries):
try:
return await strategy.crawl(url)
except (ConnectionTimeoutError, HTTPCrawlerError) as e:
if attempt == max_retries - 1:
raise
print(f"Retry {attempt + 1}/{max_retries}: {e}")
await asyncio.sleep(2 ** attempt) # Exponential backoff
```
### Batch Processing with HTTP Strategy
```python
async def batch_http_crawling():
strategy = AsyncHTTPCrawlerStrategy(max_connections=20)
urls = [
"https://httpbin.org/get",
"https://httpbin.org/user-agent",
"https://httpbin.org/headers",
"https://example.com",
"https://httpbin.org/json"
]
# Sequential processing
results = []
async with strategy:
for url in urls:
try:
result = await strategy.crawl(url)
results.append((url, result.status_code, len(result.html)))
except Exception as e:
results.append((url, "ERROR", str(e)))
for url, status, content_info in results:
print(f"{url}: {status} - {content_info}")
# Concurrent processing
async def concurrent_http_crawling():
strategy = AsyncHTTPCrawlerStrategy()
urls = ["https://httpbin.org/delay/1"] * 5
async def crawl_single(url):
try:
result = await strategy.crawl(url)
return f"✓ {result.status_code}"
except Exception as e:
return f"✗ {e}"
async with strategy:
tasks = [crawl_single(url) for url in urls]
results = await asyncio.gather(*tasks, return_exceptions=True)
for i, result in enumerate(results):
print(f"URL {i+1}: {result}")
```
### Integration with Content Processing
```python
from crawl4ai import DefaultMarkdownGenerator, PruningContentFilter
async def http_with_processing():
# HTTP strategy with content processing
http_strategy = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(verify_ssl=True)
)
# Configure markdown generation
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=10
)
),
word_count_threshold=5,
excluded_tags=['script', 'style', 'nav'],
exclude_external_links=True
)
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
result = await crawler.arun(
url="https://example.com",
config=crawler_config
)
print(f"Status: {result.status_code}")
print(f"Raw HTML: {len(result.html)} chars")
if result.markdown:
print(f"Markdown: {len(result.markdown.raw_markdown)} chars")
if result.markdown.fit_markdown:
print(f"Filtered: {len(result.markdown.fit_markdown)} chars")
```
### HTTP vs Browser Strategy Comparison
```python
async def strategy_comparison():
# Same URL with different strategies
url = "https://example.com"
# HTTP Strategy (fast, no JS)
http_strategy = AsyncHTTPCrawlerStrategy()
start_time = time.time()
http_result = await http_strategy.crawl(url)
http_time = time.time() - start_time
# Browser Strategy (full features)
from crawl4ai import BrowserConfig
browser_config = BrowserConfig(headless=True)
start_time = time.time()
async with AsyncWebCrawler(config=browser_config) as crawler:
browser_result = await crawler.arun(url)
browser_time = time.time() - start_time
print(f"HTTP Strategy:")
print(f" Time: {http_time:.2f}s")
print(f" Content: {len(http_result.html)} chars")
print(f" Features: Fast, lightweight, no JS")
print(f"Browser Strategy:")
print(f" Time: {browser_time:.2f}s")
print(f" Content: {len(browser_result.html)} chars")
print(f" Features: Full browser, JS, screenshots, etc.")
# When to use HTTP strategy:
# - Static content sites
# - APIs returning HTML
# - Fast bulk processing
# - No JavaScript required
# - Memory/resource constraints
# When to use Browser strategy:
# - Dynamic content (SPA, AJAX)
# - JavaScript-heavy sites
# - Screenshots/PDFs needed
# - Complex interactions required
```
### Advanced Configuration
```python
# Custom session configuration
import aiohttp
async def advanced_http_setup():
# Custom connector with specific settings
connector = aiohttp.TCPConnector(
limit=100, # Connection pool size
ttl_dns_cache=600, # DNS cache TTL
use_dns_cache=True, # Enable DNS caching
keepalive_timeout=30, # Keep-alive timeout
force_close=False # Reuse connections
)
strategy = AsyncHTTPCrawlerStrategy(
max_connections=50,
dns_cache_ttl=600,
chunk_size=64 * 1024
)
# Custom headers for all requests
http_config = HTTPCrawlerConfig(
headers={
"User-Agent": "Crawl4AI-HTTP/1.0",
"Accept": "text/html,application/xhtml+xml",
"Accept-Language": "en-US,en;q=0.9",
"Accept-Encoding": "gzip, deflate, br",
"DNT": "1"
},
verify_ssl=True,
follow_redirects=True
)
strategy.browser_config = http_config
# Use with custom timeout
config = CrawlerRunConfig(
page_timeout=45000, # 45 seconds
cache_mode=CacheMode.ENABLED
)
result = await strategy.crawl("https://example.com", config=config)
await strategy.close()
```
**📖 Learn more:** [AsyncWebCrawler API](https://docs.crawl4ai.com/api/async-webcrawler/), [Browser vs HTTP Strategy](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)

View File

@@ -0,0 +1,231 @@
## Installation
Multiple installation options for different environments and use cases.
### Basic Installation
```bash
# Install core library
pip install crawl4ai
# Initial setup (installs Playwright browsers)
crawl4ai-setup
# Verify installation
crawl4ai-doctor
```
### Quick Verification
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
print(result.markdown[:300])
if __name__ == "__main__":
asyncio.run(main())
```
**📖 Learn more:** [Basic Usage Guide](https://docs.crawl4ai.com/core/quickstart.md)
### Advanced Features (Optional)
```bash
# PyTorch-based features (text clustering, semantic chunking)
pip install crawl4ai[torch]
crawl4ai-setup
# Transformers (Hugging Face models)
pip install crawl4ai[transformer]
crawl4ai-setup
# All features (large download)
pip install crawl4ai[all]
crawl4ai-setup
# Pre-download models (optional)
crawl4ai-download-models
```
**📖 Learn more:** [Advanced Features Documentation](https://docs.crawl4ai.com/extraction/llm-strategies.md)
### Docker Deployment
```bash
# Pull pre-built image (specify platform for consistency)
docker pull --platform linux/amd64 unclecode/crawl4ai:latest
# For ARM (M1/M2 Macs): docker pull --platform linux/arm64 unclecode/crawl4ai:latest
# Setup environment for LLM support
cat > .llm.env << EOL
OPENAI_API_KEY=sk-your-key
ANTHROPIC_API_KEY=your-anthropic-key
EOL
# Run with LLM support (specify platform)
docker run -d \
--platform linux/amd64 \
-p 11235:11235 \
--name crawl4ai \
--env-file .llm.env \
--shm-size=1g \
unclecode/crawl4ai:latest
# For ARM Macs, use: --platform linux/arm64
# Basic run (no LLM)
docker run -d \
--platform linux/amd64 \
-p 11235:11235 \
--name crawl4ai \
--shm-size=1g \
unclecode/crawl4ai:latest
```
**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment.md)
### Docker Compose
```bash
# Clone repository
git clone https://github.com/unclecode/crawl4ai.git
cd crawl4ai
# Copy environment template
cp deploy/docker/.llm.env.example .llm.env
# Edit .llm.env with your API keys
# Run pre-built image
IMAGE=unclecode/crawl4ai:latest docker compose up -d
# Build and run locally
docker compose up --build -d
# Build with all features
INSTALL_TYPE=all docker compose up --build -d
# Stop service
docker compose down
```
**📖 Learn more:** [Docker Compose Configuration](https://docs.crawl4ai.com/core/docker-deployment.md#option-2-using-docker-compose)
### Manual Docker Build
```bash
# Build multi-architecture image (specify platform)
docker buildx build --platform linux/amd64 -t crawl4ai-local:latest --load .
# For ARM: docker buildx build --platform linux/arm64 -t crawl4ai-local:latest --load .
# Build with specific features
docker buildx build \
--platform linux/amd64 \
--build-arg INSTALL_TYPE=all \
--build-arg ENABLE_GPU=false \
-t crawl4ai-local:latest --load .
# Run custom build (specify platform)
docker run -d \
--platform linux/amd64 \
-p 11235:11235 \
--name crawl4ai-custom \
--env-file .llm.env \
--shm-size=1g \
crawl4ai-local:latest
```
**📖 Learn more:** [Manual Build Guide](https://docs.crawl4ai.com/core/docker-deployment.md#option-3-manual-local-build--run)
### Google Colab
```python
# Install in Colab
!pip install crawl4ai
!crawl4ai-setup
# If setup fails, manually install Playwright browsers
!playwright install chromium
# Install with all features (may take 5-10 minutes)
!pip install crawl4ai[all]
!crawl4ai-setup
!crawl4ai-download-models
# If still having issues, force Playwright install
!playwright install chromium --force
# Quick test
import asyncio
from crawl4ai import AsyncWebCrawler
async def test_crawl():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com")
print("✅ Installation successful!")
print(f"Content length: {len(result.markdown)}")
# Run test in Colab
await test_crawl()
```
**📖 Learn more:** [Colab Examples Notebook](https://colab.research.google.com/github/unclecode/crawl4ai/blob/main/docs/examples/quickstart.ipynb)
### Docker API Usage
```python
# Using Docker SDK
import asyncio
from crawl4ai.docker_client import Crawl4aiDockerClient
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
results = await client.crawl(
["https://example.com"],
browser_config=BrowserConfig(headless=True),
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
for result in results:
print(f"Success: {result.success}, Length: {len(result.markdown)}")
asyncio.run(main())
```
**📖 Learn more:** [Docker Client API](https://docs.crawl4ai.com/core/docker-deployment.md#python-sdk)
### Direct API Calls
```python
# REST API example
import requests
payload = {
"urls": ["https://example.com"],
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
}
response = requests.post("http://localhost:11235/crawl", json=payload)
print(response.json())
```
**📖 Learn more:** [REST API Reference](https://docs.crawl4ai.com/core/docker-deployment.md#rest-api-examples)
### Health Check
```bash
# Check Docker service
curl http://localhost:11235/health
# Access playground
open http://localhost:11235/playground
# View metrics
curl http://localhost:11235/metrics
```
**📖 Learn more:** [Monitoring & Metrics](https://docs.crawl4ai.com/core/docker-deployment.md#metrics--monitoring)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,339 @@
## Multi-URL Crawling
Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
### Basic Multi-URL Crawling
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
# Batch processing (default) - get all results at once
async def batch_crawl():
urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=False # Default: batch mode
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls, config=config)
for result in results:
if result.success:
print(f"✅ {result.url}: {len(result.markdown)} chars")
else:
print(f"❌ {result.url}: {result.error_message}")
# Streaming processing - handle results as they complete
async def streaming_crawl():
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True # Enable streaming
)
async with AsyncWebCrawler() as crawler:
# Process results as they become available
async for result in await crawler.arun_many(urls, config=config):
if result.success:
print(f"🔥 Just completed: {result.url}")
await process_result_immediately(result)
else:
print(f"❌ Failed: {result.url}")
```
### Memory-Adaptive Dispatching
```python
from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
# Automatically manages concurrency based on system memory
async def memory_adaptive_crawl():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0, # Pause if memory exceeds 80%
check_interval=1.0, # Check memory every second
max_session_permit=15, # Max concurrent tasks
memory_wait_timeout=300.0 # Wait up to 5 minutes for memory
)
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=50
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=large_url_list,
config=config,
dispatcher=dispatcher
)
# Each result includes dispatch information
for result in results:
if result.dispatch_result:
dr = result.dispatch_result
print(f"Memory used: {dr.memory_usage:.1f}MB")
print(f"Duration: {dr.end_time - dr.start_time}")
```
### Rate-Limited Crawling
```python
from crawl4ai import RateLimiter, SemaphoreDispatcher
# Control request pacing and handle server rate limits
async def rate_limited_crawl():
rate_limiter = RateLimiter(
base_delay=(1.0, 3.0), # Random delay 1-3 seconds
max_delay=60.0, # Cap backoff at 60 seconds
max_retries=3, # Retry failed requests 3 times
rate_limit_codes=[429, 503] # Handle these status codes
)
dispatcher = SemaphoreDispatcher(
max_session_permit=5, # Fixed concurrency limit
rate_limiter=rate_limiter
)
config = CrawlerRunConfig(
user_agent_mode="random", # Randomize user agents
simulate_user=True # Simulate human behavior
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
print(f"Processed: {result.url}")
```
### Real-Time Monitoring
```python
from crawl4ai import CrawlerMonitor, DisplayMode
# Monitor crawling progress in real-time
async def monitored_crawl():
monitor = CrawlerMonitor(
max_visible_rows=20, # Show 20 tasks in display
display_mode=DisplayMode.DETAILED # Show individual task details
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=75.0,
max_session_permit=10,
monitor=monitor # Attach monitor to dispatcher
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
```
### Advanced Dispatcher Configurations
```python
# Memory-adaptive with comprehensive monitoring
memory_dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0, # Higher memory tolerance
check_interval=0.5, # Check memory more frequently
max_session_permit=20, # More concurrent tasks
memory_wait_timeout=600.0, # Wait longer for memory
rate_limiter=RateLimiter(
base_delay=(0.5, 1.5),
max_delay=30.0,
max_retries=5
),
monitor=CrawlerMonitor(
max_visible_rows=15,
display_mode=DisplayMode.AGGREGATED # Summary view
)
)
# Simple semaphore-based dispatcher
semaphore_dispatcher = SemaphoreDispatcher(
max_session_permit=8, # Fixed concurrency
rate_limiter=RateLimiter(
base_delay=(1.0, 2.0),
max_delay=20.0
)
)
# Usage with custom dispatcher
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
config=config,
dispatcher=memory_dispatcher # or semaphore_dispatcher
)
```
### Handling Large-Scale Crawling
```python
async def large_scale_crawl():
# For thousands of URLs
urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=70.0, # Conservative memory usage
max_session_permit=25, # Higher concurrency
rate_limiter=RateLimiter(
base_delay=(0.1, 0.5), # Faster for large batches
max_retries=2 # Fewer retries for speed
),
monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
)
config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED, # Use caching for efficiency
stream=True, # Stream for memory efficiency
word_count_threshold=100, # Skip short content
exclude_external_links=True # Reduce processing overhead
)
successful_crawls = 0
failed_crawls = 0
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
successful_crawls += 1
await save_result_to_database(result)
else:
failed_crawls += 1
await log_failure(result.url, result.error_message)
# Progress reporting
if (successful_crawls + failed_crawls) % 100 == 0:
print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
```
### Robots.txt Compliance
```python
async def compliant_crawl():
config = CrawlerRunConfig(
check_robots_txt=True, # Respect robots.txt
user_agent="MyBot/1.0", # Identify your bot
mean_delay=2.0, # Be polite with delays
max_range=1.0
)
dispatcher = SemaphoreDispatcher(
max_session_permit=3, # Conservative concurrency
rate_limiter=RateLimiter(
base_delay=(2.0, 5.0), # Slower, more respectful
max_retries=1
)
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
print(f"✅ Crawled: {result.url}")
elif "robots.txt" in result.error_message:
print(f"🚫 Blocked by robots.txt: {result.url}")
else:
print(f"❌ Error: {result.url}")
```
### Performance Analysis
```python
async def analyze_crawl_performance():
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=80.0,
max_session_permit=12,
monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
)
start_time = time.time()
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(
urls=urls,
dispatcher=dispatcher
)
end_time = time.time()
# Analyze results
successful = [r for r in results if r.success]
failed = [r for r in results if not r.success]
print(f"Total time: {end_time - start_time:.2f}s")
print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
# Memory usage analysis
if successful and successful[0].dispatch_result:
memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
print(f"Peak memory usage: {max(peak_memory):.1f}MB")
```
### Error Handling and Recovery
```python
async def robust_multi_crawl():
failed_urls = []
config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
stream=True,
page_timeout=30000 # 30 second timeout
)
dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=85.0,
max_session_permit=10
)
async with AsyncWebCrawler() as crawler:
async for result in await crawler.arun_many(
urls=urls,
config=config,
dispatcher=dispatcher
):
if result.success:
await process_successful_result(result)
else:
failed_urls.append({
'url': result.url,
'error': result.error_message,
'status_code': result.status_code
})
# Retry logic for specific errors
if result.status_code in [503, 429]: # Server errors
await schedule_retry(result.url)
# Report failures
if failed_urls:
print(f"Failed to crawl {len(failed_urls)} URLs:")
for failure in failed_urls[:10]: # Show first 10
print(f" {failure['url']}: {failure['error']}")
```
**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)

View File

@@ -0,0 +1,365 @@
## Simple Crawling
Basic web crawling operations with AsyncWebCrawler, configurations, and response handling.
### Basic Setup
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
async def main():
browser_config = BrowserConfig() # Default browser settings
run_config = CrawlerRunConfig() # Default crawl settings
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=run_config
)
print(result.markdown)
if __name__ == "__main__":
asyncio.run(main())
```
### Understanding CrawlResult
```python
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
config = CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(threshold=0.6),
options={"ignore_links": True}
)
)
result = await crawler.arun("https://example.com", config=config)
# Different content formats
print(result.html) # Raw HTML
print(result.cleaned_html) # Cleaned HTML
print(result.markdown.raw_markdown) # Raw markdown
print(result.markdown.fit_markdown) # Filtered markdown
# Status information
print(result.success) # True/False
print(result.status_code) # HTTP status (200, 404, etc.)
# Extracted content
print(result.media) # Images, videos, audio
print(result.links) # Internal/external links
```
### Basic Configuration Options
```python
run_config = CrawlerRunConfig(
word_count_threshold=10, # Min words per block
exclude_external_links=True, # Remove external links
remove_overlay_elements=True, # Remove popups/modals
process_iframes=True, # Process iframe content
excluded_tags=['form', 'header'] # Skip these tags
)
result = await crawler.arun("https://example.com", config=run_config)
```
### Error Handling
```python
result = await crawler.arun("https://example.com", config=run_config)
if not result.success:
print(f"Crawl failed: {result.error_message}")
print(f"Status code: {result.status_code}")
else:
print(f"Success! Content length: {len(result.markdown)}")
```
### Debugging with Verbose Logging
```python
browser_config = BrowserConfig(verbose=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun("https://example.com")
# Detailed logging output will be displayed
```
### Complete Example
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def comprehensive_crawl():
browser_config = BrowserConfig(verbose=True)
run_config = CrawlerRunConfig(
# Content filtering
word_count_threshold=10,
excluded_tags=['form', 'header', 'nav'],
exclude_external_links=True,
# Content processing
process_iframes=True,
remove_overlay_elements=True,
# Cache control
cache_mode=CacheMode.ENABLED
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://example.com",
config=run_config
)
if result.success:
# Display content summary
print(f"Title: {result.metadata.get('title', 'No title')}")
print(f"Content: {result.markdown[:500]}...")
# Process media
images = result.media.get("images", [])
print(f"Found {len(images)} images")
for img in images[:3]: # First 3 images
print(f" - {img.get('src', 'No src')}")
# Process links
internal_links = result.links.get("internal", [])
print(f"Found {len(internal_links)} internal links")
for link in internal_links[:3]: # First 3 links
print(f" - {link.get('href', 'No href')}")
else:
print(f"❌ Crawl failed: {result.error_message}")
print(f"Status: {result.status_code}")
if __name__ == "__main__":
asyncio.run(comprehensive_crawl())
```
### Working with Raw HTML and Local Files
```python
# Crawl raw HTML
raw_html = "<html><body><h1>Test</h1><p>Content</p></body></html>"
result = await crawler.arun(f"raw://{raw_html}")
# Crawl local file
result = await crawler.arun("file:///path/to/local/file.html")
# Both return standard CrawlResult objects
print(result.markdown)
```
## Table Extraction
Extract structured data from HTML tables with automatic detection and scoring.
### Basic Table Extraction
```python
import asyncio
import pandas as pd
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
async def extract_tables():
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
table_score_threshold=7, # Higher = stricter detection
cache_mode=CacheMode.BYPASS
)
result = await crawler.arun("https://example.com/tables", config=config)
if result.success and result.tables:
# New tables field (v0.6+)
for i, table in enumerate(result.tables):
print(f"Table {i+1}:")
print(f"Headers: {table['headers']}")
print(f"Rows: {len(table['rows'])}")
print(f"Caption: {table.get('caption', 'No caption')}")
# Convert to DataFrame
df = pd.DataFrame(table['rows'], columns=table['headers'])
print(df.head())
asyncio.run(extract_tables())
```
### Advanced Table Processing
```python
from crawl4ai import LXMLWebScrapingStrategy
async def process_financial_tables():
config = CrawlerRunConfig(
table_score_threshold=8, # Strict detection for data tables
scraping_strategy=LXMLWebScrapingStrategy(),
keep_data_attributes=True,
scan_full_page=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://coinmarketcap.com", config=config)
if result.tables:
# Get the main data table (usually first/largest)
main_table = result.tables[0]
# Create DataFrame
df = pd.DataFrame(
main_table['rows'],
columns=main_table['headers']
)
# Clean and process data
df = clean_financial_data(df)
# Save for analysis
df.to_csv("market_data.csv", index=False)
return df
def clean_financial_data(df):
"""Clean currency symbols, percentages, and large numbers"""
for col in df.columns:
if 'price' in col.lower():
# Remove currency symbols
df[col] = df[col].str.replace(r'[^\d.]', '', regex=True)
df[col] = pd.to_numeric(df[col], errors='coerce')
elif '%' in str(df[col].iloc[0]):
# Convert percentages
df[col] = df[col].str.replace('%', '').astype(float) / 100
elif any(suffix in str(df[col].iloc[0]) for suffix in ['B', 'M', 'K']):
# Handle large numbers (Billions, Millions, etc.)
df[col] = df[col].apply(convert_large_numbers)
return df
def convert_large_numbers(value):
"""Convert 1.5B -> 1500000000"""
if pd.isna(value):
return float('nan')
value = str(value)
multiplier = 1
if 'B' in value:
multiplier = 1e9
elif 'M' in value:
multiplier = 1e6
elif 'K' in value:
multiplier = 1e3
number = float(re.sub(r'[^\d.]', '', value))
return number * multiplier
```
### Table Detection Configuration
```python
# Strict table detection (data-heavy pages)
strict_config = CrawlerRunConfig(
table_score_threshold=9, # Only high-quality tables
word_count_threshold=5, # Ignore sparse content
excluded_tags=['nav', 'footer'] # Skip navigation tables
)
# Lenient detection (mixed content pages)
lenient_config = CrawlerRunConfig(
table_score_threshold=5, # Include layout tables
process_iframes=True, # Check embedded tables
scan_full_page=True # Scroll to load dynamic tables
)
# Financial/data site optimization
financial_config = CrawlerRunConfig(
table_score_threshold=8,
scraping_strategy=LXMLWebScrapingStrategy(),
wait_for="css:table", # Wait for tables to load
scan_full_page=True,
scroll_delay=0.2
)
```
### Multi-Table Processing
```python
async def extract_all_tables():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com/data", config=config)
tables_data = {}
for i, table in enumerate(result.tables):
# Create meaningful names based on content
table_name = (
table.get('caption') or
f"table_{i+1}_{table['headers'][0]}"
).replace(' ', '_').lower()
df = pd.DataFrame(table['rows'], columns=table['headers'])
# Store with metadata
tables_data[table_name] = {
'dataframe': df,
'headers': table['headers'],
'row_count': len(table['rows']),
'caption': table.get('caption'),
'summary': table.get('summary')
}
return tables_data
# Usage
tables = await extract_all_tables()
for name, data in tables.items():
print(f"{name}: {data['row_count']} rows")
data['dataframe'].to_csv(f"{name}.csv")
```
### Backward Compatibility
```python
# Support both new and old table formats
def get_tables(result):
# New format (v0.6+)
if hasattr(result, 'tables') and result.tables:
return result.tables
# Fallback to media.tables (older versions)
return result.media.get('tables', [])
# Usage in existing code
result = await crawler.arun(url, config=config)
tables = get_tables(result)
for table in tables:
df = pd.DataFrame(table['rows'], columns=table['headers'])
# Process table data...
```
### Table Quality Scoring
```python
# Understanding table_score_threshold values:
# 10: Only perfect data tables (headers + data rows)
# 8-9: High-quality tables (recommended for financial/data sites)
# 6-7: Mixed content tables (news sites, wikis)
# 4-5: Layout tables included (broader detection)
# 1-3: All table-like structures (very permissive)
config = CrawlerRunConfig(
table_score_threshold=8, # Balanced detection
verbose=True # See scoring details in logs
)
```
**📖 Learn more:** [CrawlResult API Reference](https://docs.crawl4ai.com/api/crawl-result/), [Browser & Crawler Configuration](https://docs.crawl4ai.com/core/browser-crawler-config/), [Cache Modes](https://docs.crawl4ai.com/core/cache-modes/)

View File

@@ -0,0 +1,655 @@
## URL Seeding
Smart URL discovery for efficient large-scale crawling. Discover thousands of URLs instantly, filter by relevance, then crawl only what matters.
### Why URL Seeding vs Deep Crawling
```python
# Deep Crawling: Real-time discovery (page by page)
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
async def deep_crawl_example():
config = CrawlerRunConfig(
deep_crawl_strategy=BFSDeepCrawlStrategy(
max_depth=2,
include_external=False,
max_pages=50
)
)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun("https://example.com", config=config)
print(f"Discovered {len(results)} pages dynamically")
# URL Seeding: Bulk discovery (thousands instantly)
from crawl4ai import AsyncUrlSeeder, SeedingConfig
async def url_seeding_example():
config = SeedingConfig(
source="sitemap+cc",
pattern="*/docs/*",
extract_head=True,
query="API documentation",
scoring_method="bm25",
max_urls=1000
)
async with AsyncUrlSeeder() as seeder:
urls = await seeder.urls("example.com", config)
print(f"Discovered {len(urls)} URLs instantly")
# Now crawl only the most relevant ones
```
### Basic URL Discovery
```python
import asyncio
from crawl4ai import AsyncUrlSeeder, SeedingConfig
async def basic_discovery():
# Context manager handles cleanup automatically
async with AsyncUrlSeeder() as seeder:
# Simple discovery from sitemaps
config = SeedingConfig(source="sitemap")
urls = await seeder.urls("example.com", config)
print(f"Found {len(urls)} URLs from sitemap")
for url in urls[:5]:
print(f" - {url['url']} (status: {url['status']})")
# Manual cleanup (if needed)
async def manual_cleanup():
seeder = AsyncUrlSeeder()
try:
config = SeedingConfig(source="cc") # Common Crawl
urls = await seeder.urls("example.com", config)
print(f"Found {len(urls)} URLs from Common Crawl")
finally:
await seeder.close()
asyncio.run(basic_discovery())
```
### Data Sources and Patterns
```python
# Different data sources
configs = [
SeedingConfig(source="sitemap"), # Fastest, official URLs
SeedingConfig(source="cc"), # Most comprehensive
SeedingConfig(source="sitemap+cc"), # Maximum coverage
]
# URL pattern filtering
patterns = [
SeedingConfig(pattern="*/blog/*"), # Blog posts only
SeedingConfig(pattern="*.html"), # HTML files only
SeedingConfig(pattern="*/product/*"), # Product pages
SeedingConfig(pattern="*/docs/api/*"), # API documentation
SeedingConfig(pattern="*"), # Everything
]
# Advanced pattern usage
async def pattern_filtering():
async with AsyncUrlSeeder() as seeder:
# Find all blog posts from 2024
config = SeedingConfig(
source="sitemap",
pattern="*/blog/2024/*.html",
max_urls=100
)
blog_urls = await seeder.urls("example.com", config)
# Further filter by keywords in URL
python_posts = [
url for url in blog_urls
if "python" in url['url'].lower()
]
print(f"Found {len(python_posts)} Python blog posts")
```
### SeedingConfig Parameters
```python
from crawl4ai import SeedingConfig
# Comprehensive configuration
config = SeedingConfig(
# Data sources
source="sitemap+cc", # "sitemap", "cc", "sitemap+cc"
pattern="*/docs/*", # URL pattern filter
# Metadata extraction
extract_head=True, # Get <head> metadata
live_check=True, # Verify URLs are accessible
# Performance controls
max_urls=1000, # Limit results (-1 = unlimited)
concurrency=20, # Parallel workers
hits_per_sec=10, # Rate limiting
# Relevance scoring
query="API documentation guide", # Search query
scoring_method="bm25", # Scoring algorithm
score_threshold=0.3, # Minimum relevance (0.0-1.0)
# Cache and filtering
force=False, # Bypass cache
filter_nonsense_urls=True, # Remove utility URLs
verbose=True # Debug output
)
# Quick configurations for common use cases
blog_config = SeedingConfig(
source="sitemap",
pattern="*/blog/*",
extract_head=True
)
api_docs_config = SeedingConfig(
source="sitemap+cc",
pattern="*/docs/*",
query="API reference documentation",
scoring_method="bm25",
score_threshold=0.5
)
product_pages_config = SeedingConfig(
source="cc",
pattern="*/product/*",
live_check=True,
max_urls=500
)
```
### Metadata Extraction and Analysis
```python
async def metadata_extraction():
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
extract_head=True, # Extract <head> metadata
pattern="*/blog/*",
max_urls=50
)
urls = await seeder.urls("example.com", config)
# Analyze extracted metadata
for url in urls[:5]:
head_data = url['head_data']
print(f"\nURL: {url['url']}")
print(f"Title: {head_data.get('title', 'No title')}")
# Standard meta tags
meta = head_data.get('meta', {})
print(f"Description: {meta.get('description', 'N/A')}")
print(f"Keywords: {meta.get('keywords', 'N/A')}")
print(f"Author: {meta.get('author', 'N/A')}")
# Open Graph data
print(f"OG Image: {meta.get('og:image', 'N/A')}")
print(f"OG Type: {meta.get('og:type', 'N/A')}")
# JSON-LD structured data
jsonld = head_data.get('jsonld', [])
if jsonld:
print(f"Structured data: {len(jsonld)} items")
for item in jsonld[:2]:
if isinstance(item, dict):
print(f" Type: {item.get('@type', 'Unknown')}")
print(f" Name: {item.get('name', 'N/A')}")
# Filter by metadata
async def metadata_filtering():
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
extract_head=True,
max_urls=100
)
urls = await seeder.urls("news.example.com", config)
# Filter by publication date (from JSON-LD)
from datetime import datetime, timedelta
recent_cutoff = datetime.now() - timedelta(days=7)
recent_articles = []
for url in urls:
for jsonld in url['head_data'].get('jsonld', []):
if isinstance(jsonld, dict) and 'datePublished' in jsonld:
try:
pub_date = datetime.fromisoformat(
jsonld['datePublished'].replace('Z', '+00:00')
)
if pub_date > recent_cutoff:
recent_articles.append(url)
break
except:
continue
print(f"Found {len(recent_articles)} recent articles")
```
### BM25 Relevance Scoring
```python
async def relevance_scoring():
async with AsyncUrlSeeder() as seeder:
# Find pages about Python async programming
config = SeedingConfig(
source="sitemap",
extract_head=True, # Required for content-based scoring
query="python async await concurrency",
scoring_method="bm25",
score_threshold=0.3, # Only 30%+ relevant pages
max_urls=20
)
urls = await seeder.urls("docs.python.org", config)
# Results are automatically sorted by relevance
print("Most relevant Python async content:")
for url in urls[:5]:
score = url['relevance_score']
title = url['head_data'].get('title', 'No title')
print(f"[{score:.2f}] {title}")
print(f" {url['url']}")
# URL-based scoring (when extract_head=False)
async def url_based_scoring():
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
extract_head=False, # Fast URL-only scoring
query="machine learning tutorial",
scoring_method="bm25",
score_threshold=0.2
)
urls = await seeder.urls("example.com", config)
# Scoring based on URL structure, domain, path segments
for url in urls[:5]:
print(f"[{url['relevance_score']:.2f}] {url['url']}")
# Multi-concept queries
async def complex_queries():
queries = [
"data science pandas numpy visualization",
"web scraping automation selenium",
"machine learning tensorflow pytorch",
"api documentation rest graphql"
]
async with AsyncUrlSeeder() as seeder:
all_results = []
for query in queries:
config = SeedingConfig(
source="sitemap",
extract_head=True,
query=query,
scoring_method="bm25",
score_threshold=0.4,
max_urls=10
)
urls = await seeder.urls("learning-site.com", config)
all_results.extend(urls)
# Remove duplicates while preserving order
seen = set()
unique_results = []
for url in all_results:
if url['url'] not in seen:
seen.add(url['url'])
unique_results.append(url)
print(f"Found {len(unique_results)} unique pages across all topics")
```
### Live URL Validation
```python
async def url_validation():
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
live_check=True, # Verify URLs are accessible
concurrency=15, # Parallel HEAD requests
hits_per_sec=8, # Rate limiting
max_urls=100
)
urls = await seeder.urls("example.com", config)
# Analyze results
valid_urls = [u for u in urls if u['status'] == 'valid']
invalid_urls = [u for u in urls if u['status'] == 'not_valid']
print(f"✅ Valid URLs: {len(valid_urls)}")
print(f"❌ Invalid URLs: {len(invalid_urls)}")
print(f"📊 Success rate: {len(valid_urls)/len(urls)*100:.1f}%")
# Show some invalid URLs for debugging
if invalid_urls:
print("\nSample invalid URLs:")
for url in invalid_urls[:3]:
print(f" - {url['url']}")
# Combined validation and metadata
async def comprehensive_validation():
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
live_check=True, # Verify accessibility
extract_head=True, # Get metadata
query="tutorial guide", # Relevance scoring
scoring_method="bm25",
score_threshold=0.2,
concurrency=10,
max_urls=50
)
urls = await seeder.urls("docs.example.com", config)
# Filter for valid, relevant tutorials
good_tutorials = [
url for url in urls
if url['status'] == 'valid' and
url['relevance_score'] > 0.3 and
'tutorial' in url['head_data'].get('title', '').lower()
]
print(f"Found {len(good_tutorials)} high-quality tutorials")
```
### Multi-Domain Discovery
```python
async def multi_domain_research():
async with AsyncUrlSeeder() as seeder:
# Research Python tutorials across multiple sites
domains = [
"docs.python.org",
"realpython.com",
"python-course.eu",
"tutorialspoint.com"
]
config = SeedingConfig(
source="sitemap",
extract_head=True,
query="python beginner tutorial basics",
scoring_method="bm25",
score_threshold=0.3,
max_urls=15 # Per domain
)
# Discover across all domains in parallel
results = await seeder.many_urls(domains, config)
# Collect and rank all tutorials
all_tutorials = []
for domain, urls in results.items():
for url in urls:
url['domain'] = domain
all_tutorials.append(url)
# Sort by relevance across all domains
all_tutorials.sort(key=lambda x: x['relevance_score'], reverse=True)
print(f"Top 10 Python tutorials across {len(domains)} sites:")
for i, tutorial in enumerate(all_tutorials[:10], 1):
score = tutorial['relevance_score']
title = tutorial['head_data'].get('title', 'No title')[:60]
domain = tutorial['domain']
print(f"{i:2d}. [{score:.2f}] {title}")
print(f" {domain}")
# Competitor analysis
async def competitor_analysis():
competitors = ["competitor1.com", "competitor2.com", "competitor3.com"]
async with AsyncUrlSeeder() as seeder:
config = SeedingConfig(
source="sitemap",
extract_head=True,
pattern="*/blog/*",
max_urls=50
)
results = await seeder.many_urls(competitors, config)
# Analyze content strategies
for domain, urls in results.items():
content_types = {}
for url in urls:
# Extract content type from metadata
meta = url['head_data'].get('meta', {})
og_type = meta.get('og:type', 'unknown')
content_types[og_type] = content_types.get(og_type, 0) + 1
print(f"\n{domain} content distribution:")
for ctype, count in sorted(content_types.items(),
key=lambda x: x[1], reverse=True):
print(f" {ctype}: {count}")
```
### Complete Pipeline: Discovery → Filter → Crawl
```python
async def smart_research_pipeline():
"""Complete pipeline: discover URLs, filter by relevance, crawl top results"""
async with AsyncUrlSeeder() as seeder:
# Step 1: Discover relevant URLs
print("🔍 Discovering URLs...")
config = SeedingConfig(
source="sitemap+cc",
extract_head=True,
query="machine learning deep learning tutorial",
scoring_method="bm25",
score_threshold=0.4,
max_urls=100
)
urls = await seeder.urls("example.com", config)
print(f" Found {len(urls)} relevant URLs")
# Step 2: Select top articles
top_articles = sorted(urls,
key=lambda x: x['relevance_score'],
reverse=True)[:10]
print(f" Selected top {len(top_articles)} for crawling")
# Step 3: Show what we're about to crawl
print("\n📋 Articles to crawl:")
for i, article in enumerate(top_articles, 1):
score = article['relevance_score']
title = article['head_data'].get('title', 'No title')[:60]
print(f" {i}. [{score:.2f}] {title}")
# Step 4: Crawl selected articles
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
print(f"\n🕷 Crawling {len(top_articles)} articles...")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
only_text=True,
word_count_threshold=200,
stream=True # Process results as they come
)
# Extract URLs and crawl
article_urls = [article['url'] for article in top_articles]
crawled_count = 0
async for result in await crawler.arun_many(article_urls, config=config):
if result.success:
crawled_count += 1
word_count = len(result.markdown.raw_markdown.split())
print(f" ✅ [{crawled_count}/{len(article_urls)}] "
f"{word_count} words from {result.url[:50]}...")
else:
print(f" ❌ Failed: {result.url[:50]}...")
print(f"\n✨ Successfully crawled {crawled_count} articles!")
asyncio.run(smart_research_pipeline())
```
### Advanced Features and Performance
```python
# Cache management
async def cache_management():
async with AsyncUrlSeeder() as seeder:
# First run - populate cache
config = SeedingConfig(
source="sitemap",
extract_head=True,
force=True # Bypass cache, fetch fresh
)
urls = await seeder.urls("example.com", config)
# Subsequent runs - use cache (much faster)
config = SeedingConfig(
source="sitemap",
extract_head=True,
force=False # Use cache
)
urls = await seeder.urls("example.com", config)
# Performance optimization
async def performance_tuning():
async with AsyncUrlSeeder() as seeder:
# High-performance configuration
config = SeedingConfig(
source="cc",
concurrency=50, # Many parallel workers
hits_per_sec=20, # High rate limit
max_urls=10000, # Large dataset
extract_head=False, # Skip metadata for speed
filter_nonsense_urls=True # Auto-filter utility URLs
)
import time
start = time.time()
urls = await seeder.urls("large-site.com", config)
elapsed = time.time() - start
print(f"Processed {len(urls)} URLs in {elapsed:.2f}s")
print(f"Speed: {len(urls)/elapsed:.0f} URLs/second")
# Memory-safe processing for large domains
async def large_domain_processing():
async with AsyncUrlSeeder() as seeder:
# Safe for domains with 1M+ URLs
config = SeedingConfig(
source="cc+sitemap",
concurrency=50, # Bounded queue adapts to this
max_urls=100000, # Process in batches
filter_nonsense_urls=True
)
# The seeder automatically manages memory by:
# - Using bounded queues (prevents RAM spikes)
# - Applying backpressure when queue is full
# - Processing URLs as they're discovered
urls = await seeder.urls("huge-site.com", config)
# Configuration cloning and reuse
config_base = SeedingConfig(
source="sitemap",
extract_head=True,
concurrency=20
)
# Create variations
blog_config = config_base.clone(pattern="*/blog/*")
docs_config = config_base.clone(
pattern="*/docs/*",
query="API documentation",
scoring_method="bm25"
)
fast_config = config_base.clone(
extract_head=False,
concurrency=100,
hits_per_sec=50
)
```
### Troubleshooting and Best Practices
```python
# Common issues and solutions
async def troubleshooting_guide():
async with AsyncUrlSeeder() as seeder:
# Issue: No URLs found
try:
config = SeedingConfig(source="sitemap", pattern="*/nonexistent/*")
urls = await seeder.urls("example.com", config)
if not urls:
# Solution: Try broader pattern or different source
config = SeedingConfig(source="cc+sitemap", pattern="*")
urls = await seeder.urls("example.com", config)
except Exception as e:
print(f"Discovery failed: {e}")
# Issue: Slow performance
config = SeedingConfig(
source="sitemap", # Faster than CC
concurrency=10, # Reduce if hitting rate limits
hits_per_sec=5, # Add rate limiting
extract_head=False # Skip if metadata not needed
)
# Issue: Low relevance scores
config = SeedingConfig(
query="specific detailed query terms",
score_threshold=0.1, # Lower threshold
scoring_method="bm25"
)
# Issue: Memory issues with large sites
config = SeedingConfig(
max_urls=10000, # Limit results
concurrency=20, # Reduce concurrency
source="sitemap" # Use sitemap only
)
# Performance benchmarks
print("""
Typical performance on standard connection:
- Sitemap discovery: 100-1,000 URLs/second
- Common Crawl discovery: 50-500 URLs/second
- HEAD checking: 10-50 URLs/second
- Head extraction: 5-20 URLs/second
- BM25 scoring: 10,000+ URLs/second
""")
# Best practices
best_practices = """
✅ Use context manager: async with AsyncUrlSeeder() as seeder
✅ Start with sitemaps (faster), add CC if needed
✅ Use extract_head=True only when you need metadata
✅ Set reasonable max_urls to limit processing
✅ Add rate limiting for respectful crawling
✅ Cache results with force=False for repeated operations
✅ Filter nonsense URLs (enabled by default)
✅ Use specific patterns to reduce irrelevant results
"""
```
**📖 Learn more:** [Complete URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [SeedingConfig Reference](https://docs.crawl4ai.com/api/parameters/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/)

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,556 +0,0 @@
# Detailed Outline for crawl4ai - config_objects Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_config_objects.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2024-05-24
---
## 1. Introduction to Configuration Objects in Crawl4ai
* **1.1. Purpose of Configuration Objects**
* Explanation: Configuration objects in `crawl4ai` serve to centralize and manage settings for various components and behaviors of the library. This includes browser setup, individual crawl run parameters, LLM provider interactions, proxy settings, and more.
* Benefit: This approach enhances code readability by grouping related settings, improves maintainability by providing a clear structure for configurations, and offers ease of customization for users to tailor the library's behavior to their specific needs.
* **1.2. General Principles and Usage**
* **1.2.1. Immutability/Cloning:**
* Concept: Most configuration objects are designed with a `clone()` method, allowing users to create modified copies without altering the original configuration instance. This promotes safer state management, especially when reusing base configurations for multiple tasks.
* Method: `clone(**kwargs)` on most configuration objects.
* **1.2.2. Serialization and Deserialization:**
* Concept: `crawl4ai` configuration objects can be serialized to dictionary format (e.g., for saving to JSON) and deserialized back into their respective class instances.
* Methods:
* `dump() -> dict`: Serializes the object to a dictionary suitable for JSON, often using the internal `to_serializable_dict` helper.
* `load(data: dict) -> ConfigClass` (Static Method): Deserializes an object from a dictionary, often using the internal `from_serializable_dict` helper.
* `to_dict() -> dict`: Converts the object to a standard Python dictionary.
* `from_dict(data: dict) -> ConfigClass` (Static Method): Creates an instance from a standard Python dictionary.
* Helper Functions:
* `crawl4ai.async_configs.to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`: Recursively converts objects into a serializable dictionary format, handling complex types like enums and nested objects.
* `crawl4ai.async_configs.from_serializable_dict(data: Any) -> Any`: Reconstructs Python objects from the serializable dictionary format.
* **1.3. Scope of this Document**
* Statement: This document provides a factual API reference for the primary configuration objects within the `crawl4ai` library, detailing their purpose, initialization parameters, attributes, and key methods.
## 2. Core Configuration Objects
### 2.1. `BrowserConfig`
Located in `crawl4ai.async_configs`.
* **2.1.1. Purpose:**
* Description: The `BrowserConfig` class is used to configure the settings for a browser instance and its associated contexts when using browser-based crawler strategies like `AsyncPlaywrightCrawlerStrategy`. It centralizes all parameters that affect the creation and behavior of the browser.
* **2.1.2. Initialization (`__init__`)**
* Signature:
```python
class BrowserConfig:
def __init__(
self,
browser_type: str = "chromium",
headless: bool = True,
browser_mode: str = "dedicated",
use_managed_browser: bool = False,
cdp_url: Optional[str] = None,
use_persistent_context: bool = False,
user_data_dir: Optional[str] = None,
chrome_channel: Optional[str] = "chromium", # Note: 'channel' is preferred
channel: Optional[str] = "chromium",
proxy: Optional[str] = None,
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport: Optional[dict] = None,
accept_downloads: bool = False,
downloads_path: Optional[str] = None,
storage_state: Optional[Union[str, dict]] = None,
ignore_https_errors: bool = True,
java_script_enabled: bool = True,
sleep_on_close: bool = False,
verbose: bool = True,
cookies: Optional[List[dict]] = None,
headers: Optional[dict] = None,
user_agent: Optional[str] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
user_agent_mode: Optional[str] = "",
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
text_mode: bool = False,
light_mode: bool = False,
extra_args: Optional[List[str]] = None,
debugging_port: int = 9222,
host: str = "localhost"
): ...
```
* Parameters:
* `browser_type (str, default: "chromium")`: Specifies the browser engine to use. Supported values: `"chromium"`, `"firefox"`, `"webkit"`.
* `headless (bool, default: True)`: If `True`, runs the browser without a visible GUI. Set to `False` for debugging or visual interaction.
* `browser_mode (str, default: "dedicated")`: Defines how the browser is initialized. Options: `"builtin"` (uses built-in CDP), `"dedicated"` (new instance each time), `"cdp"` (connects to an existing CDP endpoint specified by `cdp_url`), `"docker"` (runs browser in a Docker container).
* `use_managed_browser (bool, default: False)`: If `True`, launches the browser using a managed approach (e.g., via CDP or Docker), allowing for more advanced control. Automatically set to `True` if `browser_mode` is `"builtin"`, `"docker"`, or if `cdp_url` is provided, or if `use_persistent_context` is `True`.
* `cdp_url (Optional[str], default: None)`: The URL for the Chrome DevTools Protocol (CDP) endpoint. If not provided and `use_managed_browser` is active, it might be set by an internal browser manager.
* `use_persistent_context (bool, default: False)`: If `True`, uses a persistent browser context (profile), saving cookies, localStorage, etc., across sessions. Requires `user_data_dir`. Sets `use_managed_browser=True`.
* `user_data_dir (Optional[str], default: None)`: Path to a directory for storing user data for persistent sessions. If `None` and `use_persistent_context` is `True`, a temporary directory might be used.
* `chrome_channel (Optional[str], default: "chromium")`: Specifies the Chrome channel (e.g., "chrome", "msedge", "chromium-beta"). Only applicable if `browser_type` is "chromium".
* `channel (Optional[str], default: "chromium")`: Preferred alias for `chrome_channel`. Set to `""` for Firefox or WebKit.
* `proxy (Optional[str], default: None)`: A string representing the proxy server URL (e.g., "http://username:password@proxy.example.com:8080").
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: A `ProxyConfig` object or a dictionary specifying detailed proxy settings. Overrides the `proxy` string if both are provided.
* `viewport_width (int, default: 1080)`: Default width of the browser viewport in pixels.
* `viewport_height (int, default: 600)`: Default height of the browser viewport in pixels.
* `viewport (Optional[dict], default: None)`: A dictionary specifying viewport dimensions, e.g., `{"width": 1920, "height": 1080}`. If set, overrides `viewport_width` and `viewport_height`.
* `accept_downloads (bool, default: False)`: If `True`, allows files to be downloaded by the browser.
* `downloads_path (Optional[str], default: None)`: Directory path where downloaded files will be stored. Required if `accept_downloads` is `True`.
* `storage_state (Optional[Union[str, dict]], default: None)`: Path to a JSON file or a dictionary containing the browser's storage state (cookies, localStorage, etc.) to load.
* `ignore_https_errors (bool, default: True)`: If `True`, HTTPS certificate errors will be ignored.
* `java_script_enabled (bool, default: True)`: If `True`, JavaScript execution is enabled on web pages.
* `sleep_on_close (bool, default: False)`: If `True`, introduces a small delay before the browser is closed.
* `verbose (bool, default: True)`: If `True`, enables verbose logging for browser operations.
* `cookies (Optional[List[dict]], default: None)`: A list of cookie dictionaries to be set in the browser context. Each dictionary should conform to Playwright's cookie format.
* `headers (Optional[dict], default: None)`: A dictionary of additional HTTP headers to be sent with every request made by the browser.
* `user_agent (Optional[str], default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36")`: The User-Agent string the browser will use.
* `user_agent_mode (Optional[str], default: "")`: Mode for generating the User-Agent string. If set (e.g., to "random"), `user_agent_generator_config` can be used.
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration dictionary for the User-Agent generator if `user_agent_mode` is active.
* `text_mode (bool, default: False)`: If `True`, attempts to disable images and other rich content to potentially speed up loading for text-focused crawls.
* `light_mode (bool, default: False)`: If `True`, disables certain background browser features for potential performance gains.
* `extra_args (Optional[List[str]], default: None)`: A list of additional command-line arguments to pass to the browser executable upon launch.
* `debugging_port (int, default: 9222)`: The port to use for the browser's remote debugging protocol (CDP).
* `host (str, default: "localhost")`: The host on which the browser's remote debugging protocol will listen.
* **2.1.3. Key Public Attributes/Properties:**
* All parameters listed in `__init__` are available as public attributes with the same names and types.
* `browser_hint (str)`: [Read-only] - A string representing client hints (Sec-CH-UA) generated based on the `user_agent` string. This is automatically set during initialization.
* **2.1.4. Key Public Methods:**
* `from_kwargs(cls, kwargs: dict) -> BrowserConfig` (Static Method):
* Purpose: Creates a `BrowserConfig` instance from a dictionary of keyword arguments.
* `to_dict(self) -> dict`:
* Purpose: Converts the `BrowserConfig` instance into a dictionary representation.
* `clone(self, **kwargs) -> BrowserConfig`:
* Purpose: Creates a deep copy of the current `BrowserConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
* `dump(self) -> dict`:
* Purpose: Serializes the `BrowserConfig` object into a dictionary format that is suitable for JSON storage or transmission, utilizing the `to_serializable_dict` helper.
* `load(cls, data: dict) -> BrowserConfig` (Static Method):
* Purpose: Deserializes a `BrowserConfig` object from a dictionary (typically one created by `dump()`), utilizing the `from_serializable_dict` helper.
### 2.2. `CrawlerRunConfig`
Located in `crawl4ai.async_configs`.
* **2.2.1. Purpose:**
* Description: The `CrawlerRunConfig` class encapsulates all settings that control the behavior of a single crawl operation performed by `AsyncWebCrawler.arun()` or multiple operations within `AsyncWebCrawler.arun_many()`. This includes parameters for content processing, page interaction, caching, and media handling.
* **2.2.2. Initialization (`__init__`)**
* Signature:
```python
class CrawlerRunConfig:
def __init__(
self,
url: Optional[str] = None,
word_count_threshold: int = MIN_WORD_THRESHOLD,
extraction_strategy: Optional[ExtractionStrategy] = None,
chunking_strategy: Optional[ChunkingStrategy] = RegexChunking(),
markdown_generator: Optional[MarkdownGenerationStrategy] = DefaultMarkdownGenerator(),
only_text: bool = False,
css_selector: Optional[str] = None,
target_elements: Optional[List[str]] = None, # Default is [] in __init__
excluded_tags: Optional[List[str]] = None, # Default is [] in __init__
excluded_selector: Optional[str] = "", # Default is "" in __init__
keep_data_attributes: bool = False,
keep_attrs: Optional[List[str]] = None, # Default is [] in __init__
remove_forms: bool = False,
prettify: bool = False,
parser_type: str = "lxml",
scraping_strategy: Optional[ContentScrapingStrategy] = None, # Instantiated with WebScrapingStrategy() if None
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
locale: Optional[str] = None,
timezone_id: Optional[str] = None,
geolocation: Optional[GeolocationConfig] = None,
fetch_ssl_certificate: bool = False,
cache_mode: CacheMode = CacheMode.BYPASS,
session_id: Optional[str] = None,
shared_data: Optional[dict] = None,
wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT,
wait_for: Optional[str] = None,
wait_for_timeout: Optional[int] = None,
wait_for_images: bool = False,
delay_before_return_html: float = 0.1,
mean_delay: float = 0.1,
max_range: float = 0.3,
semaphore_count: int = 5,
js_code: Optional[Union[str, List[str]]] = None,
js_only: bool = False,
ignore_body_visibility: bool = True,
scan_full_page: bool = False,
scroll_delay: float = 0.2,
process_iframes: bool = False,
remove_overlay_elements: bool = False,
simulate_user: bool = False,
override_navigator: bool = False,
magic: bool = False,
adjust_viewport_to_content: bool = False,
screenshot: bool = False,
screenshot_wait_for: Optional[float] = None,
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
pdf: bool = False,
capture_mhtml: bool = False,
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
table_score_threshold: int = 7,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
exclude_social_media_domains: Optional[List[str]] = None, # Uses SOCIAL_MEDIA_DOMAINS if None
exclude_external_links: bool = False,
exclude_social_media_links: bool = False,
exclude_domains: Optional[List[str]] = None, # Default is [] in __init__
exclude_internal_links: bool = False,
verbose: bool = True,
log_console: bool = False,
capture_network_requests: bool = False,
capture_console_messages: bool = False,
method: str = "GET",
stream: bool = False,
check_robots_txt: bool = False,
user_agent: Optional[str] = None,
user_agent_mode: Optional[str] = None,
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
experimental: Optional[Dict[str, Any]] = None # Default is {} in __init__
): ...
```
* Parameters:
* `url (Optional[str], default: None)`: The target URL for this specific crawl run.
* `word_count_threshold (int, default: MIN_WORD_THRESHOLD)`: Minimum word count for a text block to be considered significant during content processing.
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`: Strategy for extracting structured data from the page. If `None`, `NoExtractionStrategy` is used.
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`: Strategy to split content into chunks before extraction.
* `markdown_generator (Optional[MarkdownGenerationStrategy], default: DefaultMarkdownGenerator())`: Strategy for converting HTML to Markdown.
* `only_text (bool, default: False)`: If `True`, attempts to extract only textual content, potentially ignoring structural elements beneficial for rich Markdown.
* `css_selector (Optional[str], default: None)`: A CSS selector defining the primary region of the page to focus on for content extraction. The raw HTML is reduced to this region.
* `target_elements (Optional[List[str]], default: [])`: A list of CSS selectors. If provided, only the content within these elements will be considered for Markdown generation and structured data extraction. Unlike `css_selector`, this does not reduce the raw HTML but scopes the processing.
* `excluded_tags (Optional[List[str]], default: [])`: A list of HTML tag names (e.g., "nav", "footer") to be removed from the HTML before processing.
* `excluded_selector (Optional[str], default: "")`: A CSS selector specifying elements to be removed from the HTML before processing.
* `keep_data_attributes (bool, default: False)`: If `True`, `data-*` attributes on HTML elements are preserved during cleaning.
* `keep_attrs (Optional[List[str]], default: [])`: A list of specific HTML attribute names to preserve during HTML cleaning.
* `remove_forms (bool, default: False)`: If `True`, all `<form>` elements are removed from the HTML.
* `prettify (bool, default: False)`: If `True`, the cleaned HTML output is "prettified" for better readability.
* `parser_type (str, default: "lxml")`: The HTML parser to be used by the scraping strategy (e.g., "lxml", "html.parser").
* `scraping_strategy (Optional[ContentScrapingStrategy], default: WebScrapingStrategy())`: The strategy for scraping content from the HTML.
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: Proxy configuration for this specific run. Overrides any proxy settings in `BrowserConfig`.
* `proxy_rotation_strategy (Optional[ProxyRotationStrategy], default: None)`: Strategy to use for rotating proxies if multiple are available.
* `locale (Optional[str], default: None)`: Locale to set for the browser context (e.g., "en-US", "fr-FR"). Affects `Accept-Language` header and JavaScript `navigator.language`.
* `timezone_id (Optional[str], default: None)`: Timezone ID to set for the browser context (e.g., "America/New_York", "Europe/Paris"). Affects JavaScript `Date` objects.
* `geolocation (Optional[GeolocationConfig], default: None)`: A `GeolocationConfig` object or dictionary to set the browser's mock geolocation.
* `fetch_ssl_certificate (bool, default: False)`: If `True`, the SSL certificate information for the main URL will be fetched and included in the `CrawlResult`.
* `cache_mode (CacheMode, default: CacheMode.BYPASS)`: Defines caching behavior for this run. See `CacheMode` enum for options.
* `session_id (Optional[str], default: None)`: An identifier for a browser session. If provided, `crawl4ai` will attempt to reuse an existing page/context associated with this ID, or create a new one and associate it.
* `shared_data (Optional[dict], default: None)`: A dictionary for passing custom data between hooks during the crawl lifecycle.
* `wait_until (str, default: "domcontentloaded")`: Playwright's page navigation wait condition (e.g., "load", "domcontentloaded", "networkidle", "commit").
* `page_timeout (int, default: PAGE_TIMEOUT)`: Maximum time in milliseconds for page navigation and other page operations.
* `wait_for (Optional[str], default: None)`: A CSS selector or a JavaScript expression (prefixed with "js:"). The crawler will wait until this condition is met before proceeding.
* `wait_for_timeout (Optional[int], default: None)`: Specific timeout in milliseconds for the `wait_for` condition. If `None`, `page_timeout` is used.
* `wait_for_images (bool, default: False)`: If `True`, attempts to wait for all images on the page to finish loading.
* `delay_before_return_html (float, default: 0.1)`: Delay in seconds to wait just before the final HTML content is retrieved from the page.
* `mean_delay (float, default: 0.1)`: Used with `arun_many`. The mean base delay in seconds between processing URLs.
* `max_range (float, default: 0.3)`: Used with `arun_many`. The maximum additional random delay (added to `mean_delay`) between processing URLs.
* `semaphore_count (int, default: 5)`: Used with `arun_many` and semaphore-based dispatchers. The maximum number of concurrent crawl operations.
* `js_code (Optional[Union[str, List[str]]], default: None)`: A string or list of strings containing JavaScript code to be executed on the page after it loads.
* `js_only (bool, default: False)`: If `True`, indicates that this `arun` call is primarily for JavaScript execution on an already loaded page (within a session) and a full page navigation might not be needed.
* `ignore_body_visibility (bool, default: True)`: If `True`, proceeds with content extraction even if the `<body>` element is not deemed visible by Playwright.
* `scan_full_page (bool, default: False)`: If `True`, the crawler will attempt to scroll through the entire page to trigger lazy-loaded content.
* `scroll_delay (float, default: 0.2)`: Delay in seconds between each scroll step when `scan_full_page` is `True`.
* `process_iframes (bool, default: False)`: If `True`, attempts to extract and inline content from `<iframe>` elements.
* `remove_overlay_elements (bool, default: False)`: If `True`, attempts to identify and remove common overlay elements (popups, cookie banners) before content extraction.
* `simulate_user (bool, default: False)`: If `True`, enables heuristics to simulate user interactions (like mouse movements) to potentially bypass some anti-bot measures.
* `override_navigator (bool, default: False)`: If `True`, overrides certain JavaScript `navigator` properties to appear more like a standard browser.
* `magic (bool, default: False)`: If `True`, enables a combination of techniques (like `remove_overlay_elements`, `simulate_user`) to try and handle dynamic/obfuscated sites.
* `adjust_viewport_to_content (bool, default: False)`: If `True`, attempts to adjust the browser viewport size to match the dimensions of the page content.
* `screenshot (bool, default: False)`: If `True`, a screenshot of the page will be taken and included in `CrawlResult.screenshot`.
* `screenshot_wait_for (Optional[float], default: None)`: Additional delay in seconds to wait before taking the screenshot.
* `screenshot_height_threshold (int, default: SCREENSHOT_HEIGHT_THRESHOLD)`: If page height exceeds this, a full-page screenshot strategy might be different.
* `pdf (bool, default: False)`: If `True`, a PDF version of the page will be generated and included in `CrawlResult.pdf`.
* `capture_mhtml (bool, default: False)`: If `True`, an MHTML archive of the page will be captured and included in `CrawlResult.mhtml`.
* `image_description_min_word_threshold (int, default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)`: Minimum word count for surrounding text to be considered as an image description.
* `image_score_threshold (int, default: IMAGE_SCORE_THRESHOLD)`: Heuristic score threshold for an image to be included in `CrawlResult.media`.
* `table_score_threshold (int, default: 7)`: Heuristic score threshold for an HTML table to be considered a data table and included in `CrawlResult.media`.
* `exclude_external_images (bool, default: False)`: If `True`, images hosted on different domains than the main page URL are excluded.
* `exclude_all_images (bool, default: False)`: If `True`, all images are excluded from `CrawlResult.media`.
* `exclude_social_media_domains (Optional[List[str]], default: SOCIAL_MEDIA_DOMAINS from config)`: List of social media domains whose links should be excluded.
* `exclude_external_links (bool, default: False)`: If `True`, all links pointing to external domains are excluded from `CrawlResult.links`.
* `exclude_social_media_links (bool, default: False)`: If `True`, links to domains in `exclude_social_media_domains` are excluded.
* `exclude_domains (Optional[List[str]], default: [])`: A list of specific domains whose links should be excluded.
* `exclude_internal_links (bool, default: False)`: If `True`, all links pointing to the same domain are excluded.
* `verbose (bool, default: True)`: Enables verbose logging for this specific crawl run. Overrides `BrowserConfig.verbose`.
* `log_console (bool, default: False)`: If `True`, browser console messages are captured (requires `capture_console_messages=True` to be effective).
* `capture_network_requests (bool, default: False)`: If `True`, captures details of network requests and responses made by the page.
* `capture_console_messages (bool, default: False)`: If `True`, captures messages logged to the browser's console.
* `method (str, default: "GET")`: HTTP method to use, primarily for `AsyncHTTPCrawlerStrategy`.
* `stream (bool, default: False)`: If `True` when using `arun_many`, results are yielded as an async generator instead of returned as a list at the end.
* `check_robots_txt (bool, default: False)`: If `True`, `robots.txt` rules for the domain will be checked and respected.
* `user_agent (Optional[str], default: None)`: User-Agent string for this specific run. Overrides `BrowserConfig.user_agent`.
* `user_agent_mode (Optional[str], default: None)`: User-Agent generation mode for this specific run.
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration for User-Agent generator for this run.
* `deep_crawl_strategy (Optional[DeepCrawlStrategy], default: None)`: Strategy to use for deep crawling beyond the initial URL.
* `experimental (Optional[Dict[str, Any]], default: {})`: A dictionary for passing experimental or beta parameters.
* **2.2.3. Key Public Attributes/Properties:**
* All parameters listed in `__init__` are available as public attributes with the same names and types.
* **2.2.4. Deprecated Property Handling (`__getattr__`, `_UNWANTED_PROPS`)**
* Behavior: Attempting to access a deprecated property (e.g., `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`) raises an `AttributeError`. The error message directs the user to use the `cache_mode` parameter with the appropriate `CacheMode` enum member instead.
* List of Deprecated Properties and their `CacheMode` Equivalents:
* `bypass_cache`: Use `cache_mode=CacheMode.BYPASS`.
* `disable_cache`: Use `cache_mode=CacheMode.DISABLE`.
* `no_cache_read`: Use `cache_mode=CacheMode.WRITE_ONLY`.
* `no_cache_write`: Use `cache_mode=CacheMode.READ_ONLY`.
* **2.2.5. Key Public Methods:**
* `from_kwargs(cls, kwargs: dict) -> CrawlerRunConfig` (Static Method):
* Purpose: Creates a `CrawlerRunConfig` instance from a dictionary of keyword arguments.
* `dump(self) -> dict`:
* Purpose: Serializes the `CrawlerRunConfig` object to a dictionary suitable for JSON storage, handling complex nested objects using `to_serializable_dict`.
* `load(cls, data: dict) -> CrawlerRunConfig` (Static Method):
* Purpose: Deserializes a `CrawlerRunConfig` object from a dictionary (typically one created by `dump()`), using `from_serializable_dict`.
* `to_dict(self) -> dict`:
* Purpose: Converts the `CrawlerRunConfig` instance into a dictionary representation. Complex objects like strategies are typically represented by their class name or a simplified form.
* `clone(self, **kwargs) -> CrawlerRunConfig`:
* Purpose: Creates a deep copy of the current `CrawlerRunConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
### 2.3. `LLMConfig`
Located in `crawl4ai.async_configs`.
* **2.3.1. Purpose:**
* Description: The `LLMConfig` class provides configuration for interacting with Large Language Model (LLM) providers. It includes settings for the provider name, API token, base URL, and various model-specific parameters like temperature and max tokens.
* **2.3.2. Initialization (`__init__`)**
* Signature:
```python
class LLMConfig:
def __init__(
self,
provider: str = DEFAULT_PROVIDER, # e.g., "openai/gpt-4o-mini"
api_token: Optional[str] = None,
base_url: Optional[str] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
top_p: Optional[float] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
stop: Optional[List[str]] = None,
n: Optional[int] = None,
): ...
```
* Parameters:
* `provider (str, default: DEFAULT_PROVIDER)`: The identifier for the LLM provider and model (e.g., "openai/gpt-4o-mini", "ollama/llama3.3", "gemini/gemini-1.5-pro").
* `api_token (Optional[str], default: None)`: The API token for authenticating with the LLM provider. If `None`, it attempts to load from environment variables based on the provider (e.g., `OPENAI_API_KEY` for OpenAI, `GEMINI_API_KEY` for Gemini). Can also be set as "env:YOUR_ENV_VAR_NAME".
* `base_url (Optional[str], default: None)`: A custom base URL for the LLM API endpoint, useful for self-hosted models or proxies.
* `temperature (Optional[float], default: None)`: Controls the randomness of the LLM's output. Higher values (e.g., 0.8) make output more random, lower values (e.g., 0.2) make it more deterministic.
* `max_tokens (Optional[int], default: None)`: The maximum number of tokens the LLM should generate in its response.
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. The model considers only tokens with cumulative probability mass up to `top_p`.
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, increasing the model's likelihood to talk about new topics.
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
* `n (Optional[int], default: None)`: The number of completions to generate for each prompt.
* **2.3.3. Key Public Attributes/Properties:**
* All parameters listed in `__init__` are available as public attributes with the same names and types.
* **2.3.4. Key Public Methods:**
* `from_kwargs(cls, kwargs: dict) -> LLMConfig` (Static Method):
* Purpose: Creates an `LLMConfig` instance from a dictionary of keyword arguments.
* `to_dict(self) -> dict`:
* Purpose: Converts the `LLMConfig` instance into a dictionary representation.
* `clone(self, **kwargs) -> LLMConfig`:
* Purpose: Creates a deep copy of the current `LLMConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
### 2.4. `GeolocationConfig`
Located in `crawl4ai.async_configs`.
* **2.4.1. Purpose:**
* Description: The `GeolocationConfig` class stores settings for mocking the browser's geolocation, including latitude, longitude, and accuracy.
* **2.4.2. Initialization (`__init__`)**
* Signature:
```python
class GeolocationConfig:
def __init__(
self,
latitude: float,
longitude: float,
accuracy: Optional[float] = 0.0
): ...
```
* Parameters:
* `latitude (float)`: The latitude coordinate (e.g., 37.7749 for San Francisco).
* `longitude (float)`: The longitude coordinate (e.g., -122.4194 for San Francisco).
* `accuracy (Optional[float], default: 0.0)`: The accuracy of the geolocation in meters.
* **2.4.3. Key Public Attributes/Properties:**
* `latitude (float)`: Stores the latitude.
* `longitude (float)`: Stores the longitude.
* `accuracy (Optional[float])`: Stores the accuracy.
* **2.4.4. Key Public Methods:**
* `from_dict(cls, geo_dict: dict) -> GeolocationConfig` (Static Method):
* Purpose: Creates a `GeolocationConfig` instance from a dictionary.
* `to_dict(self) -> dict`:
* Purpose: Converts the `GeolocationConfig` instance to a dictionary: `{"latitude": ..., "longitude": ..., "accuracy": ...}`.
* `clone(self, **kwargs) -> GeolocationConfig`:
* Purpose: Creates a copy of the `GeolocationConfig` instance, allowing for overriding specific attributes with `kwargs`.
### 2.5. `ProxyConfig`
Located in `crawl4ai.async_configs` (and `crawl4ai.proxy_strategy`).
* **2.5.1. Purpose:**
* Description: The `ProxyConfig` class encapsulates the configuration for a single proxy server, including its address, authentication credentials (if any), and optionally its public IP address.
* **2.5.2. Initialization (`__init__`)**
* Signature:
```python
class ProxyConfig:
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
): ...
```
* Parameters:
* `server (str)`: The proxy server URL, including protocol and port (e.g., "http://127.0.0.1:8080", "socks5://proxy.example.com:1080").
* `username (Optional[str], default: None)`: The username for proxy authentication, if required.
* `password (Optional[str], default: None)`: The password for proxy authentication, if required.
* `ip (Optional[str], default: None)`: The public IP address of the proxy server. If not provided, it will be automatically extracted from the `server` string if possible.
* **2.5.3. Key Public Attributes/Properties:**
* `server (str)`: The proxy server URL.
* `username (Optional[str])`: The username for proxy authentication.
* `password (Optional[str])`: The password for proxy authentication.
* `ip (Optional[str])`: The public IP address of the proxy. This is either user-provided or automatically extracted from the `server` string during initialization via the internal `_extract_ip_from_server` method.
* **2.5.4. Key Public Methods:**
* `_extract_ip_from_server(self) -> Optional[str]` (Internal method):
* Purpose: Extracts the IP address component from the `self.server` URL string.
* `from_string(cls, proxy_str: str) -> ProxyConfig` (Static Method):
* Purpose: Creates a `ProxyConfig` instance from a string.
* Formats:
* `'ip:port:username:password'`
* `'ip:port'` (no authentication)
* `from_dict(cls, proxy_dict: dict) -> ProxyConfig` (Static Method):
* Purpose: Creates a `ProxyConfig` instance from a dictionary with keys "server", "username", "password", and "ip".
* `from_env(cls, env_var: str = "PROXIES") -> List[ProxyConfig]` (Static Method):
* Purpose: Loads a list of `ProxyConfig` objects from a comma-separated environment variable. Each proxy string in the variable should conform to the format accepted by `from_string`.
* `to_dict(self) -> dict`:
* Purpose: Converts the `ProxyConfig` instance to a dictionary: `{"server": ..., "username": ..., "password": ..., "ip": ...}`.
* `clone(self, **kwargs) -> ProxyConfig`:
* Purpose: Creates a copy of the `ProxyConfig` instance, allowing for overriding specific attributes with `kwargs`.
### 2.6. `HTTPCrawlerConfig`
Located in `crawl4ai.async_configs`.
* **2.6.1. Purpose:**
* Description: The `HTTPCrawlerConfig` class holds configuration settings specific to direct HTTP-based crawling strategies (e.g., `AsyncHTTPCrawlerStrategy`), which do not use a full browser environment.
* **2.6.2. Initialization (`__init__`)**
* Signature:
```python
class HTTPCrawlerConfig:
def __init__(
self,
method: str = "GET",
headers: Optional[Dict[str, str]] = None,
data: Optional[Dict[str, Any]] = None,
json: Optional[Dict[str, Any]] = None,
follow_redirects: bool = True,
verify_ssl: bool = True,
): ...
```
* Parameters:
* `method (str, default: "GET")`: The HTTP method to use for the request (e.g., "GET", "POST", "PUT").
* `headers (Optional[Dict[str, str]], default: None)`: A dictionary of custom HTTP headers to send with the request.
* `data (Optional[Dict[str, Any]], default: None)`: Data to be sent in the body of the request, typically for "POST" or "PUT" requests (e.g., form data).
* `json (Optional[Dict[str, Any]], default: None)`: JSON data to be sent in the body of the request. If provided, the `Content-Type` header is typically set to `application/json`.
* `follow_redirects (bool, default: True)`: If `True`, the crawler will automatically follow HTTP redirects.
* `verify_ssl (bool, default: True)`: If `True`, SSL certificates will be verified. Set to `False` to ignore SSL errors (use with caution).
* **2.6.3. Key Public Attributes/Properties:**
* All parameters listed in `__init__` are available as public attributes with the same names and types.
* **2.6.4. Key Public Methods:**
* `from_kwargs(cls, kwargs: dict) -> HTTPCrawlerConfig` (Static Method):
* Purpose: Creates an `HTTPCrawlerConfig` instance from a dictionary of keyword arguments.
* `to_dict(self) -> dict`:
* Purpose: Converts the `HTTPCrawlerConfig` instance into a dictionary representation.
* `clone(self, **kwargs) -> HTTPCrawlerConfig`:
* Purpose: Creates a deep copy of the current `HTTPCrawlerConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
* `dump(self) -> dict`:
* Purpose: Serializes the `HTTPCrawlerConfig` object to a dictionary.
* `load(cls, data: dict) -> HTTPCrawlerConfig` (Static Method):
* Purpose: Deserializes an `HTTPCrawlerConfig` object from a dictionary.
## 3. Enumerations and Helper Constants
### 3.1. `CacheMode` (Enum)
Located in `crawl4ai.cache_context`.
* **3.1.1. Purpose:**
* Description: The `CacheMode` enumeration defines the different caching behaviors that can be applied to a crawl operation. It is used in `CrawlerRunConfig` to control how results are read from and written to the cache.
* **3.1.2. Enum Members:**
* `ENABLE (str)`: Value: "ENABLE". Description: Enables normal caching behavior. The crawler will attempt to read from the cache first, and if a result is not found or is stale, it will perform the crawl and write the new result to the cache.
* `DISABLE (str)`: Value: "DISABLE". Description: Disables all caching. The crawler will not read from or write to the cache. Every request will be a fresh crawl.
* `READ_ONLY (str)`: Value: "READ_ONLY". Description: The crawler will only attempt to read from the cache. If a result is found, it will be used. If not, the crawl will not proceed further for that URL, and no new data will be written to the cache.
* `WRITE_ONLY (str)`: Value: "WRITE_ONLY". Description: The crawler will not attempt to read from the cache. It will always perform a fresh crawl and then write the result to the cache.
* `BYPASS (str)`: Value: "BYPASS". Description: The crawler will skip reading from the cache for this specific operation and will perform a fresh crawl. The result of this crawl *will* be written to the cache. This is the default `cache_mode` for `CrawlerRunConfig`.
* **3.1.3. Usage:**
* Example:
```python
from crawl4ai import CrawlerRunConfig, CacheMode
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLE) # Use cache fully
config_bypass = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Force fresh crawl, then cache
```
## 4. Serialization Helper Functions
Located in `crawl4ai.async_configs`.
### 4.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
* **4.1.1. Purpose:**
* Description: This utility function recursively converts various Python objects, including `crawl4ai` configuration objects, into a dictionary format that is suitable for JSON serialization. It uses a `{ "type": "ClassName", "params": { ... } }` structure for custom class instances to enable proper deserialization later.
* **4.1.2. Parameters:**
* `obj (Any)`: The Python object to be serialized.
* `ignore_default_value (bool, default: False)`: If `True`, when serializing class instances, parameters whose current values match their `__init__` default values might be excluded from the "params" dictionary. (Note: The exact behavior depends on the availability of default values in the class signature and handling of empty/None values).
* **4.1.3. Returns:**
* `Dict`: A dictionary representation of the input object, structured for easy serialization (e.g., to JSON) and later deserialization by `from_serializable_dict`.
* **4.1.4. Key Behaviors:**
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
* **Enums:** Serialized as `{"type": "EnumClassName", "params": enum_member.value}`.
* **Datetime Objects:** Serialized to their ISO 8601 string representation.
* **Lists, Tuples, Sets, Frozensets:** Serialized by recursively calling `to_serializable_dict` on each of their elements, returning a list.
* **Plain Dictionaries:** Serialized as `{"type": "dict", "value": {key: serialized_value, ...}}`.
* **Class Instances (e.g., Config Objects):**
* The object's class name is stored in the "type" field.
* Parameters from the `__init__` signature and attributes from `__slots__` (if defined) are collected.
* Their current values are recursively serialized and stored in the "params" dictionary.
* The structure is `{"type": "ClassName", "params": {"param_name": serialized_param_value, ...}}`.
### 4.2. `from_serializable_dict(data: Any) -> Any`
* **4.2.1. Purpose:**
* Description: This utility function reconstructs Python objects, including `crawl4ai` configuration objects, from the serializable dictionary format previously created by `to_serializable_dict`.
* **4.2.2. Parameters:**
* `data (Any)`: The dictionary (or basic data type) to be deserialized. This is typically the output of `to_serializable_dict` after being, for example, loaded from a JSON string.
* **4.2.3. Returns:**
* `Any`: The reconstructed Python object (e.g., an instance of `BrowserConfig`, `LLMConfig`, a list, a plain dictionary, etc.).
* **4.2.4. Key Behaviors:**
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
* **Typed Dictionaries (from `to_serializable_dict`):**
* If `data` is a dictionary and contains a "type" key:
* If `data["type"] == "dict"`, it reconstructs a plain Python dictionary from `data["value"]` by recursively deserializing its items.
* Otherwise, it attempts to locate the class specified by `data["type"]` within the `crawl4ai` module.
* If the class is an `Enum`, it instantiates the enum member using `data["params"]` (the enum value).
* If it's a regular class, it recursively deserializes the items in `data["params"]` and uses them as keyword arguments (`**kwargs`) to instantiate the class.
* **Lists:** If `data` is a list, it reconstructs a list by recursively calling `from_serializable_dict` on each of its elements.
* **Legacy Dictionaries:** If `data` is a dictionary but does not conform to the "type" key structure (for backward compatibility), it attempts to deserialize its values.
## 5. Cross-References and Relationships
* **5.1. `BrowserConfig` Usage:**
* Typically instantiated once and passed to the `AsyncWebCrawler` constructor via its `config` parameter.
* `browser_config = BrowserConfig(headless=False)`
* `crawler = AsyncWebCrawler(config=browser_config)`
* It defines the global browser settings that will be used for all subsequent crawl operations unless overridden by `CrawlerRunConfig` on a per-run basis.
* **5.2. `CrawlerRunConfig` Usage:**
* Passed to the `arun()` or `arun_many()` methods of `AsyncWebCrawler`.
* `run_config = CrawlerRunConfig(screenshot=True, cache_mode=CacheMode.BYPASS)`
* `result = await crawler.arun(url="https://example.com", config=run_config)`
* Allows for fine-grained control over individual crawl requests, overriding global settings from `BrowserConfig` or `AsyncWebCrawler`'s defaults where applicable (e.g., `user_agent`, `proxy_config`, `cache_mode`).
* **5.3. `LLMConfig` Usage:**
* Instantiated and passed to LLM-based extraction strategies (e.g., `LLMExtractionStrategy`) or content filters (`LLMContentFilter`) during their initialization.
* `llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="sk-...")`
* `extraction_strategy = LLMExtractionStrategy(llm_config=llm_conf, schema=my_schema)`
* **5.4. `GeolocationConfig` and `ProxyConfig` Usage:**
* `GeolocationConfig` is typically instantiated and assigned to the `geolocation` parameter of `CrawlerRunConfig`.
* `geo_conf = GeolocationConfig(latitude=34.0522, longitude=-118.2437)`
* `run_config = CrawlerRunConfig(geolocation=geo_conf)`
* `ProxyConfig` can be assigned to the `proxy_config` parameter of `BrowserConfig` (for a global proxy applied to all contexts) or `CrawlerRunConfig` (for a proxy specific to a single crawl run).
* `proxy_conf = ProxyConfig(server="http://myproxy:8080")`
* `browser_config = BrowserConfig(proxy_config=proxy_conf)` (global)
* `run_config = CrawlerRunConfig(proxy_config=proxy_conf)` (per-run)
* **5.5. `HTTPCrawlerConfig` Usage:**
* Used when the `crawler_strategy` for `AsyncWebCrawler` is set to `AsyncHTTPCrawlerStrategy` (for non-browser-based HTTP requests).
* `http_conf = HTTPCrawlerConfig(method="POST", json={"key": "value"})`
* `http_strategy = AsyncHTTPCrawlerStrategy(http_crawler_config=http_conf)`
* `crawler = AsyncWebCrawler(crawler_strategy=http_strategy)`
* Alternatively, parameters like `method`, `data`, `json` can be passed directly to `arun()` when using `AsyncHTTPCrawlerStrategy` if they are part of the `CrawlerRunConfig`.

File diff suppressed because it is too large Load Diff

View File

@@ -1,356 +0,0 @@
```markdown
# Examples Outline for crawl4ai - core Component
**Target Document Type:** Examples Collection
**Target Output Filename Suggestion:** `llm_examples_core.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2024-05-24 10:00:00
---
This document provides a collection of runnable code examples for the `core` component of the `crawl4ai` library. Each example is designed to showcase a specific feature or configuration.
## 1. Basic `AsyncWebCrawler` Usage
### 1.1. Example: Simplest crawl of a single URL with default `BrowserConfig` and `CrawlerRunConfig`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def simplest_crawl():
# Uses default BrowserConfig and CrawlerRunConfig
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful!")
print(f"Markdown (first 300 chars):\n{result.markdown.raw_markdown[:300]}...")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(simplest_crawl())
```
---
### 1.2. Example: Using `AsyncWebCrawler` as an asynchronous context manager (`async with`).
This is the recommended way to manage the crawler's lifecycle.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def context_manager_crawl():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful using context manager!")
print(f"Page title from metadata: {result.metadata.get('title')}")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(context_manager_crawl())
```
---
### 1.3. Example: Explicitly starting and closing the `AsyncWebCrawler` using `start()` and `close()`.
Useful for scenarios where the crawler's lifecycle needs more manual control.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def explicit_lifecycle_crawl():
crawler = AsyncWebCrawler()
await crawler.start() # Explicitly start the crawler and browser
try:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful with explicit start/close!")
print(f"Cleaned HTML (first 300 chars):\n{result.cleaned_html[:300]}...")
else:
print(f"Crawl failed: {result.error_message}")
finally:
await crawler.close() # Ensure the crawler is closed
if __name__ == "__main__":
asyncio.run(explicit_lifecycle_crawl())
```
---
### 1.4. Example: Handling a failed crawl (e.g., non-existent URL, network error) and checking `CrawlResult.success` and `CrawlResult.error_message`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def failed_crawl_handling():
async with AsyncWebCrawler() as crawler:
# Using a deliberately non-existent URL
result = await crawler.arun(url="https://thissitedoesnotexist.crawl4ai")
if not result.success:
print(f"Crawl failed as expected for URL: {result.url}")
print(f"Status Code: {result.status_code}")
print(f"Error Message: {result.error_message}")
else:
print("Crawl unexpectedly succeeded!")
if __name__ == "__main__":
asyncio.run(failed_crawl_handling())
```
---
### 1.5. Example: Processing raw HTML content directly using `crawler.aprocess_html()`.
This is useful if you already have HTML content and want to use Crawl4ai's processing capabilities.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def process_raw_html_directly():
raw_html_content = """
<html>
<head><title>My Test Page</title></head>
<body>
<h1>Welcome!</h1>
<p>This is a paragraph with a <a href="https://example.com">link</a>.</p>
<script>console.log("This should be removed");</script>
</body>
</html>
"""
# No need for BrowserConfig as we are not navigating
async with AsyncWebCrawler() as crawler:
# Use CrawlerRunConfig if you need specific processing options
config = CrawlerRunConfig()
result = await crawler.aprocess_html(
url="raw://my_virtual_page", # Provide a conceptual URL
html=raw_html_content,
config=config
)
if result.success:
print("Raw HTML processed successfully!")
print(f"Markdown:\n{result.markdown.raw_markdown}")
print(f"Cleaned HTML:\n{result.cleaned_html}")
else:
print(f"HTML processing failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(process_raw_html_directly())
```
---
### 1.6. Example: Crawling a local HTML file using the `file:///` prefix.
First, create a dummy HTML file named `local_test.html` in the same directory as your script.
```python
# local_test.html
# <!DOCTYPE html>
# <html>
# <head>
# <title>Local Test File</title>
# </head>
# <body>
# <h1>Hello from a local file!</h1>
# <p>This content is loaded from the local filesystem.</p>
# </body>
# </html>
```
```python
import asyncio
import os
from pathlib import Path
from crawl4ai import AsyncWebCrawler
async def crawl_local_file():
# Create a dummy local HTML file for the example
script_dir = Path(__file__).parent
local_file_path = script_dir / "local_test_for_crawl.html"
with open(local_file_path, "w", encoding="utf-8") as f:
f.write("<!DOCTYPE html><html><head><title>Local Test</title></head><body><h1>Local Content</h1></body></html>")
file_url = f"file:///{local_file_path.resolve()}"
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=file_url)
if result.success:
print(f"Successfully crawled local file: {file_url}")
print(f"Markdown (first 100 chars): {result.markdown.raw_markdown[:100]}...")
else:
print(f"Failed to crawl local file: {result.error_message}")
# Clean up the dummy file
if os.path.exists(local_file_path):
os.remove(local_file_path)
if __name__ == "__main__":
asyncio.run(crawl_local_file())
```
---
### 1.7. Example: Accessing basic fields from `CrawlResult` (e.g., `url`, `html`, `markdown.raw_markdown`, `status_code`, `response_headers`).
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def access_crawl_result_fields():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print(f"URL Crawled: {result.url}")
print(f"Status Code: {result.status_code}")
print("\n--- Response Headers (sample) ---")
if result.response_headers:
for key, value in list(result.response_headers.items())[:3]: # Print first 3 headers
print(f"{key}: {value}")
print(f"\n--- Raw HTML (first 100 chars) ---\n{result.html[:100]}...")
print(f"\n--- Cleaned HTML (first 100 chars) ---\n{result.cleaned_html[:100]}...")
if result.markdown:
print(f"\n--- Raw Markdown (first 100 chars) ---\n{result.markdown.raw_markdown[:100]}...")
print(f"\n--- Metadata (sample) ---")
if result.metadata:
for key, value in list(result.metadata.items())[:3]: # Print first 3 metadata items
print(f"{key}: {value}")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(access_crawl_result_fields())
```
---
## 2. Configuring the Browser (`BrowserConfig`)
### 2.1. Example: Initializing `AsyncWebCrawler` with a custom `BrowserConfig` object.
This example sets the browser to run in non-headless mode and uses Firefox.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def custom_browser_config_init():
# Configure browser to be Firefox and visible
browser_config = BrowserConfig(
browser_type="firefox",
headless=False # Set to True to run without UI
)
# Pass the custom config to the crawler
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print(f"Crawl successful with custom BrowserConfig (Firefox, visible)!")
print(f"Page title: {result.metadata.get('title')}")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
# This example might open a visible browser window.
# Ensure Firefox is installed if you run this.
# asyncio.run(custom_browser_config_init())
print("Skipping custom_browser_config_init example in automated run to avoid GUI interaction.")
```
---
### 2.2. Browser Type and Headless Mode
#### 2.2.1. Example: Using Chromium browser (default).
This shows the default behavior if no `browser_type` is specified.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def chromium_default_crawl():
# Chromium is the default, but we can explicitly set it
browser_config = BrowserConfig(browser_type="chromium", headless=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful with Chromium (default)!")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(chromium_default_crawl())
```
---
#### 2.2.2. Example: Using Firefox browser (`browser_type="firefox"`).
Ensure Firefox is installed on your system for this example to run.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def firefox_crawl():
browser_config = BrowserConfig(browser_type="firefox", headless=True)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful with Firefox!")
else:
print(f"Crawl failed with Firefox: {result.error_message}")
except Exception as e:
print(f"Error running Firefox example: {e}. Ensure Firefox is installed and Playwright browsers are set up (`crawl4ai-setup`).")
if __name__ == "__main__":
# asyncio.run(firefox_crawl())
print("Skipping Firefox example in automated run. Uncomment to run if Firefox is installed.")
```
---
#### 2.2.3. Example: Using WebKit browser (`browser_type="webkit"`).
Ensure WebKit (Safari's engine) is installed via Playwright.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def webkit_crawl():
browser_config = BrowserConfig(browser_type="webkit", headless=True)
try:
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https://example.com")
if result.success:
print("Crawl successful with WebKit!")
else:
print(f"Crawl failed with WebKit: {result.error_message}")
except Exception as e:
print(f"Error running WebKit example: {e}. Ensure WebKit is installed and Playwright browsers are set up (`crawl4ai-setup`).")
if __name__ == "__main__":
# asyncio.run(webkit_crawl())
print("Skipping WebKit example in automated run. Uncomment to run if WebKit is installed.")
```
---
#### 2.2.4. Example: Running the browser in non-headless mode (`headless=False`) for visual debugging.
This will open a visible browser window.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig
async def non_headless_crawl():
browser_config = BrowserConfig(headless=False) # Browser window will be visible
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="https
```

View File

@@ -1,890 +0,0 @@
Okay, I have read the objective and instructions. I will now generate the detailed Markdown outline for a "Foundational Memory" document for the `core` component of `crawl4ai`, using the provided `code_analysis_output.md` (the concatenated code snippets from the previous prompt) as the primary source of truth for API details.
```markdown
# Detailed Outline for crawl4ai - core Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_core.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
## 1. Introduction to Core Components
* 1.1. Purpose: Provides the foundational classes, configurations, and data models for web crawling and scraping operations within the `crawl4ai` library.
* 1.2. Key Functionalities:
* Orchestration of asynchronous web crawling (`AsyncWebCrawler`).
* Configuration of browser behavior and specific crawl runs (`BrowserConfig`, `CrawlerRunConfig`).
* Standardized data structures for crawl results and associated data (`CrawlResult`, `Media`, `Links`, etc.).
* Strategies for fetching web content (`AsyncPlaywrightCrawlerStrategy`, `AsyncHTTPCrawlerStrategy`).
* Management of browser instances and sessions (`BrowserManager`, `ManagedBrowser`).
* Asynchronous logging (`AsyncLogger`).
* 1.3. Relationship with other `crawl4ai` components:
* The `core` component serves as the foundation upon which specialized strategies (e.g., PDF processing, Markdown generation, content extraction, chunking, content filtering) are built and integrated.
## 2. Main Class: `AsyncWebCrawler`
* 2.1. Purpose: The primary class for orchestrating asynchronous web crawling operations. It manages browser instances (via a `BrowserManager`), applies crawling strategies, and processes HTML content to produce structured results.
* 2.2. Initialization (`__init__`)
* 2.2.1. Signature:
```python
def __init__(
self,
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
config: Optional[BrowserConfig] = None,
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
thread_safe: bool = False,
logger: Optional[AsyncLoggerBase] = None,
**kwargs,
):
```
* 2.2.2. Parameters:
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The strategy to use for fetching web content. If `None`, defaults to `AsyncPlaywrightCrawlerStrategy` initialized with `config` and `logger`.
* `config (Optional[BrowserConfig])`: Configuration object for browser settings. If `None`, a default `BrowserConfig()` is created.
* `base_directory (str)`: The base directory for storing crawl4ai related files, such as cache and logs. Defaults to `os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())`.
* `thread_safe (bool)`: If `True`, uses an `asyncio.Lock` for thread-safe operations, particularly relevant for `arun_many`. Default: `False`.
* `logger (Optional[AsyncLoggerBase])`: An instance of a logger. If `None`, a default `AsyncLogger` is initialized using `base_directory` and `config.verbose`.
* `**kwargs`: Additional keyword arguments, primarily for backward compatibility, passed to the `AsyncPlaywrightCrawlerStrategy` if `crawler_strategy` is not provided.
* 2.3. Key Public Attributes/Properties:
* `browser_config (BrowserConfig)`: Read-only. The browser configuration object used by the crawler.
* `crawler_strategy (AsyncCrawlerStrategy)`: Read-only. The active crawling strategy instance.
* `logger (AsyncLoggerBase)`: Read-only. The logger instance used by the crawler.
* `ready (bool)`: Read-only. `True` if the crawler has been started and is ready to perform crawl operations, `False` otherwise.
* 2.4. Lifecycle Methods:
* 2.4.1. `async start() -> AsyncWebCrawler`:
* Purpose: Asynchronously initializes the crawler strategy (e.g., launches the browser). This must be called before `arun` or `arun_many` if the crawler is not used as an asynchronous context manager.
* Returns: The `AsyncWebCrawler` instance (`self`).
* 2.4.2. `async close() -> None`:
* Purpose: Asynchronously closes the crawler strategy and cleans up resources (e.g., closes the browser). This should be called if `start()` was used explicitly.
* 2.4.3. `async __aenter__() -> AsyncWebCrawler`:
* Purpose: Entry point for asynchronous context management. Calls `self.start()`.
* Returns: The `AsyncWebCrawler` instance (`self`).
* 2.4.4. `async __aexit__(exc_type, exc_val, exc_tb) -> None`:
* Purpose: Exit point for asynchronous context management. Calls `self.close()`.
* 2.5. Primary Crawl Methods:
* 2.5.1. `async arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`:
* Purpose: Performs a single crawl operation for the given URL or raw HTML content.
* Parameters:
* `url (str)`: The URL to crawl (e.g., "http://example.com", "file:///path/to/file.html") or raw HTML content prefixed with "raw:" (e.g., "raw:<html>...</html>").
* `config (Optional[CrawlerRunConfig])`: Configuration for this specific crawl run. If `None`, a default `CrawlerRunConfig()` is used.
* `**kwargs`: Additional parameters passed to the underlying `aprocess_html` method, can be used to override settings in `config`.
* Returns: `RunManyReturn` (which resolves to `CrawlResultContainer` containing a single `CrawlResult`).
* 2.5.2. `async arun_many(urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`:
* Purpose: Crawls multiple URLs concurrently using a specified or default dispatcher strategy.
* Parameters:
* `urls (List[str])`: A list of URLs to crawl.
* `config (Optional[CrawlerRunConfig])`: Configuration applied to all crawl runs in this batch.
* `dispatcher (Optional[BaseDispatcher])`: The dispatcher strategy to manage concurrent crawls. Defaults to `MemoryAdaptiveDispatcher`.
* `**kwargs`: Additional parameters passed to the underlying `arun` method for each URL.
* Returns: `RunManyReturn`. If `config.stream` is `True`, returns an `AsyncGenerator[CrawlResult, None]`. Otherwise, returns a `CrawlResultContainer` (list-like) of `CrawlResult` objects.
* 2.6. Internal Processing Method (User-Facing Effects):
* 2.6.1. `async aprocess_html(url: str, html: str, extracted_content: Optional[str], config: CrawlerRunConfig, screenshot_data: Optional[str], pdf_data: Optional[bytes], verbose: bool, **kwargs) -> CrawlResult`:
* Purpose: Processes the fetched HTML content. This method is called internally by `arun` after content is fetched (either from a live crawl or cache). It applies scraping strategies, content filtering, and Markdown generation based on the `config`.
* Parameters:
* `url (str)`: The URL of the content being processed.
* `html (str)`: The raw HTML content.
* `extracted_content (Optional[str])`: Pre-extracted content from a previous step or cache.
* `config (CrawlerRunConfig)`: Configuration for this processing run.
* `screenshot_data (Optional[str])`: Base64 encoded screenshot data, if available.
* `pdf_data (Optional[bytes])`: PDF data, if available.
* `verbose (bool)`: Verbosity setting for logging during processing.
* `**kwargs`: Additional parameters, including `is_raw_html` and `redirected_url`.
* Returns: A `CrawlResult` object containing the processed data.
## 3. Core Configuration Objects
* 3.1. Class `BrowserConfig` (from `crawl4ai.async_configs`)
* 3.1.1. Purpose: Configures the browser instance launched by Playwright, including its type, mode, display settings, proxy, user agent, and persistent storage options.
* 3.1.2. Initialization (`__init__`)
* Signature:
```python
def __init__(
self,
browser_type: str = "chromium",
headless: bool = True,
browser_mode: str = "dedicated",
use_managed_browser: bool = False,
cdp_url: Optional[str] = None,
use_persistent_context: bool = False,
user_data_dir: Optional[str] = None,
channel: Optional[str] = "chromium", # Note: 'channel' from code, outline had 'chrome_channel'
proxy: Optional[str] = None, # Note: 'proxy' from code, outline had 'proxy_config' for this level
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport: Optional[dict] = None,
accept_downloads: bool = False,
downloads_path: Optional[str] = None,
storage_state: Optional[Union[str, dict, None]] = None,
ignore_https_errors: bool = True,
java_script_enabled: bool = True,
sleep_on_close: bool = False,
verbose: bool = True,
cookies: Optional[list] = None,
headers: Optional[dict] = None,
user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
user_agent_mode: str = "",
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
text_mode: bool = False,
light_mode: bool = False,
extra_args: Optional[list] = None,
debugging_port: int = 9222,
host: str = "localhost",
):
```
* Key Parameters:
* `browser_type (str)`: Type of browser to launch ("chromium", "firefox", "webkit"). Default: "chromium".
* `headless (bool)`: Whether to run the browser in headless mode. Default: `True`.
* `browser_mode (str)`: How the browser should be initialized ("builtin", "dedicated", "cdp", "docker"). Default: "dedicated".
* `use_managed_browser (bool)`: Whether to launch the browser using a managed approach (e.g., via CDP). Default: `False`.
* `cdp_url (Optional[str])`: URL for Chrome DevTools Protocol endpoint. Default: `None`.
* `use_persistent_context (bool)`: Use a persistent browser context (profile). Default: `False`.
* `user_data_dir (Optional[str])`: Path to user data directory for persistent sessions. Default: `None`.
* `channel (Optional[str])`: Browser channel (e.g., "chromium", "chrome", "msedge"). Default: "chromium".
* `proxy (Optional[str])`: Simple proxy server URL string.
* `proxy_config (Optional[Union[ProxyConfig, dict, None]])`: Detailed proxy configuration object or dictionary. Takes precedence over `proxy`.
* `viewport_width (int)`: Default viewport width. Default: `1080`.
* `viewport_height (int)`: Default viewport height. Default: `600`.
* `viewport (Optional[dict])`: Dictionary to set viewport dimensions, overrides `viewport_width` and `viewport_height` if set (e.g., `{"width": 1920, "height": 1080}`). Default: `None`.
* `accept_downloads (bool)`: Whether to allow file downloads. Default: `False`.
* `downloads_path (Optional[str])`: Directory to store downloaded files. Default: `None`.
* `storage_state (Optional[Union[str, dict, None]])`: Path to a file or a dictionary containing browser storage state (cookies, localStorage). Default: `None`.
* `ignore_https_errors (bool)`: Ignore HTTPS certificate errors. Default: `True`.
* `java_script_enabled (bool)`: Enable JavaScript execution. Default: `True`.
* `user_agent (str)`: Custom User-Agent string. Default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36".
* `user_agent_mode (str)`: Mode for generating User-Agent (e.g., "random"). Default: `""` (uses provided `user_agent`).
* `user_agent_generator_config (Optional[dict])`: Configuration for User-Agent generation if `user_agent_mode` is active. Default: `{}`.
* `text_mode (bool)`: If `True`, disables images and rich content for faster loading. Default: `False`.
* `light_mode (bool)`: Disables certain background features for performance. Default: `False`.
* `extra_args (Optional[list])`: Additional command-line arguments for the browser. Default: `None` (resolves to `[]`).
* `debugging_port (int)`: Port for browser debugging protocol. Default: `9222`.
* `host (str)`: Host for browser debugging protocol. Default: "localhost".
* 3.1.3. Key Public Methods:
* `clone(**kwargs) -> BrowserConfig`: Creates a new `BrowserConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
* `static load(data: dict) -> BrowserConfig`: Deserializes a `BrowserConfig` instance from a dictionary (previously created by `dump`).
* `static from_kwargs(kwargs: dict) -> BrowserConfig`: Creates a `BrowserConfig` instance directly from a dictionary of keyword arguments.
* 3.2. Class `CrawlerRunConfig` (from `crawl4ai.async_configs`)
* 3.2.1. Purpose: Specifies settings for an individual crawl operation initiated by `arun()` or `arun_many()`. These settings can override or augment the global `BrowserConfig`.
* 3.2.2. Initialization (`__init__`)
* Signature:
```python
def __init__(
self,
# Content Processing Parameters
word_count_threshold: int = MIN_WORD_THRESHOLD,
extraction_strategy: Optional[ExtractionStrategy] = None,
chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
only_text: bool = False,
css_selector: Optional[str] = None,
target_elements: Optional[List[str]] = None,
excluded_tags: Optional[list] = None,
excluded_selector: Optional[str] = None,
keep_data_attributes: bool = False,
keep_attrs: Optional[list] = None,
remove_forms: bool = False,
prettify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None, # Will default to WebScrapingStrategy
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# Browser Location and Identity Parameters
locale: Optional[str] = None,
timezone_id: Optional[str] = None,
geolocation: Optional[GeolocationConfig] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode: CacheMode = CacheMode.BYPASS,
session_id: Optional[str] = None,
bypass_cache: bool = False, # Legacy
disable_cache: bool = False, # Legacy
no_cache_read: bool = False, # Legacy
no_cache_write: bool = False, # Legacy
shared_data: Optional[dict] = None,
# Page Navigation and Timing Parameters
wait_until: str = "domcontentloaded",
page_timeout: int = PAGE_TIMEOUT,
wait_for: Optional[str] = None,
wait_for_timeout: Optional[int] = None,
wait_for_images: bool = False,
delay_before_return_html: float = 0.1,
mean_delay: float = 0.1,
max_range: float = 0.3,
semaphore_count: int = 5,
# Page Interaction Parameters
js_code: Optional[Union[str, List[str]]] = None,
js_only: bool = False,
ignore_body_visibility: bool = True,
scan_full_page: bool = False,
scroll_delay: float = 0.2,
process_iframes: bool = False,
remove_overlay_elements: bool = False,
simulate_user: bool = False,
override_navigator: bool = False,
magic: bool = False,
adjust_viewport_to_content: bool = False,
# Media Handling Parameters
screenshot: bool = False,
screenshot_wait_for: Optional[float] = None,
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
pdf: bool = False,
capture_mhtml: bool = False,
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
table_score_threshold: int = 7,
exclude_external_images: bool = False,
exclude_all_images: bool = False,
# Link and Domain Handling Parameters
exclude_social_media_domains: Optional[list] = None, # Note: 'exclude_social_media_domains' from code
exclude_external_links: bool = False,
exclude_social_media_links: bool = False,
exclude_domains: Optional[list] = None,
exclude_internal_links: bool = False,
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
# Network and Console Capturing Parameters
capture_network_requests: bool = False,
capture_console_messages: bool = False,
# Connection Parameters (for HTTPCrawlerStrategy)
method: str = "GET",
stream: bool = False,
url: Optional[str] = None,
# Robots.txt Handling
check_robots_txt: bool = False,
# User Agent Parameters
user_agent: Optional[str] = None,
user_agent_mode: Optional[str] = None,
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
# Deep Crawl Parameters
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
# Experimental Parameters
experimental: Optional[Dict[str, Any]] = None,
):
```
* Key Parameters:
* `word_count_threshold (int)`: Minimum word count for a content block to be considered. Default: `MIN_WORD_THRESHOLD` (200).
* `extraction_strategy (Optional[ExtractionStrategy])`: Strategy for structured data extraction (e.g., `LLMExtractionStrategy`, `JsonCssExtractionStrategy`). Default: `None` (falls back to `NoExtractionStrategy`).
* `chunking_strategy (ChunkingStrategy)`: Strategy for splitting content into chunks before extraction. Default: `RegexChunking()`.
* `markdown_generator (MarkdownGenerationStrategy)`: Strategy for converting HTML to Markdown. Default: `DefaultMarkdownGenerator()`.
* `cache_mode (CacheMode)`: Caching behavior for this run. Default: `CacheMode.BYPASS`.
* `session_id (Optional[str])`: ID for session persistence (reusing browser tabs/contexts). Default: `None`.
* `js_code (Optional[Union[str, List[str]]])`: JavaScript code snippets to execute on the page. Default: `None`.
* `wait_for (Optional[str])`: CSS selector or JS condition (prefixed with "js:") to wait for before proceeding. Default: `None`.
* `page_timeout (int)`: Timeout for page operations (e.g., navigation) in milliseconds. Default: `PAGE_TIMEOUT` (60000ms).
* `screenshot (bool)`: If `True`, capture a screenshot of the page. Default: `False`.
* `pdf (bool)`: If `True`, generate a PDF of the page. Default: `False`.
* `capture_mhtml (bool)`: If `True`, capture an MHTML snapshot of the page. Default: `False`.
* `exclude_external_links (bool)`: If `True`, exclude external links from results. Default: `False`.
* `stream (bool)`: If `True` (used with `arun_many`), results are yielded as an `AsyncGenerator`. Default: `False`.
* `check_robots_txt (bool)`: If `True`, crawler will check and respect `robots.txt` rules. Default: `False`.
* `user_agent (Optional[str])`: Override the browser's User-Agent for this specific run.
* 3.2.3. Key Public Methods:
* `clone(**kwargs) -> CrawlerRunConfig`: Creates a new `CrawlerRunConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
* `static load(data: dict) -> CrawlerRunConfig`: Deserializes a `CrawlerRunConfig` instance from a dictionary (previously created by `dump`).
* `static from_kwargs(kwargs: dict) -> CrawlerRunConfig`: Creates a `CrawlerRunConfig` instance directly from a dictionary of keyword arguments.
* 3.3. Supporting Configuration Objects (from `crawl4ai.async_configs`)
* 3.3.1. Class `GeolocationConfig`
* Purpose: Defines geolocation (latitude, longitude, accuracy) to be emulated by the browser.
* Initialization (`__init__`):
```python
def __init__(
self,
latitude: float,
longitude: float,
accuracy: Optional[float] = 0.0
):
```
* Parameters:
* `latitude (float)`: Latitude coordinate (e.g., 37.7749).
* `longitude (float)`: Longitude coordinate (e.g., -122.4194).
* `accuracy (Optional[float])`: Accuracy in meters. Default: `0.0`.
* Methods:
* `static from_dict(geo_dict: Dict) -> GeolocationConfig`: Creates an instance from a dictionary.
* `to_dict() -> Dict`: Converts the instance to a dictionary.
* `clone(**kwargs) -> GeolocationConfig`: Creates a copy with updated values.
* 3.3.2. Class `ProxyConfig`
* Purpose: Defines the settings for a single proxy server, including server address, authentication credentials, and optional IP.
* Initialization (`__init__`):
```python
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
```
* Parameters:
* `server (str)`: Proxy server URL (e.g., "http://127.0.0.1:8080", "socks5://user:pass@host:port").
* `username (Optional[str])`: Username for proxy authentication.
* `password (Optional[str])`: Password for proxy authentication.
* `ip (Optional[str])`: Optional IP address associated with the proxy for verification.
* Methods:
* `static from_string(proxy_str: str) -> ProxyConfig`: Creates an instance from a string (e.g., "ip:port:username:password" or "ip:port").
* `static from_dict(proxy_dict: Dict) -> ProxyConfig`: Creates an instance from a dictionary.
* `static from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`: Loads a list of proxies from a comma-separated environment variable.
* `to_dict() -> Dict`: Converts the instance to a dictionary.
* `clone(**kwargs) -> ProxyConfig`: Creates a copy with updated values.
* 3.3.3. Class `HTTPCrawlerConfig`
* Purpose: Configuration for the `AsyncHTTPCrawlerStrategy`, specifying HTTP method, headers, data/JSON payload, and redirect/SSL verification behavior.
* Initialization (`__init__`):
```python
def __init__(
self,
method: str = "GET",
headers: Optional[Dict[str, str]] = None,
data: Optional[Dict[str, Any]] = None,
json: Optional[Dict[str, Any]] = None,
follow_redirects: bool = True,
verify_ssl: bool = True,
):
```
* Parameters:
* `method (str)`: HTTP method (e.g., "GET", "POST"). Default: "GET".
* `headers (Optional[Dict[str, str]])`: Dictionary of HTTP request headers. Default: `None`.
* `data (Optional[Dict[str, Any]])`: Dictionary of form data to send in the request body. Default: `None`.
* `json (Optional[Dict[str, Any]])`: JSON data to send in the request body. Default: `None`.
* `follow_redirects (bool)`: Whether to automatically follow HTTP redirects. Default: `True`.
* `verify_ssl (bool)`: Whether to verify SSL certificates. Default: `True`.
* Methods:
* `static from_kwargs(kwargs: dict) -> HTTPCrawlerConfig`: Creates an instance from keyword arguments.
* `to_dict() -> dict`: Converts config to a dictionary.
* `clone(**kwargs) -> HTTPCrawlerConfig`: Creates a copy with updated values.
* `dump() -> dict`: Serializes the config to a dictionary.
* `static load(data: dict) -> HTTPCrawlerConfig`: Deserializes from a dictionary.
* 3.3.4. Class `LLMConfig`
* Purpose: Configures settings for interacting with Large Language Models, including provider choice, API credentials, and generation parameters.
* Initialization (`__init__`):
```python
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
top_p: Optional[float] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
stop: Optional[List[str]] = None,
n: Optional[int] = None,
):
```
* Key Parameters:
* `provider (str)`: Name of the LLM provider (e.g., "openai/gpt-4o", "ollama/llama3.3", "groq/llama3-8b-8192"). Default: `DEFAULT_PROVIDER` (from `crawl4ai.config`).
* `api_token (Optional[str])`: API token for the LLM provider. If prefixed with "env:", it reads from the specified environment variable (e.g., "env:OPENAI_API_KEY"). If not provided, it attempts to load from default environment variables based on the provider.
* `base_url (Optional[str])`: Custom base URL for the LLM API endpoint.
* `temperature (Optional[float])`: Sampling temperature for generation.
* `max_tokens (Optional[int])`: Maximum number of tokens to generate.
* `top_p (Optional[float])`: Nucleus sampling parameter.
* `frequency_penalty (Optional[float])`: Penalty for token frequency.
* `presence_penalty (Optional[float])`: Penalty for token presence.
* `stop (Optional[List[str]])`: List of stop sequences for generation.
* `n (Optional[int])`: Number of completions to generate.
* Methods:
* `static from_kwargs(kwargs: dict) -> LLMConfig`: Creates an instance from keyword arguments.
* `to_dict() -> dict`: Converts config to a dictionary.
* `clone(**kwargs) -> LLMConfig`: Creates a copy with updated values.
## 4. Core Data Models (Results & Payloads from `crawl4ai.models`)
* 4.1. Class `CrawlResult(BaseModel)`
* Purpose: A Pydantic model representing the comprehensive result of a single crawl and processing operation.
* Key Fields:
* `url (str)`: The final URL that was crawled (after any redirects).
* `html (str)`: The raw HTML content fetched from the URL.
* `success (bool)`: `True` if the crawl operation (fetching and initial processing) was successful, `False` otherwise.
* `cleaned_html (Optional[str])`: HTML content after sanitization and removal of unwanted tags/attributes as per configuration. Default: `None`.
* `_markdown (Optional[MarkdownGenerationResult])`: (Private Attribute) Holds the `MarkdownGenerationResult` object if Markdown generation was performed. Use the `markdown` property to access. Default: `None`.
* `markdown (Optional[Union[str, MarkdownGenerationResult]])`: (Property) Provides access to Markdown content. Behaves as a string (raw markdown) by default but allows access to `MarkdownGenerationResult` attributes (e.g., `result.markdown.fit_markdown`).
* `extracted_content (Optional[str])`: JSON string representation of structured data extracted by an `ExtractionStrategy`. Default: `None`.
* `media (Media)`: An object containing lists of `MediaItem` for images, videos, audio, and extracted tables. Default: `Media()`.
* `links (Links)`: An object containing lists of `Link` for internal and external hyperlinks found on the page. Default: `Links()`.
* `downloaded_files (Optional[List[str]])`: A list of file paths if any files were downloaded during the crawl. Default: `None`.
* `js_execution_result (Optional[Dict[str, Any]])`: The result of any JavaScript code executed on the page. Default: `None`.
* `screenshot (Optional[str])`: Base64 encoded string of the page screenshot, if `screenshot=True` was set. Default: `None`.
* `pdf (Optional[bytes])`: Raw bytes of the PDF generated from the page, if `pdf=True` was set. Default: `None`.
* `mhtml (Optional[str])`: MHTML snapshot of the page, if `capture_mhtml=True` was set. Default: `None`.
* `metadata (Optional[dict])`: Dictionary of metadata extracted from the page (e.g., title, description, OpenGraph tags, Twitter card data). Default: `None`.
* `error_message (Optional[str])`: A message describing the error if `success` is `False`. Default: `None`.
* `session_id (Optional[str])`: The session ID used for this crawl, if applicable. Default: `None`.
* `response_headers (Optional[dict])`: HTTP response headers from the server. Default: `None`.
* `status_code (Optional[int])`: HTTP status code of the response. Default: `None`.
* `ssl_certificate (Optional[SSLCertificate])`: Information about the SSL certificate if `fetch_ssl_certificate=True`. Default: `None`.
* `dispatch_result (Optional[DispatchResult])`: Metadata about the task execution from the dispatcher (e.g., timings, memory usage). Default: `None`.
* `redirected_url (Optional[str])`: The original URL if the request was redirected. Default: `None`.
* `network_requests (Optional[List[Dict[str, Any]]])`: List of captured network requests if `capture_network_requests=True`. Default: `None`.
* `console_messages (Optional[List[Dict[str, Any]]])`: List of captured browser console messages if `capture_console_messages=True`. Default: `None`.
* Methods:
* `model_dump(*args, **kwargs)`: Serializes the `CrawlResult` model to a dictionary, ensuring the `_markdown` private attribute is correctly handled and included as "markdown" in the output if present.
* 4.2. Class `MarkdownGenerationResult(BaseModel)`
* Purpose: A Pydantic model that holds various forms of Markdown generated from HTML content.
* Fields:
* `raw_markdown (str)`: The basic, direct conversion of HTML to Markdown.
* `markdown_with_citations (str)`: Markdown content with inline citations (e.g., [^1^]) and a references section.
* `references_markdown (str)`: The Markdown content for the "References" section, listing all cited links.
* `fit_markdown (Optional[str])`: Markdown generated specifically from content deemed "relevant" by a content filter (like `PruningContentFilter` or `LLMContentFilter`), if such a filter was applied. Default: `None`.
* `fit_html (Optional[str])`: The filtered HTML content that was used to generate `fit_markdown`. Default: `None`.
* Methods:
* `__str__(self) -> str`: Returns `self.raw_markdown` when the object is cast to a string.
* 4.3. Class `ScrapingResult(BaseModel)`
* Purpose: A Pydantic model representing a standardized output from content scraping strategies.
* Fields:
* `cleaned_html (str)`: The primary sanitized and processed HTML content.
* `success (bool)`: Indicates if the scraping operation was successful.
* `media (Media)`: A `Media` object containing extracted images, videos, audio, and tables.
* `links (Links)`: A `Links` object containing extracted internal and external links.
* `metadata (Dict[str, Any])`: A dictionary of metadata extracted from the page (e.g., title, description).
* 4.4. Class `MediaItem(BaseModel)`
* Purpose: A Pydantic model representing a generic media item like an image, video, or audio file.
* Fields:
* `src (Optional[str])`: The source URL of the media item. Default: `""`.
* `data (Optional[str])`: Base64 encoded data for inline media. Default: `""`.
* `alt (Optional[str])`: Alternative text for the media item (e.g., image alt text). Default: `""`.
* `desc (Optional[str])`: A description or surrounding text related to the media item. Default: `""`.
* `score (Optional[int])`: A relevance or importance score, if calculated by a strategy. Default: `0`.
* `type (str)`: The type of media (e.g., "image", "video", "audio"). Default: "image".
* `group_id (Optional[int])`: An identifier to group related media variants (e.g., different resolutions of the same image from a srcset). Default: `0`.
* `format (Optional[str])`: The detected file format (e.g., "jpeg", "png", "mp4"). Default: `None`.
* `width (Optional[int])`: The width of the media item in pixels, if available. Default: `None`.
* 4.5. Class `Link(BaseModel)`
* Purpose: A Pydantic model representing an extracted hyperlink.
* Fields:
* `href (Optional[str])`: The URL (href attribute) of the link. Default: `""`.
* `text (Optional[str])`: The anchor text of the link. Default: `""`.
* `title (Optional[str])`: The title attribute of the link, if present. Default: `""`.
* `base_domain (Optional[str])`: The base domain extracted from the `href`. Default: `""`.
* 4.6. Class `Media(BaseModel)`
* Purpose: A Pydantic model that acts as a container for lists of different types of media items found on a page.
* Fields:
* `images (List[MediaItem])`: A list of `MediaItem` objects representing images. Default: `[]`.
* `videos (List[MediaItem])`: A list of `MediaItem` objects representing videos. Default: `[]`.
* `audios (List[MediaItem])`: A list of `MediaItem` objects representing audio files. Default: `[]`.
* `tables (List[Dict])`: A list of dictionaries, where each dictionary represents an extracted HTML table with keys like "headers", "rows", "caption", "summary". Default: `[]`.
* 4.7. Class `Links(BaseModel)`
* Purpose: A Pydantic model that acts as a container for lists of internal and external links.
* Fields:
* `internal (List[Link])`: A list of `Link` objects considered internal to the crawled site. Default: `[]`.
* `external (List[Link])`: A list of `Link` objects pointing to external sites. Default: `[]`.
* 4.8. Class `AsyncCrawlResponse(BaseModel)`
* Purpose: A Pydantic model representing the raw response from a crawler strategy's `crawl` method. This data is then processed further to create a `CrawlResult`.
* Fields:
* `html (str)`: The raw HTML content of the page.
* `response_headers (Dict[str, str])`: A dictionary of HTTP response headers.
* `js_execution_result (Optional[Dict[str, Any]])`: The result from any JavaScript code executed on the page. Default: `None`.
* `status_code (int)`: The HTTP status code of the response.
* `screenshot (Optional[str])`: Base64 encoded screenshot data, if captured. Default: `None`.
* `pdf_data (Optional[bytes])`: Raw PDF data, if captured. Default: `None`.
* `mhtml_data (Optional[str])`: MHTML snapshot data, if captured. Default: `None`.
* `downloaded_files (Optional[List[str]])`: A list of local file paths for any files downloaded during the crawl. Default: `None`.
* `ssl_certificate (Optional[SSLCertificate])`: SSL certificate information for the site. Default: `None`.
* `redirected_url (Optional[str])`: The original URL requested if the final URL is a result of redirection. Default: `None`.
* `network_requests (Optional[List[Dict[str, Any]]])`: Captured network requests if enabled. Default: `None`.
* `console_messages (Optional[List[Dict[str, Any]]])`: Captured console messages if enabled. Default: `None`.
* 4.9. Class `TokenUsage(BaseModel)`
* Purpose: A Pydantic model to track token usage statistics for interactions with Large Language Models.
* Fields:
* `completion_tokens (int)`: Number of tokens used for the LLM's completion/response. Default: `0`.
* `prompt_tokens (int)`: Number of tokens used for the input prompt to the LLM. Default: `0`.
* `total_tokens (int)`: Total number of tokens used (prompt + completion). Default: `0`.
* `completion_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of completion tokens. Default: `None`.
* `prompt_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of prompt tokens. Default: `None`.
* 4.10. Class `SSLCertificate(dict)` (from `crawl4ai.ssl_certificate`)
* Purpose: Represents an SSL certificate's information, behaving like a dictionary for direct JSON serialization and easy access to its fields.
* Key Fields (accessed as dictionary keys):
* `subject (dict)`: Dictionary of subject fields (e.g., `{"CN": "example.com", "O": "Example Inc."}`).
* `issuer (dict)`: Dictionary of issuer fields.
* `version (int)`: Certificate version number.
* `serial_number (str)`: Certificate serial number (hexadecimal string).
* `not_before (str)`: Validity start date and time (ASN.1/UTC format string, e.g., "YYYYMMDDHHMMSSZ").
* `not_after (str)`: Validity end date and time (ASN.1/UTC format string).
* `fingerprint (str)`: SHA-256 fingerprint of the certificate (lowercase hex string).
* `signature_algorithm (str)`: The algorithm used to sign the certificate (e.g., "sha256WithRSAEncryption").
* `raw_cert (str)`: Base64 encoded string of the raw DER-encoded certificate.
* `extensions (List[dict])`: A list of dictionaries, each representing a certificate extension with "name" and "value" keys.
* Static Methods:
* `from_url(url: str, timeout: int = 10) -> Optional[SSLCertificate]`: Fetches the SSL certificate from the given URL and returns an `SSLCertificate` instance, or `None` on failure.
* Instance Methods:
* `to_json(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate information as a JSON string. If `filepath` is provided, writes to the file and returns `None`.
* `to_pem(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate in PEM format as a string. If `filepath` is provided, writes to the file and returns `None`.
* `to_der(filepath: Optional[str] = None) -> Optional[bytes]`: Exports the raw certificate in DER format as bytes. If `filepath` is provided, writes to the file and returns `None`.
* Example:
```python
# Assuming 'cert' is an SSLCertificate instance
# print(cert["subject"]["CN"])
# cert.to_pem("my_cert.pem")
```
* 4.11. Class `DispatchResult(BaseModel)`
* Purpose: Contains metadata about a task's execution when processed by a dispatcher (e.g., in `arun_many`).
* Fields:
* `task_id (str)`: A unique identifier for the dispatched task.
* `memory_usage (float)`: Memory usage (in MB) recorded during the task's execution.
* `peak_memory (float)`: Peak memory usage (in MB) recorded during the task's execution.
* `start_time (Union[datetime, float])`: The start time of the task (can be a `datetime` object or a Unix timestamp float).
* `end_time (Union[datetime, float])`: The end time of the task.
* `error_message (str)`: Any error message if the task failed during dispatch or execution. Default: `""`.
* 4.12. `CrawlResultContainer(Generic[CrawlResultT])`
* Purpose: A generic container for `CrawlResult` objects, primarily used as the return type for `arun_many` when `stream=False`. It behaves like a list, allowing iteration, indexing, and length checking.
* Methods:
* `__iter__(self)`: Allows iteration over the contained `CrawlResult` objects.
* `__getitem__(self, index)`: Allows accessing `CrawlResult` objects by index.
* `__len__(self)`: Returns the number of `CrawlResult` objects contained.
* `__repr__(self)`: Provides a string representation of the container.
* Attribute:
* `_results (List[CrawlResultT])`: The internal list holding the `CrawlResult` objects.
* 4.13. `RunManyReturn` (Type Alias from `crawl4ai.models`)
* Purpose: A type alias defining the possible return types for the `arun_many` method of `AsyncWebCrawler`.
* Definition: `Union[CrawlResultContainer[CrawlResult], AsyncGenerator[CrawlResult, None]]`
* This means `arun_many` will return a `CrawlResultContainer` (a list-like object of all `CrawlResult` instances) if `CrawlerRunConfig.stream` is `False` (the default).
* It will return an `AsyncGenerator` yielding individual `CrawlResult` instances if `CrawlerRunConfig.stream` is `True`.
## 5. Core Crawler Strategies (from `crawl4ai.async_crawler_strategy`)
* 5.1. Abstract Base Class `AsyncCrawlerStrategy(ABC)`
* Purpose: Defines the common interface that all asynchronous crawler strategies must implement. This allows `AsyncWebCrawler` to use different fetching mechanisms (e.g., Playwright, HTTP requests) interchangeably.
* Initialization (`__init__`):
```python
def __init__(self, browser_config: BrowserConfig, logger: AsyncLoggerBase):
```
* Parameters:
* `browser_config (BrowserConfig)`: The browser configuration to be used by the strategy.
* `logger (AsyncLoggerBase)`: The logger instance for logging strategy-specific events.
* Key Abstract Methods (must be implemented by concrete subclasses):
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
* Purpose: Fetches the content from the given URL according to the `config`.
* Returns: An `AsyncCrawlResponse` object containing the raw fetched data.
* `async __aenter__(self)`:
* Purpose: Asynchronous context manager entry, typically for initializing resources (e.g., launching a browser).
* `async __aexit__(self, exc_type, exc_val, exc_tb)`:
* Purpose: Asynchronous context manager exit, for cleaning up resources.
* Key Concrete Methods (available to all strategies):
* `set_custom_headers(self, headers: dict) -> None`:
* Purpose: Sets custom HTTP headers to be used by the strategy for subsequent requests.
* `update_user_agent(self, user_agent: str) -> None`:
* Purpose: Updates the User-Agent string used by the strategy.
* `set_hook(self, hook_name: str, callback: Callable) -> None`:
* Purpose: Registers a callback function for a specific hook point in the crawling lifecycle.
* `async_run_hook(self, hook_name: str, *args, **kwargs) -> Any`:
* Purpose: Executes a registered hook with the given arguments.
* `async_get_default_context(self) -> BrowserContext`:
* Purpose: Retrieves the default browser context (Playwright specific, might raise `NotImplementedError` in non-Playwright strategies).
* `async_create_new_page(self, context: BrowserContext) -> Page`:
* Purpose: Creates a new page within a given browser context (Playwright specific).
* `async_get_page(self, url: str, config: CrawlerRunConfig, session_id: Optional[str]) -> Tuple[Page, BrowserContext]`:
* Purpose: Gets an existing page/context for a session or creates a new one (Playwright specific, managed by `BrowserManager`).
* `async_close_page(self, page: Page, session_id: Optional[str]) -> None`:
* Purpose: Closes a page, potentially keeping the associated context/session alive (Playwright specific).
* `async_kill_session(self, session_id: str) -> None`:
* Purpose: Kills (closes) a specific browser session, including its page and context (Playwright specific).
* 5.2. Class `AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy)`
* Purpose: The default crawler strategy, using Playwright to control a web browser for fetching and interacting with web pages. It supports complex JavaScript execution and provides hooks for various stages of the crawl.
* Initialization (`__init__`):
```python
def __init__(
self,
browser_config: Optional[BrowserConfig] = None,
logger: Optional[AsyncLoggerBase] = None,
browser_manager: Optional[BrowserManager] = None
):
```
* Parameters:
* `browser_config (Optional[BrowserConfig])`: Browser configuration. Defaults to a new `BrowserConfig()` if not provided.
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
* `browser_manager (Optional[BrowserManager])`: An instance of `BrowserManager` to manage browser lifecycles and contexts. If `None`, a new `BrowserManager` is created internally.
* Key Overridden/Implemented Methods:
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
* Purpose: Implements the crawling logic using Playwright. It navigates to the URL, executes JavaScript if specified, waits for conditions, captures screenshots/PDFs if requested, and returns the page content and other metadata.
* `async aprocess_html(self, url: str, html: str, config: CrawlerRunConfig, **kwargs) -> CrawlResult`:
* Purpose: (Note: While `AsyncWebCrawler` calls this, the default implementation is in `AsyncPlaywrightCrawlerStrategy` for convenience, acting as a bridge to the scraping strategy.) Processes the fetched HTML to produce a `CrawlResult`. This involves using the `scraping_strategy` from the `config` (defaults to `WebScrapingStrategy`) to clean HTML, extract media/links, and then uses the `markdown_generator` to produce Markdown.
* Specific Public Methods:
* `async_create_new_context(self, config: Optional[CrawlerRunConfig] = None) -> BrowserContext`:
* Purpose: Creates a new Playwright `BrowserContext` based on the global `BrowserConfig` and optional overrides from `CrawlerRunConfig`.
* `async_setup_context_default(self, context: BrowserContext, config: Optional[CrawlerRunConfig] = None) -> None`:
* Purpose: Applies default settings to a `BrowserContext`, such as viewport size, user agent, custom headers, locale, timezone, and geolocation, based on `BrowserConfig` and `CrawlerRunConfig`.
* `async_setup_context_hooks(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
* Purpose: Sets up event listeners on the context for capturing network requests and console messages if `config.capture_network_requests` or `config.capture_console_messages` is `True`.
* `async_handle_storage_state(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
* Purpose: Loads cookies and localStorage from a `storage_state` file or dictionary (specified in `BrowserConfig` or `CrawlerRunConfig`) into the given `BrowserContext`.
* Hooks (Callable via `set_hook(hook_name, callback)` and executed by `async_run_hook`):
* `on_browser_created`: Called after the Playwright browser instance is launched or connected. Callback receives `(browser, **kwargs)`.
* `on_page_context_created`: Called after a new Playwright `BrowserContext` and `Page` are created. Callback receives `(page, context, **kwargs)`.
* `before_goto`: Called just before `page.goto(url)` is executed. Callback receives `(page, context, url, **kwargs)`.
* `after_goto`: Called after `page.goto(url)` completes successfully. Callback receives `(page, context, url, response, **kwargs)`.
* `on_user_agent_updated`: Called when the User-Agent string is updated for a context. Callback receives `(page, context, user_agent, **kwargs)`.
* `on_execution_started`: Called when `js_code` execution begins on a page. Callback receives `(page, context, **kwargs)`.
* `before_retrieve_html`: Called just before the final HTML content is retrieved from the page. Callback receives `(page, context, **kwargs)`.
* `before_return_html`: Called just before the `AsyncCrawlResponse` is returned by the `crawl()` method of the strategy. Callback receives `(page, context, html_content, **kwargs)`.
* 5.3. Class `AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy)`
* Purpose: A lightweight crawler strategy that uses direct HTTP requests (via `httpx`) instead of a full browser. Suitable for static sites or when JavaScript execution is not needed.
* Initialization (`__init__`):
```python
def __init__(self, http_config: Optional[HTTPCrawlerConfig] = None, logger: Optional[AsyncLoggerBase] = None):
```
* Parameters:
* `http_config (Optional[HTTPCrawlerConfig])`: Configuration for HTTP requests (method, headers, data, etc.). Defaults to a new `HTTPCrawlerConfig()`.
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
* Key Overridden/Implemented Methods:
* `async crawl(self, url: str, http_config: Optional[HTTPCrawlerConfig] = None, **kwargs) -> AsyncCrawlResponse`:
* Purpose: Fetches content from the URL using an HTTP GET or POST request via `httpx`. Does not execute JavaScript. Returns an `AsyncCrawlResponse` with HTML, status code, and headers. Screenshot, PDF, and MHTML capabilities are not available with this strategy.
## 6. Browser Management (from `crawl4ai.browser_manager`)
* 6.1. Class `BrowserManager`
* Purpose: Manages the lifecycle of Playwright browser instances and their contexts. It handles launching/connecting to browsers, creating new contexts with specific configurations, managing sessions for page reuse, and cleaning up resources.
* Initialization (`__init__`):
```python
def __init__(self, browser_config: BrowserConfig, logger: Optional[AsyncLoggerBase] = None):
```
* Parameters:
* `browser_config (BrowserConfig)`: The global browser configuration settings.
* `logger (Optional[AsyncLoggerBase])`: Logger instance for browser management events.
* Key Methods:
* `async start() -> None`: Initializes the Playwright instance and launches or connects to the browser based on `browser_config` (e.g., launches a new browser instance or connects to an existing CDP endpoint via `ManagedBrowser`).
* `async create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> playwright.async_api.BrowserContext`: Creates a new browser context. If `crawlerRunConfig` is provided, its settings (e.g., locale, viewport, proxy) can override the global `BrowserConfig`.
* `async setup_context(self, context: playwright.async_api.BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, is_default: bool = False) -> None`: Applies various settings to a given browser context, including headers, cookies, viewport, geolocation, permissions, and storage state, based on `BrowserConfig` and `CrawlerRunConfig`.
* `async get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[playwright.async_api.Page, playwright.async_api.BrowserContext]`: Retrieves an existing page and context for a given `session_id` (if present in `crawlerRunConfig` and the session is active) or creates a new page and context. Manages context reuse based on a signature derived from `CrawlerRunConfig` to ensure contexts with different core settings (like proxy, locale) are isolated.
* `async kill_session(self, session_id: str) -> None`: Closes the page and browser context associated with the given `session_id`, effectively ending that session.
* `async close() -> None`: Closes all managed browser contexts and the main browser instance.
* 6.2. Class `ManagedBrowser`
* Purpose: Manages the lifecycle of a single, potentially persistent, browser process. It's used when `BrowserConfig.use_managed_browser` is `True` or `BrowserConfig.use_persistent_context` is `True`. It handles launching the browser with a specific user data directory and connecting via CDP.
* Initialization (`__init__`):
```python
def __init__(
self,
browser_type: str = "chromium",
user_data_dir: Optional[str] = None,
headless: bool = False,
logger=None,
host: str = "localhost",
debugging_port: int = 9222,
cdp_url: Optional[str] = None, # Added as per code_analysis
browser_config: Optional[BrowserConfig] = None # Added as per code_analysis
):
```
* Parameters:
* `browser_type (str)`: "chromium", "firefox", or "webkit". Default: "chromium".
* `user_data_dir (Optional[str])`: Path to the user data directory for the browser profile. If `None`, a temporary directory might be created.
* `headless (bool)`: Whether to launch the browser in headless mode. Default: `False` (typically for managed/persistent scenarios).
* `logger`: Logger instance.
* `host (str)`: Host for the debugging port. Default: "localhost".
* `debugging_port (int)`: Port for the Chrome DevTools Protocol. Default: `9222`.
* `cdp_url (Optional[str])`: If provided, attempts to connect to an existing browser at this CDP URL instead of launching a new one.
* `browser_config (Optional[BrowserConfig])`: The `BrowserConfig` object providing overall browser settings.
* Key Methods:
* `async start() -> str`: Starts the browser process (if not connecting to an existing `cdp_url`). If a new browser is launched, it uses the specified `user_data_dir` and `debugging_port`.
* Returns: The CDP endpoint URL (e.g., "http://localhost:9222").
* `async cleanup() -> None`: Terminates the browser process (if launched by this instance) and removes any temporary user data directory created by it.
* Static Methods:
* `async create_profile(cls, browser_config: Optional[BrowserConfig] = None, profile_name: Optional[str] = None, logger=None) -> str`:
* Purpose: Launches a browser instance with a new or existing user profile, allowing interactive setup (e.g., manual login, cookie acceptance). The browser remains open until the user closes it.
* Parameters:
* `browser_config (Optional[BrowserConfig])`: Optional browser configuration to use.
* `profile_name (Optional[str])`: Name for the profile. If `None`, a default name is used.
* `logger`: Logger instance.
* Returns: The path to the created/used user data directory, which can then be passed to `BrowserConfig.user_data_dir`.
* `list_profiles(cls) -> List[str]`:
* Purpose: Lists the names of all browser profiles stored in the default Crawl4AI profiles directory (`~/.crawl4ai/profiles`).
* Returns: A list of profile name strings.
* `delete_profile(cls, profile_name_or_path: str) -> bool`:
* Purpose: Deletes a browser profile either by its name (if in the default directory) or by its full path.
* Returns: `True` if deletion was successful, `False` otherwise.
* 6.3. Function `clone_runtime_state(src: BrowserContext, dst: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, browserConfig: Optional[BrowserConfig] = None) -> None`
* Purpose: Asynchronously copies runtime state (cookies, localStorage, session storage) from a source `BrowserContext` to a destination `BrowserContext`. Can also apply headers and geolocation from `CrawlerRunConfig` or `BrowserConfig` to the destination context.
* Parameters:
* `src (BrowserContext)`: The source browser context.
* `dst (BrowserContext)`: The destination browser context.
* `crawlerRunConfig (Optional[CrawlerRunConfig])`: Optional run configuration to apply to `dst`.
* `browserConfig (Optional[BrowserConfig])`: Optional browser configuration to apply to `dst`.
## 7. Proxy Rotation Strategies (from `crawl4ai.proxy_strategy`)
* 7.1. Abstract Base Class `ProxyRotationStrategy(ABC)`
* Purpose: Defines the interface for strategies that provide a sequence of proxy configurations, enabling proxy rotation.
* Abstract Methods:
* `async get_next_proxy(self) -> Optional[ProxyConfig]`:
* Purpose: Asynchronously retrieves the next `ProxyConfig` from the strategy.
* Returns: A `ProxyConfig` object or `None` if no more proxies are available or an error occurs.
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`:
* Purpose: Adds a list of `ProxyConfig` objects to the strategy's pool of proxies.
* 7.2. Class `RoundRobinProxyStrategy(ProxyRotationStrategy)`
* Purpose: A simple proxy rotation strategy that cycles through a list of provided proxies in a round-robin fashion.
* Initialization (`__init__`):
```python
def __init__(self, proxies: Optional[List[ProxyConfig]] = None):
```
* Parameters:
* `proxies (Optional[List[ProxyConfig]])`: An initial list of `ProxyConfig` objects. If `None`, the list is empty and proxies must be added via `add_proxies`.
* Methods:
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`: Adds new `ProxyConfig` objects to the internal list of proxies and reinitializes the cycle.
* `async get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next `ProxyConfig` from the list, cycling back to the beginning when the end is reached. Returns `None` if the list is empty.
## 8. Logging (from `crawl4ai.async_logger`)
* 8.1. Abstract Base Class `AsyncLoggerBase(ABC)`
* Purpose: Defines the basic interface for an asynchronous logger. Concrete implementations should provide methods for logging messages at different levels.
* 8.2. Class `AsyncLogger(AsyncLoggerBase)`
* Purpose: The default asynchronous logger for `crawl4ai`. It provides structured logging to both the console and optionally to a file, with customizable icons, colors, and verbosity levels.
* Initialization (`__init__`):
```python
def __init__(
self,
log_file: Optional[str] = None,
verbose: bool = True,
tag_width: int = 15, # outline had 10, code has 15
icons: Optional[Dict[str, str]] = None,
colors: Optional[Dict[LogLevel, LogColor]] = None, # Corrected type annotation
log_level: LogLevel = LogLevel.INFO # Assuming LogLevel.INFO is a typical default
):
```
* Parameters:
* `log_file (Optional[str])`: Path to a file where logs should be written. If `None`, logs only to console.
* `verbose (bool)`: If `True`, enables more detailed logging (DEBUG level). Default: `True`.
* `tag_width (int)`: Width for the tag part of the log message. Default: `15`.
* `icons (Optional[Dict[str, str]])`: Custom icons for different log tags.
* `colors (Optional[Dict[LogLevel, LogColor]])`: Custom colors for different log levels.
* `log_level (LogLevel)`: Minimum log level to output.
* Key Methods (for logging):
* `info(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an informational message.
* `warning(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a warning message.
* `error(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an error message.
* `debug(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a debug message (only if `verbose=True` or `log_level` is DEBUG).
* `url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", **params) -> None`: Logs the status of a URL fetch operation, including success/failure and timing.
* `error_status(self, url: str, error: str, tag: str = "ERROR", **params) -> None`: Logs an error encountered for a specific URL.
## 9. Core Utility Functions (from `crawl4ai.async_configs`)
* 9.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
* Purpose: Recursively converts a Python object (often a Pydantic model or a dataclass instance used for configuration) into a dictionary that is safe for JSON serialization. It handles nested objects, enums, and basic types.
* Parameters:
* `obj (Any)`: The object to be serialized.
* `ignore_default_value (bool)`: If `True`, fields whose current value is the same as their default value (if applicable, e.g., for Pydantic models) might be omitted from the resulting dictionary. Default: `False`.
* Returns: `Dict` - A JSON-serializable dictionary representation of the object.
* 9.2. `from_serializable_dict(data: Any) -> Any`
* Purpose: Recursively reconstructs Python objects from a dictionary representation (typically one created by `to_serializable_dict`). It attempts to instantiate classes based on a "type" key in the dictionary if present.
* Parameters:
* `data (Any)`: The dictionary (or basic type) to be deserialized.
* Returns: `Any` - The reconstructed Python object or the original data if no special deserialization rule applies.
* 9.3. `is_empty_value(value: Any) -> bool`
* Purpose: Checks if a given value is considered "empty" (e.g., `None`, an empty string, an empty list, an empty dictionary).
* Returns: `bool` - `True` if the value is empty, `False` otherwise.
## 10. Enumerations (Key Enums used in Core)
* 10.1. `CacheMode` (from `crawl4ai.cache_context`, defined in `crawl4ai.async_configs` as per provided code)
* Purpose: Defines the caching behavior for crawl operations.
* Members:
* `ENABLE`: (Value: "enable") Normal caching behavior; read from cache if available, write to cache after fetching.
* `DISABLE`: (Value: "disable") No caching at all; always fetch fresh content and do not write to cache.
* `READ_ONLY`: (Value: "read_only") Only read from the cache; do not write new or updated content to the cache.
* `WRITE_ONLY`: (Value: "write_only") Only write to the cache after fetching; do not read from the cache.
* `BYPASS`: (Value: "bypass") Skip the cache entirely for this specific operation; fetch fresh content and do not write to cache. This is often the default for individual `CrawlerRunConfig` instances.
* 10.2. `DisplayMode` (from `crawl4ai.models`, used by `CrawlerMonitor`)
* Purpose: Defines the display mode for the `CrawlerMonitor`.
* Members:
* `DETAILED`: Shows detailed information for each task.
* `AGGREGATED`: Shows summary statistics and overall progress.
* 10.3. `CrawlStatus` (from `crawl4ai.models`, used by `CrawlStats`)
* Purpose: Represents the status of a crawl task.
* Members:
* `QUEUED`: Task is waiting to be processed.
* `IN_PROGRESS`: Task is currently being processed.
* `COMPLETED`: Task finished successfully.
* `FAILED`: Task failed.
## 11. Versioning
* 11.1. Accessing Library Version:
* The current version of the `crawl4ai` library can be accessed programmatically via the `__version__` attribute of the top-level `crawl4ai` package.
* Example:
```python
from crawl4ai import __version__ as crawl4ai_version
print(f"Crawl4AI Version: {crawl4ai_version}")
# Expected output based on provided code: Crawl4AI Version: 0.6.3
```
## 12. Basic Usage Examples
* 12.1. Minimal Crawl:
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="http://example.com")
if result.success:
print("Markdown (first 300 chars):")
print(result.markdown.raw_markdown[:300]) # Accessing raw_markdown
else:
print(f"Error: {result.error_message}")
if __name__ == "__main__":
asyncio.run(main())
```
* 12.2. Crawl with Basic Configuration:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
browser_cfg = BrowserConfig(headless=True, browser_type="firefox")
run_cfg = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=50
)
async with AsyncWebCrawler(config=browser_cfg) as crawler:
result = await crawler.arun(url="http://example.com", config=run_cfg)
if result.success:
print(f"Status Code: {result.status_code}")
print(f"Cleaned HTML length: {len(result.cleaned_html)}")
else:
print(f"Error: {result.error_message}")
if __name__ == "__main__":
asyncio.run(main())
```
* 12.3. Accessing Links and Images from Result:
```python
import asyncio
from crawl4ai import AsyncWebCrawler
async def main():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="http://example.com")
if result.success:
print(f"Found {len(result.links.internal)} internal links.")
if result.links.internal:
print(f"First internal link: {result.links.internal[0].href}")
print(f"Found {len(result.media.images)} images.")
if result.media.images:
print(f"First image src: {result.media.images[0].src}")
else:
print(f"Error: {result.error_message}")
if __name__ == "__main__":
asyncio.run(main())
```
```

File diff suppressed because it is too large Load Diff

View File

@@ -1,813 +0,0 @@
Okay, I have read and processed the information. I will now generate the detailed "Foundational Memory" document for the `deep_crawling` component of `crawl4ai`, based on the outline from the previous step and the provided code snippets.
```markdown
# Detailed Foundational Memory for crawl4ai - deep_crawling Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_deep_crawling.md`
**Library Version Context:** 0.6.3 (from `crawl4ai/__version__.py`)
**Outline Generation Date:** 2024-05-24
---
## 1. Introduction to Deep Crawling
* 1.1. Purpose: The `deep_crawling` component provides functionalities for recursively crawling web pages starting from an initial URL. It includes strategies for different traversal orders (BFS, DFS, Best-First), mechanisms for filtering which URLs to visit, and methods for scoring URLs to prioritize crawling.
* 1.2. Core Concepts:
* 1.2.1. Definition of Deep Crawling in Crawl4ai context: The process of discovering and fetching multiple web pages by following links from an initial set of URLs, adhering to specified depth, page limits, and filtering/scoring rules.
* 1.2.2. Key Abstractions:
* `DeepCrawlStrategy`: Defines the algorithm for traversing linked web pages (e.g., BFS, DFS).
* `URLFilter`: Determines whether a discovered URL should be considered for crawling.
* `URLScorer`: Assigns a score to URLs to influence crawling priority, especially in strategies like Best-First.
## 2. `DeepCrawlStrategy` Interface and Implementations
* **2.1. `DeepCrawlStrategy` (Abstract Base Class)**
* Source: `crawl4ai/deep_crawling/base_strategy.py`
* 2.1.1. Purpose: Defines the abstract base class for all deep crawling strategies, outlining the core methods required for traversal logic, resource management, URL validation, and link discovery.
* 2.1.2. Key Abstract Methods:
* `async def _arun_batch(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> List[CrawlResult]`:
* Description: Core logic for batch (non-streaming) deep crawling. Processes URLs level by level (or according to strategy) and returns all results once the crawl is complete or limits are met.
* `async def _arun_stream(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> AsyncGenerator[CrawlResult, None]`:
* Description: Core logic for streaming deep crawling. Processes URLs and yields `CrawlResult` objects as they become available.
* `async def shutdown(self) -> None`:
* Description: Cleans up any resources used by the deep crawl strategy, such as signaling cancellation events.
* `async def can_process_url(self, url: str, depth: int) -> bool`:
* Description: Validates a given URL and current depth against configured filters and limits to decide if it should be processed.
* `async def link_discovery(self, result: CrawlResult, source_url: str, current_depth: int, visited: Set[str], next_level: List[tuple], depths: Dict[str, int]) -> None`:
* Description: Extracts links from a `CrawlResult`, validates them using `can_process_url`, optionally scores them, and appends valid URLs (and their parent references) to the `next_level` list. Updates the `depths` dictionary for newly discovered URLs.
* 2.1.3. Key Concrete Methods:
* `async def arun(self, start_url: str, crawler: AsyncWebCrawler, config: Optional[CrawlerRunConfig] = None) -> RunManyReturn`:
* Description: Main entry point for initiating a deep crawl. It checks if a `CrawlerRunConfig` is provided and then delegates to either `_arun_stream` or `_arun_batch` based on the `config.stream` flag.
* `def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig)`:
* Description: Makes the strategy instance callable, directly invoking the `arun` method.
* 2.1.4. Attributes:
* `_cancel_event (asyncio.Event)`: Event to signal cancellation of the crawl.
* `_pages_crawled (int)`: Counter for the number of pages successfully crawled.
* **2.2. `BFSDeepCrawlStrategy`**
* Source: `crawl4ai/deep_crawling/bfs_strategy.py`
* 2.2.1. Purpose: Implements a Breadth-First Search (BFS) deep crawling strategy, exploring all URLs at the current depth level before moving to the next.
* 2.2.2. Inheritance: `DeepCrawlStrategy`
* 2.2.3. Initialization (`__init__`)
* 2.2.3.1. Signature:
```python
def __init__(
self,
max_depth: int,
filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None,
include_external: bool = False,
score_threshold: float = -float('inf'),
max_pages: int = float('inf'),
logger: Optional[logging.Logger] = None,
):
```
* 2.2.3.2. Parameters:
* `max_depth (int)`: Maximum depth to crawl relative to the `start_url`.
* `filter_chain (FilterChain`, default: `FilterChain()`)`: A `FilterChain` instance to apply to discovered URLs.
* `url_scorer (Optional[URLScorer]`, default: `None`)`: An optional `URLScorer` to score URLs. If provided, URLs below `score_threshold` are skipped, and for crawls exceeding `max_pages`, higher-scored URLs are prioritized.
* `include_external (bool`, default: `False`)`: If `True`, allows crawling of URLs from external domains.
* `score_threshold (float`, default: `-float('inf')`)`: Minimum score (if `url_scorer` is used) for a URL to be processed.
* `max_pages (int`, default: `float('inf')`)`: Maximum total number of pages to crawl.
* `logger (Optional[logging.Logger]`, default: `None`)`: An optional logger instance. If `None`, a default logger is created.
* 2.2.4. Key Implemented Methods:
* `_arun_batch(...)`: Implements BFS traversal by processing URLs level by level. It collects all results from a level before discovering links for the next level. All results are returned as a list upon completion.
* `_arun_stream(...)`: Implements BFS traversal, yielding `CrawlResult` objects as soon as they are processed within a level. Link discovery for the next level happens after all URLs in the current level are processed and their results yielded.
* `can_process_url(...)`: Validates URL format, applies the `filter_chain`, and checks depth limits. For the start URL (depth 0), filtering is bypassed.
* `link_discovery(...)`: Extracts internal (and optionally external) links, normalizes them, checks against `visited` set and `can_process_url`. If a `url_scorer` is present and `max_pages` limit is a concern, it scores and sorts valid links, selecting the top ones within `remaining_capacity`.
* `shutdown(...)`: Sets an internal `_cancel_event` to signal graceful termination and records the end time in `stats`.
* 2.2.5. Key Attributes/Properties:
* `stats (TraversalStats)`: [Read-only] - Instance of `TraversalStats` tracking the progress and statistics of the crawl.
* `max_depth (int)`: Maximum crawl depth.
* `filter_chain (FilterChain)`: The filter chain used.
* `url_scorer (Optional[URLScorer])`: The URL scorer used.
* `include_external (bool)`: Flag for including external URLs.
* `score_threshold (float)`: URL score threshold.
* `max_pages (int)`: Maximum pages to crawl.
* **2.3. `DFSDeepCrawlStrategy`**
* Source: `crawl4ai/deep_crawling/dfs_strategy.py`
* 2.3.1. Purpose: Implements a Depth-First Search (DFS) deep crawling strategy, exploring as far as possible along each branch before backtracking.
* 2.3.2. Inheritance: `BFSDeepCrawlStrategy` (Note: Leverages much of the `BFSDeepCrawlStrategy`'s infrastructure but overrides traversal logic to use a stack.)
* 2.3.3. Initialization (`__init__`)
* 2.3.3.1. Signature: (Same as `BFSDeepCrawlStrategy`)
```python
def __init__(
self,
max_depth: int,
filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None,
include_external: bool = False,
score_threshold: float = -float('inf'),
max_pages: int = infinity,
logger: Optional[logging.Logger] = None,
):
```
* 2.3.3.2. Parameters: Same as `BFSDeepCrawlStrategy`.
* 2.3.4. Key Overridden/Implemented Methods:
* `_arun_batch(...)`: Implements DFS traversal using a LIFO stack. Processes one URL at a time, discovers its links, and adds them to the stack (typically in reverse order of discovery to maintain a natural DFS path). Collects all results in a list.
* `_arun_stream(...)`: Implements DFS traversal using a LIFO stack, yielding `CrawlResult` for each processed URL as it becomes available. Discovered links are added to the stack for subsequent processing.
* **2.4. `BestFirstCrawlingStrategy`**
* Source: `crawl4ai/deep_crawling/bff_strategy.py`
* 2.4.1. Purpose: Implements a Best-First Search deep crawling strategy, prioritizing URLs based on scores assigned by a `URLScorer`. It uses a priority queue to manage URLs to visit.
* 2.4.2. Inheritance: `DeepCrawlStrategy`
* 2.4.3. Initialization (`__init__`)
* 2.4.3.1. Signature:
```python
def __init__(
self,
max_depth: int,
filter_chain: FilterChain = FilterChain(),
url_scorer: Optional[URLScorer] = None,
include_external: bool = False,
max_pages: int = float('inf'),
logger: Optional[logging.Logger] = None,
):
```
* 2.4.3.2. Parameters:
* `max_depth (int)`: Maximum depth to crawl.
* `filter_chain (FilterChain`, default: `FilterChain()`)`: Chain of filters to apply.
* `url_scorer (Optional[URLScorer]`, default: `None`)`: Scorer to rank URLs. Crucial for this strategy; if not provided, URLs might effectively be processed in FIFO order (score 0).
* `include_external (bool`, default: `False`)`: Whether to include external links.
* `max_pages (int`, default: `float('inf')`)`: Maximum number of pages to crawl.
* `logger (Optional[logging.Logger]`, default: `None`)`: Logger instance.
* 2.4.4. Key Implemented Methods:
* `_arun_batch(...)`: Aggregates results from `_arun_best_first` into a list.
* `_arun_stream(...)`: Yields results from `_arun_best_first` as they are generated.
* `_arun_best_first(...)`: Core logic for best-first traversal. Uses an `asyncio.PriorityQueue` where items are `(score, depth, url, parent_url)`. URLs are processed in batches (default size 10) from the priority queue. Discovered links are scored and added to the queue.
* 2.4.5. Key Attributes/Properties:
* `stats (TraversalStats)`: [Read-only] - Traversal statistics object.
* `BATCH_SIZE (int)`: [Class constant, default: 10] - Number of URLs to process concurrently from the priority queue.
## 3. URL Filtering Mechanisms
* **3.1. `URLFilter` (Abstract Base Class)**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.1.1. Purpose: Defines the abstract base class for all URL filters, providing a common interface for deciding whether a URL should be processed.
* 3.1.2. Key Abstract Methods:
* `apply(self, url: str) -> bool`:
* Description: Abstract method that must be implemented by subclasses. It takes a URL string and returns `True` if the URL passes the filter (should be processed), and `False` otherwise.
* 3.1.3. Key Attributes/Properties:
* `name (str)`: [Read-only] - The name of the filter, typically the class name.
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` to track how many URLs were processed, passed, and rejected by this filter.
* `logger (logging.Logger)`: [Read-only] - A logger instance specific to this filter, initialized lazily.
* 3.1.4. Key Concrete Methods:
* `_update_stats(self, passed: bool) -> None`: Updates the `stats` object (total, passed, rejected counts).
* **3.2. `FilterChain`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.2.1. Purpose: Manages a sequence of `URLFilter` instances. A URL must pass all filters in the chain to be considered valid.
* 3.2.2. Initialization (`__init__`)
* 3.2.2.1. Signature:
```python
def __init__(self, filters: List[URLFilter] = None):
```
* 3.2.2.2. Parameters:
* `filters (List[URLFilter]`, default: `None`)`: An optional list of `URLFilter` instances to initialize the chain with. If `None`, an empty chain is created.
* 3.2.3. Key Public Methods:
* `add_filter(self, filter_: URLFilter) -> FilterChain`:
* Description: Adds a new `URLFilter` instance to the end of the chain.
* Returns: `(FilterChain)` - The `FilterChain` instance itself, allowing for method chaining.
* `async def apply(self, url: str) -> bool`:
* Description: Applies each filter in the chain to the given URL. If any filter returns `False` (rejects the URL), this method immediately returns `False`. If all filters pass, it returns `True`. Handles both synchronous and asynchronous `apply` methods of individual filters.
* Returns: `(bool)` - `True` if the URL passes all filters, `False` otherwise.
* 3.2.4. Key Attributes/Properties:
* `filters (Tuple[URLFilter, ...])`: [Read-only] - An immutable tuple containing the `URLFilter` instances in the chain.
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` tracking the aggregated statistics for the entire chain (total URLs processed, passed, and rejected by the chain as a whole).
* **3.3. `URLPatternFilter`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.3.1. Purpose: Filters URLs based on whether they match a list of specified string patterns. Supports glob-style wildcards and regular expressions.
* 3.3.2. Inheritance: `URLFilter`
* 3.3.3. Initialization (`__init__`)
* 3.3.3.1. Signature:
```python
def __init__(
self,
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
use_glob: bool = True, # Deprecated, glob is always used for strings if not regex
reverse: bool = False,
):
```
* 3.3.3.2. Parameters:
* `patterns (Union[str, Pattern, List[Union[str, Pattern]]])`: A single pattern string/compiled regex, or a list of such patterns. String patterns are treated as glob patterns by default unless they are identifiable as regex (e.g., start with `^`, end with `$`, contain `\d`).
* `use_glob (bool`, default: `True`)`: [Deprecated] This parameter's functionality is now implicitly handled by pattern detection.
* `reverse (bool`, default: `False`)`: If `True`, the filter rejects URLs that match any of the patterns. If `False` (default), it accepts URLs that match any pattern and rejects those that don't match any.
* 3.3.4. Key Implemented Methods:
* `apply(self, url: str) -> bool`:
* Description: Checks if the URL matches any of the configured patterns. Simple suffix/prefix/domain patterns are checked first for performance. For more complex patterns, it uses `fnmatch.translate` (for glob-like strings) or compiled regex objects. The outcome is affected by the `reverse` flag.
* 3.3.5. Internal Categorization:
* `PATTERN_TYPES`: A dictionary mapping pattern types (SUFFIX, PREFIX, DOMAIN, PATH, REGEX) to integer constants.
* `_simple_suffixes (Set[str])`: Stores simple suffix patterns (e.g., `.html`).
* `_simple_prefixes (Set[str])`: Stores simple prefix patterns (e.g., `/blog/`).
* `_domain_patterns (List[Pattern])`: Stores compiled regex for domain-specific patterns (e.g., `*.example.com`).
* `_path_patterns (List[Pattern])`: Stores compiled regex for more general path patterns.
* **3.4. `ContentTypeFilter`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.4.1. Purpose: Filters URLs based on their expected content type, primarily by inferring it from the file extension in the URL.
* 3.4.2. Inheritance: `URLFilter`
* 3.4.3. Initialization (`__init__`)
* 3.4.3.1. Signature:
```python
def __init__(
self,
allowed_types: Union[str, List[str]],
check_extension: bool = True,
ext_map: Dict[str, str] = _MIME_MAP, # _MIME_MAP is internal
):
```
* 3.4.3.2. Parameters:
* `allowed_types (Union[str, List[str]])`: A single MIME type string (e.g., "text/html") or a list of allowed MIME types. Can also be partial types like "image/" to allow all image types.
* `check_extension (bool`, default: `True`)`: If `True` (default), the filter attempts to determine the content type by looking at the URL's file extension. If `False`, all URLs pass this filter (unless `allowed_types` is empty).
* `ext_map (Dict[str, str]`, default: `ContentTypeFilter._MIME_MAP`)`: A dictionary mapping file extensions to their corresponding MIME types. A comprehensive default map is provided.
* 3.4.4. Key Implemented Methods:
* `apply(self, url: str) -> bool`:
* Description: Extracts the file extension from the URL. If `check_extension` is `True` and an extension is found, it checks if the inferred MIME type (or the extension itself if MIME type is unknown) is among the `allowed_types`. If no extension is found, it typically allows the URL (assuming it might be an HTML page or similar).
* 3.4.5. Static Methods:
* `_extract_extension(url: str) -> str`: [Cached] Extracts the file extension from a URL path, handling query parameters and fragments.
* 3.4.6. Class Variables:
* `_MIME_MAP (Dict[str, str])`: A class-level dictionary mapping common file extensions to MIME types.
* **3.5. `DomainFilter`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.5.1. Purpose: Filters URLs based on a whitelist of allowed domains or a blacklist of blocked domains. Supports subdomain matching.
* 3.5.2. Inheritance: `URLFilter`
* 3.5.3. Initialization (`__init__`)
* 3.5.3.1. Signature:
```python
def __init__(
self,
allowed_domains: Union[str, List[str]] = None,
blocked_domains: Union[str, List[str]] = None,
):
```
* 3.5.3.2. Parameters:
* `allowed_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. If provided, only URLs whose domain (or a subdomain thereof) is in this list will pass.
* `blocked_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. URLs whose domain (or a subdomain thereof) is in this list will be rejected.
* 3.5.4. Key Implemented Methods:
* `apply(self, url: str) -> bool`:
* Description: Extracts the domain from the URL. First, checks if the domain is in `_blocked_domains` (rejects if true). Then, if `_allowed_domains` is specified, checks if the domain is in that list (accepts if true). If `_allowed_domains` is not specified and the URL was not blocked, it passes.
* 3.5.5. Static Methods:
* `_normalize_domains(domains: Union[str, List[str]]) -> Set[str]`: Converts input domains to a set of lowercase strings.
* `_is_subdomain(domain: str, parent_domain: str) -> bool`: Checks if `domain` is a subdomain of (or equal to) `parent_domain`.
* `_extract_domain(url: str) -> str`: [Cached] Extracts the domain name from a URL.
* **3.6. `ContentRelevanceFilter`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.6.1. Purpose: Filters URLs by fetching their `<head>` section, extracting text content (title, meta tags), and scoring its relevance against a given query using the BM25 algorithm.
* 3.6.2. Inheritance: `URLFilter`
* 3.6.3. Initialization (`__init__`)
* 3.6.3.1. Signature:
```python
def __init__(
self,
query: str,
threshold: float,
k1: float = 1.2,
b: float = 0.75,
avgdl: int = 1000,
):
```
* 3.6.3.2. Parameters:
* `query (str)`: The query string to assess relevance against.
* `threshold (float)`: The minimum BM25 score required for the URL to be considered relevant and pass the filter.
* `k1 (float`, default: `1.2`)`: BM25 k1 parameter (term frequency saturation).
* `b (float`, default: `0.75`)`: BM25 b parameter (length normalization).
* `avgdl (int`, default: `1000`)`: Assumed average document length for BM25 calculations (typically based on the head content).
* 3.6.4. Key Implemented Methods:
* `async def apply(self, url: str) -> bool`:
* Description: Asynchronously fetches the HTML `<head>` content of the URL using `HeadPeeker.peek_html`. Extracts title and meta description/keywords. Calculates the BM25 score of this combined text against the `query`. Returns `True` if the score is >= `threshold`.
* 3.6.5. Helper Methods:
* `_build_document(self, fields: Dict) -> str`: Constructs a weighted document string from title and meta tags.
* `_tokenize(self, text: str) -> List[str]`: Simple whitespace tokenizer.
* `_bm25(self, document: str) -> float`: Calculates the BM25 score.
* **3.7. `SEOFilter`**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.7.1. Purpose: Filters URLs by performing a quantitative SEO quality assessment based on the content of their `<head>` section (e.g., title length, meta description presence, canonical tags, robots meta tags, schema.org markup).
* 3.7.2. Inheritance: `URLFilter`
* 3.7.3. Initialization (`__init__`)
* 3.7.3.1. Signature:
```python
def __init__(
self,
threshold: float = 0.65,
keywords: List[str] = None,
weights: Dict[str, float] = None,
):
```
* 3.7.3.2. Parameters:
* `threshold (float`, default: `0.65`)`: The minimum aggregated SEO score (typically 0.0 to 1.0 range, though individual factor weights can exceed 1) required for the URL to pass.
* `keywords (List[str]`, default: `None`)`: A list of keywords to check for presence in the title.
* `weights (Dict[str, float]`, default: `None`)`: A dictionary to override default weights for various SEO factors (e.g., `{"title_length": 0.2, "canonical": 0.15}`).
* 3.7.4. Key Implemented Methods:
* `async def apply(self, url: str) -> bool`:
* Description: Asynchronously fetches the HTML `<head>` content. Calculates scores for individual SEO factors (title length, keyword presence, meta description, canonical tag, robots meta tag, schema.org presence, URL quality). Aggregates these scores using the defined `weights`. Returns `True` if the total score is >= `threshold`.
* 3.7.5. Helper Methods (Scoring Factors):
* `_score_title_length(self, title: str) -> float`
* `_score_keyword_presence(self, text: str) -> float`
* `_score_meta_description(self, desc: str) -> float`
* `_score_canonical(self, canonical: str, original: str) -> float`
* `_score_schema_org(self, html: str) -> float`
* `_score_url_quality(self, parsed_url) -> float`
* 3.7.6. Class Variables:
* `DEFAULT_WEIGHTS (Dict[str, float])`: Default weights for each SEO factor.
* **3.8. `FilterStats` Data Class**
* Source: `crawl4ai/deep_crawling/filters.py`
* 3.8.1. Purpose: A data class to track statistics for URL filtering operations, including total URLs processed, passed, and rejected.
* 3.8.2. Fields:
* `_counters (array.array)`: An array of unsigned integers storing counts for `[total, passed, rejected]`.
* 3.8.3. Properties:
* `total_urls (int)`: Returns the total number of URLs processed.
* `passed_urls (int)`: Returns the number of URLs that passed the filter.
* `rejected_urls (int)`: Returns the number of URLs that were rejected.
## 4. URL Scoring Mechanisms
* **4.1. `URLScorer` (Abstract Base Class)**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.1.1. Purpose: Defines the abstract base class for all URL scorers. Scorers assign a numerical value to URLs, which can be used to prioritize crawling.
* 4.1.2. Key Abstract Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Abstract method to be implemented by subclasses. It takes a URL string and returns a raw numerical score.
* 4.1.3. Key Concrete Methods:
* `score(self, url: str) -> float`:
* Description: Calculates the final score for a URL by calling `_calculate_score` and multiplying the result by the scorer's `weight`. It also updates the internal `ScoringStats`.
* Returns: `(float)` - The weighted score.
* 4.1.4. Key Attributes/Properties:
* `weight (ctypes.c_float)`: [Read-write] - The weight assigned to this scorer. The raw score calculated by `_calculate_score` will be multiplied by this weight. Default is 1.0. Stored as `ctypes.c_float` for memory efficiency.
* `stats (ScoringStats)`: [Read-only] - An instance of `ScoringStats` that tracks statistics for this scorer (number of URLs scored, total score, min/max scores).
* **4.2. `KeywordRelevanceScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.2.1. Purpose: Scores URLs based on the presence and frequency of specified keywords within the URL string itself.
* 4.2.2. Inheritance: `URLScorer`
* 4.2.3. Initialization (`__init__`)
* 4.2.3.1. Signature:
```python
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
```
* 4.2.3.2. Parameters:
* `keywords (List[str])`: A list of keyword strings to search for in the URL.
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
* `case_sensitive (bool`, default: `False`)`: If `True`, keyword matching is case-sensitive. Otherwise, both the URL and keywords are converted to lowercase for matching.
* 4.2.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Counts how many of the provided `keywords` are present in the `url`. The score is the ratio of matched keywords to the total number of keywords (0.0 to 1.0).
* 4.2.5. Helper Methods:
* `_url_bytes(self, url: str) -> bytes`: [Cached] Converts URL to bytes, lowercasing if not case-sensitive.
* **4.3. `PathDepthScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.3.1. Purpose: Scores URLs based on their path depth (number of segments in the URL path). It favors URLs closer to an `optimal_depth`.
* 4.3.2. Inheritance: `URLScorer`
* 4.3.3. Initialization (`__init__`)
* 4.3.3.1. Signature:
```python
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
```
* 4.3.3.2. Parameters:
* `optimal_depth (int`, default: `3`)`: The path depth considered ideal. URLs at this depth get the highest score.
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
* 4.3.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Calculates the path depth of the URL. The score is `1.0 / (1.0 + abs(depth - optimal_depth))`, meaning URLs at `optimal_depth` score 1.0, and scores decrease as depth deviates. Uses a lookup table for common small differences for speed.
* 4.3.5. Static Methods:
* `_quick_depth(path: str) -> int`: [Cached] Efficiently calculates path depth without full URL parsing.
* **4.4. `ContentTypeScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.4.1. Purpose: Scores URLs based on their inferred content type, typically derived from the file extension.
* 4.4.2. Inheritance: `URLScorer`
* 4.4.3. Initialization (`__init__`)
* 4.4.3.1. Signature:
```python
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
```
* 4.4.3.2. Parameters:
* `type_weights (Dict[str, float])`: A dictionary mapping file extensions (e.g., "html", "pdf") or MIME type patterns (e.g., "text/html", "image/") to scores. Patterns ending with '$' are treated as exact extension matches.
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
* 4.4.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Extracts the file extension from the URL. Looks up the score in `type_weights` first by exact extension match (if pattern ends with '$'), then by general extension. If no direct match, it might try matching broader MIME type categories if defined in `type_weights`. Returns 0.0 if no match found.
* 4.4.5. Static Methods:
* `_quick_extension(url: str) -> str`: [Cached] Efficiently extracts file extension.
* **4.5. `FreshnessScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.5.1. Purpose: Scores URLs based on dates found within the URL string, giving higher scores to more recent dates.
* 4.5.2. Inheritance: `URLScorer`
* 4.5.3. Initialization (`__init__`)
* 4.5.3.1. Signature:
```python
def __init__(self, weight: float = 1.0, current_year: int = [datetime.date.today().year]): # Actual default is dynamic
```
* 4.5.3.2. Parameters:
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
* `current_year (int`, default: `datetime.date.today().year`)`: The reference year to calculate freshness against.
* 4.5.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Uses a regex to find year patterns (YYYY) in the URL. If multiple years are found, it uses the latest valid year. The score is higher for years closer to `current_year`, using a predefined lookup for small differences or a decay function for larger differences. If no year is found, a default score (0.5) is returned.
* 4.5.5. Helper Methods:
* `_extract_year(self, url: str) -> Optional[int]`: [Cached] Extracts the most recent valid year from the URL.
* **4.6. `DomainAuthorityScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.6.1. Purpose: Scores URLs based on a predefined list of domain authority weights. This allows prioritizing or de-prioritizing URLs from specific domains.
* 4.6.2. Inheritance: `URLScorer`
* 4.6.3. Initialization (`__init__`)
* 4.6.3.1. Signature:
```python
def __init__(
self,
domain_weights: Dict[str, float],
default_weight: float = 0.5,
weight: float = 1.0,
):
```
* 4.6.3.2. Parameters:
* `domain_weights (Dict[str, float])`: A dictionary mapping domain names (e.g., "example.com") to their authority scores (typically between 0.0 and 1.0).
* `default_weight (float`, default: `0.5`)`: The score to assign to URLs whose domain is not found in `domain_weights`.
* `weight (float`, default: `1.0`)`: The overall weight to apply to the calculated score.
* 4.6.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Extracts the domain from the URL. If the domain is in `_domain_weights`, its corresponding score is returned. Otherwise, `_default_weight` is returned. Prioritizes top domains for faster lookup.
* 4.6.5. Static Methods:
* `_extract_domain(url: str) -> str`: [Cached] Efficiently extracts the domain from a URL.
* **4.7. `CompositeScorer`**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.7.1. Purpose: Combines the scores from multiple `URLScorer` instances. Each constituent scorer contributes its weighted score to the final composite score.
* 4.7.2. Inheritance: `URLScorer`
* 4.7.3. Initialization (`__init__`)
* 4.7.3.1. Signature:
```python
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
```
* 4.7.3.2. Parameters:
* `scorers (List[URLScorer])`: A list of `URLScorer` instances to be combined.
* `normalize (bool`, default: `True`)`: If `True`, the final composite score is normalized by dividing the sum of weighted scores by the number of scorers. This can help keep scores in a more consistent range.
* 4.7.4. Key Implemented Methods:
* `_calculate_score(self, url: str) -> float`:
* Description: Iterates through all scorers in its list, calls their `score(url)` method (which applies individual weights), and sums up these scores. If `normalize` is `True`, divides the total sum by the number of scorers.
* 4.7.5. Key Concrete Methods (overrides `URLScorer.score`):
* `score(self, url: str) -> float`:
* Description: Calculates the composite score and updates its own `ScoringStats`. Note: The individual scorers' stats are updated when their `score` methods are called internally.
* **4.8. `ScoringStats` Data Class**
* Source: `crawl4ai/deep_crawling/scorers.py`
* 4.8.1. Purpose: A data class to track statistics for URL scoring operations, including the number of URLs scored, total score, and min/max scores.
* 4.8.2. Fields:
* `_urls_scored (int)`: Count of URLs scored.
* `_total_score (float)`: Sum of all scores.
* `_min_score (Optional[float])`: Minimum score encountered.
* `_max_score (Optional[float])`: Maximum score encountered.
* 4.8.3. Key Methods:
* `update(self, score: float) -> None`: Updates the statistics with a new score.
* `get_average(self) -> float`: Calculates and returns the average score.
* `get_min(self) -> float`: Lazily initializes and returns the minimum score.
* `get_max(self) -> float`: Lazily initializes and returns the maximum score.
## 5. `DeepCrawlDecorator`
* Source: `crawl4ai/deep_crawling/base_strategy.py`
* 5.1. Purpose: A decorator class that transparently adds deep crawling functionality to the `AsyncWebCrawler.arun` method if a `deep_crawl_strategy` is specified in the `CrawlerRunConfig`.
* 5.2. Initialization (`__init__`)
* 5.2.1. Signature:
```python
def __init__(self, crawler: AsyncWebCrawler):
```
* 5.2.2. Parameters:
* `crawler (AsyncWebCrawler)`: The `AsyncWebCrawler` instance whose `arun` method is to be decorated.
* 5.3. `__call__` Method
* 5.3.1. Signature:
```python
@wraps(original_arun)
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
```
* 5.3.2. Functionality: This method wraps the original `arun` method of the `AsyncWebCrawler`.
* It checks if `config` is provided, has a `deep_crawl_strategy` set, and if `DeepCrawlDecorator.deep_crawl_active` context variable is `False` (to prevent recursion).
* If these conditions are met:
* It sets `DeepCrawlDecorator.deep_crawl_active` to `True`.
* It calls the `arun` method of the specified `config.deep_crawl_strategy`.
* It handles potential streaming results from the strategy by wrapping them in an async generator.
* Finally, it resets `DeepCrawlDecorator.deep_crawl_active` to `False`.
* If the conditions are not met, it calls the original `arun` method of the crawler.
* 5.4. Class Variable:
* `deep_crawl_active (ContextVar)`:
* Purpose: A `contextvars.ContextVar` used as a flag to indicate if a deep crawl is currently in progress for the current asynchronous context. This prevents the decorator from re-triggering deep crawling if the strategy itself calls the crawler's `arun` or `arun_many` methods.
* Default Value: `False`.
## 6. `TraversalStats` Data Model
* Source: `crawl4ai/models.py`
* 6.1. Purpose: A data class for storing and tracking statistics related to a deep crawl traversal.
* 6.2. Fields:
* `start_time (datetime)`: The timestamp (Python `datetime` object) when the traversal process began. Default: `datetime.now()`.
* `end_time (Optional[datetime])`: The timestamp when the traversal process completed. Default: `None`.
* `urls_processed (int)`: The total number of URLs that were successfully fetched and processed. Default: `0`.
* `urls_failed (int)`: The total number of URLs that resulted in an error during fetching or processing. Default: `0`.
* `urls_skipped (int)`: The total number of URLs that were skipped (e.g., due to filters, already visited, or depth limits). Default: `0`.
* `total_depth_reached (int)`: The maximum depth reached from the start URL during the crawl. Default: `0`.
* `current_depth (int)`: The current depth level being processed by the crawler (can fluctuate during the crawl, especially for BFS). Default: `0`.
## 7. Configuration for Deep Crawling (`CrawlerRunConfig`)
* Source: `crawl4ai/async_configs.py`
* 7.1. Purpose: `CrawlerRunConfig` is the primary configuration object passed to `AsyncWebCrawler.arun()` and `AsyncWebCrawler.arun_many()`. It contains various settings that control the behavior of a single crawl run, including those specific to deep crawling.
* 7.2. Relevant Fields:
* `deep_crawl_strategy (Optional[DeepCrawlStrategy])`:
* Type: `Optional[DeepCrawlStrategy]` (where `DeepCrawlStrategy` is the ABC from `crawl4ai.deep_crawling.base_strategy`)
* Default: `None`
* Description: Specifies the deep crawling strategy instance (e.g., `BFSDeepCrawlStrategy`, `DFSDeepCrawlStrategy`, `BestFirstCrawlingStrategy`) to be used for the crawl. If `None`, deep crawling is disabled, and only the initial URL(s) will be processed.
* *Note: Parameters like `max_depth`, `max_pages`, `filter_chain`, `url_scorer`, `score_threshold`, and `include_external` are not direct attributes of `CrawlerRunConfig` for deep crawling. Instead, they are passed to the constructor of the chosen `DeepCrawlStrategy` instance, which is then assigned to `CrawlerRunConfig.deep_crawl_strategy`.*
## 8. Utility Functions
* **8.1. `normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py` if it's a general utility)
* 8.1.1. Purpose: Normalizes a URL found during deep crawling. This typically involves resolving relative URLs against the `source_url` to create absolute URLs and removing URL fragments (`#fragment`).
* 8.1.2. Signature: `def normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
* 8.1.3. Parameters:
* `url (str)`: The URL string to be normalized.
* `source_url (str)`: The URL of the page where the `url` was discovered. This is used as the base for resolving relative paths.
* 8.1.4. Returns: `(str)` - The normalized, absolute URL without fragments.
* **8.2. `efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py`)
* 8.2.1. Purpose: Provides a potentially more performant version of URL normalization specifically for deep crawling scenarios, likely employing optimizations to avoid repeated or complex parsing operations. (Note: Based on the provided code, this appears to be the same as `normalize_url_for_deep_crawl` if only one is present, or it might contain specific internal optimizations not exposed differently at the API level but used by strategies).
* 8.2.2. Signature: `def efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
* 8.2.3. Parameters:
* `url (str)`: The URL string to be normalized.
* `source_url (str)`: The URL of the page where the `url` was discovered.
* 8.2.4. Returns: `(str)` - The normalized, absolute URL, typically without fragments.
## 9. PDF Processing Integration (`crawl4ai.processors.pdf`)
* 9.1. Overview of PDF processing in Crawl4ai: While not directly part of the `deep_crawling` package, PDF processing components can be used in conjunction if a deep crawl discovers PDF URLs and they need to be processed. The `PDFCrawlerStrategy` can fetch PDFs, and `PDFContentScrapingStrategy` can extract content from them.
* **9.2. `PDFCrawlerStrategy`**
* Source: `crawl4ai/processors/pdf/__init__.py`
* 9.2.1. Purpose: An `AsyncCrawlerStrategy` designed to "crawl" PDF files. In practice, this usually means downloading the PDF content. It returns a minimal `AsyncCrawlResponse` that signals to a `ContentScrapingStrategy` (like `PDFContentScrapingStrategy`) that the content is a PDF.
* 9.2.2. Inheritance: `AsyncCrawlerStrategy`
* 9.2.3. Initialization (`__init__`)
* 9.2.3.1. Signature: `def __init__(self, logger: AsyncLogger = None):`
* 9.2.3.2. Parameters:
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
* 9.2.4. Key Methods:
* `async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
* Description: For a PDF URL, this method typically signifies that the URL points to a PDF. It constructs an `AsyncCrawlResponse` with a `Content-Type` header of `application/pdf` and a placeholder HTML. The actual PDF processing (downloading and content extraction) is usually handled by a subsequent scraping strategy.
* **9.3. `PDFContentScrapingStrategy`**
* Source: `crawl4ai/processors/pdf/__init__.py`
* 9.3.1. Purpose: A `ContentScrapingStrategy` specialized in extracting text, images (optional), and metadata from PDF files. It uses a `PDFProcessorStrategy` (like `NaivePDFProcessorStrategy`) internally.
* 9.3.2. Inheritance: `ContentScrapingStrategy`
* 9.3.3. Initialization (`__init__`)
* 9.3.3.1. Signature:
```python
def __init__(self,
save_images_locally: bool = False,
extract_images: bool = False,
image_save_dir: str = None,
batch_size: int = 4,
logger: AsyncLogger = None):
```
* 9.3.3.2. Parameters:
* `save_images_locally (bool`, default: `False`)`: If `True`, extracted images will be saved to the local disk.
* `extract_images (bool`, default: `False`)`: If `True`, attempts to extract images from the PDF.
* `image_save_dir (str`, default: `None`)`: The directory where extracted images will be saved if `save_images_locally` is `True`.
* `batch_size (int`, default: `4`)`: The number of PDF pages to process in parallel batches (if the underlying processor supports it).
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
* 9.3.4. Key Methods:
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
* Description: Takes the URL (which should point to a PDF or a local PDF path) and processes it. It downloads the PDF if it's a remote URL, then uses the internal `pdf_processor` to extract content. It formats the extracted text into basic HTML and collects image and link information.
* `async def ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
* Description: Asynchronous version of the `scrape` method, typically by running the synchronous `scrape` method in a separate thread.
* 9.3.5. Helper Methods:
* `_get_pdf_path(self, url: str) -> str`: Downloads a PDF from a URL to a temporary file if it's not a local path.
* **9.4. `NaivePDFProcessorStrategy`**
* Source: `crawl4ai/processors/pdf/processor.py`
* 9.4.1. Purpose: A concrete implementation of `PDFProcessorStrategy` that uses `PyPDF2` (or similar libraries if extended) to extract text, images, and metadata from PDF documents page by page or in batches.
* 9.4.2. Initialization (`__init__`)
* Signature: `def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
* Parameters: [Details parameters for image extraction quality, saving, and batch processing size.]
* 9.4.3. Key Methods:
* `process(self, pdf_path: Path) -> PDFProcessResult`:
* Description: Processes a single PDF file sequentially, page by page. Extracts metadata, text, and optionally images from each page.
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
* Description: Processes a PDF file by dividing its pages into batches and processing these batches in parallel using a thread pool, potentially speeding up extraction for large PDFs.
* 9.4.4. Helper Methods:
* `_process_page(self, page, image_dir: Optional[Path]) -> PDFPage`: Processes a single PDF page object.
* `_extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]`: Extracts images from a page.
* `_extract_links(self, page) -> List[str]`: Extracts hyperlinks from a page.
* `_extract_metadata(self, pdf_path: Path, reader=None) -> PDFMetadata`: Extracts metadata from the PDF.
* **9.5. PDF Data Models**
* Source: `crawl4ai/processors/pdf/processor.py`
* 9.5.1. `PDFMetadata`:
* Purpose: Stores metadata extracted from a PDF document.
* Fields:
* `title (Optional[str])`: The title of the PDF.
* `author (Optional[str])`: The author(s) of the PDF.
* `producer (Optional[str])`: The software used to produce the PDF.
* `created (Optional[datetime])`: The creation date of the PDF.
* `modified (Optional[datetime])`: The last modification date of the PDF.
* `pages (int)`: The total number of pages in the PDF. Default: `0`.
* `encrypted (bool)`: `True` if the PDF is encrypted, `False` otherwise. Default: `False`.
* `file_size (Optional[int])`: The size of the PDF file in bytes. Default: `None`.
* 9.5.2. `PDFPage`:
* Purpose: Stores content extracted from a single page of a PDF document.
* Fields:
* `page_number (int)`: The page number (1-indexed).
* `raw_text (str)`: The raw text extracted from the page. Default: `""`.
* `markdown (str)`: Markdown representation of the page content. Default: `""`.
* `html (str)`: Basic HTML representation of the page content. Default: `""`.
* `images (List[Dict])`: A list of dictionaries, each representing an extracted image with details like format, path/data, dimensions. Default: `[]`.
* `links (List[str])`: A list of hyperlink URLs found on the page. Default: `[]`.
* `layout (List[Dict])`: Information about the layout of text elements on the page (e.g., coordinates). Default: `[]`.
* 9.5.3. `PDFProcessResult`:
* Purpose: Encapsulates the results of processing a PDF document.
* Fields:
* `metadata (PDFMetadata)`: The metadata of the processed PDF.
* `pages (List[PDFPage])`: A list of `PDFPage` objects, one for each page processed.
* `processing_time (float)`: The time taken to process the PDF, in seconds. Default: `0.0`.
* `version (str)`: The version of the PDF processor. Default: `"1.1"`.
## 10. Version Information (`crawl4ai.__version__`)
* Source: `crawl4ai/__version__.py`
* 10.1. `__version__ (str)`: A string representing the current installed version of the `crawl4ai` library (e.g., "0.6.3").
## 11. Asynchronous Configuration (`crawl4ai.async_configs`)
* 11.1. Overview: The `crawl4ai.async_configs` module contains configuration classes used throughout the library, including those relevant for network requests like proxies (`ProxyConfig`) and general crawler/browser behavior.
* **11.2. `ProxyConfig`**
* Source: `crawl4ai/async_configs.py` (and `crawl4ai/proxy_strategy.py`)
* 11.2.1. Purpose: Represents the configuration for a single proxy server, including its address, port, and optional authentication credentials.
* 11.2.2. Initialization (`__init__`)
* 11.2.2.1. Signature:
```python
def __init__(
self,
server: str,
username: Optional[str] = None,
password: Optional[str] = None,
ip: Optional[str] = None,
):
```
* 11.2.2.2. Parameters:
* `server (str)`: The proxy server URL (e.g., "http://proxy.example.com:8080", "socks5://proxy.example.com:1080").
* `username (Optional[str]`, default: `None`)`: The username for proxy authentication, if required.
* `password (Optional[str]`, default: `None`)`: The password for proxy authentication, if required.
* `ip (Optional[str]`, default: `None`)`: Optionally, the specific IP address of the proxy server. If not provided, it's inferred from the `server` URL.
* 11.2.3. Key Static Methods:
* `from_string(proxy_str: str) -> ProxyConfig`:
* Description: Creates a `ProxyConfig` instance from a string representation. Expected format is "ip:port:username:password" or "ip:port".
* Returns: `(ProxyConfig)`
* `from_dict(proxy_dict: Dict) -> ProxyConfig`:
* Description: Creates a `ProxyConfig` instance from a dictionary.
* Returns: `(ProxyConfig)`
* `from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`:
* Description: Loads a list of proxy configurations from a comma-separated string in an environment variable.
* Returns: `(List[ProxyConfig])`
* 11.2.4. Key Methods:
* `to_dict(self) -> Dict`: Converts the `ProxyConfig` instance to a dictionary.
* `clone(self, **kwargs) -> ProxyConfig`: Creates a copy of the instance, optionally updating attributes with `kwargs`.
* **11.3. `ProxyRotationStrategy` (ABC)**
* Source: `crawl4ai/proxy_strategy.py`
* 11.3.1. Purpose: Abstract base class defining the interface for proxy rotation strategies.
* 11.3.2. Key Abstract Methods:
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Asynchronously gets the next `ProxyConfig` from the strategy.
* `def add_proxies(self, proxies: List[ProxyConfig])`: Adds a list of `ProxyConfig` objects to the strategy's pool.
* **11.4. `RoundRobinProxyStrategy`**
* Source: `crawl4ai/proxy_strategy.py`
* 11.4.1. Purpose: A simple proxy rotation strategy that cycles through a list of proxies in a round-robin fashion.
* 11.4.2. Inheritance: `ProxyRotationStrategy`
* 11.4.3. Initialization (`__init__`)
* 11.4.3.1. Signature: `def __init__(self, proxies: List[ProxyConfig] = None):`
* 11.4.3.2. Parameters:
* `proxies (List[ProxyConfig]`, default: `None`)`: An optional initial list of `ProxyConfig` objects.
* 11.4.4. Key Implemented Methods:
* `add_proxies(self, proxies: List[ProxyConfig])`: Adds new proxies to the internal list and reinitializes the cycle.
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next proxy from the cycle. Returns `None` if no proxies are available.
## 12. HTML to Markdown Conversion (`crawl4ai.markdown_generation_strategy`)
* 12.1. `MarkdownGenerationStrategy` (ABC)
* Source: `crawl4ai/markdown_generation_strategy.py`
* 12.1.1. Purpose: Abstract base class defining the interface for strategies that convert HTML content to Markdown.
* 12.1.2. Key Abstract Methods:
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
* Description: Abstract method to convert the given `input_html` string into a `MarkdownGenerationResult` object.
* Parameters:
* `input_html (str)`: The HTML content to convert.
* `base_url (str`, default: `""`)`: The base URL used for resolving relative links within the HTML.
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options to pass to the underlying HTML-to-text conversion library.
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional filter to apply to the HTML before Markdown conversion, potentially to extract only relevant parts.
* `citations (bool`, default: `True`)`: If `True`, attempts to convert hyperlinks into Markdown citations with a reference list.
* `**kwargs`: Additional keyword arguments.
* Returns: `(MarkdownGenerationResult)`
* 12.2. `DefaultMarkdownGenerator`
* Source: `crawl4ai/markdown_generation_strategy.py`
* 12.2.1. Purpose: The default implementation of `MarkdownGenerationStrategy`. It uses the `CustomHTML2Text` class (an enhanced `html2text.HTML2Text`) for the primary conversion and can optionally apply a `RelevantContentFilter`.
* 12.2.2. Inheritance: `MarkdownGenerationStrategy`
* 12.2.3. Initialization (`__init__`)
* 12.2.3.1. Signature:
```python
def __init__(
self,
content_filter: Optional[RelevantContentFilter] = None,
options: Optional[Dict[str, Any]] = None,
content_source: str = "cleaned_html", # "raw_html", "fit_html"
):
```
* 12.2.3.2. Parameters:
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An instance of a content filter strategy (e.g., `BM25ContentFilter`, `PruningContentFilter`) to be applied to the `input_html` before Markdown conversion. If `None`, no pre-filtering is done.
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to configure the `CustomHTML2Text` converter (e.g., `{"body_width": 0, "ignore_links": False}`).
* `content_source (str`, default: `"cleaned_html"`)`: Specifies which HTML source to use for Markdown generation if multiple are available (e.g., from `CrawlResult`). Options: `"cleaned_html"` (default), `"raw_html"`, `"fit_html"`. This parameter is primarily used when the generator is part of a larger crawling pipeline.
* 12.2.4. Key Methods:
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
* Description: Converts HTML to Markdown. If a `content_filter` is provided (either at init or as an argument), it's applied first to get "fit_html". Then, `CustomHTML2Text` converts the chosen HTML (input_html or fit_html) to raw Markdown. If `citations` is True, links in the raw Markdown are converted to citation format.
* Returns: `(MarkdownGenerationResult)`
* `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`:
* Description: Parses Markdown text, identifies links, replaces them with citation markers (e.g., `[text]^(1)`), and generates a corresponding list of references.
* Returns: `(Tuple[str, str])` - A tuple containing the Markdown with citations and the Markdown string of references.
## 13. Content Filtering (`crawl4ai.content_filter_strategy`)
* 13.1. `RelevantContentFilter` (ABC)
* Source: `crawl4ai/content_filter_strategy.py`
* 13.1.1. Purpose: Abstract base class for strategies that filter HTML content to extract only the most relevant parts, typically before Markdown conversion or further processing.
* 13.1.2. Key Abstract Methods:
* `filter_content(self, html: str) -> List[str]`:
* Description: Abstract method that takes an HTML string and returns a list of strings, where each string is a chunk of HTML deemed relevant.
* 13.2. `BM25ContentFilter`
* Source: `crawl4ai/content_filter_strategy.py`
* 13.2.1. Purpose: Filters HTML content by extracting text chunks and scoring their relevance to a user query (or an inferred page query) using the BM25 algorithm.
* 13.2.2. Inheritance: `RelevantContentFilter`
* 13.2.3. Initialization (`__init__`)
* 13.2.3.1. Signature:
```python
def __init__(
self,
user_query: Optional[str] = None,
bm25_threshold: float = 1.0,
language: str = "english",
):
```
* 13.2.3.2. Parameters:
* `user_query (Optional[str]`, default: `None`)`: The query to compare content against. If `None`, the filter attempts to extract a query from the page's metadata.
* `bm25_threshold (float`, default: `1.0`)`: The minimum BM25 score for a text chunk to be considered relevant.
* `language (str`, default: `"english"`)`: The language used for stemming tokens.
* 13.2.4. Key Implemented Methods:
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, extracts text chunks (paragraphs, list items, etc.), scores them with BM25 against the query, and returns the HTML of chunks exceeding the threshold.
* 13.3. `PruningContentFilter`
* Source: `crawl4ai/content_filter_strategy.py`
* 13.3.1. Purpose: Filters HTML content by recursively pruning less relevant parts of the DOM tree based on a composite score (text density, link density, tag weights, etc.).
* 13.3.2. Inheritance: `RelevantContentFilter`
* 13.3.3. Initialization (`__init__`)
* 13.3.3.1. Signature:
```python
def __init__(
self,
user_query: Optional[str] = None,
min_word_threshold: Optional[int] = None,
threshold_type: str = "fixed", # or "dynamic"
threshold: float = 0.48,
):
```
* 13.3.3.2. Parameters:
* `user_query (Optional[str]`, default: `None`)`: [Not directly used by pruning logic but inherited].
* `min_word_threshold (Optional[int]`, default: `None`)`: Minimum word count for an element to be considered for scoring initially (default behavior might be more nuanced).
* `threshold_type (str`, default: `"fixed"`)`: Specifies how the `threshold` is applied. "fixed" uses the direct value. "dynamic" adjusts the threshold based on content characteristics.
* `threshold (float`, default: `0.48`)`: The score threshold for pruning. Elements below this score are removed.
* 13.3.4. Key Implemented Methods:
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, applies the pruning algorithm to the body, and returns the remaining significant HTML blocks as a list of strings.
* 13.4. `LLMContentFilter`
* Source: `crawl4ai/content_filter_strategy.py`
* 13.4.1. Purpose: Uses a Large Language Model (LLM) to determine the relevance of HTML content chunks based on a given instruction.
* 13.4.2. Inheritance: `RelevantContentFilter`
* 13.4.3. Initialization (`__init__`)
* 13.4.3.1. Signature:
```python
def __init__(
self,
llm_config: Optional[LLMConfig] = None,
instruction: Optional[str] = None,
chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, # Default from config
overlap_rate: float = OVERLAP_RATE, # Default from config
word_token_rate: float = WORD_TOKEN_RATE, # Default from config
verbose: bool = False,
logger: Optional[AsyncLogger] = None,
ignore_cache: bool = True
):
```
* 13.4.3.2. Parameters:
* `llm_config (Optional[LLMConfig])`: Configuration for the LLM (provider, API key, model, etc.).
* `instruction (Optional[str])`: The instruction given to the LLM to guide content filtering (e.g., "Extract only the main article content, excluding headers, footers, and ads.").
* `chunk_token_threshold (int)`: Maximum number of tokens per chunk sent to the LLM.
* `overlap_rate (float)`: Percentage of overlap between consecutive chunks.
* `word_token_rate (float)`: Estimated ratio of words to tokens, used for chunking.
* `verbose (bool`, default: `False`)`: Enables verbose logging for LLM operations.
* `logger (Optional[AsyncLogger]`, default: `None`)`: Custom logger instance.
* `ignore_cache (bool`, default: `True`)`: If `True`, bypasses any LLM response caching for this operation.
* 13.4.4. Key Implemented Methods:
* `filter_content(self, html: str, ignore_cache: bool = True) -> List[str]`:
* Description: Chunks the input HTML. For each chunk, it sends a request to the configured LLM with the chunk and the `instruction`. The LLM is expected to return the relevant part of the chunk. These relevant parts are then collected and returned.
```

File diff suppressed because it is too large Load Diff

View File

@@ -1,537 +0,0 @@
```markdown
# Detailed Outline for crawl4ai - deployment Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_deployment.md`
**Library Version Context:** 0.6.0 (as per Dockerfile ARG `C4AI_VER` from provided `Dockerfile` content)
**Outline Generation Date:** 2025-05-24
---
## 1. Introduction to Deployment
* 1.1. Purpose: This document provides a factual reference for installing the `crawl4ai` library and deploying its server component using Docker. It covers basic and advanced library installation, various Docker deployment methods, server configuration, and an overview of the API for interaction.
* 1.2. Scope:
* Installation of the `crawl4ai` Python library.
* Setup and diagnostic commands for the library.
* Deployment of the `crawl4ai` server using Docker, including pre-built images, Docker Compose, and manual builds.
* Explanation of Dockerfile parameters and server configuration via `config.yml`.
* Details of API interaction, including the Playground UI, Python SDK, and direct REST API calls.
* Overview of additional server API endpoints and Model Context Protocol (MCP) support.
* High-level understanding of the server's internal logic relevant to users.
* The library's version numbering scheme.
## 2. Library Installation
* 2.1. **Basic Library Installation**
* 2.1.1. Standard Installation
* Command: `pip install crawl4ai`
* Purpose: Installs the core `crawl4ai` library and its essential dependencies for performing web crawling and scraping tasks. This provides the fundamental `AsyncWebCrawler` and related configuration objects.
* 2.1.2. Post-Installation Setup
* Command: `crawl4ai-setup`
* Purpose:
* Initializes the user's home directory structure for Crawl4ai (e.g., `~/.crawl4ai/cache`).
* Installs or updates necessary Playwright browsers (Chromium is installed by default) required for browser-based crawling. The `crawl4ai-setup` script internally calls `playwright install --with-deps chromium`.
* Performs OS-level checks for common missing libraries that Playwright might depend on, providing guidance if issues are found.
* Creates a default `global.yml` configuration file if one doesn't exist.
* 2.1.3. Diagnostic Check
* Command: `crawl4ai-doctor`
* Purpose:
* Verifies Python version compatibility.
* Confirms Playwright installation and browser integrity by attempting a simple crawl of `https://crawl4ai.com`.
* Inspects essential environment variables and potential library conflicts that might affect Crawl4ai's operation.
* Provides diagnostic messages indicating success or failure of these checks, with suggestions for resolving common issues.
* 2.1.4. Verification Process
* Purpose: To confirm that the basic installation and setup were successful and Crawl4ai can perform a simple crawl.
* Script Example (as inferred from `crawl4ai-doctor` logic and typical usage):
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
async def main():
browser_config = BrowserConfig(
headless=True,
browser_type="chromium",
ignore_https_errors=True,
light_mode=True,
viewport_width=1280,
viewport_height=720,
)
run_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
screenshot=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
print("Testing crawling capabilities...")
result = await crawler.arun(url="https://crawl4ai.com", config=run_config)
if result and result.markdown:
print("✅ Crawling test passed!")
return True
else:
print("❌ Test failed: Failed to get content")
return False
if __name__ == "__main__":
asyncio.run(main())
```
* Expected Outcome: The script should print "✅ Crawling test passed!" and successfully output Markdown content from the crawled page.
* 2.2. **Advanced Library Installation (Optional Features)**
* 2.2.1. Installation of Optional Extras
* Purpose: To install additional dependencies required for specific advanced features of Crawl4ai, such as those involving machine learning models.
* Options (as defined in `pyproject.toml`):
* `pip install crawl4ai[pdf]`:
* Purpose: Installs `PyPDF2` for PDF processing capabilities.
* `pip install crawl4ai[torch]`:
* Purpose: Installs `torch`, `nltk`, and `scikit-learn`. Enables features relying on PyTorch models, such as some advanced text clustering or semantic analysis within extraction strategies.
* `pip install crawl4ai[transformer]`:
* Purpose: Installs `transformers` and `tokenizers`. Enables the use of Hugging Face Transformers models for tasks like summarization, question answering, or other advanced NLP features within Crawl4ai.
* `pip install crawl4ai[cosine]`:
* Purpose: Installs `torch`, `transformers`, and `nltk`. Specifically for features utilizing cosine similarity with embeddings (implies model usage).
* `pip install crawl4ai[sync]`:
* Purpose: Installs `selenium` for synchronous crawling capabilities (less common, as Crawl4ai primarily focuses on async).
* `pip install crawl4ai[all]`:
* Purpose: Installs all optional dependencies listed above (`PyPDF2`, `torch`, `nltk`, `scikit-learn`, `transformers`, `tokenizers`, `selenium`), providing the complete suite of Crawl4ai capabilities.
* 2.2.2. Model Pre-fetching
* Command: `crawl4ai-download-models` (maps to `crawl4ai.model_loader:main`)
* Purpose: Downloads and caches machine learning models (e.g., specific sentence transformers or classification models from Hugging Face) that are used by certain optional features, particularly those installed via `crawl4ai[transformer]` or `crawl4ai[cosine]`. This avoids runtime downloads and ensures models are available offline.
## 3. Docker Deployment (Server Mode)
* 3.1. **Prerequisites**
* 3.1.1. Docker: A working Docker installation. (Link: `https://docs.docker.com/get-docker/`)
* 3.1.2. Git: Required for cloning the `crawl4ai` repository if building locally or using Docker Compose from the repository. (Link: `https://git-scm.com/book/en/v2/Getting-Started-Installing-Git`)
* 3.1.3. RAM Requirements:
* Minimum: 2GB for the basic server without intensive LLM tasks. The `Dockerfile` HEALTCHECK indicates a warning if less than 2GB RAM is available.
* Recommended for LLM support: 4GB+ (as specified in `docker-compose.yml` limits).
* Shared Memory (`/dev/shm`): Recommended size is 1GB (`--shm-size=1g`) for optimal Chromium browser performance, as specified in `docker-compose.yml` and run commands.
* 3.2. **Installation Options**
* 3.2.1. **Using Pre-built Images from Docker Hub**
* 3.2.1.1. Image Source: `unclecode/crawl4ai:<tag>`
* Explanation of `<tag>`:
* `latest`: Points to the most recent stable release of Crawl4ai.
* Specific version tags (e.g., `0.6.0`, `0.5.1`): Correspond to specific library releases.
* Pre-release tags (e.g., `0.6.0-rc1`, `0.7.0-devN`): Development or release candidate versions for testing.
* 3.2.1.2. Pulling the Image
* Command: `docker pull unclecode/crawl4ai:<tag>` (e.g., `docker pull unclecode/crawl4ai:latest`)
* 3.2.1.3. Environment Setup (`.llm.env`)
* File Name: `.llm.env` (to be created by the user in the directory where `docker run` or `docker-compose` commands are executed).
* Purpose: To securely provide API keys for various LLM providers used by Crawl4ai for features like LLM-based extraction or Q&A.
* Example Content (based on `docker-compose.yml`):
```env
OPENAI_API_KEY=your_openai_api_key
DEEPSEEK_API_KEY=your_deepseek_api_key
ANTHROPIC_API_KEY=your_anthropic_api_key
GROQ_API_KEY=your_groq_api_key
TOGETHER_API_KEY=your_together_api_key
MISTRAL_API_KEY=your_mistral_api_key
GEMINI_API_TOKEN=your_gemini_api_token
```
* Creation: Users should create this file and populate it with their API keys. An example (`.llm.env.example`) might be provided in the repository.
* 3.2.1.4. Running the Container
* Basic Run (without LLM support):
* Command: `docker run -d -p 11235:11235 --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
* Port Mapping: `-p 11235:11235` maps port 11235 on the host to port 11235 in the container (default server port).
* Shared Memory: `--shm-size=1g` allocates 1GB of shared memory for the browser.
* Run with LLM Support (mounting `.llm.env`):
* Command: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
* 3.2.1.5. Stopping the Container
* Command: `docker stop crawl4ai-server`
* Command (to remove): `docker rm crawl4ai-server`
* 3.2.1.6. Docker Hub Versioning:
* Docker image tags on Docker Hub (e.g., `unclecode/crawl4ai:0.6.0`) directly correspond to `crawl4ai` library releases. The `latest` tag usually points to the most recent stable release. Pre-release tags include suffixes like `-devN`, `-aN`, `-bN`, or `-rcN`.
* 3.2.2. **Using Docker Compose (`docker-compose.yml`)**
* 3.2.2.1. Cloning the Repository
* Command: `git clone https://github.com/unclecode/crawl4ai.git`
* Command: `cd crawl4ai`
* 3.2.2.2. Environment Setup (`.llm.env`)
* File Name: `.llm.env` (should be created in the root of the cloned `crawl4ai` repository).
* Purpose: Same as above, to provide LLM API keys.
* 3.2.2.3. Running Pre-built Images
* Command: `docker-compose up -d`
* Behavior: Uses the image specified in `docker-compose.yml` (e.g., `${IMAGE:-unclecode/crawl4ai}:${TAG:-latest}`).
* Overriding image tag: `TAG=0.6.0 docker-compose up -d` or `IMAGE=mycustom/crawl4ai TAG=mytag docker-compose up -d`.
* 3.2.2.4. Building Locally with Docker Compose
* Command: `docker-compose up -d --build`
* Build Arguments (passed from environment variables to `docker-compose.yml` which then passes to `Dockerfile`):
* `INSTALL_TYPE`: (e.g., `default`, `torch`, `all`)
* Purpose: To include optional Python dependencies during the Docker image build process.
* Example: `INSTALL_TYPE=all docker-compose up -d --build`
* `ENABLE_GPU`: (e.g., `true`, `false`)
* Purpose: To include GPU support (e.g., CUDA toolkits) in the Docker image if the build hardware and target runtime support it.
* Example: `ENABLE_GPU=true docker-compose up -d --build`
* 3.2.2.5. Stopping Docker Compose Services
* Command: `docker-compose down`
* 3.2.3. **Manual Local Build & Run**
* 3.2.3.1. Cloning the Repository: (As above)
* 3.2.3.2. Environment Setup (`.llm.env`): (As above)
* 3.2.3.3. Building with `docker buildx`
* Command Example:
```bash
docker buildx build --platform linux/amd64,linux/arm64 \
--build-arg C4AI_VER=0.6.0 \
--build-arg INSTALL_TYPE=all \
--build-arg ENABLE_GPU=false \
--build-arg USE_LOCAL=true \
-t my-crawl4ai-image:custom .
```
* Purpose of `docker buildx`: A Docker CLI plugin that extends the `docker build` command with full support for BuildKit builder capabilities, including multi-architecture builds.
* Explanation of `--platform`: Specifies the target platform(s) for the build (e.g., `linux/amd64`, `linux/arm64`).
* Explanation of `--build-arg`: Passes build-time variables defined in the `Dockerfile` (see section 3.3).
* 3.2.3.4. Running the Custom-Built Container
* Basic Run: `docker run -d -p 11235:11235 --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
* Run with LLM Support: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
* 3.2.3.5. Stopping the Container: (As above)
* 3.3. **Dockerfile Parameters (`ARG` values)**
* 3.3.1. `C4AI_VER`: (Default: `0.6.0`)
* Role: Specifies the version of the `crawl4ai` library. Used for labeling the image and potentially for version-specific logic.
* 3.3.2. `APP_HOME`: (Default: `/app`)
* Role: Defines the working directory inside the Docker container where the application code and related files are stored and executed.
* 3.3.3. `GITHUB_REPO`: (Default: `https://github.com/unclecode/crawl4ai.git`)
* Role: The URL of the GitHub repository to clone if `USE_LOCAL` is set to `false`.
* 3.3.4. `GITHUB_BRANCH`: (Default: `main`)
* Role: The specific branch of the GitHub repository to clone if `USE_LOCAL` is `false`.
* 3.3.5. `USE_LOCAL`: (Default: `true`)
* Role: A boolean flag. If `true`, the `Dockerfile` installs `crawl4ai` from the local source code copied into `/tmp/project/` during the build context. If `false`, it clones the repository specified by `GITHUB_REPO` and `GITHUB_BRANCH`.
* 3.3.6. `PYTHON_VERSION`: (Default: `3.12`)
* Role: Specifies the Python version for the base image (e.g., `python:3.12-slim-bookworm`).
* 3.3.7. `INSTALL_TYPE`: (Default: `default`)
* Role: Controls which optional dependencies of `crawl4ai` are installed. Possible values: `default` (core), `pdf`, `torch`, `transformer`, `cosine`, `sync`, `all`.
* 3.3.8. `ENABLE_GPU`: (Default: `false`)
* Role: A boolean flag. If `true` and `TARGETARCH` is `amd64`, the `Dockerfile` attempts to install the NVIDIA CUDA toolkit for GPU acceleration.
* 3.3.9. `TARGETARCH`:
* Role: An automatic build argument provided by Docker, indicating the target architecture of the build (e.g., `amd64`, `arm64`). Used for conditional logic in the `Dockerfile`, such as installing platform-specific optimized libraries or CUDA for `amd64`.
* 3.4. **Server Configuration (`config.yml`)**
* 3.4.1. Location: The server loads its configuration from `/app/config.yml` inside the container by default. This path is relative to `APP_HOME`.
* 3.4.2. Structure Overview (based on `deploy/docker/config.yml`):
* `app`: General application settings.
* `title (str)`: API title (e.g., "Crawl4AI API").
* `version (str)`: API version (e.g., "1.0.0").
* `host (str)`: Host address for the server to bind to (e.g., "0.0.0.0").
* `port (int)`: Port for the server to listen on (e.g., 11234, though Docker usually maps to 11235).
* `reload (bool)`: Enable/disable auto-reload for development (default: `false`).
* `workers (int)`: Number of worker processes (default: 1).
* `timeout_keep_alive (int)`: Keep-alive timeout in seconds (default: 300).
* `llm`: Default LLM configuration.
* `provider (str)`: Default LLM provider string (e.g., "openai/gpt-4o-mini").
* `api_key_env (str)`: Environment variable name to read the API key from (e.g., "OPENAI_API_KEY").
* `api_key (Optional[str])`: Directly pass API key (overrides `api_key_env`).
* `redis`: Redis connection details.
* `host (str)`: Redis host (e.g., "localhost").
* `port (int)`: Redis port (e.g., 6379).
* `db (int)`: Redis database number (e.g., 0).
* `password (str)`: Redis password (default: "").
* `ssl (bool)`: Enable SSL for Redis connection (default: `false`).
* `ssl_cert_reqs (Optional[str])`: SSL certificate requirements (e.g., "none", "optional", "required").
* `ssl_ca_certs (Optional[str])`: Path to CA certificate file.
* `ssl_certfile (Optional[str])`: Path to SSL certificate file.
* `ssl_keyfile (Optional[str])`: Path to SSL key file.
* `rate_limiting`: Configuration for API rate limits.
* `enabled (bool)`: Enable/disable rate limiting (default: `true`).
* `default_limit (str)`: Default rate limit (e.g., "1000/minute").
* `trusted_proxies (List[str])`: List of trusted proxy IP addresses.
* `storage_uri (str)`: Storage URI for rate limit counters (e.g., "memory://", "redis://localhost:6379").
* `security`: Security-related settings.
* `enabled (bool)`: Master switch for security features (default: `false`).
* `jwt_enabled (bool)`: Enable/disable JWT authentication (default: `false`).
* `https_redirect (bool)`: Enable/disable HTTPS redirection (default: `false`).
* `trusted_hosts (List[str])`: List of allowed host headers (e.g., `["*"]` or specific domains).
* `headers (Dict[str, str])`: Default security headers to add to responses (e.g., `X-Content-Type-Options`, `Content-Security-Policy`).
* `crawler`: Default crawler behavior.
* `base_config (Dict[str, Any])`: Base parameters for `CrawlerRunConfig`.
* `simulate_user (bool)`: (default: `true`).
* `memory_threshold_percent (float)`: Memory usage threshold for adaptive dispatcher (default: `95.0`).
* `rate_limiter (Dict[str, Any])`: Configuration for the internal rate limiter for crawling.
* `enabled (bool)`: (default: `true`).
* `base_delay (List[float, float])`: Min/max delay range (e.g., `[1.0, 2.0]`).
* `timeouts (Dict[str, float])`: Timeouts for different crawler operations.
* `stream_init (float)`: Timeout for stream initialization (default: `30.0`).
* `batch_process (float)`: Timeout for batch processing (default: `300.0`).
* `pool (Dict[str, Any])`: Browser pool settings.
* `max_pages (int)`: Max concurrent browser pages (default: `40`).
* `idle_ttl_sec (int)`: Time-to-live for idle crawlers in seconds (default: `1800`).
* `browser (Dict[str, Any])`: Default `BrowserConfig` parameters.
* `kwargs (Dict[str, Any])`: Keyword arguments for `BrowserConfig`.
* `headless (bool)`: (default: `true`).
* `text_mode (bool)`: (default: `true`).
* `extra_args (List[str])`: List of additional browser launch arguments (e.g., `"--no-sandbox"`).
* `logging`: Logging configuration.
* `level (str)`: Logging level (e.g., "INFO", "DEBUG").
* `format (str)`: Log message format string.
* `observability`: Observability settings.
* `prometheus (Dict[str, Any])`: Prometheus metrics configuration.
* `enabled (bool)`: (default: `true`).
* `endpoint (str)`: Metrics endpoint path (e.g., "/metrics").
* `health_check (Dict[str, str])`: Health check endpoint configuration.
* `endpoint (str)`: Health check endpoint path (e.g., "/health").
* 3.4.3. JWT Authentication
* Enabling: Set `security.enabled: true` and `security.jwt_enabled: true` in `config.yml`.
* Secret Key: Configured via `security.jwt_secret_key`. This value can be overridden by the environment variable `JWT_SECRET_KEY`.
* Algorithm: Configured via `security.jwt_algorithm` (default: `HS256`).
* Token Expiry: Configured via `security.jwt_expire_minutes` (default: `30`).
* Usage:
* 1. Client obtains a token by sending a POST request to the `/token` endpoint with an email in the request body (e.g., `{"email": "user@example.com"}`). The email domain might be validated if configured.
* 2. Client includes the received token in the `Authorization` header of subsequent requests to protected API endpoints: `Authorization: Bearer <your_jwt_token>`.
* 3.4.4. Customizing `config.yml`
* 3.4.4.1. Modifying Before Build:
* Method: Edit the `deploy/docker/config.yml` file within the cloned `crawl4ai` repository before building the Docker image. This new configuration will be baked into the image.
* 3.4.4.2. Runtime Mount:
* Method: Mount a custom `config.yml` file from the host machine to `/app/config.yml` (or the path specified by `APP_HOME`) inside the running Docker container.
* Example Command: `docker run -d -p 11235:11235 -v /path/on/host/my-config.yml:/app/config.yml --name crawl4ai-server unclecode/crawl4ai:latest`
* 3.4.5. Key Configuration Recommendations
* Security:
* Enable JWT (`security.jwt_enabled: true`) if the server is exposed to untrusted networks.
* Use a strong, unique `jwt_secret_key`.
* Configure `security.trusted_hosts` to a specific list of allowed hostnames instead of `["*"]` for production.
* If using a reverse proxy for SSL termination, ensure `https_redirect` is appropriately configured or disabled if the proxy handles it.
* Resource Management:
* Adjust `crawler.pool.max_pages` based on server resources to prevent overwhelming the system.
* Tune `crawler.pool.idle_ttl_sec` to balance resource usage and responsiveness for pooled browser instances.
* Monitoring:
* Keep `observability.prometheus.enabled: true` for production monitoring via the `/metrics` endpoint.
* Ensure the `/health` endpoint is accessible to health checking systems.
* Performance:
* Review and customize `crawler.browser.extra_args` for headless browser optimization (e.g., disabling GPU, sandbox if appropriate for your environment).
* Set reasonable `crawler.timeouts` to prevent long-stalled crawls.
* 3.5. **API Usage (Interacting with the Dockerized Server)**
* 3.5.1. **Playground Interface**
* Access URL: `http://localhost:11235/playground` (assuming default port mapping).
* Purpose: An interactive web UI (Swagger UI/OpenAPI) allowing users to explore API endpoints, view schemas, construct requests, and test API calls directly from their browser.
* 3.5.2. **Python SDK (`Crawl4aiDockerClient`)**
* Class Name: `Crawl4aiDockerClient`
* Location: (Typically imported as `from crawl4ai.docker_client import Crawl4aiDockerClient`) - Actual import might vary based on final library structure; refer to `docs/examples/docker_example.py` or `docs/examples/docker_python_sdk.py`.
* Initialization:
* Signature: `Crawl4aiDockerClient(base_url: str = "http://localhost:11235", api_token: Optional[str] = None, timeout: int = 300)`
* Parameters:
* `base_url (str)`: The base URL of the Crawl4ai server. Default: `"http://localhost:11235"`.
* `api_token (Optional[str])`: JWT token for authentication if enabled on the server. Default: `None`.
* `timeout (int)`: Default timeout in seconds for HTTP requests to the server. Default: `300`.
* Authentication (JWT):
* Method: Pass the `api_token` during client initialization. The token can be obtained from the server's `/token` endpoint or other authentication mechanisms.
* `crawl()` Method:
* Signature (Conceptual, based on typical SDK patterns and server capabilities): `async def crawl(self, urls: Union[str, List[str]], browser_config: Optional[Dict] = None, crawler_config: Optional[Dict] = None, stream: bool = False) -> Union[List[Dict], AsyncGenerator[Dict, None]]`
*Note: SDK might take `BrowserConfig` and `CrawlerRunConfig` objects directly, which it then serializes.*
* Key Parameters:
* `urls (Union[str, List[str]])`: A single URL string or a list of URL strings to crawl.
* `browser_config (Optional[Dict])`: A dictionary representing the `BrowserConfig` object, or a `BrowserConfig` instance itself.
* `crawler_config (Optional[Dict])`: A dictionary representing the `CrawlerRunConfig` object, or a `CrawlerRunConfig` instance itself.
* `stream (bool)`: If `True`, the method returns an async generator yielding individual `CrawlResult` dictionaries as they are processed by the server. If `False` (default), it returns a list containing all `CrawlResult` dictionaries after all URLs are processed.
* Return Type: `List[Dict]` (for `stream=False`) or `AsyncGenerator[Dict, None]` (for `stream=True`), where each `Dict` represents a `CrawlResult`.
* Streaming Behavior:
* `stream=True`: Allows processing of results incrementally, suitable for long crawl jobs or real-time data feeds.
* `stream=False`: Collects all results before returning, simpler for smaller batches.
* `get_schema()` Method:
* Signature: `async def get_schema(self) -> dict`
* Return Type: `dict`.
* Purpose: Fetches the JSON schemas for `BrowserConfig` and `CrawlerRunConfig` from the server's `/schema` endpoint. This helps in constructing valid configuration payloads.
* 3.5.3. **JSON Request Schema for Configurations**
* Structure: `{"type": "ClassName", "params": {...}}`
* Purpose: This structure is used by the server (and expected by the Python SDK internally) to deserialize JSON payloads back into Pydantic configuration objects like `BrowserConfig`, `CrawlerRunConfig`, and their nested strategy objects (e.g., `LLMExtractionStrategy`, `PruningContentFilter`). The `type` field specifies the Python class name, and `params` holds the keyword arguments for its constructor.
* Example (`BrowserConfig`):
```json
{
"type": "BrowserConfig",
"params": {
"headless": true,
"browser_type": "chromium",
"viewport_width": 1920,
"viewport_height": 1080
}
}
```
* Example (`CrawlerRunConfig` with a nested `LLMExtractionStrategy`):
```json
{
"type": "CrawlerRunConfig",
"params": {
"cache_mode": {"type": "CacheMode", "params": "BYPASS"},
"screenshot": false,
"extraction_strategy": {
"type": "LLMExtractionStrategy",
"params": {
"llm_config": {
"type": "LLMConfig",
"params": {"provider": "openai/gpt-4o-mini"}
},
"instruction": "Extract the main title and summary."
}
}
}
}
```
* 3.5.4. **REST API Examples**
* `/crawl` Endpoint:
* URL: `http://localhost:11235/crawl`
* HTTP Method: `POST`
* Payload Structure (`CrawlRequest` model from `deploy/docker/schemas.py`):
```json
{
"urls": ["https://example.com"],
"browser_config": { // JSON representation of BrowserConfig
"type": "BrowserConfig",
"params": {"headless": true}
},
"crawler_config": { // JSON representation of CrawlerRunConfig
"type": "CrawlerRunConfig",
"params": {"screenshot": true}
}
}
```
* Response Structure: A JSON object, typically `{"success": true, "results": [CrawlResult, ...], "server_processing_time_s": float, ...}`.
* `/crawl/stream` Endpoint:
* URL: `http://localhost:11235/crawl/stream`
* HTTP Method: `POST`
* Payload Structure: Same as `/crawl` (`CrawlRequest` model).
* Response Structure: Newline Delimited JSON (NDJSON, `application/x-ndjson`). Each line is a JSON string representing a `CrawlResult` object.
* Headers: Includes `Content-Type: application/x-ndjson` and `X-Stream-Status: active` while streaming, and a final JSON object `{"status": "completed"}`.
* 3.6. **Additional API Endpoints (from `server.py`)**
* 3.6.1. `/html`
* Endpoint URL: `/html`
* HTTP Method: `POST`
* Purpose: Crawls the given URL, preprocesses its raw HTML content specifically for schema extraction purposes (e.g., by sanitizing and simplifying the structure), and returns the processed HTML.
* Request Body (`HTMLRequest` from `deploy/docker/schemas.py`):
* `url (str)`: The URL to fetch and process.
* Response Structure (JSON):
* `html (str)`: The preprocessed HTML string.
* `url (str)`: The original URL requested.
* `success (bool)`: Indicates if the operation was successful.
* 3.6.2. `/screenshot`
* Endpoint URL: `/screenshot`
* HTTP Method: `POST`
* Purpose: Captures a full-page PNG screenshot of the specified URL. Allows an optional delay before capture and an option to save the file server-side.
* Request Body (`ScreenshotRequest` from `deploy/docker/schemas.py`):
* `url (str)`: The URL to take a screenshot of.
* `screenshot_wait_for (Optional[float])`: Seconds to wait before taking the screenshot. Default: `2.0`.
* `output_path (Optional[str])`: If provided, the screenshot is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded image is returned. Default: `None`.
* Response Structure (JSON):
* `success (bool)`: Indicates if the screenshot was successfully taken.
* `screenshot (Optional[str])`: Base64 encoded PNG image data, if `output_path` was not provided.
* `path (Optional[str])`: The absolute server-side path to the saved screenshot, if `output_path` was provided.
* 3.6.3. `/pdf`
* Endpoint URL: `/pdf`
* HTTP Method: `POST`
* Purpose: Generates a PDF document of the rendered content of the specified URL.
* Request Body (`PDFRequest` from `deploy/docker/schemas.py`):
* `url (str)`: The URL to convert to PDF.
* `output_path (Optional[str])`: If provided, the PDF is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded PDF data is returned. Default: `None`.
* Response Structure (JSON):
* `success (bool)`: Indicates if the PDF generation was successful.
* `pdf (Optional[str])`: Base64 encoded PDF data, if `output_path` was not provided.
* `path (Optional[str])`: The absolute server-side path to the saved PDF, if `output_path` was provided.
* 3.6.4. `/execute_js`
* Endpoint URL: `/execute_js`
* HTTP Method: `POST`
* Purpose: Executes a list of JavaScript snippets on the specified URL in the browser context and returns the full `CrawlResult` object, including any modifications or data retrieved by the scripts.
* Request Body (`JSEndpointRequest` from `deploy/docker/schemas.py`):
* `url (str)`: The URL on which to execute the JavaScript.
* `scripts (List[str])`: A list of JavaScript code snippets to execute sequentially. Each script should be an expression that returns a value.
* Response Structure (JSON): A `CrawlResult` object (serialized to a dictionary) containing the state of the page after JS execution, including `js_execution_result`.
* 3.6.5. `/ask` (Endpoint defined as `/ask` in `server.py`)
* Endpoint URL: `/ask`
* HTTP Method: `GET`
* Purpose: Retrieves context about the Crawl4ai library itself, either code snippets or documentation sections, filtered by a query. This is designed for AI assistants or RAG systems needing information about Crawl4ai.
* Parameters (Query):
* `context_type (str, default="all", enum=["code", "doc", "all"])`: Specifies whether to return "code", "doc", or "all" (both).
* `query (Optional[str])`: A search query string used to filter relevant chunks using BM25 ranking. If `None`, returns all context of the specified type(s).
* `score_ratio (float, default=0.5, ge=0.0, le=1.0)`: The minimum score (as a fraction of the maximum possible score for the query) for a chunk to be included in the results.
* `max_results (int, default=20, ge=1)`: The maximum number of result chunks to return.
* Response Structure (JSON):
* If `query` is provided:
* `code_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "code_chunk...", "score": bm25_score}`. Present if `context_type` is "code" or "all".
* `doc_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "doc_chunk...", "score": bm25_score}`. Present if `context_type` is "doc" or "all".
* If `query` is not provided:
* `code_context (Optional[str])`: The full concatenated code context as a single string. Present if `context_type` is "code" or "all".
* `doc_context (Optional[str])`: The full concatenated documentation context as a single string. Present if `context_type` is "doc" or "all".
* 3.7. **MCP (Model Context Protocol) Support**
* 3.7.1. Explanation of MCP:
* Purpose: The Model Context Protocol (MCP) is a standardized way for AI models (like Anthropic's Claude with Code Interpreter capabilities) to discover and interact with external tools and data sources. Crawl4ai's MCP server exposes its functionalities as tools that an MCP-compatible AI can use.
* 3.7.2. Connection Endpoints (defined in `mcp_bridge.py` and attached to FastAPI app):
* `/mcp/sse`: Server-Sent Events (SSE) endpoint for MCP communication.
* `/mcp/ws`: WebSocket endpoint for MCP communication.
* `/mcp/messages`: Endpoint for clients to POST messages in the SSE transport.
* 3.7.3. Usage with Claude Code Example:
* Command: `claude mcp add -t sse c4ai-sse http://localhost:11235/mcp/sse`
* Purpose: This command (specific to the Claude Code CLI) registers the Crawl4ai MCP server as a tool provider named `c4ai-sse` using the SSE transport. The AI can then discover and invoke tools from this source.
* 3.7.4. List of Available MCP Tools (defined by `@mcp_tool` decorators in `server.py`):
* `md`: Fetches Markdown for a URL.
* Parameters (derived from `get_markdown` function signature): `url (str)`, `filter_type (FilterType)`, `query (Optional[str])`, `cache (Optional[str])`.
* `html`: Generates preprocessed HTML for a URL.
* Parameters (derived from `generate_html` function signature): `url (str)`.
* `screenshot`: Generates a screenshot of a URL.
* Parameters (derived from `generate_screenshot` function signature): `url (str)`, `screenshot_wait_for (Optional[float])`, `output_path (Optional[str])`.
* `pdf`: Generates a PDF of a URL.
* Parameters (derived from `generate_pdf` function signature): `url (str)`, `output_path (Optional[str])`.
* `execute_js`: Executes JavaScript on a URL.
* Parameters (derived from `execute_js` function signature): `url (str)`, `scripts (List[str])`.
* `crawl`: Performs a full crawl operation.
* Parameters (derived from `crawl` function signature): `urls (List[str])`, `browser_config (Optional[Dict])`, `crawler_config (Optional[Dict])`.
* `ask`: Retrieves library context.
* Parameters (derived from `get_context` function signature): `context_type (str)`, `query (Optional[str])`, `score_ratio (float)`, `max_results (int)`.
* 3.7.5. Testing MCP Connections:
* Method: Use an MCP client tool (e.g., `claude mcp call c4ai-sse.md url=https://example.com`) to invoke a tool and verify the response.
* 3.7.6. Accessing MCP Schemas:
* Endpoint URL: `/mcp/schema`
* Purpose: Returns a JSON response detailing all registered MCP tools, including their names, descriptions, and input schemas, enabling clients to understand how to use them.
* 3.8. **Metrics & Monitoring Endpoints**
* 3.8.1. `/health`
* Purpose: Provides a basic health check for the server, indicating if it's running and responsive.
* Response Structure (JSON from `server.py`): `{"status": "ok", "timestamp": float, "version": str}` (where version is `__version__` from `server.py`).
* Configuration: Path configurable via `observability.health_check.endpoint` in `config.yml`.
* 3.8.2. `/metrics`
* Purpose: Exposes application metrics in a format compatible with Prometheus for monitoring and alerting.
* Response Format: Prometheus text format.
* Configuration: Enabled via `observability.prometheus.enabled: true` and endpoint path via `observability.prometheus.endpoint` in `config.yml`.
* 3.9. **Underlying Server Logic (`server.py` - High-Level Understanding)**
* 3.9.1. FastAPI Application:
* Framework: The server is built using the FastAPI Python web framework for creating APIs.
* 3.9.2. `crawler_pool` (`CrawlerPool` from `deploy.docker.crawler_pool`):
* Role: Manages a pool of `AsyncWebCrawler` instances to reuse browser resources efficiently.
* `get_crawler(BrowserConfig)`: Fetches an existing idle crawler compatible with the `BrowserConfig` or creates a new one if none are available or compatible.
* `close_all()`: Iterates through all pooled crawlers and closes them.
* `janitor()`: An `asyncio.Task` that runs periodically to close and remove crawler instances that have been idle for longer than `crawler.pool.idle_ttl_sec` (configured in `config.yml`).
* 3.9.3. Global Page Semaphore (`GLOBAL_SEM`):
* Type: `asyncio.Semaphore`.
* Purpose: A global semaphore that limits the total number of concurrently open browser pages across all `AsyncWebCrawler` instances managed by the server. This acts as a hard cap to prevent excessive resource consumption.
* Configuration: The maximum number of concurrent pages is set by `crawler.pool.max_pages` in `config.yml` (default: `30` in `server.py`, but `40` in `config.yml`). The `AsyncWebCrawler.arun` method acquires this semaphore.
* 3.9.4. Job Router (`init_job_router` from `deploy.docker.job`):
* Role: Manages asynchronous, long-running tasks, particularly for the `/crawl` (non-streaming batch) endpoint.
* Mechanism: Uses Redis (configured in `config.yml`) as a backend for task queuing (storing task metadata like status, creation time, URL, result, error) and status tracking.
* User Interaction: When a job is submitted to an endpoint using this router (e.g., `/crawl/job`), a `task_id` is returned. The client then polls an endpoint like `/task/{task_id}` to get the status and eventual result or error.
* 3.9.5. Rate Limiting Middleware:
* Implementation: Uses the `slowapi` library, integrated with FastAPI.
* Purpose: To protect the server from abuse by limiting the number of requests an IP address can make within a specified time window.
* Configuration: Settings like `enabled`, `default_limit`, `storage_uri` (e.g., `memory://` or `redis://...`) are managed in the `rate_limiting` section of `config.yml`.
* 3.9.6. Security Middleware:
* Implementations: `HTTPSRedirectMiddleware` and `TrustedHostMiddleware` from FastAPI, plus custom logic for adding security headers.
* Purpose:
* `HTTPSRedirectMiddleware`: Redirects HTTP requests to HTTPS if `security.https_redirect` is true.
* `TrustedHostMiddleware`: Ensures requests are only served if their `Host` header matches an entry in `security.trusted_hosts`.
* Custom header logic: Adds HTTP security headers like `X-Content-Type-Options`, `X-Frame-Options`, `Content-Security-Policy`, `Strict-Transport-Security` to all responses if `security.enabled` is true. These are defined in `security.headers` in `config.yml`.
* 3.9.7. API Request Mapping:
* Request Models: Pydantic models defined in `deploy/docker/schemas.py` (e.g., `CrawlRequest`, `MarkdownRequest`, `HTMLRequest`, `ScreenshotRequest`, `PDFRequest`, `JSEndpointRequest`, `TokenRequest`, `RawCode`) define the expected JSON structure for incoming API request bodies.
* Endpoint Logic: Functions decorated with `@app.post(...)`, `@app.get(...)`, etc., in `server.py` handle incoming HTTP requests. These functions use FastAPI's dependency injection to parse and validate request bodies against the Pydantic models.
* `AsyncWebCrawler` Interaction:
* The parameters from the parsed request models (e.g., `CrawlRequest.urls`, `CrawlRequest.browser_config`, `CrawlRequest.crawler_config`) are used.
* `BrowserConfig` and `CrawlerRunConfig` objects are created by calling their respective `.load()` class methods with the dictionary payloads received in the request (e.g., `BrowserConfig.load(crawl_request.browser_config)`).
* These configuration objects are then passed to an `AsyncWebCrawler` instance obtained from the `crawler_pool`, typically to its `arun()` (for single URL or when JS execution context is critical) or `arun_many()` (for batch processing of multiple URLs) methods.
* Result Serialization: The `CrawlResult` objects (or lists/generators of them) returned by the `AsyncWebCrawler` are usually serialized to JSON using their `.model_dump()` method before being included in the HTTP response. For streaming endpoints, each `CrawlResult` is serialized and sent as a separate NDJSON line.
## 4. Version Numbering Scheme
* 4.1. **Standard Versioning (`MAJOR.MINOR.PATCH`)**
* `MAJOR`: Incremented when incompatible API changes are made.
* `MINOR`: Incremented when functionality is added in a backward-compatible manner.
* `PATCH`: Incremented for backward-compatible bug fixes.
* 4.2. **Pre-release Suffixes**
* `devN`: (e.g., `0.6.0.dev1`) Development release. These are typically unstable and used for internal testing or early feedback on new, unrefined features.
* `aN`: (e.g., `0.6.0a1`) Alpha release. Indicates an early preview of a new version, potentially unstable, and APIs might still change.
* `bN`: (e.g., `0.6.0b1`) Beta release. Generally feature-complete for the targeted minor or major version but may still contain bugs. APIs are mostly stable at this point.
* `rcN`: (e.g., `0.6.0rc1`) Release Candidate. A version that is potentially the final release, undergoing final testing to catch critical bugs before official release.
```

File diff suppressed because it is too large Load Diff

View File

@@ -1,494 +0,0 @@
Okay, I will now generate the "Foundational Memory" document for the `extraction` component of `crawl4ai`, based on the outline you provided and the information I've processed from the codebase and existing documentation.
```markdown
# Detailed Outline for crawl4ai - extraction Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_extraction.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2024-05-24
---
## 1. Overview of Data Extraction in Crawl4ai
* 1.1. Purpose of the Extraction Component: The extraction component in Crawl4ai is responsible for parsing structured data from web content (HTML, text, Markdown) or PDF documents. It allows users to define how data should be identified and extracted, using various strategies ranging from rule-based (CSS, XPath, Regex) to LLM-powered approaches. Its goal is to transform raw crawled content into usable, structured information.
* 1.2. Core Concepts:
* 1.2.1. `ExtractionStrategy`: This is an abstract base class (interface) that defines the contract for all specific extraction methods. Each strategy implements how data is extracted from the provided content.
* 1.2.2. `ChunkingStrategy`: This is an abstract base class (interface) for strategies that preprocess content by splitting it into smaller, manageable chunks. This is particularly relevant for LLM-based extraction strategies that have token limits for their input.
* 1.2.3. Schemas: Schemas define the structure of the data to be extracted. For non-LLM strategies like `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`, schemas are typically dictionary-based, specifying selectors and field types. For `LLMExtractionStrategy`, schemas can be Pydantic models or JSON schema dictionaries that guide the LLM in structuring its output.
* 1.2.4. `CrawlerRunConfig`: The `CrawlerRunConfig` object allows users to specify which `extraction_strategy` and `chunking_strategy` (if applicable) should be used for a particular crawl operation via its `arun()` method.
## 2. `ExtractionStrategy` Interface
* 2.1. Purpose: The `ExtractionStrategy` class, found in `crawl4ai.extraction_strategy`, serves as an abstract base class (ABC) defining the standard interface for all data extraction strategies within the Crawl4ai library. Implementations of this class provide specific methods for extracting structured data from content.
* 2.2. Key Abstract Methods:
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Abstract method intended to extract meaningful blocks or chunks from the given content. Subclasses must implement this.
* Parameters:
* `url (str)`: The URL of the webpage.
* `content (str)`: The HTML, Markdown, or text content of the webpage.
* `*q`: Variable positional arguments.
* `**kwargs`: Variable keyword arguments.
* Returns: `List[Dict[str, Any]]` - A list of extracted blocks or chunks, typically as dictionaries.
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Abstract method to process sections of text, often in parallel by default implementations in subclasses. Subclasses must implement this.
* Parameters:
* `url (str)`: The URL of the webpage.
* `sections (List[str])`: List of sections (strings) to process.
* `*q`: Variable positional arguments.
* `**kwargs`: Variable keyword arguments.
* Returns: `List[Dict[str, Any]]` - A list of processed JSON blocks.
* 2.3. Input Format Property:
* `input_format (str)`: [Read-only] - An attribute indicating the expected input format for the content to be processed by the strategy (e.g., "markdown", "html", "fit_html", "text"). Default is "markdown".
## 3. Non-LLM Based Extraction Strategies
* ### 3.1. Class `NoExtractionStrategy`
* 3.1.1. Purpose: A baseline `ExtractionStrategy` that performs no actual data extraction. It returns the input content as is, typically useful for scenarios where only raw or cleaned HTML/Markdown is needed without further structuring.
* 3.1.2. Inheritance: `ExtractionStrategy`
* 3.1.3. Initialization (`__init__`):
* 3.1.3.1. Signature: `NoExtractionStrategy(**kwargs)`
* 3.1.3.2. Parameters:
* `**kwargs`: Passed to the base `ExtractionStrategy` initializer.
* 3.1.4. Key Public Methods:
* `extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Returns the provided `html` content wrapped in a list containing a single dictionary: `[{"index": 0, "content": html}]`.
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Returns a list where each input section is wrapped in a dictionary: `[{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]`.
* ### 3.2. Class `JsonCssExtractionStrategy`
* 3.2.1. Purpose: Extracts structured data from HTML content using a JSON schema that defines CSS selectors to locate and extract data for specified fields. It uses BeautifulSoup4 for parsing and selection.
* 3.2.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
* 3.2.3. Initialization (`__init__`):
* 3.2.3.1. Signature: `JsonCssExtractionStrategy(schema: Dict[str, Any], **kwargs)`
* 3.2.3.2. Parameters:
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules.
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
* 3.2.4. Schema Definition for `JsonCssExtractionStrategy`:
* 3.2.4.1. `name (str)`: A descriptive name for the schema (e.g., "ProductDetails").
* 3.2.4.2. `baseSelector (str)`: The primary CSS selector that identifies each root element representing an item to be extracted (e.g., "div.product-item").
* 3.2.4.3. `fields (List[Dict[str, Any]])`: A list of dictionaries, each defining a field to be extracted from within each `baseSelector` element.
* Each field dictionary:
* `name (str)`: The key for this field in the output JSON object.
* `selector (str)`: The CSS selector for this field, relative to its parent element (either the `baseSelector` or a parent "nested" field).
* `type (str)`: Specifies how to extract the data. Common values:
* `"text"`: Extracts the text content of the selected element.
* `"attribute"`: Extracts the value of a specified HTML attribute.
* `"html"`: Extracts the raw inner HTML of the selected element.
* `"list"`: Extracts a list of items. The `fields` sub-key then defines the structure of each item in the list (if objects) or the `selector` directly targets list elements for primitive values.
* `"nested"`: Extracts a nested JSON object. The `fields` sub-key defines the structure of this nested object.
* `attribute (str, Optional)`: Required if `type` is "attribute". Specifies the name of the HTML attribute to extract (e.g., "href", "src").
* `fields (List[Dict[str, Any]], Optional)`: Required if `type` is "list" (for a list of objects) or "nested". Defines the structure of the nested object or list items.
* `transform (str, Optional)`: A string indicating a transformation to apply to the extracted value (e.g., "lowercase", "uppercase", "strip").
* `default (Any, Optional)`: A default value to use if the selector does not find an element or the attribute is missing.
* 3.2.5. Key Public Methods:
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Parses the `html_content` and applies the defined schema to extract structured data using CSS selectors.
* 3.2.6. Features:
* 3.2.6.1. Nested Extraction: Supports extracting complex, nested JSON objects by defining "nested" type fields within the schema.
* 3.2.6.2. List Handling: Supports extracting lists of primitive values (e.g., list of strings from multiple `<li>` tags) or lists of structured objects (e.g., a list of product details, each with its own fields).
* ### 3.3. Class `JsonXPathExtractionStrategy`
* 3.3.1. Purpose: Extracts structured data from HTML/XML content using a JSON schema that defines XPath expressions to locate and extract data. It uses `lxml` for parsing and XPath evaluation.
* 3.3.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
* 3.3.3. Initialization (`__init__`):
* 3.3.3.1. Signature: `JsonXPathExtractionStrategy(schema: Dict[str, Any], **kwargs)`
* 3.3.3.2. Parameters:
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, where selectors are XPath expressions.
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
* 3.3.4. Schema Definition: The schema structure is identical to `JsonCssExtractionStrategy` (see 3.2.4), but the `baseSelector` and field `selector` values must be valid XPath expressions.
* 3.3.5. Key Public Methods:
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using XPath expressions.
* ### 3.4. Class `JsonLxmlExtractionStrategy`
* 3.4.1. Purpose: Provides an alternative CSS selector-based extraction strategy leveraging the `lxml` library for parsing and selection, which can offer performance benefits over BeautifulSoup4 in some cases.
* 3.4.2. Inheritance: `JsonCssExtractionStrategy` (and thus `JsonElementExtractionStrategy`, `ExtractionStrategy`)
* 3.4.3. Initialization (`__init__`):
* 3.4.3.1. Signature: `JsonLxmlExtractionStrategy(schema: Dict[str, Any], **kwargs)`
* 3.4.3.2. Parameters:
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, using CSS selectors.
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
* 3.4.4. Schema Definition: Identical to `JsonCssExtractionStrategy` (see 3.2.4).
* 3.4.5. Key Public Methods:
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using lxml's CSS selector capabilities (which often translates CSS to XPath internally).
* ### 3.5. Class `RegexExtractionStrategy`
* 3.5.1. Purpose: Extracts data from text content (HTML, Markdown, or plain text) using a collection of regular expression patterns. Each match is returned as a structured dictionary.
* 3.5.2. Inheritance: `ExtractionStrategy`
* 3.5.3. Initialization (`__init__`):
* 3.5.3.1. Signature: `RegexExtractionStrategy(patterns: Union[Dict[str, str], List[Tuple[str, str]], "RegexExtractionStrategy._B"] = _B.NOTHING, input_format: str = "fit_html", **kwargs)`
* 3.5.3.2. Parameters:
* `patterns (Union[Dict[str, str], List[Tuple[str, str]], "_B"], default: _B.NOTHING)`:
* Description: Defines the regex patterns to use.
* Can be a dictionary mapping labels to regex strings (e.g., `{"email": r"..."}`).
* Can be a list of (label, regex_string) tuples.
* Can be a bitwise OR combination of `RegexExtractionStrategy._B` enum members for using built-in patterns (e.g., `RegexExtractionStrategy.Email | RegexExtractionStrategy.Url`).
* `input_format (str, default: "fit_html")`: Specifies the input format for the content. Options: "html" (raw HTML), "markdown" (Markdown from HTML), "text" (plain text from HTML), "fit_html" (content filtered for relevance before regex application).
* `**kwargs`: Passed to the base `ExtractionStrategy`.
* 3.5.4. Built-in Patterns (`RegexExtractionStrategy._B` Enum - an `IntFlag`):
* `EMAIL (auto())`: Matches email addresses. Example pattern: `r"[\\w.+-]+@[\\w-]+\\.[\\w.-]+"`
* `PHONE_INTL (auto())`: Matches international phone numbers. Example pattern: `r"\\+?\\d[\\d .()-]{7,}\\d"`
* `PHONE_US (auto())`: Matches US phone numbers. Example pattern: `r"\\(?\\d{3}\\)?[-. ]?\\d{3}[-. ]?\\d{4}"`
* `URL (auto())`: Matches URLs. Example pattern: `r"https?://[^\\s\\'\"<>]+"`
* `IPV4 (auto())`: Matches IPv4 addresses. Example pattern: `r"(?:\\d{1,3}\\.){3}\\d{1,3}"`
* `IPV6 (auto())`: Matches IPv6 addresses. Example pattern: `r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}"`
* `UUID (auto())`: Matches UUIDs. Example pattern: `r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}"`
* `CURRENCY (auto())`: Matches currency amounts. Example pattern: `r"(?:USD|EUR|RM|\\$|€|¥|£)\\s?\\d+(?:[.,]\\d{2})?"`
* `PERCENTAGE (auto())`: Matches percentages. Example pattern: `r"\\d+(?:\\.\\d+)?%"`
* `NUMBER (auto())`: Matches numbers (integers, decimals). Example pattern: `r"\\b\\d{1,3}(?:[,.]?\\d{3})*(?:\\.\\d+)?\\b"`
* `DATE_ISO (auto())`: Matches ISO 8601 dates (YYYY-MM-DD). Example pattern: `r"\\d{4}-\\d{2}-\\d{2}"`
* `DATE_US (auto())`: Matches US-style dates (MM/DD/YYYY or MM/DD/YY). Example pattern: `r"\\d{1,2}/\\d{1,2}/\\d{2,4}"`
* `TIME_24H (auto())`: Matches 24-hour time formats (HH:MM or HH:MM:SS). Example pattern: `r"\\b(?:[01]?\\d|2[0-3]):[0-5]\\d(?:[:.][0-5]\\d)?\\b"`
* `POSTAL_US (auto())`: Matches US postal codes (ZIP codes). Example pattern: `r"\\b\\d{5}(?:-\\d{4})?\\b"`
* `POSTAL_UK (auto())`: Matches UK postal codes. Example pattern: `r"\\b[A-Z]{1,2}\\d[A-Z\\d]? ?\\d[A-Z]{2}\\b"`
* `HTML_COLOR_HEX (auto())`: Matches HTML hex color codes. Example pattern: `r"#[0-9A-Fa-f]{6}\\b"`
* `TWITTER_HANDLE (auto())`: Matches Twitter handles. Example pattern: `r"@[\\w]{1,15}"`
* `HASHTAG (auto())`: Matches hashtags. Example pattern: `r"#[\\w-]+"`
* `MAC_ADDR (auto())`: Matches MAC addresses. Example pattern: `r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}"`
* `IBAN (auto())`: Matches IBANs. Example pattern: `r"[A-Z]{2}\\d{2}[A-Z0-9]{11,30}"`
* `CREDIT_CARD (auto())`: Matches common credit card numbers. Example pattern: `r"\\b(?:4\\d{12}(?:\\d{3})?|5[1-5]\\d{14}|3[47]\\d{13}|6(?:011|5\\d{2})\\d{12})\\b"`
* `ALL (_B(-1).value & ~_B.NOTHING.value)`: Includes all built-in patterns except `NOTHING`.
* `NOTHING (_B(0).value)`: Includes no built-in patterns.
* 3.5.5. Key Public Methods:
* `extract(self, url: str, content: str, **kwargs) -> List[Dict[str, Any]]`:
* Description: Applies all configured regex patterns (built-in and custom) to the input `content`.
* Returns: `List[Dict[str, Any]]` - A list of dictionaries, where each dictionary represents a match and contains:
* `"url" (str)`: The source URL.
* `"label" (str)`: The label of the matching regex pattern.
* `"value" (str)`: The actual matched string.
* `"span" (Tuple[int, int])`: The start and end indices of the match within the content.
* 3.5.6. Static Method: `generate_pattern`
* 3.5.6.1. Signature: `staticmethod generate_pattern(label: str, html: str, query: Optional[str] = None, examples: Optional[List[str]] = None, llm_config: Optional[LLMConfig] = None, **kwargs) -> Dict[str, str]`
* 3.5.6.2. Purpose: Uses an LLM to automatically generate a Python-compatible regular expression pattern for a given label, based on sample HTML content, an optional natural language query describing the target, and/or examples of desired matches.
* 3.5.6.3. Parameters:
* `label (str)`: A descriptive label for the pattern to be generated (e.g., "product_price", "article_date").
* `html (str)`: The HTML content from which the pattern should be inferred.
* `query (Optional[str], default: None)`: A natural language description of what kind of data the regex should capture (e.g., "Extract the publication date", "Find all ISBN numbers").
* `examples (Optional[List[str]], default: None)`: A list of example strings that the generated regex should successfully match from the provided HTML.
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM to be used. If `None`, uses default `LLMConfig`.
* `**kwargs`: Additional arguments passed to the LLM completion request (e.g., `temperature`, `max_tokens`).
* 3.5.6.4. Returns: `Dict[str, str]` - A dictionary containing the generated pattern, in the format `{label: "regex_pattern_string"}`.
## 4. LLM-Based Extraction Strategies
* ### 4.1. Class `LLMExtractionStrategy`
* 4.1.1. Purpose: Employs Large Language Models (LLMs) to extract either structured data according to a schema or relevant blocks of text based on natural language instructions from various content formats (HTML, Markdown, text).
* 4.1.2. Inheritance: `ExtractionStrategy`
* 4.1.3. Initialization (`__init__`):
* 4.1.3.1. Signature: `LLMExtractionStrategy(llm_config: Optional[LLMConfig] = None, instruction: Optional[str] = None, schema: Optional[Union[Dict[str, Any], "BaseModel"]] = None, extraction_type: str = "block", chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, overlap_rate: float = OVERLAP_RATE, word_token_rate: float = WORD_TOKEN_RATE, apply_chunking: bool = True, force_json_response: bool = False, **kwargs)`
* 4.1.3.2. Parameters:
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM. If `None`, a default `LLMConfig` is created.
* `instruction (Optional[str], default: None)`: Natural language instructions to guide the LLM's extraction process (e.g., "Extract the main article content", "Summarize the key points").
* `schema (Optional[Union[Dict[str, Any], "BaseModel"]], default: None)`: A Pydantic model class or a dictionary representing a JSON schema. Used when `extraction_type` is "schema" to define the desired output structure.
* `extraction_type (str, default: "block")`: Determines the extraction mode.
* `"block"`: LLM identifies and extracts relevant blocks/chunks of text based on the `instruction`.
* `"schema"`: LLM attempts to populate the fields defined in `schema` from the content.
* `chunk_token_threshold (int, default: CHUNK_TOKEN_THRESHOLD)`: The target maximum number of tokens for each chunk of content sent to the LLM. `CHUNK_TOKEN_THRESHOLD` is defined in `crawl4ai.config` (default value: 10000).
* `overlap_rate (float, default: OVERLAP_RATE)`: The percentage of overlap between consecutive chunks to ensure context continuity. `OVERLAP_RATE` is defined in `crawl4ai.config` (default value: 0.1, i.e., 10%).
* `word_token_rate (float, default: WORD_TOKEN_RATE)`: An estimated ratio of words to tokens (e.g., 0.75 words per token). Used for approximating chunk boundaries. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
* `apply_chunking (bool, default: True)`: If `True`, the input content is chunked before being sent to the LLM. If `False`, the entire content is sent (which might exceed token limits for large inputs).
* `force_json_response (bool, default: False)`: If `True` and `extraction_type` is "schema", instructs the LLM to strictly adhere to JSON output format.
* `**kwargs`: Passed to `ExtractionStrategy` and potentially to the underlying LLM API calls (e.g., `temperature`, `max_tokens` if not set in `llm_config`).
* 4.1.4. Key Public Methods:
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Processes the input `content`. If `apply_chunking` is `True`, it first chunks the content using the specified `chunking_strategy` (or a default one if `LLMExtractionStrategy` manages it internally). Then, for each chunk (or the whole content if not chunked), it constructs a prompt based on `instruction` and/or `schema` and sends it to the configured LLM.
* Returns: `List[Dict[str, Any]]` - A list of dictionaries.
* If `extraction_type` is "block", each dictionary typically contains `{"index": int, "content": str, "tags": List[str]}`.
* If `extraction_type` is "schema", each dictionary is an instance of the extracted structured data, ideally conforming to the provided `schema`. If the LLM returns multiple JSON objects in a list, they are parsed and returned.
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
* Description: Processes a list of content `sections` in parallel (using `ThreadPoolExecutor`). Each section is passed to the `extract` method logic.
* Returns: `List[Dict[str, Any]]` - Aggregated list of results from processing all sections.
* 4.1.5. `TokenUsage` Tracking:
* `total_usage (TokenUsage)`: [Read-only Public Attribute] - An instance of `TokenUsage` that accumulates the token counts (prompt, completion, total) from all LLM API calls made by this `LLMExtractionStrategy` instance.
* `usages (List[TokenUsage])`: [Read-only Public Attribute] - A list containing individual `TokenUsage` objects for each separate LLM API call made during the extraction process. This allows for detailed tracking of token consumption per call.
## 5. `ChunkingStrategy` Interface and Implementations
* ### 5.1. Interface `ChunkingStrategy`
* 5.1.1. Purpose: The `ChunkingStrategy` class, found in `crawl4ai.chunking_strategy`, is an abstract base class (ABC) that defines the interface for different content chunking algorithms. Chunking is used to break down large pieces of text or HTML into smaller, manageable segments, often before feeding them to an LLM or other processing steps.
* 5.1.2. Key Abstract Methods:
* `chunk(self, content: str) -> List[str]`:
* Description: Abstract method that must be implemented by subclasses to split the input `content` string into a list of string chunks.
* Parameters:
* `content (str)`: The content to be chunked.
* Returns: `List[str]` - A list of content chunks.
* ### 5.2. Class `RegexChunking`
* 5.2.1. Purpose: Implements `ChunkingStrategy` by splitting content based on a list of regular expression patterns. It can also attempt to merge smaller chunks to meet a target `chunk_size`.
* 5.2.2. Inheritance: `ChunkingStrategy`
* 5.2.3. Initialization (`__init__`):
* 5.2.3.1. Signature: `RegexChunking(patterns: Optional[List[str]] = None, chunk_size: Optional[int] = None, overlap: Optional[int] = None, word_token_ratio: Optional[float] = WORD_TOKEN_RATE, **kwargs)`
* 5.2.3.2. Parameters:
* `patterns (Optional[List[str]], default: None)`: A list of regex patterns used to split the text. If `None`, defaults to paragraph-based splitting (`["\\n\\n+"]`).
* `chunk_size (Optional[int], default: None)`: The target token size for each chunk. If specified, the strategy will try to merge smaller chunks created by regex splitting to approximate this size.
* `overlap (Optional[int], default: None)`: The target token overlap between consecutive chunks when `chunk_size` is active.
* `word_token_ratio (Optional[float], default: WORD_TOKEN_RATE)`: The estimated ratio of words to tokens, used if `chunk_size` or `overlap` are specified. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
* `**kwargs`: Additional keyword arguments.
* 5.2.4. Key Public Methods:
* `chunk(self, content: str) -> List[str]`:
* Description: Splits the input `content` using the configured regex patterns. If `chunk_size` is set, it then merges these initial chunks to meet the target size with the specified overlap.
* ### 5.3. Class `IdentityChunking`
* 5.3.1. Purpose: A `ChunkingStrategy` that does not perform any actual chunking. It returns the input content as a single chunk in a list.
* 5.3.2. Inheritance: `ChunkingStrategy`
* 5.3.3. Initialization (`__init__`):
* 5.3.3.1. Signature: `IdentityChunking(**kwargs)`
* 5.3.3.2. Parameters:
* `**kwargs`: Additional keyword arguments.
* 5.3.4. Key Public Methods:
* `chunk(self, content: str) -> List[str]`:
* Description: Returns the input `content` as a single-element list: `[content]`.
## 6. Defining Schemas for Extraction
* 6.1. Purpose: Schemas provide a structured way to define what data needs to be extracted from content and how it should be organized. This allows for consistent and predictable output from the extraction process.
* 6.2. Schemas for CSS/XPath/LXML-based Extraction (`JsonCssExtractionStrategy`, etc.):
* 6.2.1. Format: These strategies use a dictionary-based JSON-like schema.
* 6.2.2. Key elements: As detailed in section 3.2.4 for `JsonCssExtractionStrategy`:
* `name (str)`: Name of the schema.
* `baseSelector (str)`: CSS selector (for CSS strategies) or XPath expression (for XPath strategy) identifying the repeating parent elements.
* `fields (List[Dict[str, Any]])`: A list defining each field to extract. Each field definition includes:
* `name (str)`: Output key for the field.
* `selector (str)`: CSS/XPath selector relative to the `baseSelector` or parent "nested" element.
* `type (str)`: "text", "attribute", "html", "list", "nested".
* `attribute (str, Optional)`: Name of HTML attribute (if type is "attribute").
* `fields (List[Dict], Optional)`: For "list" (of objects) or "nested" types.
* `transform (str, Optional)`: e.g., "lowercase".
* `default (Any, Optional)`: Default value if not found.
* 6.3. Schemas for LLM-based Extraction (`LLMExtractionStrategy`):
* 6.3.1. Format: `LLMExtractionStrategy` accepts schemas in two main formats when `extraction_type="schema"`:
* Pydantic models: The Pydantic model class itself.
* Dictionary: A Python dictionary representing a valid JSON schema.
* 6.3.2. Pydantic Models:
* Definition: Users can define a Pydantic `BaseModel` where each field represents a piece of data to be extracted. Field types and descriptions are automatically inferred.
* Conversion: `LLMExtractionStrategy` internally converts the Pydantic model to its JSON schema representation (`model_json_schema()`) to guide the LLM.
* 6.3.3. Dictionary-based JSON Schema:
* Structure: Users can provide a dictionary that conforms to the JSON Schema specification. This typically includes a `type: "object"` at the root and a `properties` dictionary defining each field, its type (e.g., "string", "number", "array", "object"), and optionally a `description`.
* Usage: This schema is passed to the LLM to instruct it on the desired output format.
## 7. Configuration with `CrawlerRunConfig`
* 7.1. Purpose: The `CrawlerRunConfig` class (from `crawl4ai.async_configs`) is used to configure the behavior of a specific `arun()` or `arun_many()` call on an `AsyncWebCrawler` instance. It allows specifying various runtime parameters, including the extraction and chunking strategies.
* 7.2. Key Attributes:
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`:
* Purpose: Specifies the `ExtractionStrategy` instance to be used for processing the content obtained during the crawl. If `None`, no structured extraction beyond basic Markdown generation occurs (unless a default is applied by the crawler).
* Type: An instance of a class inheriting from `ExtractionStrategy`.
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`:
* Purpose: Specifies the `ChunkingStrategy` instance to be used for breaking down content into smaller pieces before it's passed to an `ExtractionStrategy` (particularly `LLMExtractionStrategy`).
* Type: An instance of a class inheriting from `ChunkingStrategy`.
* Default: An instance of `RegexChunking()` with its default parameters (paragraph-based splitting).
## 8. LLM-Specific Configuration and Models
* ### 8.1. Class `LLMConfig`
* 8.1.1. Purpose: The `LLMConfig` class (from `crawl4ai.async_configs`) centralizes configuration parameters for interacting with Large Language Models (LLMs) through various providers.
* 8.1.2. Initialization (`__init__`):
* 8.1.2.1. Signature:
```python
class LLMConfig:
def __init__(
self,
provider: str = DEFAULT_PROVIDER,
api_token: Optional[str] = None,
base_url: Optional[str] = None,
temperature: Optional[float] = None,
max_tokens: Optional[int] = None,
top_p: Optional[float] = None,
frequency_penalty: Optional[float] = None,
presence_penalty: Optional[float] = None,
stop: Optional[List[str]] = None,
n: Optional[int] = None,
): ...
```
* 8.1.2.2. Parameters:
* `provider (str, default: DEFAULT_PROVIDER)`: Specifies the LLM provider and model, e.g., "openai/gpt-4o-mini", "ollama/llama3.3". `DEFAULT_PROVIDER` is "openai/gpt-4o-mini".
* `api_token (Optional[str], default: None)`: API token for the LLM provider. If `None`, the system attempts to read it from environment variables (e.g., `OPENAI_API_KEY`, `GEMINI_API_KEY`, `GROQ_API_KEY` based on provider). Can also be prefixed with "env:" (e.g., "env:MY_CUSTOM_LLM_KEY").
* `base_url (Optional[str], default: None)`: Custom base URL for the LLM API endpoint, for self-hosted or alternative provider endpoints.
* `temperature (Optional[float], default: None)`: Controls randomness in LLM generation. Higher values (e.g., 0.8) make output more random, lower (e.g., 0.2) more deterministic.
* `max_tokens (Optional[int], default: None)`: Maximum number of tokens the LLM should generate in its response.
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. An alternative to temperature; controls the cumulative probability mass of tokens considered for generation.
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing repetition.
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, encouraging new topics.
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
* `n (Optional[int], default: None)`: Number of completions to generate for each prompt.
* 8.1.3. Helper Methods:
* `from_kwargs(kwargs: dict) -> LLMConfig`:
* Description: [Static method] Creates an `LLMConfig` instance from a dictionary of keyword arguments.
* `to_dict() -> dict`:
* Description: Converts the `LLMConfig` instance into a dictionary representation.
* `clone(**kwargs) -> LLMConfig`:
* Description: Creates a new `LLMConfig` instance as a copy of the current one, allowing specific attributes to be overridden with `kwargs`.
* ### 8.2. Dataclass `TokenUsage`
* 8.2.1. Purpose: The `TokenUsage` dataclass (from `crawl4ai.models`) is used to store information about the number of tokens consumed during an LLM API call.
* 8.2.2. Fields:
* `completion_tokens (int, default: 0)`: The number of tokens generated by the LLM in the completion.
* `prompt_tokens (int, default: 0)`: The number of tokens in the prompt sent to the LLM.
* `total_tokens (int, default: 0)`: The sum of `completion_tokens` and `prompt_tokens`.
* `completion_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of completion tokens, if available.
* `prompt_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of prompt tokens, if available.
## 9. PDF Processing and Extraction
* ### 9.1. Overview of PDF Processing
* 9.1.1. Purpose: Crawl4ai provides specialized strategies to handle PDF documents, enabling the fetching of PDF content and subsequent extraction of text, images, and metadata. This allows PDFs to be treated as a primary content source similar to HTML web pages.
* 9.1.2. Key Components:
* `PDFCrawlerStrategy`: For fetching/identifying PDF content.
* `PDFContentScrapingStrategy`: For processing PDF content using an underlying PDF processor.
* `NaivePDFProcessorStrategy`: The default logic for parsing PDF files.
* ### 9.2. Class `PDFCrawlerStrategy`
* 9.2.1. Purpose: An implementation of `AsyncCrawlerStrategy` specifically for handling PDF documents. It doesn't perform typical browser interactions but focuses on fetching PDF content and setting the appropriate response headers to indicate a PDF document, which then allows `PDFContentScrapingStrategy` to process it.
* 9.2.2. Inheritance: `AsyncCrawlerStrategy` (from `crawl4ai.async_crawler_strategy`)
* 9.2.3. Initialization (`__init__`):
* 9.2.3.1. Signature: `PDFCrawlerStrategy(logger: Optional[AsyncLogger] = None)`
* 9.2.3.2. Parameters:
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance for logging messages.
* 9.2.4. Key Public Methods:
* `crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
* Description: Fetches the content from the given `url`. If the content is identified as a PDF (either by URL extension or `Content-Type` header for remote URLs), it sets `response_headers={"Content-Type": "application/pdf"}` in the returned `AsyncCrawlResponse`. The `html` field of the response will contain a placeholder message as the actual PDF processing happens in the scraping strategy.
* `close(self) -> None`:
* Description: Placeholder for cleanup, typically does nothing in this strategy.
* `__aenter__(self) -> "PDFCrawlerStrategy"`:
* Description: Async context manager entry point.
* `__aexit__(self, exc_type, exc_val, exc_tb) -> None`:
* Description: Async context manager exit point, calls `close()`.
* ### 9.3. Class `PDFContentScrapingStrategy`
* 9.3.1. Purpose: An implementation of `ContentScrapingStrategy` designed to process PDF documents. It uses an underlying `PDFProcessorStrategy` (by default, `NaivePDFProcessorStrategy`) to extract text, images, and metadata from the PDF, then formats this information into a `ScrapingResult`.
* 9.3.2. Inheritance: `ContentScrapingStrategy` (from `crawl4ai.content_scraping_strategy`)
* 9.3.3. Initialization (`__init__`):
* 9.3.3.1. Signature: `PDFContentScrapingStrategy(save_images_locally: bool = False, extract_images: bool = False, image_save_dir: Optional[str] = None, batch_size: int = 4, logger: Optional[AsyncLogger] = None)`
* 9.3.3.2. Parameters:
* `save_images_locally (bool, default: False)`: If `True`, extracted images will be saved to the local filesystem.
* `extract_images (bool, default: False)`: If `True`, the strategy will attempt to extract images from the PDF.
* `image_save_dir (Optional[str], default: None)`: The directory where extracted images will be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
* `batch_size (int, default: 4)`: The number of PDF pages to process in parallel by the underlying `NaivePDFProcessorStrategy`.
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance.
* 9.3.4. Key Attributes:
* `pdf_processor (NaivePDFProcessorStrategy)`: An instance of `NaivePDFProcessorStrategy` configured with the provided image and batch settings, used to do the actual PDF parsing.
* 9.3.5. Key Public Methods:
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
* Description: Takes a `url` (which can be a local file path or a remote HTTP/HTTPS URL pointing to a PDF) and processes it. The `html` parameter is typically a placeholder like "Scraper will handle the real work" as the content comes from the PDF file itself. It downloads remote PDFs to a temporary local file before processing.
* Returns: `ScrapingResult` containing the extracted PDF data, including `cleaned_html` (concatenated HTML of pages), `media` (extracted images), `links`, and `metadata`.
* `ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
* Description: Asynchronous version of `scrape`. Internally calls `scrape` using `asyncio.to_thread`.
* 9.3.6. Internal Methods (Conceptual):
* `_get_pdf_path(self, url: str) -> str`:
* Description: If `url` is an HTTP/HTTPS URL, downloads the PDF to a temporary file and returns its path. If `url` starts with "file://", it strips the prefix and returns the local path. Otherwise, assumes `url` is already a local path. Handles download timeouts and errors.
* ### 9.4. Class `NaivePDFProcessorStrategy`
* 9.4.1. Purpose: The default implementation of `PDFProcessorStrategy` in Crawl4ai. It uses the PyPDF2 library (and Pillow for image processing) to parse PDF files, extract text content page by page, attempt to extract embedded images, and gather document metadata.
* 9.4.2. Inheritance: `PDFProcessorStrategy` (from `crawl4ai.processors.pdf.processor`)
* 9.4.3. Dependencies: Requires `PyPDF2` and `Pillow`. These are installed with the `crawl4ai[pdf]` extra.
* 9.4.4. Initialization (`__init__`):
* 9.4.4.1. Signature: `NaivePDFProcessorStrategy(image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
* 9.4.4.2. Parameters:
* `image_dpi (int, default: 144)`: DPI used when rendering PDF pages to images (if direct image extraction is not possible or disabled).
* `image_quality (int, default: 85)`: Quality setting (1-100) for images saved in lossy formats like JPEG.
* `extract_images (bool, default: True)`: If `True`, attempts to extract embedded images directly from the PDF's XObjects.
* `save_images_locally (bool, default: False)`: If `True`, extracted images are saved to disk. Otherwise, they are base64 encoded and returned in the `PDFPage.images` data.
* `image_save_dir (Optional[Path], default: None)`: If `save_images_locally` is True, this specifies the directory to save images. If `None`, a temporary directory (prefixed `pdf_images_`) is created and used.
* `batch_size (int, default: 4)`: The number of pages to process in parallel when using the `process_batch` method.
* 9.4.5. Key Public Methods:
* `process(self, pdf_path: Path) -> PDFProcessResult`:
* Description: Processes the PDF specified by `pdf_path` page by page sequentially.
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects.
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
* Description: Processes the PDF specified by `pdf_path` by handling pages in parallel batches using a `ThreadPoolExecutor` with `max_workers` set to `batch_size`.
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects, assembled in the correct page order.
* 9.4.6. Internal Methods (Conceptual High-Level):
* `_process_page(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> PDFPage`: Extracts text, images (if `extract_images` is True), and links from a single PyPDF2 page object.
* `_extract_images(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> List[Dict]`: Iterates through XObjects on a page, identifies images, decodes them (handling FlateDecode, DCTDecode, CCITTFaxDecode, JPXDecode), and either saves them locally or base64 encodes them.
* `_extract_links(self, page: PyPDF2PageObject) -> List[str]`: Extracts URI actions from page annotations to get hyperlinks.
* `_extract_metadata(self, pdf_path: Path, reader: PyPDF2PdfReader) -> PDFMetadata`: Reads metadata from the PDF document information dictionary (e.g., /Title, /Author, /CreationDate).
* ### 9.5. Data Models for PDF Processing
* 9.5.1. Dataclass `PDFMetadata` (from `crawl4ai.processors.pdf.processor`)
* Fields:
* `title (Optional[str], default: None)`
* `author (Optional[str], default: None)`
* `producer (Optional[str], default: None)`
* `created (Optional[datetime], default: None)`
* `modified (Optional[datetime], default: None)`
* `pages (int, default: 0)`
* `encrypted (bool, default: False)`
* `file_size (Optional[int], default: None)`
* 9.5.2. Dataclass `PDFPage` (from `crawl4ai.processors.pdf.processor`)
* Fields:
* `page_number (int)`
* `raw_text (str, default: "")`
* `markdown (str, default: "")`: Markdown representation of the page's text content, processed by `clean_pdf_text`.
* `html (str, default: "")`: HTML representation of the page's text content, processed by `clean_pdf_text_to_html`.
* `images (List[Dict], default_factory: list)`: List of image dictionaries. Each dictionary contains:
* `format (str)`: e.g., "png", "jpeg", "tiff", "jp2", "bin".
* `width (int)`
* `height (int)`
* `color_space (str)`: e.g., "/DeviceRGB", "/DeviceGray".
* `bits_per_component (int)`
* `path (str, Optional)`: If `save_images_locally` was True, path to the saved image file.
* `data (str, Optional)`: If `save_images_locally` was False, base64 encoded image data.
* `page (int)`: The page number this image was extracted from.
* `links (List[str], default_factory: list)`: List of hyperlink URLs found on the page.
* `layout (List[Dict], default_factory: list)`: List of dictionaries representing text layout elements, primarily: `{"type": "text", "text": str, "x": float, "y": float}`.
* 9.5.3. Dataclass `PDFProcessResult` (from `crawl4ai.processors.pdf.processor`)
* Fields:
* `metadata (PDFMetadata)`
* `pages (List[PDFPage])`
* `processing_time (float, default: 0.0)`: Time in seconds taken to process the PDF.
* `version (str, default: "1.1")`: Version of the PDF processor strategy (e.g., "1.1" for current `NaivePDFProcessorStrategy`).
* ### 9.6. Using PDF Strategies with `AsyncWebCrawler`
* 9.6.1. Workflow:
1. Instantiate `AsyncWebCrawler`. The `crawler_strategy` parameter of `AsyncWebCrawler` should be set to an instance of `PDFCrawlerStrategy` if you intend to primarily crawl PDF URLs or local PDF files directly. If crawling mixed content where PDFs are discovered via links on HTML pages, the default `AsyncPlaywrightCrawlerStrategy` might be used initially, and then a PDF-specific scraping strategy would be applied when a PDF content type is detected.
2. In `CrawlerRunConfig`, set the `scraping_strategy` attribute to an instance of `PDFContentScrapingStrategy`. Configure this strategy with desired options like `extract_images`, `save_images_locally`, etc.
3. When `crawler.arun(url="path/to/document.pdf", config=run_config)` is called for a PDF URL or local file path:
* `PDFCrawlerStrategy` (if used) or the default crawler strategy fetches the file.
* `PDFContentScrapingStrategy.scrape()` is invoked. It uses its internal `NaivePDFProcessorStrategy` instance to parse the PDF.
* The extracted text, image data, and metadata are populated into the `CrawlResult` object (e.g., `result.markdown`, `result.media["images"]`, `result.metadata`).
* 9.6.2. Example Snippet:
```python
import asyncio
from pathlib import Path
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, PDFCrawlerStrategy
from crawl4ai.content_scraping_strategy import PDFContentScrapingStrategy
from crawl4ai.processors.pdf import PDFContentScrapingStrategy # Corrected import path
async def main():
# Setup for PDF processing
pdf_crawler_strategy = PDFCrawlerStrategy() # Use if directly targeting PDF URLs
pdf_scraping_strategy = PDFContentScrapingStrategy(
extract_images=True,
save_images_locally=True,
image_save_dir="./pdf_images_output" # Ensure this directory exists
)
Path("./pdf_images_output").mkdir(parents=True, exist_ok=True)
# If crawling a website that links to PDFs, you might use the default crawler strategy
# and rely on content-type detection to switch to PDFContentScrapingStrategy if needed.
# For direct PDF URL:
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
# Example PDF URL (replace with a real one for testing)
pdf_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
result = await crawler.arun(url=pdf_url, config=run_config)
if result.success:
print(f"Successfully processed PDF: {result.url}")
if result.markdown:
print(f"Markdown content (first 500 chars): {result.markdown.raw_markdown[:500]}")
if result.media and result.media.images:
print(f"Extracted {len(result.media.images)} images.")
for img in result.media.images:
print(f" - Image source/path: {img.src or img.path}, Page: {img.page}")
if result.metadata:
print(f"PDF Metadata: {result.metadata}")
else:
print(f"Failed to process PDF: {result.url}, Error: {result.error_message}")
# if __name__ == "__main__":
# asyncio.run(main())
```
```

File diff suppressed because it is too large Load Diff

View File

@@ -1,818 +0,0 @@
# Examples Outline for crawl4ai - markdown Component
**Target Document Type:** Examples Collection
**Target Output Filename Suggestion:** `llm_examples_markdown.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
This document provides practical, runnable code examples for the `markdown` component of the `crawl4ai` library, focusing on the `DefaultMarkdownGenerator` and its various configurations.
## 1. Basic Markdown Generation with `DefaultMarkdownGenerator`
### 1.1. Example: Generating Markdown with default `DefaultMarkdownGenerator` settings via `AsyncWebCrawler`.
This example demonstrates the most basic usage of `DefaultMarkdownGenerator` within an `AsyncWebCrawler` run.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
async def basic_markdown_generation_via_crawler():
# DefaultMarkdownGenerator will be used by default if markdown_generator is not specified,
# but we explicitly set it here for clarity.
md_generator = DefaultMarkdownGenerator()
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS # Use BYPASS for fresh content in examples
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.markdown:
print("--- Raw Markdown (First 300 chars) ---")
print(result.markdown.raw_markdown[:300])
print("\n--- Markdown with Citations (First 300 chars) ---")
print(result.markdown.markdown_with_citations[:300])
print("\n--- References Markdown ---")
print(result.markdown.references_markdown) # example.com has no outbound links usually
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(basic_markdown_generation_via_crawler())
```
---
### 1.2. Example: Direct instantiation and use of `DefaultMarkdownGenerator`.
You can use `DefaultMarkdownGenerator` directly if you already have HTML content.
```python
from crawl4ai import DefaultMarkdownGenerator
def direct_markdown_generation():
generator = DefaultMarkdownGenerator()
html_content = """
<html>
<head><title>Test Page</title></head>
<body>
<h1>Welcome to Example</h1>
<p>This is a paragraph with a <a href="https://example.org/another-page">link</a>.</p>
<p>Another paragraph follows.</p>
</body>
</html>
"""
# base_url is important for resolving relative links if any, and for citation context
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
print("--- Raw Markdown (Direct Generation) ---")
print(result_md.raw_markdown)
print("\n--- Markdown with Citations (Direct Generation) ---")
print(result_md.markdown_with_citations)
print("\n--- References Markdown (Direct Generation) ---")
print(result_md.references_markdown)
if __name__ == "__main__":
direct_markdown_generation()
```
---
## 2. Citation Management in Markdown
### 2.1. Example: Default citation behavior (citations enabled).
By default, `DefaultMarkdownGenerator` generates citations for links.
```python
from crawl4ai import DefaultMarkdownGenerator
def default_citation_behavior():
generator = DefaultMarkdownGenerator()
html_content = """
<html><body>
<p>Check out <a href="https://crawl4ai.com" title="Crawl4ai Homepage">Crawl4ai</a> and
<a href="/docs">our documentation</a>.</p>
</body></html>
"""
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
print("--- Raw Markdown ---")
print(result_md.raw_markdown)
print("\n--- Markdown with Citations ---")
print(result_md.markdown_with_citations)
print("\n--- References Markdown ---")
print(result_md.references_markdown)
if __name__ == "__main__":
default_citation_behavior()
```
---
### 2.2. Example: Disabling citations in `DefaultMarkdownGenerator`.
You can disable citation generation by setting `citations=False` in the `generate_markdown` method.
```python
from crawl4ai import DefaultMarkdownGenerator
def disabling_citations():
generator = DefaultMarkdownGenerator()
html_content = """
<html><body>
<p>A link to <a href="https://anothersite.com">another site</a> will not be cited.</p>
</body></html>
"""
# Disable citations for this specific call
result_md_no_citations = generator.generate_markdown(
input_html=html_content,
base_url="https://example.com",
citations=False
)
print("--- Raw Markdown (Citations Disabled) ---")
print(result_md_no_citations.raw_markdown)
print("\n--- Markdown with Citations (Citations Disabled) ---")
# This should be the same as raw_markdown when citations=False
print(result_md_no_citations.markdown_with_citations)
print("\n--- References Markdown (Citations Disabled) ---")
# This should be empty or minimal
print(result_md_no_citations.references_markdown)
# For comparison, with citations enabled (default)
result_md_with_citations = generator.generate_markdown(
input_html=html_content,
base_url="https://example.com",
citations=True # Default
)
print("\n--- For Comparison: Markdown with Citations (Enabled) ---")
print(result_md_with_citations.markdown_with_citations)
print("\n--- For Comparison: References Markdown (Enabled) ---")
print(result_md_with_citations.references_markdown)
if __name__ == "__main__":
disabling_citations()
```
---
### 2.3. Example: Impact of `base_url` on citation links for relative URLs.
The `base_url` parameter is crucial for correctly resolving relative URLs in your HTML content into absolute URLs in the references.
```python
from crawl4ai import DefaultMarkdownGenerator
def base_url_impact_on_citations():
generator = DefaultMarkdownGenerator()
html_content = """
<html><body>
<p>Links: <a href="/features">Features</a>, <a href="pricing.html">Pricing</a>,
and an absolute link to <a href="https://external.com/resource">External Resource</a>.</p>
</body></html>
"""
print("--- Case 1: With base_url='https://example.com/products/' ---")
result_md_case1 = generator.generate_markdown(
input_html=html_content,
base_url="https://example.com/products/"
)
print(result_md_case1.references_markdown)
print("\n--- Case 2: With base_url='https://another-domain.net/' ---")
result_md_case2 = generator.generate_markdown(
input_html=html_content,
base_url="https://another-domain.net/"
)
print(result_md_case2.references_markdown)
print("\n--- Case 3: Without base_url (relative links might be incomplete) ---")
result_md_case3 = generator.generate_markdown(input_html=html_content)
print(result_md_case3.references_markdown)
if __name__ == "__main__":
base_url_impact_on_citations()
```
---
### 2.4. Example: Handling HTML with no links (empty `references_markdown`).
If the input HTML contains no hyperlinks, the `references_markdown` will be empty.
```python
from crawl4ai import DefaultMarkdownGenerator
def no_links_in_html():
generator = DefaultMarkdownGenerator()
html_content = "<html><body><p>This is a paragraph with no links at all.</p><b>Just some bold text.</b></body></html>"
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
print("--- Raw Markdown ---")
print(result_md.raw_markdown)
print("\n--- Markdown with Citations ---")
print(result_md.markdown_with_citations) # Should be same as raw_markdown
print("\n--- References Markdown ---")
print(f"'{result_md.references_markdown}'") # Should be empty or contain minimal boilerplate
if __name__ == "__main__":
no_links_in_html()
```
---
## 3. Controlling `html2text` Conversion Options
The `DefaultMarkdownGenerator` uses the `html2text` library internally. You can pass options to `html2text` either during generator initialization (`options` parameter) or during the `generate_markdown` call (`html2text_options` parameter).
### 3.1. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore links.
This will prevent links from appearing in the Markdown output altogether (different from `citations=False` which keeps link text but omits citation markers).
```python
from crawl4ai import DefaultMarkdownGenerator
def ignore_links_option():
# Initialize with html2text option to ignore links
generator = DefaultMarkdownGenerator(options={"ignore_links": True})
html_content = "<html><body><p>A link to <a href='https://example.com'>Example Site</a> and some text.</p></body></html>"
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown (ignore_links=True) ---")
print(result_md.raw_markdown) # Link text might be present or absent based on html2text behavior
print("--- Markdown with Citations (ignore_links=True) ---")
print(result_md.markdown_with_citations) # No citations as links are ignored
print("--- References (ignore_links=True) ---")
print(f"'{result_md.references_markdown}'") # Should be empty
if __name__ == "__main__":
ignore_links_option()
```
---
### 3.2. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore images.
This will prevent image references (like `![alt text](src)`) from appearing in the Markdown.
```python
from crawl4ai import DefaultMarkdownGenerator
def ignore_images_option():
generator = DefaultMarkdownGenerator(options={"ignore_images": True})
html_content = "<html><body><p>An image: <img src='image.png' alt='My Test Image'></p></body></html>"
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown (ignore_images=True) ---")
print(result_md.raw_markdown) # Image markdown should be absent
if __name__ == "__main__":
ignore_images_option()
```
---
### 3.3. Example: Initializing `DefaultMarkdownGenerator` with `options` for `body_width=0` (no line wrapping).
`body_width=0` tells `html2text` not to wrap lines.
```python
from crawl4ai import DefaultMarkdownGenerator
def no_line_wrapping_option():
generator = DefaultMarkdownGenerator(options={"body_width": 0})
long_text = "This is a very long line of text that would normally be wrapped by html2text. " * 5
html_content = f"<html><body><p>{long_text}</p></body></html>"
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown (body_width=0) ---")
print(result_md.raw_markdown) # Observe the long line without soft wraps
if __name__ == "__main__":
no_line_wrapping_option()
```
---
### 3.4. Example: Initializing `DefaultMarkdownGenerator` to disable emphasis.
This will remove formatting for `<em>` and `<strong>` tags.
```python
from crawl4ai import DefaultMarkdownGenerator
def ignore_emphasis_option():
generator = DefaultMarkdownGenerator(options={"ignore_emphasis": True})
html_content = "<html><body><p>Normal, <em>emphasized</em>, and <strong>strongly emphasized</strong> text.</p></body></html>"
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown (ignore_emphasis=True) ---")
print(result_md.raw_markdown) # Emphasis should be gone
if __name__ == "__main__":
ignore_emphasis_option()
```
---
### 3.5. Example: Overriding `html2text_options` at `generate_markdown` call time.
Options passed to `generate_markdown` via `html2text_options` take precedence.
```python
from crawl4ai import DefaultMarkdownGenerator
def override_html2text_options():
# Initial generator might have some defaults
generator = DefaultMarkdownGenerator(options={"ignore_links": False})
html_content = "<html><body><p>Link: <a href='https://example.com'>Example</a>.</p></body></html>"
# Override at call time to protect links
result_md = generator.generate_markdown(
input_html=html_content,
html2text_options={"protect_links": True} # Links will be <URL>
)
print("--- Markdown (protect_links=True via call-time override) ---")
print(result_md.raw_markdown)
if __name__ == "__main__":
override_html2text_options()
```
---
### 3.6. Example: Combining multiple `html2text` options.
Multiple options can be combined for fine-grained control over the Markdown output.
```python
from crawl4ai import DefaultMarkdownGenerator
def combined_html2text_options():
generator = DefaultMarkdownGenerator(options={
"ignore_links": True,
"ignore_images": True,
"body_width": 60 # Wrap at 60 characters
})
html_content = """
<html><body>
<p>This is a paragraph with a <a href='https://example.com'>link to ignore</a> and an
<img src='image.png' alt='image to ignore'>. It also has some long text to demonstrate wrapping.
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
</p>
</body></html>
"""
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown (Combined Options: ignore_links, ignore_images, body_width=60) ---")
print(result_md.raw_markdown)
if __name__ == "__main__":
combined_html2text_options()
```
---
## 4. Selecting the HTML Content Source for Markdown Generation
The `DefaultMarkdownGenerator` can generate Markdown from different HTML sources within the `CrawlResult`.
### 4.1. Example: Markdown from `cleaned_html` (default `content_source`).
This is the default behavior. `cleaned_html` is the HTML after `WebScrapingStrategy` (e.g., `LXMLWebScrapingStrategy`) has processed it.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
async def markdown_from_cleaned_html():
# Default content_source is "cleaned_html"
md_generator = DefaultMarkdownGenerator()
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
# Using a more complex page to see the effect of cleaning
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
if result.success and result.markdown:
print("--- Markdown from Cleaned HTML (Default - First 300 chars) ---")
print(result.markdown.raw_markdown[:300])
# For comparison, show a snippet of cleaned_html
print("\n--- Cleaned HTML (Source - First 300 chars) ---")
print(result.cleaned_html[:300])
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(markdown_from_cleaned_html())
```
---
### 4.2. Example: Markdown from `raw_html`.
This example uses the original, unprocessed HTML fetched from the URL as the source for Markdown generation.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
async def markdown_from_raw_html():
md_generator = DefaultMarkdownGenerator(content_source="raw_html")
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.markdown:
print("--- Markdown from Raw HTML (First 300 chars) ---")
print(result.markdown.raw_markdown[:300])
print("\n--- Raw Page HTML (Source - First 300 chars for comparison) ---")
print(result.html[:300]) # result.html contains the raw HTML
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(markdown_from_raw_html())
```
---
### 4.3. Example: Markdown from `fit_html` (requires a `ContentFilterStrategy`).
`fit_html` is the HTML content after a `ContentFilterStrategy` (like `PruningContentFilter`) has processed it.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
async def markdown_from_fit_html():
# A content filter must run to produce fit_html
pruning_filter = PruningContentFilter()
md_generator = DefaultMarkdownGenerator(
content_filter=pruning_filter,
content_source="fit_html" # Explicitly use the output of the filter
)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
# Using a news site which PruningContentFilter can work on
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
if result.success and result.markdown:
print("--- Markdown from Fit HTML (Output of PruningFilter - First 300 chars) ---")
# When content_source="fit_html", result.markdown.raw_markdown IS from fit_html
print(result.markdown.raw_markdown[:300])
print("\n--- Fit HTML itself (Source - First 300 chars for comparison) ---")
print(result.markdown.fit_html[:300])
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(markdown_from_fit_html())
```
---
## 5. Integration with Content Filters
`DefaultMarkdownGenerator` can work in conjunction with `ContentFilterStrategy` instances. If a filter is provided, it will produce `fit_html` and `fit_markdown`.
### 5.1. Example: `DefaultMarkdownGenerator` with `PruningContentFilter`.
The `PruningContentFilter` attempts to remove boilerplate and keep main content.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
async def md_with_pruning_filter():
pruning_filter = PruningContentFilter()
# By default, raw_markdown is from cleaned_html, fit_markdown is from fit_html
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
if result.success and result.markdown:
print("--- Raw Markdown (from cleaned_html - First 200 chars) ---")
print(result.markdown.raw_markdown[:200])
print("\n--- Fit Markdown (from PruningFilter's fit_html - First 200 chars) ---")
print(result.markdown.fit_markdown[:200])
print("\n--- Fit HTML (Source for Fit Markdown - First 200 chars) ---")
print(result.markdown.fit_html[:200])
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(md_with_pruning_filter())
```
---
### 5.2. Example: `DefaultMarkdownGenerator` with `BM25ContentFilter`.
`BM25ContentFilter` filters content based on relevance to a user query.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
async def md_with_bm25_filter():
bm25_filter = BM25ContentFilter(user_query="Python programming language features")
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
# Using a relevant page for the query
result = await crawler.arun(url="https://docs.python.org/3/tutorial/classes.html", config=config)
if result.success and result.markdown:
print("--- Fit Markdown (from BM25Filter - First 300 chars) ---")
print(result.markdown.fit_markdown[:300])
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(md_with_bm25_filter())
```
---
### 5.3. Example: `DefaultMarkdownGenerator` with `LLMContentFilter`.
`LLMContentFilter` uses an LLM to intelligently filter or summarize content based on instructions. (Requires API Key)
```python
import asyncio
import os
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, LLMConfig, CacheMode
from crawl4ai.content_filter_strategy import LLMContentFilter
async def md_with_llm_filter():
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
print("OPENAI_API_KEY not found. Skipping LLMContentFilter example.")
return
llm_config = LLMConfig(api_token=openai_api_key, provider="openai/gpt-3.5-turbo")
llm_filter = LLMContentFilter(
llm_config=llm_config,
instruction="Summarize the main arguments presented in this Hacker News discussion thread."
)
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS # Fresh run for LLM
)
async with AsyncWebCrawler() as crawler:
# Example Hacker News discussion
result = await crawler.arun(url="https://news.ycombinator.com/item?id=39000000", config=config) # A past popular item
if result.success and result.markdown:
print("--- Fit Markdown (from LLMContentFilter - First 500 chars) ---")
print(result.markdown.fit_markdown[:500])
llm_filter.show_usage() # Show token usage
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(md_with_llm_filter())
```
---
### 5.4. Example: Forcing Markdown generation from `fit_html` when a filter is active.
This example shows how to ensure the `raw_markdown` itself is generated from the `fit_html` (output of the filter) rather than `cleaned_html`.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
async def md_forced_from_fit_html():
pruning_filter = PruningContentFilter()
# Explicitly set content_source to "fit_html"
md_generator = DefaultMarkdownGenerator(
content_filter=pruning_filter,
content_source="fit_html"
)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
if result.success and result.markdown:
print("--- Raw Markdown (forced from fit_html - First 300 chars) ---")
# This raw_markdown is now generated from the output of PruningFilter
print(result.markdown.raw_markdown[:300])
print("\n--- Fit HTML (Source for Raw Markdown - First 300 chars) ---")
print(result.markdown.fit_html[:300])
print("\n--- Fit Markdown (should be same as Raw Markdown here - First 300 chars) ---")
print(result.markdown.fit_markdown[:300])
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(md_forced_from_fit_html())
```
---
### 5.5. Example: Markdown generation when no filter is active.
If no `content_filter` is provided to `DefaultMarkdownGenerator`, `fit_markdown` and `fit_html` will be empty or None.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
async def md_no_filter():
md_generator = DefaultMarkdownGenerator() # No filter provided
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.markdown:
print("--- Raw Markdown (First 300 chars) ---")
print(result.markdown.raw_markdown[:300])
print("\n--- Fit Markdown (Expected: None or empty) ---")
print(result.markdown.fit_markdown)
print("\n--- Fit HTML (Expected: None or empty) ---")
print(result.markdown.fit_html)
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(md_no_filter())
```
---
## 6. Understanding `MarkdownGenerationResult` Output Fields
### 6.1. Example: Accessing all fields of `MarkdownGenerationResult`.
This example demonstrates how to access all the different Markdown and HTML outputs available in the `MarkdownGenerationResult` object.
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter # Using a filter to populate fit_html/fit_markdown
async def access_all_markdown_fields():
# Setup with a filter to ensure fit_html and fit_markdown are generated
content_filter = PruningContentFilter()
md_generator = DefaultMarkdownGenerator(
content_filter=content_filter,
content_source="cleaned_html" # raw_markdown will be from cleaned_html
)
config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
# Using a content-rich page
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=config)
if result.success and result.markdown:
md_result = result.markdown
print("--- Accessing MarkdownGenerationResult Fields ---")
print(f"\n1. Raw Markdown (from '{md_generator.content_source}' - snippet):")
print(md_result.raw_markdown[:300] + "...")
print(f"\n2. Markdown with Citations (snippet):")
print(md_result.markdown_with_citations[:300] + "...")
print(f"\n3. References Markdown (snippet):")
print(md_result.references_markdown[:200] + "...")
print(f"\n4. Fit HTML (from ContentFilter - snippet):")
if md_result.fit_html:
print(md_result.fit_html[:300] + "...")
else:
print("None (No filter or filter produced no output)")
print(f"\n5. Fit Markdown (from fit_html - snippet):")
if md_result.fit_markdown:
print(md_result.fit_markdown[:300] + "...")
else:
print("None (No filter or filter produced no output)")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(access_all_markdown_fields())
```
---
## 7. Advanced and Specific Scenarios
### 7.1. Example: Handling HTML with complex table structures.
`DefaultMarkdownGenerator` (via `html2text`) attempts to render HTML tables into Markdown tables.
```python
from crawl4ai import DefaultMarkdownGenerator
def markdown_for_tables():
generator = DefaultMarkdownGenerator()
html_content = """
<html><body>
<h3>Product Comparison</h3>
<table>
<thead>
<tr><th>Feature</th><th>Product A</th><th>Product B</th></tr>
</thead>
<tbody>
<tr><td>Price</td><td>$100</td><td>$120</td></tr>
<tr><td>Rating</td><td>4.5 stars</td><td>4.2 stars</td></tr>
<tr><td>Multi-row<br/>Feature</td><td colspan="2">Supported by Both</td></tr>
</tbody>
</table>
</body></html>
"""
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown for Table ---")
print(result_md.raw_markdown)
if __name__ == "__main__":
markdown_for_tables()
```
---
### 7.2. Example: Handling HTML with code blocks.
Code blocks are generally preserved in Markdown format.
```python
from crawl4ai import DefaultMarkdownGenerator
def markdown_for_code_blocks():
generator = DefaultMarkdownGenerator()
html_content = """
<html><body>
<p>Here is some Python code:</p>
<pre><code class="language-python">
def greet(name):
print(f"Hello, {name}!")
greet("World")
</code></pre>
<p>And an inline <code>example_function()</code>.</p>
</body></html>
"""
result_md = generator.generate_markdown(input_html=html_content)
print("--- Markdown for Code Blocks ---")
print(result_md.raw_markdown)
if __name__ == "__main__":
markdown_for_code_blocks()
```
---
### 7.3. Example: Using a custom `MarkdownGenerationStrategy` (conceptual).
You can create your own Markdown generation logic by subclassing `MarkdownGenerationStrategy`.
```python
import asyncio
from crawl4ai import (
AsyncWebCrawler, CrawlerRunConfig, CacheMode,
MarkdownGenerationStrategy, MarkdownGenerationResult
)
# Define a minimal custom Markdown generator
class CustomMarkdownGenerator(MarkdownGenerationStrategy):
def __init__(self, prefix="CUSTOM MD: ", **kwargs):
super().__init__(**kwargs) # Pass along any other options
self.prefix = prefix
def generate_markdown(
self,
input_html: str,
base_url: str = "",
html2text_options: dict = None, # Can be used by html2text
citations: bool = True, # Standard param
**kwargs # For other potential strategy-specific params
) -> MarkdownGenerationResult:
# Simplified custom logic: just prefix and take a snippet
# A real custom generator would do more sophisticated parsing/conversion
custom_raw_md = self.prefix + input_html[:100].strip() + "..."
# For simplicity, we'll just return the custom raw markdown for all fields
return MarkdownGenerationResult(
raw_markdown=custom_raw_md,
markdown_with_citations=custom_raw_md, # No real citation logic here
references_markdown="",
fit_markdown=None, # Not implementing filtering here
fit_html=None
)
async def use_custom_markdown_generator():
custom_generator = CustomMarkdownGenerator(prefix="[MyGenerator Says]: ")
config = CrawlerRunConfig(
markdown_generator=custom_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.markdown:
print("--- Output from CustomMarkdownGenerator ---")
print(result.markdown.raw_markdown)
# Since our custom generator doesn't really do citations or filtering:
print(f"Citations: '{result.markdown.markdown_with_citations}'")
print(f"References: '{result.markdown.references_markdown}'")
print(f"Fit Markdown: '{result.markdown.fit_markdown}'")
else:
print(f"Crawl failed: {result.error_message}")
if __name__ == "__main__":
asyncio.run(use_custom_markdown_generator())
```
---
**End of Examples Document**
```

View File

@@ -1,228 +0,0 @@
Okay, I have processed the instructions and the provided information. I will now generate the detailed "Foundational Memory" document for the `markdown` component of `crawl4ai`, adhering strictly to the outline you've provided and focusing on factual API details.
```markdown
# Detailed Outline for crawl4ai - markdown Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_markdown.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
## 1. Introduction to Markdown Generation in Crawl4ai
* 1.1. Purpose: This section outlines the `markdown` component of the `crawl4ai` library. Its primary role is to convert HTML content, obtained during web crawling, into various Markdown formats. These formats are designed to be suitable for consumption by Large Language Models (LLMs), as well as for other applications requiring structured text from web pages.
* 1.2. Key Abstractions:
* `MarkdownGenerationStrategy`: An abstract base class that defines the interface for different markdown generation algorithms and approaches. This allows for customizable Markdown conversion processes.
* `DefaultMarkdownGenerator`: The standard, out-of-the-box implementation of `MarkdownGenerationStrategy`. It handles the conversion of HTML to Markdown, including features like link-to-citation conversion and integration with content filtering.
* `MarkdownGenerationResult`: A Pydantic data model that encapsulates the various outputs of the markdown generation process, such as raw markdown, markdown with citations, and markdown derived from filtered content.
* `CrawlerRunConfig.markdown_generator`: An attribute within the `CrawlerRunConfig` class that allows users to specify which instance of a `MarkdownGenerationStrategy` should be used for a particular crawl operation.
* 1.3. Relationship with Content Filtering: The markdown generation process can be integrated with `RelevantContentFilter` strategies. When a content filter is applied, it first refines the input HTML, and then this filtered HTML is used to produce a `fit_markdown` output, providing a more focused version of the content.
## 2. Core Interface: `MarkdownGenerationStrategy`
* 2.1. Purpose: The `MarkdownGenerationStrategy` class is an abstract base class (ABC) that defines the contract for all markdown generation strategies within `crawl4ai`. It ensures that any custom markdown generator will adhere to a common interface, making them pluggable into the crawling process.
* 2.2. Source File: `crawl4ai/markdown_generation_strategy.py`
* 2.3. Initialization (`__init__`)
* 2.3.1. Signature:
```python
class MarkdownGenerationStrategy(ABC):
def __init__(
self,
content_filter: Optional[RelevantContentFilter] = None,
options: Optional[Dict[str, Any]] = None,
verbose: bool = False,
content_source: str = "cleaned_html",
):
# ...
```
* 2.3.2. Parameters:
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter will be used to process the HTML before generating the `fit_markdown` and `fit_html` outputs in the `MarkdownGenerationResult`.
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary for strategy-specific custom options. This allows subclasses to receive additional configuration parameters. Defaults to an empty dictionary if `None`.
* `verbose (bool`, default: `False`)`: If `True`, enables verbose logging for the markdown generation process.
* `content_source (str`, default: `"cleaned_html"`)`: A string indicating the source of HTML to use for Markdown generation. Common values might include `"raw_html"` (original HTML from the page), `"cleaned_html"` (HTML after initial cleaning by the scraping strategy), or `"fit_html"` (HTML after being processed by `content_filter`). The actual available sources depend on the `ScrapingResult` provided to the markdown generator.
* 2.4. Abstract Methods:
* 2.4.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
* Purpose: This abstract method must be implemented by concrete subclasses. It is responsible for taking an HTML string and converting it into various Markdown representations, encapsulated within a `MarkdownGenerationResult` object.
* Parameters:
* `input_html (str)`: The HTML string content to be converted to Markdown.
* `base_url (str`, default: `""`)`: The base URL of the crawled page. This is crucial for resolving relative URLs, especially when converting links to citations.
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to be passed to the underlying HTML-to-text conversion engine (e.g., `CustomHTML2Text`).
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter is used to generate `fit_markdown` and `fit_html`. This parameter overrides any filter set during the strategy's initialization for this specific call.
* `citations (bool`, default: `True`)`: A boolean flag indicating whether to convert Markdown links into a citation format (e.g., `[text]^[1]^`) with a corresponding reference list.
* `**kwargs`: Additional keyword arguments to allow for future extensions or strategy-specific parameters.
* Returns: (`MarkdownGenerationResult`) An object containing the results of the Markdown generation, including `raw_markdown`, `markdown_with_citations`, `references_markdown`, and potentially `fit_markdown` and `fit_html`.
## 3. Default Implementation: `DefaultMarkdownGenerator`
* 3.1. Purpose: `DefaultMarkdownGenerator` is the standard concrete implementation of `MarkdownGenerationStrategy`. It provides a robust mechanism for converting HTML to Markdown, featuring link-to-citation conversion and the ability to integrate with `RelevantContentFilter` strategies for focused content output.
* 3.2. Source File: `crawl4ai/markdown_generation_strategy.py`
* 3.3. Inheritance: Inherits from `MarkdownGenerationStrategy`.
* 3.4. Initialization (`__init__`)
* 3.4.1. Signature:
```python
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
def __init__(
self,
content_filter: Optional[RelevantContentFilter] = None,
options: Optional[Dict[str, Any]] = None,
# content_source parameter from parent is available
# verbose parameter from parent is available
):
super().__init__(content_filter, options, content_source=kwargs.get("content_source", "cleaned_html"), verbose=kwargs.get("verbose", False))
```
*(Note: The provided code snippet for `DefaultMarkdownGenerator.__init__` does not explicitly list `verbose` and `content_source`, but they are passed to `super().__init__` through `**kwargs` in the actual library code, so their effective signature matches the parent.)*
* 3.4.2. Parameters:
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
* `options (Optional[Dict[str, Any]]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
* `verbose (bool`, default: `False`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
* `content_source (str`, default: `"cleaned_html"`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
* 3.5. Key Class Attributes:
* 3.5.1. `LINK_PATTERN (re.Pattern)`: A compiled regular expression pattern used to find Markdown links. The pattern is `r'!\[(.[^\]]*)\]\(([^)]*?)(?:\s*\"(.*)\")?\)'`.
* 3.6. Key Public Methods:
* 3.6.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
* Purpose: Implements the conversion of HTML to Markdown. It uses `CustomHTML2Text` for the base conversion, handles link-to-citation transformation, and integrates with an optional `RelevantContentFilter` to produce `fit_markdown`.
* Parameters:
* `input_html (str)`: The HTML content to convert.
* `base_url (str`, default: `""`)`: Base URL for resolving relative links.
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options for the `CustomHTML2Text` converter. If not provided, it uses `self.options`.
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: Overrides the instance's `content_filter` for this call.
* `citations (bool`, default: `True`)`: Whether to convert links to citations.
* `**kwargs`: Additional arguments (not currently used by this specific implementation beyond parent class).
* Core Logic:
1. Instantiates `CustomHTML2Text` using `base_url` and the resolved `html2text_options` (merged from method arg, `self.options`, and defaults).
2. Converts `input_html` to `raw_markdown` using the `CustomHTML2Text` instance.
3. If `citations` is `True`, calls `self.convert_links_to_citations(raw_markdown, base_url)` to get `markdown_with_citations` and `references_markdown`.
4. If `citations` is `False`, `markdown_with_citations` is set to `raw_markdown`, and `references_markdown` is an empty string.
5. Determines the active `content_filter` (parameter or instance's `self.content_filter`).
6. If an active `content_filter` exists:
* Calls `active_filter.filter_content(input_html)` to get a list of filtered HTML strings.
* Joins these strings with `\n` and wraps them in `<div>` tags to form `fit_html`.
* Uses a new `CustomHTML2Text` instance to convert `fit_html` into `fit_markdown`.
7. Otherwise, `fit_html` and `fit_markdown` are set to `None` (or empty strings based on implementation details).
8. Constructs and returns a `MarkdownGenerationResult` object with all generated Markdown variants.
* 3.6.2. `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`
* Purpose: Transforms standard Markdown links within the input `markdown` string into a citation format (e.g., `[Link Text]^[1]^`) and generates a corresponding numbered list of references.
* Parameters:
* `markdown (str)`: The input Markdown string.
* `base_url (str`, default: `""`)`: The base URL used to resolve relative link URLs before they are added to the reference list.
* Returns: (`Tuple[str, str]`) A tuple where the first element is the Markdown string with links converted to citations, and the second element is a string containing the formatted list of references.
* Internal Logic:
* Uses the `LINK_PATTERN` regex to find all Markdown links.
* For each link, it resolves the URL using `fast_urljoin(base, url)` if `base_url` is provided and the link is relative.
* Assigns a unique citation number to each unique URL.
* Replaces the original link markup with the citation format (e.g., `[Text]^[Number]^`).
* Constructs a Markdown formatted reference list string.
* 3.7. Role of `CustomHTML2Text`:
* `CustomHTML2Text` is a customized version of an HTML-to-Markdown converter, likely based on the `html2text` library.
* It's instantiated by `DefaultMarkdownGenerator` to perform the core HTML to plain Markdown conversion.
* Its behavior is controlled by options passed via `html2text_options` in `generate_markdown` or `self.options` of the `DefaultMarkdownGenerator`. These options can include `body_width`, `ignore_links`, `ignore_images`, etc., influencing the final Markdown output. (Refer to `crawl4ai/html2text.py` for specific options).
## 4. Output Data Model: `MarkdownGenerationResult`
* 4.1. Purpose: `MarkdownGenerationResult` is a Pydantic `BaseModel` designed to structure and encapsulate the various Markdown outputs generated by any `MarkdownGenerationStrategy`. It provides a consistent way to access different versions of the converted content.
* 4.2. Source File: `crawl4ai/models.py`
* 4.3. Fields:
* 4.3.1. `raw_markdown (str)`: The direct result of converting the input HTML to Markdown, before any citation processing or specific content filtering (by the generator itself) is applied. This represents the most basic Markdown version of the content.
* 4.3.2. `markdown_with_citations (str)`: Markdown content where hyperlinks have been converted into a citation style (e.g., `[Link Text]^[1]^`). This is typically derived from `raw_markdown`.
* 4.3.3. `references_markdown (str)`: A string containing a formatted list of references (e.g., numbered list of URLs) corresponding to the citations found in `markdown_with_citations`.
* 4.3.4. `fit_markdown (Optional[str]`, default: `None`)`: Markdown content generated from HTML that has been processed by a `RelevantContentFilter`. This version is intended to be more concise or focused on relevant parts of the original content. It is `None` if no content filter was applied or if the filter resulted in no content.
* 4.3.5. `fit_html (Optional[str]`, default: `None`)`: The HTML content that remains after being processed by a `RelevantContentFilter`. `fit_markdown` is generated from this `fit_html`. It is `None` if no content filter was applied or if the filter resulted in no content.
* 4.4. Methods:
* 4.4.1. `__str__(self) -> str`:
* Purpose: Defines the string representation of a `MarkdownGenerationResult` object.
* Signature: `__str__(self) -> str`
* Returns: (`str`) The content of the `raw_markdown` field.
## 5. Integration with Content Filtering (`RelevantContentFilter`)
* 5.1. Purpose of Integration: `DefaultMarkdownGenerator` allows integration with `RelevantContentFilter` strategies to produce a `fit_markdown` output. This enables generating Markdown from a version of the HTML that has been refined or focused based on relevance criteria defined by the filter (e.g., keywords, semantic similarity, or LLM-based assessment).
* 5.2. Mechanism:
* A `RelevantContentFilter` instance can be passed to `DefaultMarkdownGenerator` either during its initialization (via the `content_filter` parameter) or directly to its `generate_markdown` method. The filter passed to `generate_markdown` takes precedence if both are provided.
* When an active filter is present, `DefaultMarkdownGenerator.generate_markdown` calls the filter's `filter_content(input_html)` method. This method is expected to return a list of HTML string chunks deemed relevant.
* These chunks are then joined (typically with `\n` and wrapped in `<div>` tags) to form the `fit_html` string.
* This `fit_html` is then converted to Markdown using `CustomHTML2Text`, and the result is stored as `fit_markdown`.
* 5.3. Impact on `MarkdownGenerationResult`:
* If a `RelevantContentFilter` is successfully used:
* `MarkdownGenerationResult.fit_markdown` will contain the Markdown derived from the filtered HTML.
* `MarkdownGenerationResult.fit_html` will contain the actual filtered HTML string.
* If no filter is used, or if the filter returns an empty list of chunks (indicating no content passed the filter), `fit_markdown` and `fit_html` will be `None` (or potentially empty strings, depending on the exact implementation details of joining an empty list).
* 5.4. Supported Filter Types (High-Level Mention):
* `PruningContentFilter`: A filter that likely removes irrelevant HTML sections based on predefined rules or structural analysis (e.g., removing common boilerplate like headers, footers, navbars).
* `BM25ContentFilter`: A filter that uses the BM25 ranking algorithm to score and select HTML chunks based on their relevance to a user-provided query.
* `LLMContentFilter`: A filter that leverages a Large Language Model to assess the relevance of HTML chunks, potentially based on a user query or a general understanding of content importance.
* *Note: Detailed descriptions and usage of each filter strategy are covered in their respective documentation sections.*
## 6. Configuration via `CrawlerRunConfig`
* 6.1. `CrawlerRunConfig.markdown_generator`
* Purpose: This attribute of the `CrawlerRunConfig` class allows a user to specify a custom `MarkdownGenerationStrategy` instance to be used for the markdown conversion phase of a crawl. This provides flexibility in how HTML content is transformed into Markdown.
* Type: `MarkdownGenerationStrategy` (accepts any concrete implementation of this ABC).
* Default Value: If not specified, an instance of `DefaultMarkdownGenerator()` is used by default within the `AsyncWebCrawler`'s `aprocess_html` method when `config.markdown_generator` is `None`.
* Usage Example:
```python
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator, AsyncWebCrawler
from crawl4ai.content_filter_strategy import BM25ContentFilter
import asyncio
# Example: Configure a markdown generator with a BM25 filter
bm25_filter = BM25ContentFilter(user_query="Python programming language")
custom_md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
run_config_with_custom_md = CrawlerRunConfig(
markdown_generator=custom_md_generator,
# Other run configurations...
)
async def example_crawl():
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Python_(programming_language)",
config=run_config_with_custom_md
)
if result.success and result.markdown:
print("Raw Markdown (snippet):", result.markdown.raw_markdown[:200])
if result.markdown.fit_markdown:
print("Fit Markdown (snippet):", result.markdown.fit_markdown[:200])
# asyncio.run(example_crawl())
```
## 7. Influencing Markdown Output for LLM Consumption
* 7.1. Role of `DefaultMarkdownGenerator.options` and `html2text_options`:
* The `options` parameter in `DefaultMarkdownGenerator.__init__` and the `html2text_options` parameter in its `generate_markdown` method are used to pass configuration settings directly to the underlying `CustomHTML2Text` instance.
* `html2text_options` provided to `generate_markdown` will take precedence over `self.options` set during initialization.
* These options control various aspects of the HTML-to-Markdown conversion, such as line wrapping, handling of links, images, and emphasis, which can be crucial for preparing text for LLMs.
* 7.2. Key `CustomHTML2Text` Options (via `html2text_options` or `DefaultMarkdownGenerator.options`):
* `bodywidth (int`, default: `0` when `DefaultMarkdownGenerator` calls `CustomHTML2Text` for `raw_markdown` and `fit_markdown` if not otherwise specified): Determines the width for wrapping lines. A value of `0` disables line wrapping, which is often preferred for LLM processing as it preserves sentence structure across lines.
* `ignore_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, all hyperlinks (`<a>` tags) are removed from the output, leaving only their anchor text.
* `ignore_images (bool`, default: `False` in `CustomHTML2Text`): If `True`, all image tags (`<img>`) are removed from the output.
* `ignore_emphasis (bool`, default: `False` in `CustomHTML2Text`): If `True`, emphasized text (e.g., `<em>`, `<strong>`) is rendered as plain text without Markdown emphasis characters (like `*` or `_`).
* `bypass_tables (bool`, default: `False` in `CustomHTML2Text`): If `True`, tables are not formatted as Markdown tables but are rendered as a series of paragraphs, which might be easier for some LLMs to process.
* `default_image_alt (str`, default: `""` in `CustomHTML2Text`): Specifies a default alt text for images that do not have an `alt` attribute.
* `protect_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, URLs in links are not processed or modified.
* `single_line_break (bool`, default: `True` in `CustomHTML2Text`): If `True`, single newlines in HTML are converted to Markdown line breaks (two spaces then a newline). This can help preserve some formatting.
* `mark_code (bool`, default: `True` in `CustomHTML2Text`): If `True`, `<code>` and `<pre>` blocks are appropriately marked in Markdown.
* `escape_snob (bool`, default: `False` in `CustomHTML2Text`): If `True`, more aggressive escaping of special Markdown characters is performed.
* *Note: This list is based on common `html2text` options; refer to `crawl4ai/html2text.py` for the exact implementation and default behaviors within `CustomHTML2Text`.*
* 7.3. Impact of `citations (bool)` in `generate_markdown`:
* When `citations=True` (default in `DefaultMarkdownGenerator.generate_markdown`):
* Standard Markdown links `[text](url)` are converted to `[text]^[citation_number]^`.
* A `references_markdown` string is generated, listing all unique URLs with their corresponding citation numbers. This helps LLMs trace information back to its source and can reduce token count if URLs are long or repetitive.
* When `citations=False`:
* Links remain in their original Markdown format `[text](url)`.
* `references_markdown` will be an empty string.
* This might be preferred if the LLM needs to directly process the URLs or if the citation format is not desired.
* 7.4. Role of `content_source` in `MarkdownGenerationStrategy`:
* This parameter (defaulting to `"cleaned_html"` in `DefaultMarkdownGenerator`) specifies which HTML version is used as the primary input for the `generate_markdown` method.
* `"cleaned_html"`: Typically refers to HTML that has undergone initial processing by the `ContentScrapingStrategy` (e.g., removal of scripts, styles, and potentially some boilerplate based on the scraping strategy's rules). This is usually the recommended source for general Markdown conversion.
* `"raw_html"`: The original, unmodified HTML content fetched from the web page. Using this source would bypass any initial cleaning done by the scraping strategy.
* `"fit_html"`: This source is relevant when a `RelevantContentFilter` is used. `fit_html` is the HTML output *after* the `RelevantContentFilter` has processed the `input_html` (which itself is determined by `content_source`). If `content_source` is, for example, `"cleaned_html"`, then `fit_html` is the result of filtering that cleaned HTML. `fit_markdown` is then generated from this `fit_html`.
* 7.5. `fit_markdown` vs. `raw_markdown`/`markdown_with_citations`:
* `raw_markdown` (or `markdown_with_citations` if `citations=True`) is generated from the HTML specified by `content_source` (e.g., `"cleaned_html"`). It represents a general conversion of that source.
* `fit_markdown` is generated *only if* a `RelevantContentFilter` is active (either set in `DefaultMarkdownGenerator` or passed to `generate_markdown`). It is derived from the `fit_html` (the output of the content filter).
* **Choosing which to use for LLMs:**
* Use `fit_markdown` when you need a concise, highly relevant subset of the page's content tailored to a specific query or set of criteria defined by the filter. This can reduce noise and token count for the LLM.
* Use `raw_markdown` or `markdown_with_citations` when you need a more comprehensive representation of the page's textual content, or when no specific filtering criteria are applied.
```

View File

@@ -1,760 +0,0 @@
```markdown
# Detailed Outline for crawl4ai - markdown Component
**Target Document Type:** reasoning
**Target Output Filename Suggestion:** `llm_reasoning_markdown_generation.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
## 1. Introduction to Markdown Generation in Crawl4AI
* 1.1. **Why Markdown Generation Matters for LLMs**
* 1.1.1. The role of clean, structured text for Large Language Model consumption.
* **Explanation:** LLMs perform significantly better when input data is well-structured and free of irrelevant noise (like HTML tags, scripts, or complex layouts not meant for textual understanding). Markdown, with its simple syntax, provides a human-readable and machine-parseable format that captures essential semantic structure (headings, lists, paragraphs, code blocks, tables) without the clutter of full HTML. This makes it easier for LLMs to understand the content's hierarchy, identify key information, and perform tasks like summarization, question-answering, or RAG (Retrieval Augmented Generation) more accurately and efficiently.
* 1.1.2. Benefits of Markdown: readability, structure preservation, common format.
* **Explanation:**
* **Readability:** Markdown is designed to be easily readable in its raw form, making it simple for developers and users to inspect and understand the crawled content.
* **Structure Preservation:** It effectively preserves the semantic structure of the original HTML (headings, lists, emphasis, etc.), which is crucial context for LLMs.
* **Common Format:** Markdown is a widely adopted standard, ensuring compatibility with a vast ecosystem of tools, editors, and LLM input pipelines.
* 1.1.3. How Crawl4AI's Markdown generation facilitates RAG and other LLM applications.
* **Explanation:** For RAG, Crawl4AI's Markdown output, especially when combined with content filtering, provides clean, relevant text chunks that can be easily embedded and indexed. This improves the quality of retrieved context for LLM prompts. For fine-tuning or direct prompting, the structured Markdown helps the LLM focus on the core content, leading to better quality responses and reducing token consumption by eliminating HTML overhead.
* 1.2. **Overview of Crawl4AI's Markdown Generation Pipeline**
* 1.2.1. High-level flow: HTML -> (Optional Filtering) -> Markdown Conversion -> (Optional Citation Handling).
* **Explanation:**
1. **Input HTML:** The process starts with either raw HTML from the crawled page or a cleaned/selected HTML segment.
2. **Optional Content Filtering:** Before Markdown conversion, a `RelevantContentFilter` can be applied to the HTML. This step aims to remove boilerplate, ads, or irrelevant sections, resulting in `fit_html`. This is crucial for generating `fit_markdown`.
3. **Markdown Conversion:** The selected HTML (either the original, cleaned, or filtered `fit_html`) is converted into Markdown using an underlying `html2text` library, specifically `CustomHTML2Text` in Crawl4AI for enhanced control.
4. **Optional Citation Handling:** If enabled, inline links in the generated Markdown are converted to a citation format (e.g., `text [^1^]`), and a separate list of references is created.
* 1.2.2. Key components involved: `MarkdownGenerationStrategy`, `DefaultMarkdownGenerator`, `CustomHTML2Text`, `RelevantContentFilter`.
* **Explanation:**
* **`MarkdownGenerationStrategy`:** An interface defining how Markdown should be generated. Allows for custom implementations.
* **`DefaultMarkdownGenerator`:** The standard implementation of `MarkdownGenerationStrategy`, using `CustomHTML2Text`. It orchestrates filtering (if provided) and citation handling.
* **`CustomHTML2Text`:** An enhanced version of the `html2text` library, providing fine-grained control over the HTML-to-Markdown conversion.
* **`RelevantContentFilter`:** An interface for strategies that filter HTML content before it's converted to Markdown, producing `fit_html` and consequently `fit_markdown`.
* 1.2.3. How `CrawlerRunConfig` ties these components together.
* **Explanation:** The `CrawlerRunConfig` object allows you to specify which `MarkdownGenerationStrategy` (and by extension, which filters and `CustomHTML2Text` options) should be used for a particular crawl run via its `markdown_generator` parameter. This provides run-specific control over the Markdown output.
* 1.3. **Goals of this Guide**
* 1.3.1. Understanding how to configure and customize Markdown output.
* **Explanation:** This guide will walk you through the various configuration options available, from choosing HTML sources and content filters to fine-tuning the `html2text` conversion itself.
* 1.3.2. Best practices for generating LLM-friendly Markdown.
* **Explanation:** We'll discuss tips and techniques to produce Markdown that is optimally structured and cleaned for consumption by Large Language Models.
* 1.3.3. Troubleshooting common Markdown generation issues.
* **Explanation:** We'll cover common problems encountered during Markdown generation (e.g., noisy output, missing content) and provide strategies for diagnosing and resolving them.
## 2. Core Concepts in Markdown Generation
* 2.1. **The `MarkdownGenerationStrategy` Interface**
* 2.1.1. **Purpose and Design Rationale:**
* Why use a strategy pattern for Markdown generation? (Flexibility, extensibility).
* **Explanation:** The strategy pattern allows Crawl4AI to define a common interface for Markdown generation while enabling different concrete implementations. This means users can easily swap out the default Markdown generator for a custom one without altering the core crawler logic. It promotes flexibility and makes the system extensible for future Markdown conversion needs or integration with other libraries.
* Core problem it solves: Decoupling Markdown generation logic from the crawler.
* **Explanation:** By abstracting Markdown generation into a strategy, the `AsyncWebCrawler` itself doesn't need to know the specifics of *how* Markdown is created. It simply delegates the task to the configured strategy. This separation of concerns makes the codebase cleaner and easier to maintain.
* 2.1.2. **When to Implement a Custom `MarkdownGenerationStrategy`:**
* Scenarios requiring completely different Markdown conversion logic.
* **Example:** If you need to convert HTML to a very specific dialect of Markdown not supported by `html2text`, or if you want to use a different underlying conversion library entirely.
* Integrating third-party Markdown conversion libraries.
* **Example:** If you prefer to use a library like `turndown` or `mistune` for its specific features or output style.
* Advanced pre/post-processing of Markdown.
* **Example:** If you need to perform complex transformations on the Markdown *after* initial generation, such as custom table formatting, complex footnote handling beyond standard citations, or domain-specific semantic tagging within the Markdown.
* 2.1.3. **How to Implement a Custom `MarkdownGenerationStrategy`:**
* Key methods to override (`generate_markdown`).
* **Explanation:** The primary method to implement is `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`. This method will receive the HTML (based on `content_source`), and it's responsible for returning a `MarkdownGenerationResult` object.
* Input parameters and expected output (`MarkdownGenerationResult`).
* **Explanation:** Your custom strategy will receive the `input_html`, the `base_url` (for resolving relative links if needed), `html2text_options` (which you can choose to use or ignore), an optional `content_filter`, and a `citations` flag. It must return an instance of `MarkdownGenerationResult` populated with the relevant Markdown strings.
* *Code Example:*
```python
from crawl4ai import MarkdownGenerationStrategy, MarkdownGenerationResult, RelevantContentFilter
from typing import Optional, Dict, Any
class MyCustomMarkdownStrategy(MarkdownGenerationStrategy):
def __init__(self, content_source: str = "cleaned_html", **kwargs):
super().__init__(content_source=content_source, **kwargs)
# Initialize any custom resources if needed
def generate_markdown(
self,
input_html: str,
base_url: str = "",
html2text_options: Optional[Dict[str, Any]] = None, # You can use or ignore these
content_filter: Optional[RelevantContentFilter] = None,
citations: bool = True, # You can decide how to handle this
**kwargs
) -> MarkdownGenerationResult:
# 1. Apply content filter if provided and desired
fit_html_output = ""
if content_filter:
# Assuming content_filter.filter_content returns a list of HTML strings
filtered_html_blocks = content_filter.filter_content(input_html)
fit_html_output = "\n".join(filtered_html_blocks)
# 2. Your custom HTML to Markdown conversion logic
# This is where you'd use your preferred library or custom logic
raw_markdown_text = f"# Custom Markdown for {base_url}\n\n{input_html[:200]}..." # Placeholder
markdown_with_citations_text = raw_markdown_text # Placeholder for citation logic
references_markdown_text = "" # Placeholder for references
# If you used a filter, also generate fit_markdown
fit_markdown_text = ""
if fit_html_output:
fit_markdown_text = f"# Custom Filtered Markdown\n\n{fit_html_output[:200]}..." # Placeholder
return MarkdownGenerationResult(
raw_markdown=raw_markdown_text,
markdown_with_citations=markdown_with_citations_text,
references_markdown=references_markdown_text,
fit_markdown=fit_markdown_text,
fit_html=fit_html_output
)
# Usage:
# custom_md_generator = MyCustomMarkdownStrategy()
# run_config = CrawlerRunConfig(markdown_generator=custom_md_generator)
```
* Common pitfalls when creating custom strategies.
* **Explanation:**
* Forgetting to handle all fields in `MarkdownGenerationResult` (even if some are empty strings).
* Incorrectly managing `base_url` for relative links if your custom converter doesn't handle it.
* Performance bottlenecks if your custom logic is inefficient.
* Not properly integrating with the `content_filter` if one is provided.
* 2.1.4. **Understanding `content_source` in `MarkdownGenerationStrategy`**
* 2.1.4.1. Purpose: What HTML source should be used for Markdown generation?
* **Explanation:** The `content_source` attribute of a `MarkdownGenerationStrategy` (including `DefaultMarkdownGenerator`) tells the strategy which version of the HTML to use as the primary input for generating `raw_markdown` and `markdown_with_citations`.
* 2.1.4.2. Available options: `"cleaned_html"`, `"raw_html"`, `"fit_html"`.
* **`"cleaned_html"` (Default):** This is the HTML after Crawl4AI's internal `ContentScrapingStrategy` (e.g., `WebScrapingStrategy` or `LXMLWebScrapingStrategy`) has processed it. This usually involves removing scripts, styles, and applying structural cleaning or selection based on `target_elements` or `css_selector` in `CrawlerRunConfig`.
* **`"raw_html"`:** The original, unmodified HTML fetched from the page. This is useful if you want to apply your own complete cleaning and Markdown conversion pipeline.
* **`"fit_html"`:** The HTML *after* a `RelevantContentFilter` (if provided to the `MarkdownGenerationStrategy`) has processed the input HTML (which would be `cleaned_html` or `raw_html` depending on the initial source). This option is powerful when you want Markdown generated *only* from the most relevant parts of the page.
* 2.1.4.3. **Decision Guide: Choosing the Right `content_source`**:
* **When to use `"cleaned_html"`:** This is the recommended default for most LLM use cases. It provides a good balance of structured content without excessive noise, as common boilerplate is often removed by the scraping strategy.
* **When to use `"raw_html"`:** Choose this if you need absolute control over the HTML input for your Markdown converter, or if Crawl4AI's default cleaning removes elements you wish to keep. Be aware that this might result in noisier Markdown.
* **When to use `"fit_html"`:** Opt for this when you are using a `RelevantContentFilter` with your `MarkdownGenerationStrategy` and you want the `raw_markdown` and `markdown_with_citations` to be based *only* on the filtered content. This is distinct from just using the `fit_markdown` field in the result, as it makes the filtered content the *primary* source for all main Markdown outputs.
* **Impact on performance and output quality:**
* `"raw_html"` might be slightly faster if Crawl4AI's cleaning is complex, but could lead to lower quality Markdown due to more noise.
* `"cleaned_html"` offers a good trade-off.
* `"fit_html"` depends on the performance of the `RelevantContentFilter` itself.
* 2.1.4.4. *Example Scenarios:*
* **General Summarization:** `"cleaned_html"` is usually best.
* **Highly Specific Q&A on a Section:** Use a `RelevantContentFilter` to produce `fit_html`, then set `content_source="fit_html"` (or just use the `fit_markdown` from the result if `raw_markdown` from `"cleaned_html"` is also desired).
* **Archiving Raw Structure:** `"raw_html"` might be chosen if the goal is to convert the entire, unmodified page structure to Markdown, perhaps for later, more nuanced processing.
* 2.2. **The `MarkdownGenerationResult` Model**
* 2.2.1. **Understanding its Purpose:** Why a structured result object?
* **Explanation:** A structured object like `MarkdownGenerationResult` is used instead of a single Markdown string to provide different views or versions of the generated Markdown, catering to various use cases. This allows users to pick the representation that best suits their needs (e.g., with or without citations, raw vs. filtered) without re-processing. It also clearly separates the main content from metadata like references or the intermediate `fit_html`.
* 2.2.2. **Deep Dive into `MarkdownGenerationResult` Fields:**
* `raw_markdown`:
* **What it is:** This is the direct, primary Markdown output generated from the `content_source` (e.g., `cleaned_html`) defined in the `MarkdownGenerationStrategy`. It does *not* have inline links converted to citation format.
* **How to use it:** Use this when you need the most "vanilla" Markdown, perhaps for LLMs that are sensitive to citation formats or if you plan to implement your own link/reference handling.
* **When it's useful:** For direct input to LLMs that don't require source attribution within the text, or as a base for further custom Markdown processing.
* `markdown_with_citations`:
* **What it is:** This takes the `raw_markdown` and converts its inline links (e.g., `[link text](http://example.com)`) into a citation format (e.g., `link text [^1^]`).
* **How it's generated:** The `DefaultMarkdownGenerator` (via `CustomHTML2Text`) scans `raw_markdown` for links, assigns unique numerical IDs to each unique URL, replaces the inline link with the text and citation marker, and populates `references_markdown`.
* **How to use it:** This is often the most useful Markdown for LLM tasks requiring RAG or for generating human-readable documents where sources are important. Combine it with `references_markdown`.
* *Example:*
```html
<!-- Input HTML fragment -->
<p>Crawl4AI is an <a href="https://github.com/unclecode/crawl4ai">open-source</a> library.</p>
```
```markdown
// Resulting markdown_with_citations (simplified)
Crawl4AI is an open-source [^1^] library.
```
* `references_markdown`:
* **What it is:** A separate Markdown string that lists all unique URLs found and converted to citations, formatted typically as a numbered list.
* **How to use it:** Append this string to the end of `markdown_with_citations` to create a complete document with a bibliography or reference section.
* **Why it's separate:** This provides flexibility. You can choose to display references at the end, in a sidebar, or not at all.
* *Example:*
```markdown
## References
[^1^]: https://github.com/unclecode/crawl4ai
```
* `fit_markdown`:
* **What it is:** This is Markdown generated *exclusively* from the `fit_html`. `fit_html` itself is the output of a `RelevantContentFilter` if one was provided to the `MarkdownGenerationStrategy`. If no filter was used, `fit_markdown` will likely be empty or reflect the `raw_markdown`.
* **How to use it:** When your primary goal is to feed an LLM with the most relevant, filtered content. This is excellent for tasks like generating concise summaries or providing highly focused context for RAG.
* **Relationship with `raw_markdown`:** If a filter is active, `fit_markdown` is based on a *subset* or *transformed version* of the HTML that `raw_markdown` was based on (assuming `content_source` wasn't `"fit_html"`). If `content_source` *was* `"fit_html"`, then `raw_markdown` and `fit_markdown` would be derived from the same filtered HTML, but `fit_markdown` might still undergo different processing if the strategy handles it distinctly.
* *Example:* Imagine a news article page. `raw_markdown` might contain the article, comments, ads, and navigation. If a `BM25ContentFilter` is used with a query about "stock market impact", `fit_markdown` would ideally only contain paragraphs related to that topic, stripped of other page elements.
* `fit_html`:
* **What it is:** The actual HTML string *after* a `RelevantContentFilter` (like `PruningContentFilter` or `LLMContentFilter`) has processed the input HTML. If no filter is applied, this field will be empty.
* **How to use it:** Primarily for debugging your content filters. You can inspect `fit_html` to see exactly what HTML content was deemed "relevant" by your filter before it was converted to `fit_markdown`. It can also be useful if you need this filtered HTML for purposes other than Markdown generation.
* **Why it's included:** It provides transparency into the filtering process and allows advanced users to work with the intermediate filtered HTML directly.
## 3. The `DefaultMarkdownGenerator` - Your Go-To Solution
* 3.1. **Understanding the `DefaultMarkdownGenerator`**
* 3.1.1. **Purpose and Design:** The `DefaultMarkdownGenerator` is Crawl4AI's standard, out-of-the-box mechanism for converting HTML content into various Markdown representations. It's designed to be a robust and generally applicable solution for most common use cases, especially when targeting LLM consumption.
* 3.1.2. Core Functionality: Its primary task is to orchestrate the HTML-to-Markdown conversion. It internally uses an instance of `CustomHTML2Text` (Crawl4AI's enhanced `html2text` wrapper) to perform the actual conversion.
* 3.1.3. How it handles citations and references by default.
* **Explanation:** If the `citations` parameter in its `generate_markdown` method is `True` (which it is by default), `DefaultMarkdownGenerator` will post-process the initially generated Markdown to convert inline links into citation markers (e.g., `[^1^]`) and generate a corresponding `references_markdown` block. This is done by its internal `CustomHTML2Text` instance.
* 3.2. **Configuring `DefaultMarkdownGenerator`**
* 3.2.1. **Initialization Options:**
* `content_filter (Optional[RelevantContentFilter])`:
* **Why use it:** To refine the HTML *before* it's converted to Markdown. This is essential if you want `fit_markdown` (and consequently `fit_html`) to contain only the most relevant parts of the page, leading to a more focused Markdown output.
* **How it integrates:** When `generate_markdown` is called, if a `content_filter` is present, `DefaultMarkdownGenerator` first passes the `input_html` (determined by `content_source`) to this filter. The filter returns a list of HTML strings (or a single string if merged). This filtered HTML becomes the `fit_html`. Then, `fit_markdown` is generated from this `fit_html`. The `raw_markdown` and `markdown_with_citations` are still generated from the original `content_source` unless `content_source` itself is set to `"fit_html"`.
* *Impact:* Directly influences `fit_markdown` and `fit_html` fields in `MarkdownGenerationResult`. Can significantly reduce the noise and improve the relevance of the final Markdown for LLMs.
* *Code Example:*
```python
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig
from crawl4ai.content_filter_strategy import PruningContentFilter
# Initialize a filter
pruning_filter = PruningContentFilter(threshold_type="fixed", threshold=0.5)
# Initialize DefaultMarkdownGenerator with the filter
md_generator_with_filter = DefaultMarkdownGenerator(content_filter=pruning_filter)
# This generator will now produce 'fit_markdown' based on pruning.
# run_config = CrawlerRunConfig(markdown_generator=md_generator_with_filter)
# result = await crawler.arun(url="...", config=run_config)
# print(result.markdown.fit_markdown)
```
* `options (Optional[Dict[str, Any]])`:
* **What it is:** This dictionary allows you to pass configuration options directly to the underlying `CustomHTML2Text` instance. These options control the specifics of the HTML-to-Markdown conversion process.
* **How to use it:** Provide a dictionary where keys are `html2text` option names (e.g., `body_width`, `ignore_links`) and values are their desired settings.
* *See Section 6: Mastering `CustomHTML2Text` for detailed options.*
* `content_source (str)`:
* **Reiteration:** As discussed in section 2.1.4, this determines the primary HTML input for `raw_markdown` and `markdown_with_citations`.
* **How it interacts with `content_filter`:**
* If `content_source` is, for example, `"cleaned_html"` and a `content_filter` is also provided, the `content_filter` will process this `"cleaned_html"` to produce `fit_html`. The `fit_markdown` field in `MarkdownGenerationResult` will be based on this `fit_html`.
* However, `raw_markdown` and `markdown_with_citations` will still be based on the original `"cleaned_html"` (unless `content_source` was explicitly set to `"fit_html"`). This allows you to have both a "fuller" Markdown and a "filtered" Markdown from a single generation step.
* 3.3. **Common Workflows with `DefaultMarkdownGenerator`**
* 3.3.1. **Workflow: Generating Basic Markdown with Citations**
* Steps: Instantiate `DefaultMarkdownGenerator` (or use the crawler's default). The crawler calls its `generate_markdown` method. Access `result.markdown.markdown_with_citations` and `result.markdown.references_markdown`.
* *Code Example:*
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
async def basic_markdown_workflow():
# DefaultMarkdownGenerator is used implicitly if none is specified in CrawlerRunConfig
# Or explicitly:
md_generator = DefaultMarkdownGenerator()
run_config = CrawlerRunConfig(markdown_generator=md_generator)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=run_config)
if result.success:
print("--- Markdown with Citations ---")
print(result.markdown.markdown_with_citations[:500]) # Show first 500 chars
print("\n--- References ---")
print(result.markdown.references_markdown)
else:
print(f"Crawl failed: {result.error_message}")
```
* 3.3.2. **Workflow: Generating Focused Markdown using a Content Filter**
* Steps:
1. Choose and instantiate a `RelevantContentFilter` (e.g., `BM25ContentFilter`).
2. Instantiate `DefaultMarkdownGenerator`, passing the filter to its `content_filter` parameter.
3. Set this `DefaultMarkdownGenerator` instance in `CrawlerRunConfig.markdown_generator`.
4. After crawling, access `result.markdown.fit_markdown`.
* Key configuration considerations for the filter and generator:
* For `BM25ContentFilter`, ensure you provide a relevant `user_query`.
* Adjust filter thresholds (e.g., `bm25_threshold`) as needed.
* The `content_source` for `DefaultMarkdownGenerator` will be the input to the filter.
* *Code Example:*
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
async def filtered_markdown_workflow():
user_query = "information about Crawl4AI library"
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
run_config = CrawlerRunConfig(
markdown_generator=md_generator,
cache_mode=CacheMode.BYPASS # For consistent demo results
)
async with AsyncWebCrawler() as crawler:
# Using a page that hopefully has content related to the query
result = await crawler.arun(url="https://github.com/unclecode/crawl4ai", config=run_config)
if result.success:
print("--- Fit Markdown (BM25 Filtered) ---")
print(result.markdown.fit_markdown) # This is the key output
# You can also inspect fit_html to see what the filter selected
# print("\n--- Fit HTML ---")
# print(result.markdown.fit_html[:500])
else:
print(f"Crawl failed: {result.error_message}")
```
* 3.3.3. **Workflow: Customizing Markdown Style via `html2text_options`**
* Steps: Instantiate `DefaultMarkdownGenerator` passing a dictionary of `html2text` options to its `options` parameter.
* *Code Example:*
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
async def custom_style_markdown_workflow():
# Example: Disable line wrapping and ignore images
html2text_opts = {
"body_width": 0, # Disable line wrapping
"ignore_images": True # Don't include image markdown ![alt](src)
}
md_generator = DefaultMarkdownGenerator(options=html2text_opts)
run_config = CrawlerRunConfig(markdown_generator=md_generator)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=run_config)
if result.success:
print("--- Custom Styled Markdown (No Wrap, No Images) ---")
print(result.markdown.raw_markdown[:500]) # raw_markdown will reflect these options
else:
print(f"Crawl failed: {result.error_message}")
```
* 3.4. **Best Practices for `DefaultMarkdownGenerator`**
* **When to use `DefaultMarkdownGenerator` vs. a custom strategy:**
* Use `DefaultMarkdownGenerator` for most cases. It's robust and highly configurable through `content_filter` and `html2text_options`.
* Opt for a custom strategy only if you need fundamentally different conversion logic or integration with external Markdown libraries that `CustomHTML2Text` doesn't cover.
* **Tips for choosing the right `content_source` and `content_filter`:**
* Start with `content_source="cleaned_html"` (default) and no filter.
* If the output is too noisy, introduce a `RelevantContentFilter`. `PruningContentFilter` is a good first step for general boilerplate. Use `BM25ContentFilter` or `LLMContentFilter` for more targeted filtering based on semantic relevance.
* If your filter is very effective and you *only* want Markdown from the filtered content, consider setting `content_source="fit_html"` in your `DefaultMarkdownGenerator` instance.
* **How to leverage `MarkdownGenerationResult` effectively:**
* For LLM input where source attribution is important, use `markdown_with_citations` + `references_markdown`.
* For tasks needing maximum conciseness based on relevance, use `fit_markdown` (after configuring a `content_filter`).
* Use `raw_markdown` if you need the "purest" Markdown conversion without citation processing.
* Inspect `fit_html` to debug your content filters.
## 4. Integrating Content Filters for Smarter Markdown (`fit_markdown`)
* 4.1. **The "Why": Purpose of Content Filtering Before Markdown Generation**
* 4.1.1. Reducing noise and improving relevance for LLMs.
* **Explanation:** Web pages often contain much more than just the main article content (e.g., navigation, ads, footers, related articles). These can be detrimental to LLM performance, increasing token count, processing time, and potentially confusing the model. Content filters aim to isolate the core, relevant information.
* 4.1.2. Generating more concise and focused Markdown (`fit_markdown`).
* **Explanation:** By filtering the HTML *before* converting it to Markdown, the resulting `fit_markdown` is inherently more concise and focused on what the filter deemed important. This is ideal for tasks where brevity and relevance are key.
* 4.1.3. How `fit_html` is generated and its role.
* **Explanation:** When a `RelevantContentFilter` is used with a `MarkdownGenerationStrategy`, the strategy first passes the input HTML (e.g., `cleaned_html`) to the filter's `filter_content` method. This method returns a list of HTML strings (or a single merged string). This output is stored as `fit_html` in the `MarkdownGenerationResult`. `fit_markdown` is then generated by converting this `fit_html` to Markdown.
* 4.2. **Overview of `RelevantContentFilter` Strategies**
* 4.2.1. **`PruningContentFilter`**:
* **How it works:** Applies heuristic rules to remove common boilerplate. For example, it might remove elements with very short text content, elements with a high link-to-text ratio, or elements matching common boilerplate CSS classes/IDs (like "footer", "nav", "sidebar").
* **When to use it:** A good first-pass filter for general-purpose cleaning. It's fast and doesn't require LLM calls or complex configuration.
* **Impact on `fit_markdown`:** Typically good at removing obvious non-content sections, resulting in a cleaner, more article-focused Markdown.
* 4.2.2. **`BM25ContentFilter`**:
* **How it works:** This filter uses the BM25 algorithm, a classical information retrieval technique. It tokenizes the HTML content into chunks and scores each chunk's relevance against a `user_query`. Chunks exceeding a `bm25_threshold` are kept.
* **When to use it:** When you want to extract content specifically related to a user's query from a larger page. Excellent for targeted information retrieval.
* **Impact on `fit_markdown`:** The output will be highly tailored to the query. If the query is "Tell me about Crawl4AI's caching", `fit_markdown` should primarily contain sections discussing caching.
* 4.2.3. **`LLMContentFilter`**:
* **How it works:** This is the most powerful and flexible filter. It chunks the input HTML and sends each chunk (or a summary) to an LLM with specific `instructions` (e.g., "Extract only the paragraphs discussing financial results"). The LLM decides which chunks are relevant.
* **When to use it:** For complex filtering criteria that are hard to express with rules or keywords, or when nuanced understanding of content is required.
* **Impact on `fit_markdown`:** Can produce very precise and contextually relevant Markdown. However, it's generally slower and can be more expensive due to LLM API calls.
* 4.3. **Decision Guide: Choosing the Right `RelevantContentFilter`**
* *Table:*
| Filter | Speed | Cost (LLM API) | Accuracy/Nuance | Use Case Examples | Configuration Complexity |
|-----------------------|------------|----------------|-----------------|----------------------------------------------------|--------------------------|
| `PruningContentFilter`| Very Fast | None | Low-Medium | General boilerplate removal, quick cleaning. | Low |
| `BM25ContentFilter` | Fast | None | Medium | Query-focused extraction, finding relevant sections. | Medium (query, threshold)|
| `LLMContentFilter` | Slow | Potentially High| High | Complex criteria, nuanced extraction, summarization. | High (prompt engineering) |
* Factors to consider:
* **Desired Output Quality:** For the highest semantic relevance, `LLMContentFilter` is often best, but at a cost.
* **Performance Constraints:** If speed is critical, `PruningContentFilter` or `BM25ContentFilter` are preferred.
* **Nature of the HTML Content:** For well-structured articles, `PruningContentFilter` might be sufficient. For diverse content or Q&A, `BM25ContentFilter` or `LLMContentFilter` might be better.
* **Specificity of Task:** If you have a clear query, `BM25ContentFilter` excels. If you have complex instructions, `LLMContentFilter` is suitable.
* 4.4. **Code Examples: Combining Filters with `DefaultMarkdownGenerator`**
* 4.4.1. *Example:* [Using `PruningContentFilter` to generate `fit_markdown`].
```python
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import PruningContentFilter
async def pruning_filter_example():
pruning_filter = PruningContentFilter(threshold=0.4, threshold_type="fixed") # Adjust threshold as needed
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
if result.success:
print("--- Fit Markdown (Pruned) ---")
print(result.markdown.fit_markdown[:1000]) # Show first 1000 chars
# print("\n--- Original Raw Markdown (for comparison) ---")
# print(result.markdown.raw_markdown[:1000])
```
* 4.4.2. *Example:* [Using `BM25ContentFilter` with a query to generate query-focused `fit_markdown`].
```python
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
from crawl4ai.content_filter_strategy import BM25ContentFilter
async def bm25_filter_example():
user_query = "Python syntax and semantics"
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
if result.success:
print(f"--- Fit Markdown (BM25 Filtered for query: '{user_query}') ---")
print(result.markdown.fit_markdown)
```
* 4.4.3. *Example:* [Using `LLMContentFilter` for nuanced content selection before Markdown generation].
```python
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, LLMConfig, CacheMode
from crawl4ai.content_filter_strategy import LLMContentFilter
import os
async def llm_filter_example():
# Ensure OPENAI_API_KEY is set in your environment
if not os.getenv("OPENAI_API_KEY"):
print("OPENAI_API_KEY not set. Skipping LLMContentFilter example.")
return
llm_config_obj = LLMConfig(provider="openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
instruction = "Extract only the sections that discuss Python's history and its creator."
llm_filter = LLMContentFilter(
llm_config=llm_config_obj,
instruction=instruction,
# chunk_token_threshold=1000 # Adjust as needed
)
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter, content_source="cleaned_html")
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
if result.success:
print(f"--- Fit Markdown (LLM Filtered with instruction: '{instruction}') ---")
print(result.markdown.fit_markdown)
llm_filter.show_usage() # Display token usage
else:
print(f"Crawl failed: {result.error_message}")
```
* 4.5. **Best Practices for Content Filtering for Markdown**
* **Start Simple:** Begin with `PruningContentFilter` for general cleanup. It's fast and often effective for removing common boilerplate.
* **Query-Specific Tasks:** If your goal is to extract information relevant to a specific query, `BM25ContentFilter` is a great, cost-effective choice.
* **Nuanced Selection:** Reserve `LLMContentFilter` for tasks requiring deeper semantic understanding or complex filtering logic that rules-based or keyword-based approaches can't handle. Be mindful of its cost and latency.
* **Iterate and Test:** Content filtering is often an iterative process. Test your filter configurations on various pages to ensure they behave as expected. Inspect `fit_html` to understand what the filter is selecting/discarding.
* **Combine with `content_source`:** Remember that `fit_markdown` is derived from the output of the filter. If you also need Markdown from the pre-filtered content, ensure your `MarkdownGenerationStrategy`'s `content_source` is set appropriately (e.g., `"cleaned_html"`) so that `raw_markdown` reflects that, while `fit_markdown` reflects the filtered version.
## 5. Customizing Markdown Output via `CrawlerRunConfig`
* 5.1. **The Role of `CrawlerRunConfig.markdown_generator`**
* 5.1.1. How it allows specifying a custom Markdown generation strategy for a crawl run.
* **Explanation:** The `markdown_generator` parameter within the `CrawlerRunConfig` object is the primary way to control how Markdown is generated for a specific crawl operation (i.e., a call to `crawler.arun()` or tasks within `crawler.arun_many()`). You can assign an instance of any class that adheres to the `MarkdownGenerationStrategy` interface to it.
* 5.1.2. Overriding the default Markdown generation behavior.
* **Explanation:** If `CrawlerRunConfig.markdown_generator` is not set (i.e., it's `None`), Crawl4AI will use a default instance of `DefaultMarkdownGenerator` with its standard settings. By providing your own `MarkdownGenerationStrategy` instance (be it a configured `DefaultMarkdownGenerator` or a custom class), you override this default behavior for that particular run.
* 5.2. **Scenarios for Using `CrawlerRunConfig.markdown_generator`**
* 5.2.1. Applying a pre-configured `DefaultMarkdownGenerator` with specific filters or options.
* **Why:** You might want different filtering logic or `html2text` options for different URLs or types of content you're crawling, even within the same `AsyncWebCrawler` instance.
* 5.2.2. Plugging in a completely custom `MarkdownGenerationStrategy`.
* **Why:** As discussed in section 2.1.2, if you have unique Markdown requirements or want to use a different conversion library.
* 5.2.3. Disabling Markdown generation entirely by setting it to `None` (if applicable, or by using a "NoOp" strategy).
* **Why:** If, for a specific crawl, you only need the HTML or extracted structured data and don't require Markdown output, you can pass `markdown_generator=None` (or a strategy that does nothing) to save processing time.
* *Note:* To truly disable Markdown generation and its associated `CustomHTML2Text` processing, you might need a "NoOpMarkdownGenerator". If `markdown_generator` is `None`, the crawler might still fall back to a default. A NoOp strategy would explicitly do nothing.
```python
# class NoOpMarkdownGenerator(MarkdownGenerationStrategy):
# def generate_markdown(self, input_html: str, **kwargs) -> MarkdownGenerationResult:
# return MarkdownGenerationResult(raw_markdown="", markdown_with_citations="", references_markdown="")
# run_config = CrawlerRunConfig(markdown_generator=NoOpMarkdownGenerator())
```
* 5.3. **Code Examples:**
* 5.3.1. *Example:* [Setting a `DefaultMarkdownGenerator` with a `PruningContentFilter` in `CrawlerRunConfig`].
```python
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
DefaultMarkdownGenerator,
CacheMode
)
from crawl4ai.content_filter_strategy import PruningContentFilter
async def run_with_specific_md_generator():
# Configure a specific markdown generator
pruning_filter = PruningContentFilter(threshold=0.6)
specific_md_generator = DefaultMarkdownGenerator(
content_filter=pruning_filter,
options={"body_width": 0, "ignore_links": True}
)
# Configure the crawl run to use this generator
run_config = CrawlerRunConfig(
markdown_generator=specific_md_generator,
cache_mode=CacheMode.BYPASS
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com/article1", config=run_config)
if result.success:
print("--- Markdown from Article 1 (Pruned, No Links, No Wrap) ---")
print(result.markdown.fit_markdown[:500])
# raw_markdown would also reflect no-wrap and no-links from html2text_options
# For another URL, you could use a different (or default) generator
# default_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
# result2 = await crawler.arun(url="https://example.com/article2", config=default_run_config)
# asyncio.run(run_with_specific_md_generator())
```
* 5.3.2. *Example:* [Setting a custom `MyMarkdownStrategy` in `CrawlerRunConfig` (assuming `MyCustomMarkdownStrategy` from 2.1.3)].
```python
# Assuming MyCustomMarkdownStrategy is defined as in section 2.1.3
# from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
# from your_module import MyCustomMarkdownStrategy # If it's in another file
# async def run_with_custom_md_strategy():
# custom_strategy = MyCustomMarkdownStrategy(content_source="raw_html")
# run_config_custom = CrawlerRunConfig(
# markdown_generator=custom_strategy,
# cache_mode=CacheMode.BYPASS
# )
# async with AsyncWebCrawler() as crawler:
# result = await crawler.arun(url="https://example.com", config=run_config_custom)
# if result.success:
# print("--- Markdown from Custom Strategy ---")
# print(result.markdown.raw_markdown) # Or other fields your strategy populates
# asyncio.run(run_with_custom_md_strategy())
```
* 5.4. **Interaction with Global vs. Run-Specific Configurations**
* **Explanation:** `AsyncWebCrawler` itself does not have a global `markdown_generator` setting during its initialization. Markdown generation is configured *per run* via `CrawlerRunConfig`. This design choice provides maximum flexibility, allowing different Markdown strategies for different URLs or tasks within the same crawler instance lifecycle. If `CrawlerRunConfig.markdown_generator` is not provided, a default `DefaultMarkdownGenerator` instance is used for that specific run.
## 6. Mastering `CustomHTML2Text` for Fine-Grained Control
* 6.1. **Understanding `CustomHTML2Text`**
* 6.1.1. **Purpose:** Why Crawl4AI includes its own `html2text` extension.
* **Enhanced control:** `CustomHTML2Text` is a subclass of the standard `html2text.HTML2Text` library. Crawl4AI uses this custom version to gain more precise control over the HTML-to-Markdown conversion process, particularly to make the output more suitable for LLMs.
* **Specific adaptations:** It includes logic for handling Crawl4AI's citation and reference generation (`convert_links_to_citations`), and potentially other tweaks that improve the quality and utility of the Markdown output for AI applications.
* 6.1.2. **How it's used by `DefaultMarkdownGenerator`**.
* **Explanation:** `DefaultMarkdownGenerator` instantiates `CustomHTML2Text` internally. When you pass `options` to `DefaultMarkdownGenerator`, these are ultimately used to configure this `CustomHTML2Text` instance. The `handle()` method of `CustomHTML2Text` is what performs the core HTML to Markdown conversion.
* 6.2. **Key `html2text_options` and Their Impact**
* (These options are passed via `DefaultMarkdownGenerator(options=...)`)
* 6.2.1. `body_width`:
* **What it does:** Controls the maximum width of lines in the generated Markdown before wrapping.
* **Why configure it:** For LLM consumption, it's often best to disable automatic line wrapping to allow the LLM to process text based on natural paragraph breaks. Setting `body_width=0` achieves this.
* *Example:*
* `body_width=80` (default-ish for some tools):
```markdown
This is a longer sentence that will be wrapped by html2text if the body_width is
set to a value like 80 characters.
```
* `body_width=0`:
```markdown
This is a longer sentence that will not be wrapped by html2text if body_width is 0, allowing the LLM to handle line breaks.
```
* 6.2.2. `ignore_links`:
* **What it does:** If `True`, all hyperlink information (`[text](url)`) is removed, leaving only the link text.
* **Why configure it:** Set to `True` if links are considered noise for your LLM task and you don't need source attribution. If `False` (default for Crawl4AI's `CustomHTML2Text` unless overridden), links are preserved and can then be converted to citations by `DefaultMarkdownGenerator`.
* *Example:*
* `ignore_links=False` (then processed for citations): `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI [^1^]`
* `ignore_links=True`: `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI`
* 6.2.3. `ignore_images`:
* **What it does:** If `True`, image tags (`<img>`) are completely ignored, and no Markdown image syntax (`![alt](src)`) is generated.
* **Why configure it:** Useful if image information is irrelevant to your LLM task and you want cleaner, more text-focused Markdown.
* *Example:*
* HTML: `<img src="logo.png" alt="My Logo">`
* `ignore_images=False`: `![My Logo](logo.png)`
* `ignore_images=True`: (nothing is output for the image)
* 6.2.4. `protect_links`:
* **What it does:** If `True`, surrounds link URLs with `<` and `>`. E.g., `[text](<url>)`.
* **Why configure it:** This can sometimes help Markdown parsers that might misinterpret URLs containing special characters. However, with Crawl4AI's citation handling, this is generally not needed, as the raw URLs are moved to the reference section.
* 6.2.5. `mark_code`:
* **What it does:** Controls how `<pre>` and `<code>` tags are handled. If `True`, it attempts to use Markdown code block syntax (backticks).
* **Why configure it:** Essential for preserving code snippets correctly. Usually, you'd want this to be `True`.
* 6.2.6. `default_image_alt`:
* **What it does:** Provides a default alt text string if an `<img>` tag is missing an `alt` attribute.
* **Why configure it:** Can make Markdown more consistent if you choose to include images.
* 6.2.7. `bypass_tables`:
* **What it does:** If `True`, `<table>` elements are not converted into Markdown table syntax. Their content might be rendered as plain text or omitted, depending on other settings.
* **Why configure it:** Standard Markdown table syntax is limited and may not handle complex tables (with `colspan`, `rowspan`, nested tables) well. If you encounter mangled tables, setting this to `True` and processing the table HTML separately (e.g., by extracting the `<table>` HTML and using a specialized table-to-text or table-to-JSON library) might be a better approach.
* 6.2.8. `pad_tables`:
* **What it does:** If `True`, adds padding spaces around cell content in Markdown tables for better visual alignment in raw Markdown.
* **Why configure it:** Mostly an aesthetic choice for human readability of the raw Markdown; LLMs typically don't care about this padding.
* *Other relevant options identified from `CustomHTML2Text` (or base `html2text`) source:*
* `escape_snob`: If `True`, escapes `>` and `&` characters. Default is `False`.
* `skip_internal_links`: If `True`, ignores links that start with `#`. Default is `False`.
* `links_each_paragraph`: If `True`, puts a link list after each paragraph. Default is `False`. Crawl4AI's citation system provides a better alternative.
* `unicode_snob`: If `True`, uses Unicode characters instead of ASCII approximations. Default is `False` in base `html2text`, but `CustomHTML2Text` might behave differently or Crawl4AI ensures UTF-8 handling.
* 6.3. **Best Practices for Configuring `CustomHTML2Text`**
* 6.3.1. **General recommendations for LLM-friendly output:**
* Set `body_width=0` to disable line wrapping and let paragraphs flow naturally.
* Consider `ignore_images=True` if images are not relevant to the LLM's task.
* Usually, keep `ignore_links=False` (Crawl4AI default) to allow `DefaultMarkdownGenerator` to handle citations properly.
* 6.3.2. **How to balance information preservation with conciseness:**
* Be selective with `ignore_*` options. Removing too much might discard useful context.
* Use content filters (Section 4) for semantic reduction rather than relying solely on `html2text` options to remove large irrelevant sections.
* 6.3.3. **Experimenting with options to achieve desired Markdown style:**
* Create a small test HTML snippet.
* Instantiate `DefaultMarkdownGenerator` with different `options` dictionaries.
* Call its `generate_markdown` method directly (or `_html_to_markdown` on its internal `CustomHTML2Text` instance if you want to bypass citation logic for testing) and observe the output.
* 6.4. **Handling Citations and References (`convert_links_to_citations` method in `CustomHTML2Text`)**
* 6.4.1. **How it works:**
* The `convert_links_to_citations` method (called by `DefaultMarkdownGenerator` if citations are enabled) iterates through the Markdown produced by `html2text.handle()`.
* It uses a regular expression (`LINK_PATTERN`) to find all Markdown links (`[text](url "optional title")`).
* For each unique URL, it assigns an incremental citation number.
* It replaces the original Markdown link with `text [^N^]` (or `![text][^N^]` for images if not ignored).
* It builds up a list of reference strings like `[^N^]: url "optional title - text if different from title"`.
* 6.4.2. **When it's called:** This method is invoked by `DefaultMarkdownGenerator.generate_markdown()` *after* the initial HTML-to-Markdown conversion by `CustomHTML2Text.handle()` if the `citations` flag is `True`.
* 6.4.3. **Impact on `MarkdownGenerationResult` fields:**
* The modified Markdown (with `[^N^]` markers) is stored in `markdown_with_citations`.
* The collected reference list is stored in `references_markdown`.
* `raw_markdown` remains the version *before* citation processing.
* 6.4.4. **Customizing Citation Behavior (if possible through options or by subclassing)**.
* **Explanation:** Direct customization of the citation format (e.g., changing `[^N^]` to `(N)`) via options is not explicitly provided in `CustomHTML2Text`.
* To change this, you would need to:
1. Create your own class inheriting from `DefaultMarkdownGenerator`.
2. Override the `generate_markdown` method.
3. In your override, you could either:
* Call the parent's `generate_markdown`, get the `MarkdownGenerationResult`, and then post-process `markdown_with_citations` and `references_markdown` to your desired format.
* Or, more invasively, replicate the logic but modify the citation generation part. This might involve creating a custom version of `CustomHTML2Text` or its `convert_links_to_citations` method.
* For most users, the default citation format is standard and widely accepted.
## 7. Advanced Markdown Generation Techniques & Best Practices
* 7.1. **Achieving LLM-Friendly Markdown Output**
* 7.1.1. Prioritizing semantic structure (headings, lists, paragraphs).
* **Why:** LLMs leverage structural cues to understand context and hierarchy. Ensure your `html2text_options` (e.g., for headings, list indentation) preserve this structure faithfully.
* **How:** Rely on `CustomHTML2Text`'s default handling of semantic HTML tags. If specific tags are problematic, consider pre-processing the HTML.
* 7.1.2. Handling complex HTML structures (nested tables, complex layouts).
* **Strategies for simplifying or selectively extracting from them:**
* **Tables:** For very complex tables, consider `html2text_options={'bypass_tables': True}`. Then, extract the table HTML separately (e.g., using `CrawlResult.html` and a CSS selector for the table) and process it with a specialized table parsing library or even an LLM call focused just on table interpretation.
* **Layouts:** Aggressive `RelevantContentFilter` strategies can help. If parts of a complex layout are consistently noise, use `CrawlerRunConfig.excluded_selector` to remove them before they even reach the Markdown generator.
* 7.1.3. When to prefer `fit_markdown` over `raw_markdown` (or `markdown_with_citations`).
* **Reasoning:**
* **`fit_markdown`:** Best for tasks requiring high relevance and conciseness (e.g., RAG context, focused summarization). It reflects the output of your content filtering.
* **`raw_markdown` / `markdown_with_citations`:** Better when you need a broader representation of the page's textual content, or when the filtering might be too aggressive and discard potentially useful context. Also, if your `content_source` is already very clean (e.g., from a targeted CSS selector), the difference might be minimal.
* 7.1.4. Balancing detail vs. conciseness for different LLM tasks (e.g., summarization vs. Q&A).
* **Summarization:** `fit_markdown` from a well-configured `LLMContentFilter` or `BM25ContentFilter` is often ideal. You might also use more aggressive `html2text_options` to remove minor elements.
* **Q&A / RAG:** You might prefer a slightly less aggressive filter or even `raw_markdown` (if `content_source` is clean) to ensure all potentially relevant details are available. Citations (`markdown_with_citations` and `references_markdown`) are crucial here for source tracking.
* 7.2. **Pre-processing HTML for Better Markdown**
* 7.2.1. Using `CrawlerRunConfig.excluded_tags` or `excluded_selector` to remove noise before Markdown generation.
* **How:** These parameters in `CrawlerRunConfig` are applied by the `ContentScrapingStrategy` *before* the HTML even reaches the `MarkdownGenerationStrategy`.
* **Why:** This is the most efficient way to remove large, consistently irrelevant sections (like global headers, footers, sidebars, ad blocks) across all outputs (HTML, Markdown, etc.).
* *Code Example:*
```python
# In CrawlerRunConfig
# config = CrawlerRunConfig(
# excluded_tags=["nav", "footer", "script", "style"],
# excluded_selector=".ads, #social-share-buttons"
# )
```
* 7.2.2. The role of `ContentScrapingStrategy` (e.g., `LXMLWebScrapingStrategy` or the default `WebScrapingStrategy` using BeautifulSoup) in preparing the HTML that `DefaultMarkdownGenerator` receives.
* **Explanation:** The `ContentScrapingStrategy` is responsible for the initial cleaning of the HTML. Its output (what becomes `cleaned_html`) is the direct input to `DefaultMarkdownGenerator` if `content_source` is `"cleaned_html"`. Understanding how your chosen scraping strategy cleans HTML is key to predicting the input for Markdown generation. `LXMLWebScrapingStrategy` is generally faster and can be more robust for heavily malformed HTML.
* 7.3. **Post-processing Generated Markdown**
* 7.3.1. When and why you might need to further process Markdown from `MarkdownGenerationResult`.
* **Scenarios:**
* Custom formatting not achievable with `html2text` options (e.g., specific table styles, unique list markers).
* Domain-specific transformations (e.g., converting certain patterns to custom shortcodes).
* Further cleaning or condensing based on rules `html2text` or content filters don't cover.
* 7.3.2. *Example:* [Python snippet for custom regex replacements or structural adjustments on `raw_markdown`].
```python
import re
def custom_post_process_markdown(markdown_text):
# Example: Replace all occurrences of "Crawl4AI" with "**Crawl4AI**"
markdown_text = re.sub(r"Crawl4AI", r"**Crawl4AI**", markdown_text)
# Example: Add a horizontal rule after every H2 heading
markdown_text = re.sub(r"(^## .*)", r"\1\n\n---", markdown_text, flags=re.MULTILINE)
return markdown_text
# result = await crawler.arun(...)
# if result.success:
# final_markdown = custom_post_process_markdown(result.markdown.raw_markdown)
# print(final_markdown)
```
* 7.4. **Combining Different Strategies for Optimal Results**
* 7.4.1. *Scenario:* Using a `RelevantContentFilter` to get `fit_html`, then passing `fit_html` to a custom Markdown generator that expects highly focused input.
* **How:**
1. Instantiate your filter (e.g., `LLMContentFilter`).
2. Instantiate your custom Markdown generator (`MyCustomMarkdownStrategy`).
3. In `CrawlerRunConfig`, set `markdown_generator` to your custom generator.
4. Crucially, within your custom generator's `generate_markdown` method, ensure you *first* apply the `content_filter` (passed as an argument) to the `input_html` to get the `fit_html`, and then process this `fit_html` with your custom logic. Or, configure your custom generator's `content_source="fit_html"` and pass the filter during its initialization.
* 7.4.2. *Scenario:* Using one set of `html2text_options` for `raw_markdown` and another for generating an alternative Markdown representation (perhaps for a different LLM or purpose).
* **How:** This would typically require two separate calls to `crawler.arun()` with different `CrawlerRunConfig` objects, each specifying a `DefaultMarkdownGenerator` with different `options`. Alternatively, a custom `MarkdownGenerationStrategy` could internally generate multiple Markdown versions with different settings and include them in custom fields within `MarkdownGenerationResult` (though this would require modifying or extending `MarkdownGenerationResult`).
## 8. Troubleshooting Common Markdown Generation Issues
* 8.1. **Problem: Markdown is too noisy / includes boilerplate**
* 8.1.1. **Solutions:**
* **Use a `RelevantContentFilter`**:
* Start with `PruningContentFilter`. It's fast and good for common boilerplate.
```python
# from crawl4ai.content_filter_strategy import PruningContentFilter
# from crawl4ai import DefaultMarkdownGenerator
# md_generator = DefaultMarkdownGenerator(content_filter=PruningContentFilter(threshold=0.5))
```
* If more precision is needed, try `BM25ContentFilter` with a relevant query or `LLMContentFilter` with clear instructions.
* **Refine `excluded_tags` or `excluded_selector` in `CrawlerRunConfig`**: This removes elements *before* any Markdown strategy sees them.
```python
# run_config = CrawlerRunConfig(
# excluded_tags=["nav", "footer", "aside", "script"],
# excluded_selector=".ad-banner, #social-links"
# )
```
* **Adjust `html2text_options`**: Options like `ignore_links`, `ignore_images`, `skip_internal_links` can reduce clutter.
```python
# from crawl4ai import DefaultMarkdownGenerator
# md_generator = DefaultMarkdownGenerator(options={"ignore_images": True, "ignore_links": True})
```
* 8.2. **Problem: Important content is missing from Markdown**
* 8.2.1. **Solutions:**
* **Check if `content_filter` is too aggressive**: If using a filter, try lowering its threshold (e.g., `bm25_threshold` for `BM25ContentFilter`) or simplifying instructions for `LLMContentFilter`. Temporarily disable the filter to see if the content appears in `raw_markdown`.
* **Ensure `word_count_threshold` in `CrawlerRunConfig` (or scraping strategy) is not too high**: The default `WebScrapingStrategy` might have its own cleaning. If `CrawlerRunConfig.word_count_threshold` is too high, it might remove short but important paragraphs.
* **Verify `html2text_options` are not inadvertently removing desired content**: For example, if `ignore_links=True` is set, link text itself might still be there, but the link URL will be gone.
* **Examine `cleaned_html` or `fit_html`**: Inspect `result.markdown.fit_html` (if a filter was used) or `result.cleaned_html` (if no filter and `content_source` was `cleaned_html`). If the content is missing here, the issue is with HTML cleaning or filtering, not the Markdown conversion itself. If it's present in these HTML versions but not in the final Markdown, the issue is likely with `html2text_options` or the conversion process.
* 8.3. **Problem: Tables are mangled or poorly formatted**
* 8.3.1. **Solutions:**
* **Try `html2text_options={'bypass_tables': True}`**: This tells `html2text` to skip converting tables.
```python
# from crawl4ai import DefaultMarkdownGenerator
# md_generator = DefaultMarkdownGenerator(options={"bypass_tables": True})
# run_config = CrawlerRunConfig(markdown_generator=md_generator)
# result = await crawler.arun(...)
# # Now result.markdown.raw_markdown will not have Markdown tables.
# # You'd need to parse tables from result.cleaned_html or result.markdown.fit_html
```
You can then extract the table HTML directly from `result.cleaned_html` (or `result.markdown.fit_html`) using BeautifulSoup or lxml and parse it with a library better suited for complex tables (e.g., pandas `read_html`, or a custom parser).
* **Experiment with other `html2text` table formatting options**: Options like `pad_tables` might slightly improve appearance, but won't fix fundamentally complex table structures.
* **Consider if the table is truly a data table or a layout table**: Layout tables are often problematic for Markdown conversion and should ideally be filtered out by `PruningContentFilter` or more aggressive cleaning.
* 8.4. **Problem: Citations or references are incorrect/missing**
* 8.4.1. **Solutions:**
* **Ensure links are present in the HTML input to `DefaultMarkdownGenerator`**: If the links were removed during an earlier HTML cleaning stage (e.g., by an aggressive `ContentScrapingStrategy` or `excluded_tags`), they can't be converted to citations.
* **Verify `ignore_links` is not `True` in `html2text_options`**: `DefaultMarkdownGenerator` relies on `CustomHTML2Text` to see the links to convert them. If `ignore_links=True`, the links are stripped before citation processing can occur.
* **Check for unusual link structures in the HTML**: Very non-standard link formats (e.g., heavily JavaScript-driven links without `href` attributes) might not be picked up. `CustomHTML2Text` primarily looks for standard `<a href="...">` tags.
* 8.5. **Problem: Markdown formatting is not ideal for a specific LLM**
* 8.5.1. **Solutions:**
* **Fine-tune `html2text_options` extensively**: This is the first line of defense. Experiment with all available options (see Section 6.2) to control aspects like heading styles, list formatting, code block rendering, etc.
* **Consider a custom `MarkdownGenerationStrategy`**: If `html2text` options are insufficient, you might need to build your own strategy, possibly using a different Markdown conversion library or implementing custom transformation logic (see Section 2.1.3).
* **Implement post-processing steps**: After getting the Markdown from `MarkdownGenerationResult`, apply your own Python scripts (e.g., using regex) to further refine the formatting (see Section 7.3.2).
* 8.6. **Debugging Workflow**
* 8.6.1. **Start with `raw_html` from `CrawlResult`**: `print(result.html)` This is the very first HTML fetched, before any processing. Is your target content even here?
* 8.6.2. **Examine `cleaned_html` (or `fit_html`)**:
* If no content filter is used in `MarkdownGenerationStrategy`, inspect `result.cleaned_html`. This is what `DefaultMarkdownGenerator` (with `content_source="cleaned_html"`) will use.
* If a content filter *is* used, inspect `result.markdown.fit_html`. This is what `DefaultMarkdownGenerator` will use to produce `fit_markdown`.
* Is your target content present in these intermediate HTML stages?
* 8.6.3. **Isolate the issue**:
* **HTML Cleaning/Scraping:** If content is missing from `cleaned_html` (but present in `raw_html`), the issue lies with the `ContentScrapingStrategy` or `CrawlerRunConfig` parameters like `excluded_tags`, `css_selector`, `target_elements`.
* **Content Filtering:** If content is in `cleaned_html` but missing from `fit_html`, the issue is with your `RelevantContentFilter` configuration.
* **Markdown Conversion:** If content is in `cleaned_html`/`fit_html` but malformed or missing in the final Markdown fields (`raw_markdown`, `fit_markdown`), the issue is likely with `html2text_options` or the `CustomHTML2Text` conversion process.
* 8.6.4. **Use `verbose=True` in relevant configs**: Set `verbose=True` in `BrowserConfig` and `CrawlerRunConfig` for more detailed logging output from Crawl4AI, which can provide clues.
## 9. Conclusion and Next Steps
* 9.1. Recap of key strategies for effective Markdown generation.
* **Summary:** Crawl4AI provides a flexible Markdown generation pipeline. Start with `DefaultMarkdownGenerator`. Use `html2text_options` for stylistic control. Employ `RelevantContentFilter` strategies (`PruningContentFilter`, `BM25ContentFilter`, `LLMContentFilter`) to create focused `fit_markdown` for LLMs. Choose the appropriate `content_source` based on your needs. For highly custom requirements, implement your own `MarkdownGenerationStrategy`.
* 9.2. Pointers to other relevant documentation sections (e.g., `RelevantContentFilter` deep dive, `CustomHTML2Text` options in API reference).
* **Suggestion:** For a detailed breakdown of each `RelevantContentFilter`, see the "Content Filtering Strategies" guide. For an exhaustive list of `html2text` options, refer to the `CustomHTML2Text` API documentation or the original `html2text` library's documentation.
* 9.3. Encouragement for experimentation and community contributions.
* **Call to Action:** The best way to master Markdown generation is to experiment with different configurations and content types. If you develop useful custom strategies or identify improvements, consider contributing them back to the Crawl4AI community!
---
```

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -1,188 +0,0 @@
Okay, I have read the "vibe" description for `crawl4ai`. Based on this, and adhering to the "memory" document type requirements, here is the detailed Markdown outline:
```markdown
# Detailed Outline for crawl4ai - vibe Component
**Target Document Type:** memory
**Target Output Filename Suggestion:** `llm_memory_vibe_coding.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
## 1. Vibe Coding with Crawl4AI: Core Concept
* 1.1. Purpose:
* Provides a conceptual framework for interacting with the `crawl4ai` library, particularly when using AI coding assistants.
* Aims to simplify the process of building web data applications by focusing on high-level capabilities and key building blocks, enabling users to guide AI assistants effectively even with limited direct `crawl4ai` API knowledge.
* 1.2. Principle:
* Describes how users can communicate their web scraping and data extraction goals to an AI assistant, which then translates these "vibes" or high-level intentions into `crawl4ai` Python code by leveraging knowledge of the library's core components and configurations.
## 2. `crawl4ai` High-Level Capabilities (for Vibe Prompts)
* 2.1. Fetching Webpages
* 2.1.1. Description: The library can retrieve content from specified web URLs.
* 2.2. Converting Web Content to Clean Markdown
* 2.2.1. Description: The library can process raw HTML content and convert it into a cleaned, structured Markdown format.
* 2.2.2. Applications: Suitable for content summarization, input for Question & Answering systems, and as a pre-processing step for other LLMs.
* 2.3. Extracting Specific Information (JSON)
* 2.3.1. Description: The library can extract targeted data elements from webpages and organize them into a JSON structure.
* 2.3.2. Examples: Can be used to extract product names, prices from e-commerce sites, article headlines, author names, etc.
* 2.4. Crawling Multiple Pages
* 2.4.1. Description: The library supports concurrent fetching and processing of a list of URLs.
* 2.5. Taking Screenshots and Generating PDFs
* 2.5.1. Description: The library can capture visual representations of webpages as PNG screenshots or generate PDF documents.
* 2.6. Handling Simple Page Interactions
* 2.6.1. Description: The library can execute JavaScript to simulate basic user interactions on a webpage, such as clicking buttons (e.g., "load more") or scrolling.
## 3. Key `crawl4ai` Building Blocks (API Reference for Vibe Coding Context)
* 3.1. Class `AsyncWebCrawler`
* 3.1.1. Purpose: The primary entry point and main tool within `crawl4ai` for orchestrating web crawling and data extraction tasks.
* 3.1.2. Initialization (`__init__`):
* Signature: `AsyncWebCrawler(self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, config: Optional[BrowserConfig] = None, base_directory: str = ..., thread_safe: bool = False, logger: Optional[AsyncLoggerBase] = None, **kwargs)`
* Parameters:
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The underlying strategy for web crawling (e.g., `AsyncPlaywrightCrawlerStrategy`). Defaults to `AsyncPlaywrightCrawlerStrategy`.
* `config (Optional[BrowserConfig])`: Configuration for the browser instance. See section 3.5 for details.
* Other parameters are generally handled by defaults for vibe coding.
* 3.2. Method `AsyncWebCrawler.arun()`
* 3.2.1. Purpose: Executes a crawl operation on a single URL or resource.
* 3.2.2. Signature: `async def arun(self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`
* 3.2.3. Parameters:
* `url (str)`: The target resource.
* Description: Can be a standard web URL (e.g., "https://example.com"), a local file path (e.g., "file:///path/to/file.html"), or raw HTML content (e.g., "raw:<html>...</html>").
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` specifying how this particular crawl run should be executed. See section 3.4 for details.
* 3.3. Method `AsyncWebCrawler.arun_many()`
* 3.3.1. Purpose: Executes crawl operations on a list of URLs or resources, often concurrently.
* 3.3.2. Signature: `async def arun_many(self, urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`
* 3.3.3. Parameters:
* `urls (List[str])`: A list of target resources (URLs, file paths, raw HTML strings).
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` applied to all URLs in the list. See section 3.4 for details.
* 3.4. Class `CrawlerRunConfig`
* 3.4.1. Purpose: Configuration object for individual crawl runs, controlling aspects like content extraction, page interaction, and output formats.
* 3.4.2. Key Parameters for Vibe Coding Context:
* `markdown_generator (Optional[MarkdownGenerationStrategy])`:
* Description: Specifies the strategy for generating Markdown.
* Default: An instance of `DefaultMarkdownGenerator`.
* Note for Vibe Coding: Can be `DefaultMarkdownGenerator(content_filter=PruningContentFilter())` for cleaner output.
* `extraction_strategy (Optional[ExtractionStrategy])`:
* Description: Specifies the strategy for extracting structured data.
* Supported Strategies (for Vibe Coding):
* `JsonCssExtractionStrategy`: For extracting data based on CSS selectors from structured HTML. Requires a `schema` dictionary.
* `LLMExtractionStrategy`: For extracting data using an LLM, often for complex or unstructured HTML. Requires an `LLMConfig` and an `instruction` or Pydantic model defining the desired output.
* `js_code (Optional[Union[str, List[str]]])`:
* Description: JavaScript code (or a list of code snippets) to be executed on the page after it loads.
* `wait_for (Optional[str])`:
* Description: A CSS selector or JavaScript expression. The crawler will wait for this condition to be met after `js_code` execution before proceeding.
* `session_id (Optional[str])`:
* Description: An identifier used to maintain the state of a browser page across multiple `arun` calls. Essential for multi-step interactions on the same page.
* `js_only (bool)`:
* Description: If `True` (and `session_id` is used), only executes `js_code` on the existing page without a full navigation/reload. Default is `False`.
* `screenshot (bool)`:
* Description: If `True`, captures a screenshot of the page. Result in `CrawlResult.screenshot`. Default is `False`.
* `pdf (bool)`:
* Description: If `True`, generates a PDF of the page. Result in `CrawlResult.pdf`. Default is `False`.
* `cache_mode (Optional[CacheMode])`:
* Description: Controls caching behavior.
* Type: `crawl4ai.cache_context.CacheMode` (Enum).
* Common Values: `CacheMode.ENABLED`, `CacheMode.BYPASS`.
* 3.5. Class `BrowserConfig`
* 3.5.1. Purpose: Configures persistent browser-level settings for an `AsyncWebCrawler` instance.
* 3.5.2. Key Parameters for Vibe Coding Context:
* `headless (bool)`:
* Description: If `True`, the browser runs without a visible UI. If `False`, the browser UI is shown.
* Default: `True`.
* `proxy_config (Optional[Union[ProxyConfig, Dict[str, str]]])`:
* Description: Configuration for using a proxy server.
* Structure (if dict): `{"server": "http://<host>:<port>", "username": "<user>", "password": "<pass>"}`.
* `user_agent (Optional[str])`:
* Description: Custom User-Agent string to be used by the browser.
* 3.6. Class `LLMConfig`
* 3.6.1. Purpose: Configures settings for interacting with Large Language Models, used by `LLMExtractionStrategy`.
* 3.6.2. Key Parameters:
* `provider (str)`:
* Description: Specifies the LLM provider and model identifier.
* Examples: "openai/gpt-4o-mini", "ollama/llama3", "anthropic/claude-3-opus-20240229".
* `api_token (Optional[str])`:
* Description: API key for the LLM provider. Can be the actual key or an environment variable reference (e.g., "env:OPENAI_API_KEY").
* 3.7. Class `CrawlResult`
* 3.7.1. Purpose: The data object returned by `crawl4ai` operations, containing the results and metadata of a crawl.
* 3.7.2. Key Attributes:
* `success (bool)`: `True` if the crawl was successful, `False` otherwise.
* `markdown (MarkdownGenerationResult)`: Object containing Markdown representations.
* `markdown.raw_markdown (str)`: Markdown generated directly from the cleaned HTML.
* `markdown.fit_markdown (str)`: Markdown potentially further processed by content filters.
* `extracted_content (Optional[str])`: JSON string of structured data if an `ExtractionStrategy` was used and successful.
* `links (Links)`: Object containing `internal` and `external` lists of `Link` objects. Each `Link` object has `href`, `text`, `title`.
* `media (Media)`: Object containing lists of `MediaItem` for `images`, `videos`, `audios`, and `tables`. Each `MediaItem` has `src`, `alt`, `score`, etc.
* `screenshot (Optional[str])`: Base64 encoded string of the PNG screenshot, if `screenshot=True`.
* `pdf (Optional[bytes])`: Raw bytes of the PDF document, if `pdf=True`.
* `error_message (Optional[str])`: Description of the error if `success` is `False`.
## 4. Common `crawl4ai` Usage Patterns (Vibe Recipes Mapped to Components)
* 4.1. Task: Get Clean Markdown from a Page
* 4.1.1. Description: Fetch a single webpage and convert its main content into clean Markdown.
* 4.1.2. Key `crawl4ai` elements:
* `AsyncWebCrawler`
* `arun()` method.
* `CrawlerRunConfig`:
* `markdown_generator`: Typically `DefaultMarkdownGenerator()`. For very clean output, `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
* 4.2. Task: Extract All Product Names and Prices from an E-commerce Category Page
* 4.2.1. Description: Scrape structured data (e.g., product names, prices) from a page with repeating elements.
* 4.2.2. Key `crawl4ai` elements:
* `AsyncWebCrawler`
* `arun()` method.
* `CrawlerRunConfig`:
* `extraction_strategy`: `JsonCssExtractionStrategy(schema={"name_field": "h2.product-title", "price_field": "span.price"})`. The schema's CSS selectors identify where to find the data.
* 4.3. Task: Extract Key Information from an Article using an LLM
* 4.3.1. Description: Use an LLM to parse an article and extract specific fields like author, date, and a summary into a JSON format.
* 4.3.2. Key `crawl4ai` elements:
* `AsyncWebCrawler`
* `arun()` method.
* `CrawlerRunConfig`:
* `extraction_strategy`: `LLMExtractionStrategy(llm_config=..., instruction=..., schema=...)`.
* `LLMConfig`: Instance specifying `provider` (e.g., "openai/gpt-4o-mini") and `api_token`.
* Schema for `LLMExtractionStrategy`: Can be a Pydantic model definition or a dictionary describing the target JSON structure.
* 4.4. Task: Crawl Multiple Pages of a Blog (Clicking "Next Page")
* 4.4.1. Description: Navigate through paginated content by simulating clicks on "Next Page" or similar links, collecting data from each page.
* 4.4.2. Key `crawl4ai` elements:
* `AsyncWebCrawler`
* Multiple sequential calls to `arun()` (typically in a loop).
* `CrawlerRunConfig` (reused or cloned for each step):
* `session_id`: A consistent identifier (e.g., "blog_pagination_session") to maintain the browser state across `arun` calls.
* `js_code`: JavaScript to trigger the "Next Page" action (e.g., `document.querySelector('a.next-page-link').click();`).
* `wait_for`: A CSS selector or JavaScript condition to ensure the new page content has loaded before proceeding.
* `js_only=True`: For subsequent `arun` calls after the initial page load to indicate only JS interaction without full navigation.
* 4.5. Task: Get Screenshots of a List of URLs
* 4.5.1. Description: Capture screenshots for a batch of URLs.
* 4.5.2. Key `crawl4ai` elements:
* `AsyncWebCrawler`
* `arun_many()` method.
* `CrawlerRunConfig`:
* `screenshot=True`.
## 5. Key Input Considerations for `crawl4ai` Operations (Inferred from Vibe Prompting Tips)
* 5.1. Clear Objective: `crawl4ai` operations are guided by the configuration. The configuration should reflect the user's goal (e.g., Markdown generation, specific data extraction, media capture).
* 5.2. URL Input: The `arun` method requires a single `url` string. `arun_many` requires a `List[str]` of URLs.
* 5.3. Structured Data Extraction Guidance:
* For `JsonCssExtractionStrategy`, the `schema` parameter (a dictionary mapping desired field names to CSS selectors) is essential.
* For `LLMExtractionStrategy`, the `instruction` parameter (natural language description of desired data) and/or a `schema` (Pydantic model or dictionary) are crucial, along with a configured `LLMConfig`.
* 5.4. LLM Configuration: When `LLMExtractionStrategy` is used, an `LLMConfig` instance specifying `provider` and `api_token` (if applicable) must be provided.
* 5.5. Dynamic Page Handling: For pages requiring interaction, `CrawlerRunConfig` parameters like `js_code`, `wait_for`, `session_id`, and `js_only` are used.
## 6. Expected Output Data from `crawl4ai` Operations (Accessing `CrawlResult`)
* 6.1. Generated Python Code: When using an AI assistant with `crawl4ai` context, the AI is expected to generate Python code that utilizes `crawl4ai` classes and methods.
* 6.2. `CrawlResult` Object: The primary output of `arun()` and `arun_many()` calls.
* `result.success (bool)`: Indicates if the individual crawl operation was successful.
* `result.markdown.raw_markdown (str)` / `result.markdown.fit_markdown (str)`: Contains the generated Markdown content.
* `result.extracted_content (Optional[str])`: Contains the JSON string of structured data if an extraction strategy was successful.
* `result.links (Links)`: Provides access to lists of internal and external links.
* `result.media (Media)`: Provides access to lists of images, videos, audio files, and tables.
* `result.screenshot (Optional[str])`: Base64 encoded screenshot data.
* `result.pdf (Optional[bytes])`: Raw PDF data.
* `result.error_message (Optional[str])`: Error details if `success` is `False`.
* 6.3. Files on Disk: Operations like screenshot or PDF generation, or custom code within an AI-generated script, might save files to the local disk (e.g., PNGs, PDFs, JSON files). The paths depend on the configuration or the custom code.
```

View File

@@ -1,643 +0,0 @@
```markdown
# Detailed Outline for crawl4ai - vibe Component
**Target Document Type:** reasoning
**Target Output Filename Suggestion:** `llm_reasoning_vibe.md`
**Library Version Context:** 0.6.3
**Outline Generation Date:** 2025-05-24
---
# Vibe Coding with Crawl4AI: A Guide for AI Coding Assistants
## 1. Introduction: What is "Vibe Coding" with Crawl4AI?
* **1.1. Explaining the Core Idea:**
"Vibe Coding" with Crawl4AI is about empowering users, even those with limited direct coding experience, to build powerful web data applications. Instead of meticulously writing every line of Python, you focus on clearly describing your data extraction or web interaction goals to an AI coding assistant. This guide teaches you how to provide that AI assistant with the *right context* about Crawl4AI, so it can generate effective and efficient code for you. The goal is to translate your "vibe" your high-level intent into working Crawl4AI solutions quickly.
* **1.2. Who is this Guide For?**
This guide is designed for:
* **Users new to web scraping or `crawl4ai`:** If you prefer to articulate your needs in natural language and have an AI assistant handle the code generation, this guide is for you.
* **Data analysts, researchers, and product managers:** Anyone who needs web data but doesn't want to get bogged down in the intricacies of web scraping libraries.
* **Developers looking for rapid prototyping:** Even experienced developers can use "vibe coding" to quickly generate boilerplate or test ideas with `crawl4ai` before refining the code.
* **AI Coding Assistant Users:** This guide helps you understand what information to feed your AI to get the best `crawl4ai` code.
* **1.3. How this Guide Helps You (and Your AI Assistant):**
By understanding the concepts in this guide, you (and by extension, your AI assistant) will:
* Grasp the high-level capabilities of `crawl4ai` that are most relevant for prompting an AI.
* Learn the key terminology and building blocks of `crawl4ai` to include in your prompts for precise code generation.
* Discover common "vibe recipes" typical data extraction tasks and how to prompt an AI to solve them using `crawl4ai`.
* Pick up effective prompting patterns to maximize the quality of AI-generated `crawl4ai` code.
## 2. High-Level Capabilities of Crawl4AI (What to Tell Your AI Assistant Crawl4AI Can Do)
When you're "vibe coding" with your AI assistant, you don't need to explain every nuance of `crawl4ai`. Instead, focus on what it *can do* for you. Here's a high-level overview of capabilities you can confidently tell your AI assistant about:
* **2.1. Fetching Any Webpage:**
* **How to tell your AI:** "Crawl4AI can fetch the content of any webpage, whether it's a simple static page or a complex JavaScript-heavy application."
* **Why it's important:** This establishes the fundamental capability getting the raw HTML from a target URL.
* **2.2. Converting Web Content into Clean Markdown:**
* **How to tell your AI:** "Crawl4AI is great at turning messy web pages into clean, readable Markdown. This is perfect if I need to summarize an article, feed content into another LLM for Q&A, or just get the main text."
* **Why it's important:** Markdown is often the desired end-format for LLM-based tasks, and `crawl4ai` simplifies this conversion.
* **2.3. Extracting Specific Pieces of Information (Structured Data/JSON):**
* **How to tell your AI:** "If I need specific data from a page, like all the product names and prices from an e-commerce site, or all the headlines from a news page, Crawl4AI can extract that and give it to me as structured JSON."
* **Why it's important:** This highlights `crawl4ai`'s ability to go beyond simple text extraction and pull out specific, targeted information.
* **2.4. Crawling Multiple Pages at Once:**
* **How to tell your AI:** "If I have a list of URLs, Crawl4AI can process them all efficiently, often in parallel, instead of me having to do them one by one."
* **Why it's important:** This points to the library's capability for batch processing, saving significant time for larger tasks.
* **2.5. Taking Screenshots or Making PDFs of Pages:**
* **How to tell your AI:** "Crawl4AI can take a screenshot of a webpage or save it as a PDF document. This is useful for archiving, visual checks, or sharing."
* **Why it's important:** Visual capture and PDF generation are common requirements that `crawl4ai` handles easily.
* **2.6. Handling Simple Page Interactions:**
* **How to tell your AI:** "For pages that need a bit of interaction, like clicking a 'load more' button to reveal more content or scrolling down, Crawl4AI can execute JavaScript to do that."
* **Why it'S important:** This shows that `crawl4ai` isn't limited to static content and can handle basic dynamic page elements to access more data.
## 3. Key Building Blocks: Communicating Crawl4AI Concepts to Your AI Assistant
To get the best results from your AI assistant, you'll want to mention the key components of `crawl4ai` it should use. Think of these as the essential vocabulary for your "vibe" prompts.
* **3.1. `AsyncWebCrawler`**: The Main Tool
* **3.1.1. Explaining its Role:**
* **What to tell your AI:** "The main tool we'll use in `crawl4ai` is called `AsyncWebCrawler`. We always need to create an instance of this first. It's best to use it with an `async with` statement to make sure resources are managed correctly."
* **Why it's important for the AI:** This directs the AI to the central class for all crawling operations and hints at the asynchronous nature and proper resource management.
* **3.1.2. Basic Usage Pattern:**
* **Vibe Prompt:** "Let's start a `crawl4ai` script. We'll need an `AsyncWebCrawler`."
* **Expected AI action:** The AI should generate the basic setup:
```python
import asyncio
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig # Common imports
async def main():
# browser_config = BrowserConfig(...) # Optional, if needed
async with AsyncWebCrawler(# config=browser_config
) as crawler:
# ... your crawl logic will go here ...
pass
if __name__ == "__main__":
asyncio.run(main())
```
* **3.2. `arun(url, config)`**: Crawling a Single Page
* **3.2.1. Explaining its Purpose:**
* **What to tell your AI:** "To crawl a single page, we'll use the `arun` method of the `AsyncWebCrawler` instance. It takes the URL and an optional configuration object."
* **Why it's important for the AI:** This points to the primary method for individual page crawls.
* **3.2.2. Guiding the AI on `url` Specification:**
* **What to tell your AI:** "The `url` for `arun` can be a regular web address (like 'https://example.com'), a local file path (if it starts with `file://`), or even raw HTML content (if it starts with `raw:`)."
* **Vibe Prompts & AI Interpretation:**
* "Crawl this URL: https://my-target-site.com" -> AI uses `url="https://my-target-site.com"`
* "Process this local HTML file: /path/to/my/page.html" -> AI uses `url="file:///path/to/my/page.html"`
* "I have some raw HTML I want to parse: `<html><body><h1>Test</h1></body></html>`" -> AI uses `url="raw:<html><body><h1>Test</h1></body></html>"`
* **3.2.3. Emphasizing `config` (`CrawlerRunConfig`):**
* **What to tell your AI:** "The most important part for customizing a crawl is the `config` parameter of `arun`. This is where we'll use `CrawlerRunConfig` to tell `crawl4ai` *how* to crawl, what to extract, and what to do with the page."
* **Vibe Prompt:** "...and when you call `arun`, pass in a `CrawlerRunConfig` object with these settings: [describe settings]."
* **3.3. `arun_many(urls, config)`**: Crawling Multiple Pages
* **3.3.1. Explaining its Purpose:**
* **What to tell your AI:** "If I have a list of URLs, we should use `arun_many` instead of calling `arun` in a loop. It's more efficient for batch crawling."
* **Why it's important for the AI:** Guides the AI to use the optimized method for multiple URLs, which handles concurrency internally.
* **3.3.2. Vibe Prompt:**
* "Crawl all these pages: https://example.com/page1, https://example.com/page2, https://example.com/page3."
* **Expected AI action:**
```python
# ... inside async def main() and async with AsyncWebCrawler ...
list_of_urls = [
"https://example.com/page1",
"https://example.com/page2",
"https://example.com/page3"
]
# run_config = CrawlerRunConfig(...) # Define config for all URLs
results = await crawler.arun_many(urls=list_of_urls, config=run_config)
for result in results:
# process each result
pass
```
* **3.4. `CrawlerRunConfig`**: Tailoring the Crawl (Simplified for Vibe Coding)
* **3.4.1. Getting Markdown:**
* **Basic Request Vibe:** "Get the Markdown for this page [URL]."
* **AI's understanding:** The AI should know that `crawl4ai` produces Markdown by default, so a very simple `CrawlerRunConfig()` or even `None` might suffice if no other settings are needed.
* **For Cleaner Markdown Vibe:** "I need very clean Markdown from [URL], suitable for feeding into another LLM. Remove boilerplate like headers, footers, and navigation."
* **AI's understanding:** "Ah, the user wants cleaner output. I should suggest or use `DefaultMarkdownGenerator` combined with `PruningContentFilter`. The `PruningContentFilter` heuristically tries to remove common non-content sections."
* **Code Snippet AI might generate:**
```python
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
markdown_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
)
run_config = CrawlerRunConfig(
markdown_generator=markdown_generator
)
# result = await crawler.arun(url="...", config=run_config)
# print(result.markdown.fit_markdown) # Access the filtered markdown
```
* **Reasoning to convey (implicitly or explicitly):** `PruningContentFilter` is a good first step for general boilerplate removal. `DefaultMarkdownGenerator` then converts the filtered HTML to Markdown. Accessing `result.markdown.fit_markdown` gives this cleaner version.
* **3.4.2. Extracting Specific Data (JSON):**
* **3.4.2.1. Using `JsonCssExtractionStrategy` for Structured Pages:**
* **When to suggest it (Vibe):** "The page I want to crawl ([URL]) has a list of items, like products on an e-commerce category page, and each item has a similar layout. I want to extract these repeating items into a JSON list."
* **Information to provide the AI (Vibe):** "For each item, I want to get the 'product_name', which is usually in an `<h2>` tag, and the 'price', which seems to be in a `<span>` tag with a class like 'price-tag' or 'current-price'."
* **AI's Role & Reasoning:** The AI should recognize this pattern and suggest `JsonCssExtractionStrategy`. It understands that the user is describing a schema. The AI's job is to translate "name from h2" into `{"name": "product_name", "selector": "h2", "type": "text"}` within the `fields` list of a schema dictionary, and the overall repeating item selector into `baseSelector`. The AI should also know to set `extraction_type="schema"` on `LLMExtractionStrategy` if it were using that for schema generation, but here it's direct CSS.
* **Code Snippet AI might generate:**
```python
from crawl4ai import CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
# AI would help construct this schema based on user's description
schema = {
"name": "ProductList",
"baseSelector": "div.product-item", # Example selector for each product block
"fields": [
{"name": "product_name", "selector": "h2.product-title", "type": "text"},
{"name": "price", "selector": "span.price-tag", "type": "text"}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema=schema)
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
# result = await crawler.arun(url="...", config=run_config)
# if result.success and result.extracted_content:
# products = json.loads(result.extracted_content)
# for product in products:
# print(f"Name: {product.get('product_name')}, Price: {product.get('price')}")
```
* **3.4.2.2. Using `LLMExtractionStrategy` for Complex/Unclear Structures:**
* **When to suggest it (Vibe):** "The page ([URL]) has the information I want, but it's not in a clear, repeating list, or it's mixed in with a lot of text. I need the AI to understand the content to pull out specific details." Or, "I want to extract information that requires some interpretation, like summarizing a paragraph."
* **Information to provide the AI (Vibe):**
* "Use `LLMExtractionStrategy` for this."
* "The LLM I want to use is [LLM provider/model, e.g., 'openai/gpt-4o-mini'] and my API key is [YOUR_API_KEY_OR_ENV_VAR_NAME] (or tell it to look for an env var)."
* **Option A (Describing fields):** "I need a JSON object with the following fields: 'author_name', 'article_publish_date', and a 'short_summary' (about 2 sentences)."
* **Option B (Example JSON):** "The JSON output should look something like this: `{\"author\": \"Jane Doe\", \"published_on\": \"2024-05-23\", \"summary\": \"This article discusses...\"}`."
* **Option C (Pydantic Model - more advanced but best for AI):** "Here's a Pydantic model that defines the structure I want: [Pydantic Class Code Snippet]. Use this for the schema."
* **AI's Role & Reasoning:** The AI needs to construct an `LLMConfig` and an `LLMExtractionStrategy`. If the user provides field descriptions or an example JSON, the AI can generate a simple schema dictionary. If a Pydantic model is provided, the AI should use `MyPydanticModel.model_json_schema()` to create the schema for `LLMExtractionStrategy`. This strategy is powerful because it leverages the LLM's understanding.
* **Code Snippet AI might generate (with Pydantic example):**
```python
from crawl4ai import CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
from pydantic import BaseModel, Field # Assuming user might provide this
# User might provide this, or AI generates it from description
class ArticleInfo(BaseModel):
author_name: str = Field(description="The main author of the article")
publication_date: str = Field(description="The date the article was published, e.g., YYYY-MM-DD")
short_summary: str = Field(description="A concise 2-3 sentence summary of the article")
llm_config = LLMConfig(
provider="openai/gpt-4o-mini", # Or user's choice
api_token="env:OPENAI_API_KEY" # Or direct key if user insists and understands risk
)
extraction_strategy = LLMExtractionStrategy(
llm_config=llm_config,
schema=ArticleInfo.model_json_schema(),
# instruction="Extract author, publication date, and a summary." # Could also be used
extraction_type="schema" # Important for Pydantic/JSON schema
)
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
# result = await crawler.arun(url="...", config=run_config)
# if result.success and result.extracted_content:
# article_data = json.loads(result.extracted_content) # Or ArticleInfo.model_validate_json(result.extracted_content)
# print(article_data)
```
* **3.4.3. Interacting with Pages (Dynamic Content):**
* **How to tell your AI (Vibe):** "This page ([URL]) loads more content when you scroll down, or when you click a 'Show More' button. `crawl4ai` needs to perform this interaction."
* **For clicking (Vibe):** "To get all the data, we need to click the button with text 'Load All Comments'."
* **AI's understanding:** This requires `js_code` to find and click the button. The AI should be guided that finding elements by text might involve more complex JS like `Array.from(document.querySelectorAll('button')).find(btn => btn.textContent.includes('Load All Comments')).click();`.
* **For scrolling (Vibe):** "Scroll to the bottom of the page to make sure everything loads."
* **AI's understanding:** `js_code` like `window.scrollTo(0, document.body.scrollHeight);`
* **Ensuring actions complete (Vibe):** "After clicking 'Load More', wait for the new items to appear. They usually show up in a `div` with class `comment-list` and we expect more than 10 comments."
* **AI's understanding:** Use `wait_for`. This can be a CSS selector (e.g., `wait_for="css:.comment-list .comment-item:nth-child(11)"`) or a JS condition (e.g., `wait_for="js:() => document.querySelectorAll('.comment-item').length > 10"`).
* **For multi-step interactions on the same page (Vibe):** "I need to first click 'Agree to Cookies', then click 'Show Details', then extract the text. Make sure these happen on the same page view."
* **AI's understanding:** "This requires a `session_id` to persist the page state across multiple `arun` calls. Each `arun` call will perform one step of the interaction."
* **If only JS interaction is needed (Vibe):** "After the first page load, the next actions (like clicking 'Next Page') only update part of the page with JavaScript, they don't reload everything."
* **AI's understanding:** "For these subsequent `arun` calls within the same session, set `js_only=True` in `CrawlerRunConfig` to prevent unnecessary full page navigations, making it faster."
* **3.4.4. Taking Screenshots or PDFs:**
* **Simple request (Vibe):** "Get me a screenshot of [URL]."
* **AI's understanding:** Set `screenshot=True` in `CrawlerRunConfig`.
* **Or (Vibe):** "I need a PDF version of [URL]."
* **AI's understanding:** Set `pdf=True` in `CrawlerRunConfig`.
* **3.4.5. Caching for Speed and Freshness:**
* **For faster testing/development (Vibe):** "I'm testing my script for [URL]. Can you make it use the cache so it's faster after the first run?"
* **AI's understanding:** Use `cache_mode=CacheMode.ENABLED`.
* **Reasoning:** This saves time during development by re-using previously fetched content.
* **To get the latest data (Vibe):** "I need the absolute latest version of [URL], don't use any cached data."
* **AI's understanding:** Use `cache_mode=CacheMode.BYPASS`.
* **Reasoning:** Ensures `crawl4ai` fetches a fresh copy from the web, ignoring any local cache. This is crucial for production or when data changes frequently.
* **3.5. `BrowserConfig`**: Browser-Level Settings (Simplified for Vibe Coding)
* **3.5.1. When it's usually not needed:**
* **What to tell your AI (or what it should infer):** "For most simple crawls where you just need to fetch a page and get its content, you probably don't need to worry about `BrowserConfig`. `crawl4ai` uses sensible defaults."
* **3.5.2. Headless Mode:**
* **Default behavior (Vibe):** "By default, `crawl4ai` runs 'headless', which means you won't see a browser window pop up. This is usually what we want for automated scripts."
* **To see the browser (Vibe):** "I'm having trouble with [URL], and I want to see what the browser is doing. Can you make it run in non-headless mode?"
* **AI's understanding:** The user wants to visually debug. Set `headless=False` in `BrowserConfig`.
* **3.5.3. Using a Proxy:**
* **How to ask (Vibe):** "I need to make `crawl4ai` use a proxy server for this crawl."
* **Information to provide (Vibe):** "The proxy server address is `http://myproxy.server.com:8080`. If it needs a username and password, they are `myuser` and `mypass`. Make sure to use the `proxy_config` setting in `BrowserConfig`."
* **AI's understanding:** The AI should construct a `ProxyConfig` object (or dictionary that `BrowserConfig` can handle) and pass it to `BrowserConfig`.
* **3.5.4. Changing User Agent:**
* **How to ask (Vibe):** "The website [URL] might be blocking default user agents. Can we make `crawl4ai` look like it's Firefox on a Mac?"
* **Information to provide (Vibe):** "You can set a custom `user_agent` string in `BrowserConfig`. For example, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0'."
* **AI's understanding:** The AI should pass the provided string to the `user_agent` parameter of `BrowserConfig`.
* **3.6. `LLMConfig`**: Configuring Language Models (Simplified for Vibe Coding)
* **3.6.1. When it's needed:**
* **What to tell your AI:** "If we're using `LLMExtractionStrategy` to extract structured data or `LLMContentFilter` to clean up content, we need to tell `crawl4ai` which language model to use. This is done with an `LLMConfig` object."
* **3.6.2. Information to provide the AI (Vibe):**
* **Model choice:** "For this task, let's use the `provider` called 'openai/gpt-4o-mini'." (Other examples: 'ollama/llama3', 'anthropic/claude-3-opus-20240229').
* **API Key:** "My `api_token` for this provider is [YOUR_API_KEY_PLACEHOLDER]. (Best practice is to tell the AI to get it from an environment variable, e.g., 'env:OPENAI_API_KEY')."
* **AI's understanding:** The AI will create an `LLMConfig(provider="...", api_token="...")` and pass it to the relevant strategy.
* **Code Snippet AI might generate:**
```python
from crawl4ai import LLMConfig
# For OpenAI
llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY")
# For Ollama (locally running Llama3)
# llm_conf = LLMConfig(provider="ollama/llama3") # api_token often not needed for local Ollama
```
* **3.7. The `CrawlResult`**: Understanding What You Get Back
* **3.7.1. Checking for Success:**
* **What to tell your AI (Crucial Vibe):** "When `crawl4ai` finishes an `arun` or `arun_many` call, the most important first step is to check if it was successful. Tell the AI to always generate code that checks `result.success`. This will be `True` or `False`."
* **If `False` (Vibe):** "If `result.success` is `False`, the AI should print or log `result.error_message` to tell us what went wrong."
* **3.7.2. Accessing Markdown Content:**
* **Raw Markdown (Vibe):** "The main text content of the page, converted to Markdown, is usually in `result.markdown.raw_markdown`."
* **Filtered Markdown (Vibe):** "If we used a content filter (like `PruningContentFilter`), the cleaner, more focused Markdown will be in `result.markdown.fit_markdown`."
* **3.7.3. Accessing Extracted Structured Data (JSON):**
* **Where to find it (Vibe):** "If we asked `crawl4ai` to extract specific structured data (using `JsonCssExtractionStrategy` or `LLMExtractionStrategy`), that data will be in `result.extracted_content`."
* **How to use it (Vibe):** "The `result.extracted_content` is a JSON string. To use it in Python, tell the AI to parse it using `json.loads()`."
* **3.7.4. Accessing Links:**
* **What it contains (Vibe):** "`result.links` is a dictionary. It usually has two keys: 'internal' and 'external'. Each of these is a list of links found on the page, with details like the link text and URL."
* **3.7.5. Accessing Media Information:**
* **What it contains (Vibe):** "`result.media` is another dictionary. It can contain lists of 'images', 'videos', or 'audios' found on the page, along with their URLs and sometimes alt text or descriptions."
* **3.7.6. Accessing Screenshots and PDFs:**
* **Screenshots (Vibe):** "If we set `screenshot=True` in the config, the screenshot image data (which is a base64 encoded string) will be in `result.screenshot`."
* **PDFs (Vibe):** "If we set `pdf=True`, the PDF data (as raw bytes) will be in `result.pdf`."
## 4. Common "Vibe" Recipes & Prompting Patterns for AI Assistants
Here are some common scenarios and how you can prompt your AI assistant to use `crawl4ai` to solve them.
* **4.1. Recipe 1: Get Clean Markdown from a Page**
* **4.1.1. User Vibe Prompt to AI:**
* *Example 1 (Basic):* "Hey AI, can you use `crawl4ai` to get me the clean Markdown content of `https://blog.example.com/my-latest-post`?"
* *Example 2 (Requesting more cleaning):* "AI, I need *very* clean Markdown from `https://some.news.site/article123`. It's for an LLM, so please try to remove ads, navigation, and footers."
* **4.1.2. Key `crawl4ai` elements the AI should use (and why):**
* **`AsyncWebCrawler`:** The starting point for any crawl.
* **`arun`:** Since it's a single URL.
* **`CrawlerRunConfig`:**
* For Example 1, the default `DefaultMarkdownGenerator` might be sufficient.
* For Example 2 ("very clean"), the AI should be guided to use `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
* **Reasoning:** `PruningContentFilter` is designed to heuristically remove common boilerplate like headers, footers, and navigation bars before Markdown conversion, leading to cleaner text suitable for LLMs.
* **4.1.3. Expected Output from AI-generated code:**
* A Python script that initializes `AsyncWebCrawler`, calls `arun` with the appropriate URL and config.
* The script should then access and print (or save) `result.markdown.raw_markdown` (for basic) or `result.markdown.fit_markdown` (if `PruningContentFilter` was used).
* **Code Example (for "very clean"):**
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
async def get_clean_markdown(url_to_crawl):
markdown_generator = DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
)
run_config = CrawlerRunConfig(
markdown_generator=markdown_generator,
cache_mode="BYPASS" # Ensure fresh crawl for demo
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url_to_crawl, config=run_config)
if result.success:
print(f"--- Fit Markdown for {url_to_crawl} ---")
print(result.markdown.fit_markdown)
# You might also want to see raw_markdown to compare
# print(f"--- Raw Markdown for {url_to_crawl} ---")
# print(result.markdown.raw_markdown)
else:
print(f"Failed to crawl {url_to_crawl}: {result.error_message}")
# asyncio.run(get_clean_markdown("https://en.wikipedia.org/wiki/Python_(programming_language)"))
```
* **4.2. Recipe 2: Extract All Product Names and Prices from an E-commerce Category Page**
* **4.2.1. User Vibe Prompt to AI:**
* *Example:* "AI, I need to use `crawl4ai` to get all product names and their prices from `https://www.example-store.com/laptops`. On that page, product names look like they are in `<h3>` tags with a class `product-title`, and prices are in `<span>` elements with the class `final-price`."
* **4.2.2. Key `crawl4ai` elements AI should use (and why):**
* **`AsyncWebCrawler`**, **`arun`**.
* **`CrawlerRunConfig`** with **`JsonCssExtractionStrategy`**.
* **Reasoning:** The user described a page with repeating structured items. `JsonCssExtractionStrategy` is ideal for this as it uses CSS selectors to pinpoint the data. The AI's task is to translate the user's description of element locations into a valid schema for the strategy.
* The AI needs to understand that `baseSelector` in the schema should target the container for each product, and `fields` will target individual pieces of data within that container.
* **4.2.3. Expected Output from AI-generated code:**
* A Python script that defines the schema dictionary.
* Initializes `JsonCssExtractionStrategy` with this schema.
* Passes the strategy to `CrawlerRunConfig`.
* After `arun`, it parses `result.extracted_content` using `json.loads()` and likely iterates through the list of extracted product dictionaries.
* **Code Example:**
```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
async def extract_products(url_to_crawl):
# AI helps create this schema based on user's description
product_schema = {
"name": "LaptopList",
"baseSelector": "div.product-listing-item", # Hypothetical selector for each product's container
"fields": [
{"name": "product_name", "selector": "h3.product-title", "type": "text"},
{"name": "price", "selector": "span.final-price", "type": "text"}
]
}
extraction_strategy = JsonCssExtractionStrategy(schema=product_schema)
run_config = CrawlerRunConfig(
extraction_strategy=extraction_strategy,
cache_mode="BYPASS"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url_to_crawl, config=run_config)
if result.success and result.extracted_content:
products = json.loads(result.extracted_content)
print(f"Found {len(products)} products:")
for i, product in enumerate(products[:3]): # Print first 3
print(f" Product {i+1}: Name='{product.get('product_name')}', Price='{product.get('price')}'")
else:
print(f"Failed to extract products from {url_to_crawl}: {result.error_message}")
# asyncio.run(extract_products("https://www.example-store.com/laptops")) # Replace with a real URL for testing
```
* **4.3. Recipe 3: Extract Key Information from an Article using an LLM**
* **4.3.1. User Vibe Prompt to AI:**
* *Example:* "AI, I want `crawl4ai` to read this article: `https://example.com/news/ai-breakthrough`. Use `openai/gpt-4o-mini` to extract the author's name, the publication date, and a short (2-3 sentence) summary. The output should be JSON. My OpenAI API key is in the `OPENAI_API_KEY` environment variable."
* **4.3.2. Key `crawl4ai` elements AI should use (and why):**
* **`AsyncWebCrawler`**, **`arun`**.
* **`CrawlerRunConfig`** with **`LLMExtractionStrategy`**.
* **`LLMConfig`**: To specify the `provider` ("openai/gpt-4o-mini") and `api_token` ("env:OPENAI_API_KEY").
* **Reasoning:** The task requires understanding and summarization, making `LLMExtractionStrategy` suitable. The AI needs to construct a schema (either a simple dictionary or a Pydantic model `model_json_schema()`) that tells the LLM what fields to populate. The instruction to the LLM will be implicitly derived from the schema field descriptions or can be explicitly provided.
* **4.3.3. Expected Output from AI-generated code:**
* Python script that defines a Pydantic model (or a dictionary schema).
* Initializes `LLMConfig` and `LLMExtractionStrategy`.
* Parses `result.extracted_content`.
* **Code Example (using Pydantic):**
```python
import asyncio
import json
import os
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
from crawl4ai.extraction_strategy import LLMExtractionStrategy
class ArticleDetails(BaseModel):
author_name: str = Field(..., description="The main author of the article.")
publication_date: str = Field(..., description="The date the article was published (e.g., YYYY-MM-DD).")
summary: str = Field(..., description="A concise 2-3 sentence summary of the article.")
async def extract_article_info_llm(url_to_crawl):
if not os.getenv("OPENAI_API_KEY"): # Or your specific key variable
print("API key environment variable not set. Skipping LLM extraction.")
return
llm_config = LLMConfig(
provider="openai/gpt-4o-mini", # Use a cost-effective model for demos
api_token="env:OPENAI_API_KEY"
)
extraction_strategy = LLMExtractionStrategy(
llm_config=llm_config,
schema=ArticleDetails.model_json_schema(),
extraction_type="schema" # Crucial for Pydantic/JSON schema
)
run_config = CrawlerRunConfig(
extraction_strategy=extraction_strategy,
cache_mode="BYPASS"
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url=url_to_crawl, config=run_config)
if result.success and result.extracted_content:
try:
article_data = ArticleDetails.model_validate_json(result.extracted_content)
print(f"Extracted Article Info for {url_to_crawl}:")
print(json.dumps(article_data.model_dump(), indent=2))
except Exception as e:
print(f"Error parsing LLM output: {e}")
print(f"Raw LLM output: {result.extracted_content}")
else:
print(f"Failed to extract article info from {url_to_crawl}: {result.error_message}")
# asyncio.run(extract_article_info_llm("https://www.example.com/news/ai-breakthrough")) # Replace with real article
```
* **4.4. Recipe 4: Crawl the first 3 pages of a blog (clicking "Next Page")**
* **4.4.1. User Vibe Prompt to AI:**
* *Example:* "AI, can you use `crawl4ai` to get the Markdown from the first 3 pages of `https://myblog.example.com/archive`? To get to the next page, I think you need to click a link that says 'Older Posts'."
* **4.4.2. Key `crawl4ai` elements AI should use (and why):**
* **`AsyncWebCrawler`**.
* **Multiple `arun` calls** in a loop (3 iterations).
* **`CrawlerRunConfig`** with:
* `session_id="blog_session"`: **Crucial** for maintaining the browser state (cookies, current page) across the multiple clicks.
* `js_code`: JavaScript to find and click the "Older Posts" link. The AI might need to generate robust JS like:
`Array.from(document.querySelectorAll('a')).find(a => a.textContent.trim() === 'Older Posts')?.click();`
* `wait_for`: After clicking, wait for a condition that indicates the next page has loaded (e.g., a specific element on the new page, or a change in an existing element). This can be tricky and might require some iteration. A simple `wait_for` for a few seconds could also be a starting point, like `wait_for=3000` (milliseconds).
* `js_only=True`: For the second and third `arun` calls, after the initial page load. This tells `crawl4ai` to only execute the JS and not perform a full new navigation to the original URL.
* **4.4.3. Expected Output from AI-generated code:**
* A Python script with a loop that calls `arun` three times.
* The script should collect and potentially print or save the Markdown from each page.
* **Code Example:**
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
async def crawl_blog_pages(start_url, num_pages=3):
session_id = "my_blog_crawl_session"
all_markdowns = []
# JavaScript to find and click "Older Posts" (example)
js_click_older_posts = """
(() => {
const links = Array.from(document.querySelectorAll('a'));
const olderPostsLink = links.find(a => a.textContent.trim().toLowerCase() === 'older posts');
if (olderPostsLink) {
olderPostsLink.click();
return true; // Indicate click was attempted
}
return false; // Indicate link not found
})();
"""
async with AsyncWebCrawler() as crawler:
current_url = start_url
for i in range(num_pages):
print(f"Crawling page {i+1}...")
run_config_dict = {
"session_id": session_id,
"cache_mode": CacheMode.BYPASS,
"wait_for": 2000 # Wait 2s for content to potentially load after click
}
if i > 0: # For subsequent pages, click and don't re-navigate
run_config_dict["js_code"] = js_click_older_posts
run_config_dict["js_only"] = True
run_config = CrawlerRunConfig(**run_config_dict)
result = await crawler.arun(url=current_url, config=run_config) # URL is mainly for context in js_only
if result.success:
print(f" Page {i+1} ({result.url}) - Markdown length: {len(result.markdown.raw_markdown)}")
all_markdowns.append({"url": result.url, "markdown": result.markdown.raw_markdown})
if i < num_pages - 1 and i > 0 and not run_config_dict.get("js_code_executed_successfully", True): # Hypothetical flag
print(f" 'Older Posts' link might not have been found or clicked on page {i+1}. Stopping.")
break
else:
print(f" Failed to crawl page {i+1}: {result.error_message}")
break
# Important: Clean up the session
await crawler.crawler_strategy.kill_session(session_id)
print(f"\nCollected markdown for {len(all_markdowns)} pages.")
# For demo, print first 100 chars of each
# for i, md_data in enumerate(all_markdowns):
# print(f"\n--- Page {i+1} URL: {md_data['url']} ---")
# print(md_data['markdown'][:100] + "...")
# asyncio.run(crawl_blog_pages("YOUR_BLOG_START_URL_HERE"))
```
* **4.5. Recipe 5: Get Screenshots of a List of URLs**
* **4.5.1. User Vibe Prompt to AI:**
* *Example:* "AI, use `crawl4ai` to take a screenshot of each of these pages: `https://example.com`, `https://crawl4ai.com`, `https://github.com`. Save them as `example_com.png`, `crawl4ai_com.png`, and `github_com.png`."
* **4.5.2. Key `crawl4ai` elements AI should use (and why):**
* **`AsyncWebCrawler`**.
* **`arun_many`**: Efficient for processing a list of URLs.
* **`CrawlerRunConfig`** with `screenshot=True`.
* **Reasoning:** `arun_many` will process each URL with the same config. The AI needs to add logic to iterate through the results and save each `result.screenshot` (which is base64 data) to a uniquely named file.
* **4.5.3. Expected Output from AI-generated code:**
* Python script.
* PNG files saved to the current directory or a specified output directory.
* **Code Example:**
```python
import asyncio
import base64
import os
from urllib.parse import urlparse
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
async def take_screenshots(urls_to_screenshot):
run_config = CrawlerRunConfig(
screenshot=True,
cache_mode=CacheMode.BYPASS # Get fresh screenshots
)
output_dir = "screenshots_output"
os.makedirs(output_dir, exist_ok=True)
async with AsyncWebCrawler() as crawler:
results = await crawler.arun_many(urls=urls_to_screenshot, config=run_config)
for result in results:
if result.success and result.screenshot:
# Create a filename from the URL
parsed_url = urlparse(result.url)
filename = "".join(c if c.isalnum() else '_' for c in parsed_url.netloc + parsed_url.path)
if not filename or filename == "_": # Handle root path or empty paths
filename = "homepage"
filepath = os.path.join(output_dir, f"{filename}.png")
try:
screenshot_data = base64.b64decode(result.screenshot)
with open(filepath, "wb") as f:
f.write(screenshot_data)
print(f"Screenshot saved to {filepath}")
except Exception as e:
print(f"Error saving screenshot for {result.url}: {e}")
elif not result.success:
print(f"Failed to crawl {result.url}: {result.error_message}")
elif not result.screenshot:
print(f"Crawled {result.url} but no screenshot data was returned.")
# urls = ["https://example.com", "https://crawl4ai.com", "https://github.com"]
# asyncio.run(take_screenshots(urls))
```
## 5. Tips for Effective Prompting Your AI Assistant for Crawl4AI Tasks
To get the best code from your AI assistant when working with `crawl4ai`, consider these prompting tips:
* **5.1. Be Clear About Your Goal:**
* Start with a high-level objective. Instead of just "Crawl a page," say "I need to extract all article titles from the homepage of this news site," or "Get the main content of this blog post as clean Markdown," or "Take full-page screenshots of these product pages." This helps the AI choose the right strategies and configurations.
* **5.2. Always Provide the URL(s):**
* This seems obvious, but be precise. If it's a list, provide the list.
* Remember to use the `file:///` prefix for local files (e.g., `file:///Users/me/Documents/mypage.html`) and `raw:` for inline HTML (e.g., `raw:<html><body>...</body></html>`). The AI might not always infer this correctly without a hint.
* **5.3. Describe Data for Extraction (Especially for `JsonCssExtractionStrategy` or `LLMExtractionStrategy`):**
* **What you want:** List the specific pieces of information you need (e.g., "product name," "price," "author," "publication_date," "article summary").
* **Where to find it (for CSS/XPath):** If you have an idea of the HTML structure, share it. "Product names seem to be in `<h2>` tags with class `item-title`." "The price is always in a `<span>` element right after a `<strong>` tag that says 'Price:'." This helps the AI generate accurate CSS selectors or XPath expressions for `JsonCssExtractionStrategy`.
* **Desired structure (for LLM):** For `LLMExtractionStrategy`, tell the AI the desired JSON structure. "I want a list of objects, where each object has a 'title' and a 'link'." Or even better, "Can you define a Pydantic model for me that has 'title' as a string and 'link' as a string, and then use that for extraction?"
* **5.4. Specify LLM Details for LLM Extraction or Filtering:**
* **Model/Provider:** "Use `openai/gpt-4o-mini` for this extraction." or "I want to use my local Ollama model, `ollama/llama3`."
* **API Key:** Clearly state where the API key should come from. "My API key is in the environment variable `OPENAI_API_KEY`." (This is safer than putting the key directly in the prompt). If you must provide it directly, be aware of the security implications.
* **5.5. Mention Page Dynamics and Interactions:**
* "This page loads more items when you scroll down."
* "You need to click the 'View All Reviews' button to see all the reviews."
* "The data I want only appears after selecting 'Category X' from a dropdown."
* This signals to the AI that `js_code`, `wait_for`, and possibly `session_id` will be necessary. You might need to guide it on *how* to identify the elements to interact with (e.g., "The 'Load More' button has the ID `load-more-btn`").
* **5.6. Iterative Refinement is Key:**
* Your first prompt might not yield perfect code. That's okay!
* Treat it as a conversation. If the AI-generated code misses something or makes a mistake:
* "That was close, but it missed extracting the product ratings. Ratings seem to be in a `div` with class `star-rating` inside each product item."
* "The script timed out. Can we increase the `page_timeout` in `CrawlerRunConfig` to 90 seconds?"
* "It didn't click the 'Next' button correctly. The button actually has the text '>>' instead of 'Next Page'."
* Provide the error messages or incorrect output back to the AI for context.
## 6. What to Expect as Output (From AI-Generated Code)
When you use "Vibe Coding" with an AI assistant for `crawl4ai`, you should generally expect the following:
* **6.1. Python Code:**
* The primary output will be a Python script that uses the `crawl4ai` library.
* It should include necessary imports like `asyncio`, `AsyncWebCrawler`, `CrawlerRunConfig`, etc.
* It will typically define an `async def main():` function and run it with `asyncio.run(main())`.
* **6.2. Accessing the `CrawlResult`:**
* The core of the script will involve one or more calls to `crawler.arun(...)` or `crawler.arun_many(...)`.
* These calls return `CrawlResult` objects (or a list of them for `arun_many`).
* The AI-generated code should then show you how to access the specific data you asked for from these `CrawlResult` objects. For example:
* `print(result.markdown.raw_markdown)` or `print(result.markdown.fit_markdown)`
* `data = json.loads(result.extracted_content)`
* `screenshot_data = base64.b64decode(result.screenshot)`
* `if not result.success: print(result.error_message)`
* **6.3. Files Saved to Disk (if requested):**
* If your vibe prompt included saving data (e.g., "save the screenshots as PNG files," "write the extracted JSON to `output.json`"), the AI-generated code should include the Python logic to perform these file operations.
* **Example for saving a screenshot:**
```python
import base64
# ... inside your async function, after getting 'result' ...
if result.success and result.screenshot:
with open("myscreenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
print("Screenshot saved to myscreenshot.png")
```
## 7. Conclusion: Vibe Your Way to Web Data!
* **7.1. Recap of "Vibe Coding" Benefits with `crawl4ai`:**
"Vibe Coding" empowers you to leverage the full capabilities of `crawl4ai` without needing to memorize every API detail. By understanding the high-level concepts and key building blocks outlined in this guide, you can effectively communicate your data extraction and web interaction needs to an AI coding assistant. This leads to faster prototyping, easier access to web data for non-programmers, and a more intuitive way to build data-driven applications.
* **7.2. Encouragement to experiment with different prompts and `crawl4ai` features:**
The key to successful "Vibe Coding" is experimentation. Try different ways of describing your goals to your AI assistant. If the first attempt doesn't yield the perfect `crawl4ai` code, refine your prompt with more specific details or hints. Don't be afraid to mention `crawl4ai` specific terms like `CrawlerRunConfig`, `js_code`, or `LLMExtractionStrategy` this guide has equipped you with the essential vocabulary. The more context you provide, the better the AI can assist you.
* **7.3. Pointers to more detailed `crawl4ai` documentation for users who want to learn direct coding or advanced configurations:**
While "Vibe Coding" is a great way to get started and be productive quickly, you might eventually want to dive deeper into `crawl4ai`'s capabilities or fine-tune the generated code yourself. For that, refer to:
* **The Official Crawl4AI API Reference:** (Assuming this exists or will exist - replace with actual link if available, e.g., `https://docs.crawl4ai.com/api/`) For detailed information on all classes, methods, and parameters.
* **Specific "Reasoning & Problem-Solving" Guides:** Check the `crawl4ai` documentation for other guides that delve into specific components like advanced `CrawlerRunConfig` options, deep crawling strategies, or custom extraction techniques.
Happy Vibe Coding, and may your web data adventures be fruitful!
```

View File

@@ -0,0 +1,144 @@
// ==== File: assets/toc.js ====
document.addEventListener('DOMContentLoaded', () => {
const mainContent = document.getElementById('terminal-mkdocs-main-content');
const tocContainer = document.getElementById('toc-sidebar');
const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
if (!mainContent) {
console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
return;
}
// --- Create ToC container if it doesn't exist ---
let tocElement = tocContainer;
if (!tocElement) {
if (!mainGrid) {
console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
return;
}
tocElement = document.createElement('aside');
tocElement.id = 'toc-sidebar';
tocElement.style.display = 'none'; // Keep hidden initially
// Append it as the last child of the flex grid
mainGrid.appendChild(tocElement);
console.info("TOC Generator: Created '#toc-sidebar' element.");
}
// --- Find Headings (h2, h3, h4 are common for ToC) ---
const headings = mainContent.querySelectorAll('h2, h3, h4');
if (headings.length === 0) {
console.info("TOC Generator: No headings found on this page. ToC not generated.");
tocElement.style.display = 'none'; // Ensure it's hidden
return;
}
// --- Generate ToC List ---
const tocList = document.createElement('ul');
const observerTargets = []; // Store headings for IntersectionObserver
headings.forEach((heading, index) => {
// Ensure heading has an ID for linking
if (!heading.id) {
// Create a simple slug-like ID
heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
}
const listItem = document.createElement('li');
const link = document.createElement('a');
link.href = `#${heading.id}`;
link.textContent = heading.textContent;
// Add class for styling based on heading level
const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
listItem.classList.add(`toc-level-${level}`);
listItem.appendChild(link);
tocList.appendChild(listItem);
observerTargets.push(heading); // Add to observer list
});
// --- Populate and Show ToC ---
// Optional: Add a title
const tocTitle = document.createElement('h4');
tocTitle.textContent = 'On this page'; // Customize title if needed
tocElement.innerHTML = ''; // Clear previous content if any
tocElement.appendChild(tocTitle);
tocElement.appendChild(tocList);
tocElement.style.display = ''; // Show the ToC container
console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
// --- Scroll Spy using Intersection Observer ---
const tocLinks = tocElement.querySelectorAll('a');
let activeLink = null; // Keep track of the current active link
const observerOptions = {
// Observe changes relative to the viewport, offset by the header height
// Negative top margin pushes the intersection trigger point down
// Negative bottom margin ensures elements low on the screen can trigger before they exit
rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
threshold: 0 // Trigger as soon as any part enters/exits the boundary
};
const observerCallback = (entries) => {
let topmostVisibleHeading = null;
entries.forEach(entry => {
const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
if (!link) return;
// Check if the heading is intersecting (partially or fully visible within rootMargin)
if (entry.isIntersecting) {
// Among visible headings, find the one closest to the top edge (within the rootMargin)
if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
topmostVisibleHeading = entry.target;
}
}
});
// If we found a topmost visible heading, activate its link
if (topmostVisibleHeading) {
const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
if (newActiveLink && newActiveLink !== activeLink) {
// Remove active class from previous link
if (activeLink) {
activeLink.classList.remove('active');
activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
}
// Add active class to the new link
newActiveLink.classList.add('active');
newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
activeLink = newActiveLink;
// Optional: Scroll the ToC sidebar to keep the active link visible
// newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
}
}
// If no headings are intersecting (scrolled past the last one?), maybe deactivate all
// Or keep the last one active - depends on desired behavior. Current logic keeps last active.
};
const observer = new IntersectionObserver(observerCallback, observerOptions);
// Observe all target headings
observerTargets.forEach(heading => observer.observe(heading));
// Initial check in case a heading is already in view on load
// (Requires slight delay for accurate layout calculation)
setTimeout(() => {
observerCallback(observer.takeRecords()); // Process initial state
}, 100);
// move footer and the hr before footer to the end of the main content
const footer = document.querySelector('footer');
const hr = footer.previousElementSibling;
if (hr && hr.tagName === 'HR') {
mainContent.appendChild(hr);
}
mainContent.appendChild(footer);
console.info("TOC Generator: Footer moved to the end of the main content.");
});