feat: add Script Builder to Chrome Extension and reorganize LLM context files
This commit introduces significant enhancements to the Crawl4AI ecosystem: Chrome Extension - Script Builder (Alpha): - Add recording functionality to capture user interactions (clicks, typing, scrolling) - Implement smart event grouping for cleaner script generation - Support export to both JavaScript and C4A script formats - Add timeline view for visualizing and editing recorded actions - Include wait commands (time-based and element-based) - Add saved flows functionality for reusing automation scripts - Update UI with consistent dark terminal theme (Dank Mono font, green/pink accents) - Release new extension versions: v1.1.0, v1.2.0, v1.2.1 LLM Context Builder Improvements: - Reorganize context files from llmtxt/ to llm.txt/ with better structure - Separate diagram templates from text content (diagrams/ and txt/ subdirectories) - Add comprehensive context files for all major Crawl4AI components - Improve file naming convention for better discoverability Documentation Updates: - Update apps index page to match main documentation theme - Standardize color scheme: "Available" tags use primary color (#50ffff) - Change "Coming Soon" tags to dark gray for better visual hierarchy - Add interactive two-column layout for extension landing page - Include code examples for both Schema Builder and Script Builder features Technical Improvements: - Enhance event capture mechanism with better element selection - Add support for contenteditable elements and complex form interactions - Implement proper scroll event handling for both window and element scrolling - Add meta key support for keyboard shortcuts - Improve selector generation for more reliable element targeting The Script Builder is released as Alpha, acknowledging potential bugs while providing early access to this powerful automation recording feature.
This commit is contained in:
@@ -8,7 +8,15 @@
|
||||
"Bash(mkdir:*)",
|
||||
"Bash(cp:*)",
|
||||
"Bash(rm:*)",
|
||||
"Bash(true)"
|
||||
"Bash(true)",
|
||||
"Bash(./package-extension.sh:*)",
|
||||
"Bash(find:*)",
|
||||
"Bash(chmod:*)",
|
||||
"Bash(rg:*)",
|
||||
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 5 -B 5 \"Script Builder\" docs/md_v2/apps/crawl4ai-assistant/)",
|
||||
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg -A 30 \"generateCode\\(events, format\\)\" docs/md_v2/apps/crawl4ai-assistant/content/content.js)",
|
||||
"Bash(/Users/unclecode/.npm-global/lib/node_modules/@anthropic-ai/claude-code/vendor/ripgrep/arm64-darwin/rg \"<style>\" docs/md_v2/apps/crawl4ai-assistant/index.html -A 5)",
|
||||
"Bash(git checkout:*)"
|
||||
]
|
||||
},
|
||||
"enableAllProjectMcpServers": false
|
||||
|
||||
@@ -384,16 +384,20 @@ code {
|
||||
|
||||
.coming-features {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(350px, 1fr));
|
||||
gap: 1.5rem;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.coming-feature {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 2rem;
|
||||
padding: 1.5rem;
|
||||
border-radius: 12px;
|
||||
border: 1px solid var(--border-color);
|
||||
transition: all 0.3s ease;
|
||||
height: 100%;
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
}
|
||||
|
||||
.coming-feature:hover {
|
||||
@@ -429,16 +433,18 @@ code {
|
||||
color: var(--text-secondary);
|
||||
margin-bottom: 1rem;
|
||||
line-height: 1.6;
|
||||
flex-grow: 1;
|
||||
}
|
||||
|
||||
.feature-preview {
|
||||
background: var(--bg-secondary);
|
||||
padding: 1rem;
|
||||
padding: 0.75rem 1rem;
|
||||
border-radius: 6px;
|
||||
font-family: var(--font-code);
|
||||
font-size: 0.875rem;
|
||||
font-size: 0.8125rem;
|
||||
color: var(--text-accent);
|
||||
border: 1px solid var(--border-color);
|
||||
margin-top: auto;
|
||||
}
|
||||
|
||||
.stay-tuned {
|
||||
@@ -537,3 +543,487 @@ code {
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
}
|
||||
|
||||
/* Interactive Tools Section */
|
||||
.interactive-tools {
|
||||
margin: 3rem 0;
|
||||
}
|
||||
|
||||
.interactive-tools h2 {
|
||||
font-size: 2rem;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.tools-container {
|
||||
display: grid;
|
||||
grid-template-columns: 300px 1fr;
|
||||
gap: 2rem;
|
||||
min-height: 400px;
|
||||
}
|
||||
|
||||
/* Tool Selector Panel */
|
||||
.tools-panel {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.tool-selector {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 12px;
|
||||
padding: 1.5rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 1rem;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
.tool-selector:hover {
|
||||
border-color: var(--primary-green);
|
||||
transform: translateX(4px);
|
||||
}
|
||||
|
||||
.tool-selector.active {
|
||||
background: var(--bg-secondary);
|
||||
border-color: var(--primary-green);
|
||||
box-shadow: 0 0 20px rgba(15, 187, 170, 0.3);
|
||||
}
|
||||
|
||||
.tool-icon {
|
||||
font-size: 2.5rem;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.tool-info h3 {
|
||||
margin: 0;
|
||||
font-size: 1.125rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.tool-info p {
|
||||
margin: 0.25rem 0 0;
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.tool-status {
|
||||
position: absolute;
|
||||
top: 1rem;
|
||||
right: 1rem;
|
||||
font-size: 0.75rem;
|
||||
padding: 0.25rem 0.75rem;
|
||||
border-radius: 20px;
|
||||
background: var(--primary-green);
|
||||
color: var(--bg-dark);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
.tool-status.alpha {
|
||||
background: var(--primary-pink);
|
||||
}
|
||||
|
||||
/* Tool Details Panel */
|
||||
.tool-details {
|
||||
background: var(--bg-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
border-radius: 12px;
|
||||
padding: 2rem;
|
||||
position: relative;
|
||||
overflow: hidden;
|
||||
}
|
||||
|
||||
.tool-content {
|
||||
display: none;
|
||||
animation: fadeIn 0.4s ease;
|
||||
}
|
||||
|
||||
.tool-content.active {
|
||||
display: block;
|
||||
}
|
||||
|
||||
@keyframes fadeIn {
|
||||
from {
|
||||
opacity: 0;
|
||||
transform: translateY(10px);
|
||||
}
|
||||
to {
|
||||
opacity: 1;
|
||||
transform: translateY(0);
|
||||
}
|
||||
}
|
||||
|
||||
.tool-header {
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.tool-header h3 {
|
||||
font-size: 1.75rem;
|
||||
margin: 0;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.tool-tagline {
|
||||
color: var(--text-secondary);
|
||||
font-size: 1rem;
|
||||
margin-top: 0.5rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
/* Tool Steps */
|
||||
.tool-steps {
|
||||
display: flex;
|
||||
flex-direction: column;
|
||||
gap: 1.5rem;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.step-item {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.step-number {
|
||||
background: var(--primary-green);
|
||||
color: var(--bg-dark);
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
border-radius: 50%;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-weight: bold;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.step-content h4 {
|
||||
margin: 0 0 0.5rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.step-content p {
|
||||
margin: 0 0 0.5rem;
|
||||
color: var(--text-secondary);
|
||||
font-size: 0.875rem;
|
||||
}
|
||||
|
||||
.step-visual {
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.highlight-green {
|
||||
color: var(--primary-green);
|
||||
font-size: 1.25rem;
|
||||
}
|
||||
|
||||
.highlight-pink {
|
||||
color: var(--primary-pink);
|
||||
font-size: 1.25rem;
|
||||
}
|
||||
|
||||
.highlight-accent {
|
||||
color: var(--primary-green);
|
||||
font-size: 1.25rem;
|
||||
}
|
||||
|
||||
.recording-dot {
|
||||
color: #ff3c74;
|
||||
font-size: 1.25rem;
|
||||
animation: pulse 1.5s ease-in-out infinite;
|
||||
}
|
||||
|
||||
@keyframes pulse {
|
||||
0%, 100% { opacity: 1; }
|
||||
50% { opacity: 0.5; }
|
||||
}
|
||||
|
||||
.action-icon {
|
||||
font-size: 1.25rem;
|
||||
margin: 0 0.25rem;
|
||||
}
|
||||
|
||||
/* Tool Features */
|
||||
.tool-features {
|
||||
display: flex;
|
||||
flex-wrap: wrap;
|
||||
gap: 0.75rem;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.feature-tag {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.5rem 1rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.feature-tag.alpha-tag {
|
||||
border-color: var(--primary-pink);
|
||||
color: var(--primary-pink);
|
||||
}
|
||||
|
||||
/* Code Showcase Section */
|
||||
.code-showcase {
|
||||
margin: 3rem 0;
|
||||
}
|
||||
|
||||
.code-showcase h2 {
|
||||
font-size: 2rem;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
/* Code Tabs */
|
||||
.code-tabs {
|
||||
display: flex;
|
||||
gap: 1rem;
|
||||
margin-bottom: 2rem;
|
||||
}
|
||||
|
||||
.code-tab {
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
padding: 0.75rem 1.5rem;
|
||||
border-radius: 8px;
|
||||
font-size: 1rem;
|
||||
color: var(--text-secondary);
|
||||
cursor: pointer;
|
||||
transition: all 0.3s ease;
|
||||
font-family: var(--font-primary);
|
||||
}
|
||||
|
||||
.code-tab:hover {
|
||||
border-color: var(--primary-green);
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.code-tab.active {
|
||||
background: var(--primary-green);
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-green);
|
||||
font-weight: 600;
|
||||
}
|
||||
|
||||
/* Code Examples */
|
||||
.code-examples {
|
||||
position: relative;
|
||||
min-height: 500px;
|
||||
}
|
||||
|
||||
.code-example {
|
||||
position: absolute;
|
||||
width: 100%;
|
||||
opacity: 0;
|
||||
visibility: hidden;
|
||||
transition: opacity 0.4s ease, visibility 0.4s ease;
|
||||
}
|
||||
|
||||
.code-example.active {
|
||||
opacity: 1;
|
||||
visibility: visible;
|
||||
position: relative;
|
||||
}
|
||||
|
||||
/* Copy Button */
|
||||
.copy-button {
|
||||
position: absolute;
|
||||
right: 1rem;
|
||||
top: 50%;
|
||||
transform: translateY(-50%);
|
||||
background: var(--bg-tertiary);
|
||||
border: 1px solid var(--border-color);
|
||||
color: var(--text-secondary);
|
||||
padding: 0.25rem 0.75rem;
|
||||
border-radius: 4px;
|
||||
font-size: 0.75rem;
|
||||
cursor: pointer;
|
||||
transition: all 0.2s ease;
|
||||
font-family: var(--font-primary);
|
||||
}
|
||||
|
||||
.copy-button:hover {
|
||||
background: var(--primary-green);
|
||||
color: var(--bg-dark);
|
||||
border-color: var(--primary-green);
|
||||
}
|
||||
|
||||
.copy-button.copied {
|
||||
background: var(--primary-green);
|
||||
color: var(--bg-dark);
|
||||
}
|
||||
|
||||
/* Responsive Updates */
|
||||
@media (max-width: 768px) {
|
||||
.tools-container {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.tools-panel {
|
||||
flex-direction: row;
|
||||
overflow-x: auto;
|
||||
padding-bottom: 0.5rem;
|
||||
}
|
||||
|
||||
.tool-selector {
|
||||
min-width: 250px;
|
||||
}
|
||||
|
||||
.code-tabs {
|
||||
flex-wrap: wrap;
|
||||
}
|
||||
}
|
||||
|
||||
/* Script Builder Section */
|
||||
.script-builder-section {
|
||||
margin: 4rem 0;
|
||||
}
|
||||
|
||||
.script-builder-section h2 {
|
||||
font-size: 2rem;
|
||||
margin-bottom: 2rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.script-builder-section h2 span {
|
||||
color: var(--primary-pink);
|
||||
font-size: 0.875rem;
|
||||
font-weight: normal;
|
||||
margin-left: 0.5rem;
|
||||
}
|
||||
|
||||
.script-features-grid {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(250px, 1fr));
|
||||
gap: 1.5rem;
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.script-feature {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 1.5rem;
|
||||
border-radius: 8px;
|
||||
border: 1px solid var(--border-color);
|
||||
text-align: center;
|
||||
transition: all 0.2s ease;
|
||||
}
|
||||
|
||||
.script-feature:hover {
|
||||
border-color: var(--primary-green);
|
||||
transform: translateY(-2px);
|
||||
box-shadow: 0 4px 16px rgba(15, 187, 170, 0.2);
|
||||
}
|
||||
|
||||
.script-feature .feature-icon {
|
||||
font-size: 2.5rem;
|
||||
margin-bottom: 1rem;
|
||||
display: block;
|
||||
}
|
||||
|
||||
.script-feature h4 {
|
||||
font-size: 1.125rem;
|
||||
margin-bottom: 0.5rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.script-feature p {
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.script-workflow {
|
||||
margin-top: 2rem;
|
||||
}
|
||||
|
||||
.workflow-step {
|
||||
display: flex;
|
||||
gap: 1.5rem;
|
||||
margin-bottom: 2rem;
|
||||
align-items: flex-start;
|
||||
}
|
||||
|
||||
.workflow-step .step-number {
|
||||
background: var(--primary-pink);
|
||||
color: var(--bg-dark);
|
||||
width: 40px;
|
||||
height: 40px;
|
||||
border-radius: 50%;
|
||||
display: flex;
|
||||
align-items: center;
|
||||
justify-content: center;
|
||||
font-weight: bold;
|
||||
flex-shrink: 0;
|
||||
}
|
||||
|
||||
.workflow-step .step-content h4 {
|
||||
margin-bottom: 0.5rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.workflow-step .step-content p {
|
||||
color: var(--text-secondary);
|
||||
}
|
||||
|
||||
.action-types {
|
||||
display: grid;
|
||||
grid-template-columns: repeat(auto-fit, minmax(280px, 1fr));
|
||||
gap: 1rem;
|
||||
margin-top: 1.5rem;
|
||||
}
|
||||
|
||||
.action-type {
|
||||
background: var(--bg-tertiary);
|
||||
padding: 1rem;
|
||||
border-radius: 6px;
|
||||
font-size: 0.875rem;
|
||||
color: var(--text-secondary);
|
||||
border: 1px solid var(--border-color);
|
||||
font-family: var(--font-code);
|
||||
}
|
||||
|
||||
.action-type code {
|
||||
color: var(--primary-green);
|
||||
font-weight: 600;
|
||||
margin-right: 0.5rem;
|
||||
}
|
||||
|
||||
.alpha-note {
|
||||
background: rgba(243, 128, 245, 0.1);
|
||||
border: 1px solid var(--primary-pink);
|
||||
border-radius: 8px;
|
||||
padding: 1.5rem;
|
||||
margin-top: 2rem;
|
||||
color: var(--text-primary);
|
||||
}
|
||||
|
||||
.alpha-note strong {
|
||||
color: var(--primary-pink);
|
||||
}
|
||||
|
||||
@media (max-width: 768px) {
|
||||
.script-features-grid {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.workflow-step {
|
||||
flex-direction: column;
|
||||
gap: 1rem;
|
||||
}
|
||||
|
||||
.action-types {
|
||||
grid-template-columns: 1fr;
|
||||
}
|
||||
|
||||
.coming-soon-section h2 {
|
||||
font-size: 1.5rem;
|
||||
}
|
||||
}
|
||||
@@ -2,7 +2,7 @@
|
||||
|
||||
// Handle messages from content script
|
||||
chrome.runtime.onMessage.addListener((message, sender, sendResponse) => {
|
||||
if (message.action === 'downloadCode') {
|
||||
if (message.action === 'downloadCode' || message.action === 'downloadScript') {
|
||||
try {
|
||||
// Create a data URL for the Python code
|
||||
const dataUrl = 'data:text/plain;charset=utf-8,' + encodeURIComponent(message.code);
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.1.0.zip
Normal file
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.1.0.zip
Normal file
Binary file not shown.
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.0.zip
Normal file
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.0.zip
Normal file
Binary file not shown.
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip
Normal file
BIN
docs/md_v2/apps/crawl4ai-assistant/crawl4ai-assistant-v1.2.1.zip
Normal file
Binary file not shown.
@@ -43,23 +43,23 @@
|
||||
<span class="terminal-title">About Crawl4AI Assistant</span>
|
||||
</div>
|
||||
<div class="terminal-content">
|
||||
<p>Transform any website into structured data with just a few clicks! The Crawl4AI Assistant Chrome Extension lets you visually select elements on any webpage and automatically generates Python code for web scraping.</p>
|
||||
<p>Transform any website into structured data with just a few clicks! The Crawl4AI Assistant Chrome Extension provides two powerful tools for web scraping and automation.</p>
|
||||
|
||||
<div class="features-grid">
|
||||
<div class="feature-card">
|
||||
<span class="feature-icon">🎯</span>
|
||||
<h3>Visual Selection</h3>
|
||||
<p>Click on any element to select it - no CSS selectors needed</p>
|
||||
<h3>Schema Builder</h3>
|
||||
<p>Click to select elements and build extraction schemas visually</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<span class="feature-icon">📊</span>
|
||||
<h3>Schema Builder</h3>
|
||||
<p>Build extraction schemas by clicking on container and field elements</p>
|
||||
<span class="feature-icon">🔴</span>
|
||||
<h3>Script Builder <span style="color: #f380f5; font-size: 0.75rem;">(Alpha)</span></h3>
|
||||
<p>Record browser actions to create automation scripts</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<span class="feature-icon">🐍</span>
|
||||
<h3>Python Code</h3>
|
||||
<p>Get production-ready Crawl4AI code with LLM extraction</p>
|
||||
<p>Get production-ready Crawl4AI code instantly</p>
|
||||
</div>
|
||||
<div class="feature-card">
|
||||
<span class="feature-icon">🎨</span>
|
||||
@@ -85,9 +85,9 @@
|
||||
<div class="step-content">
|
||||
<h4>Download the Extension</h4>
|
||||
<p>Get the latest release from GitHub or use the button below</p>
|
||||
<a href="crawl4ai-assistant-v1.0.1.zip" class="download-button" download>
|
||||
<a href="crawl4ai-assistant-v1.2.1.zip" class="download-button" download>
|
||||
<span class="button-icon">↓</span>
|
||||
Download Extension (v1.0.1)
|
||||
Download Extension (v1.2.1)
|
||||
</a>
|
||||
</div>
|
||||
</div>
|
||||
@@ -110,64 +110,152 @@
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Usage Guide -->
|
||||
<section class="usage-section">
|
||||
<h2>How to Use</h2>
|
||||
<div class="terminal-window">
|
||||
<div class="terminal-header">
|
||||
<span class="terminal-title">Step-by-Step Guide</span>
|
||||
<!-- Interactive Tools Section -->
|
||||
<section class="interactive-tools">
|
||||
<h2>Explore Our Tools</h2>
|
||||
|
||||
<div class="tools-container">
|
||||
<!-- Left Panel - Tool Selector -->
|
||||
<div class="tools-panel">
|
||||
<div class="tool-selector active" data-tool="schema-builder">
|
||||
<div class="tool-icon">📊</div>
|
||||
<div class="tool-info">
|
||||
<h3>Schema Builder</h3>
|
||||
<p>Visual data extraction</p>
|
||||
</div>
|
||||
<div class="terminal-content">
|
||||
<div class="usage-flow">
|
||||
<div class="usage-step">
|
||||
<div class="usage-header">
|
||||
<span class="usage-icon">1️⃣</span>
|
||||
<h4>Start Schema Builder</h4>
|
||||
</div>
|
||||
<p>Click the extension icon and select "Schema Builder" to begin</p>
|
||||
<div class="tool-status">Available</div>
|
||||
</div>
|
||||
|
||||
<div class="usage-step">
|
||||
<div class="usage-header">
|
||||
<span class="usage-icon">2️⃣</span>
|
||||
<div class="tool-selector" data-tool="script-builder">
|
||||
<div class="tool-icon">🔴</div>
|
||||
<div class="tool-info">
|
||||
<h3>Script Builder</h3>
|
||||
<p>Browser automation</p>
|
||||
</div>
|
||||
<div class="tool-status alpha">Alpha</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Right Panel - Tool Details -->
|
||||
<div class="tool-details">
|
||||
<!-- Schema Builder Details -->
|
||||
<div class="tool-content active" id="schema-builder">
|
||||
<div class="tool-header">
|
||||
<h3>📊 Schema Builder</h3>
|
||||
<span class="tool-tagline">Click to extract data visually</span>
|
||||
</div>
|
||||
|
||||
<div class="tool-steps">
|
||||
<div class="step-item">
|
||||
<div class="step-number">1</div>
|
||||
<div class="step-content">
|
||||
<h4>Select Container</h4>
|
||||
<p>Click on any repeating element like product cards or articles</p>
|
||||
<div class="step-visual">
|
||||
<span class="highlight-green">■</span> Elements highlighted in green
|
||||
</div>
|
||||
<p>Click on a container element (e.g., product card, article, listing)</p>
|
||||
<div class="code-snippet">
|
||||
<span class="comment"># Container will be highlighted in green</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="usage-step">
|
||||
<div class="usage-header">
|
||||
<span class="usage-icon">3️⃣</span>
|
||||
<h4>Select Fields</h4>
|
||||
<div class="step-item">
|
||||
<div class="step-number">2</div>
|
||||
<div class="step-content">
|
||||
<h4>Mark Fields</h4>
|
||||
<p>Click on data fields inside the container</p>
|
||||
<div class="step-visual">
|
||||
<span class="highlight-pink">■</span> Fields highlighted in pink
|
||||
</div>
|
||||
<p>Click on individual fields inside the container and name them</p>
|
||||
<div class="code-snippet">
|
||||
<span class="comment"># Fields will be highlighted in pink</span>
|
||||
<span class="comment"># Examples: title, price, description, image</span>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="usage-step">
|
||||
<div class="usage-header">
|
||||
<span class="usage-icon">4️⃣</span>
|
||||
<h4>Generate Code</h4>
|
||||
<div class="step-item">
|
||||
<div class="step-number">3</div>
|
||||
<div class="step-content">
|
||||
<h4>Generate & Extract</h4>
|
||||
<p>Get your CSS selectors and Python code instantly</p>
|
||||
<div class="step-visual">
|
||||
<span class="highlight-accent">⚡</span> Ready to use code
|
||||
</div>
|
||||
<p>Click "Stop & Generate" to create your Python extraction code</p>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tool-features">
|
||||
<div class="feature-tag">No CSS knowledge needed</div>
|
||||
<div class="feature-tag">Smart selector generation</div>
|
||||
<div class="feature-tag">LLM-ready schemas</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<!-- Script Builder Details -->
|
||||
<div class="tool-content" id="script-builder">
|
||||
<div class="tool-header">
|
||||
<h3>🔴 Script Builder</h3>
|
||||
<span class="tool-tagline">Record actions, generate automation</span>
|
||||
</div>
|
||||
|
||||
<div class="tool-steps">
|
||||
<div class="step-item">
|
||||
<div class="step-number">1</div>
|
||||
<div class="step-content">
|
||||
<h4>Hit Record</h4>
|
||||
<p>Start capturing your browser interactions</p>
|
||||
<div class="step-visual">
|
||||
<span class="recording-dot">●</span> Recording indicator
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="step-item">
|
||||
<div class="step-number">2</div>
|
||||
<div class="step-content">
|
||||
<h4>Interact Naturally</h4>
|
||||
<p>Click, type, scroll - everything is captured</p>
|
||||
<div class="step-visual">
|
||||
<span class="action-icon">🖱️</span> <span class="action-icon">⌨️</span> <span class="action-icon">📜</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="step-item">
|
||||
<div class="step-number">3</div>
|
||||
<div class="step-content">
|
||||
<h4>Export Script</h4>
|
||||
<p>Get JavaScript for Crawl4AI's js_code parameter</p>
|
||||
<div class="step-visual">
|
||||
<span class="highlight-accent">📝</span> Automation ready
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="tool-features">
|
||||
<div class="feature-tag">Smart action grouping</div>
|
||||
<div class="feature-tag">Wait detection</div>
|
||||
<div class="feature-tag">Keyboard shortcuts</div>
|
||||
<div class="feature-tag alpha-tag">Alpha version</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<!-- Generated Code Example -->
|
||||
<section class="code-section">
|
||||
<h2>Generated Code Example</h2>
|
||||
<!-- Interactive Code Examples -->
|
||||
<section class="code-showcase">
|
||||
<h2>See the Generated Code</h2>
|
||||
|
||||
<div class="code-tabs">
|
||||
<button class="code-tab active" data-example="schema">📊 Schema Builder</button>
|
||||
<button class="code-tab" data-example="script">🔴 Script Builder</button>
|
||||
</div>
|
||||
|
||||
<div class="code-examples">
|
||||
<!-- Schema Builder Code -->
|
||||
<div class="code-example active" id="code-schema">
|
||||
<div class="terminal-window">
|
||||
<div class="terminal-header">
|
||||
<span class="terminal-title">example_extraction.py</span>
|
||||
<span class="terminal-title">schema_extraction.py</span>
|
||||
<button class="copy-button" data-code="schema">Copy</button>
|
||||
</div>
|
||||
<div class="terminal-content">
|
||||
<pre><code><span class="keyword">import</span> asyncio
|
||||
@@ -191,26 +279,17 @@
|
||||
<span class="string">"selector"</span>: <span class="string">"span.price"</span>,
|
||||
<span class="string">"type"</span>: <span class="string">"text"</span>
|
||||
},
|
||||
{
|
||||
<span class="string">"name"</span>: <span class="string">"description"</span>,
|
||||
<span class="string">"selector"</span>: <span class="string">"p.description"</span>,
|
||||
<span class="string">"type"</span>: <span class="string">"text"</span>
|
||||
},
|
||||
{
|
||||
<span class="string">"name"</span>: <span class="string">"image"</span>,
|
||||
<span class="string">"selector"</span>: <span class="string">"img.product-image"</span>,
|
||||
<span class="string">"selector"</span>: <span class="string">"img.product-img"</span>,
|
||||
<span class="string">"type"</span>: <span class="string">"attribute"</span>,
|
||||
<span class="string">"attribute"</span>: <span class="string">"src"</span>
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
<span class="comment"># Create extraction strategy</span>
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=<span class="keyword">True</span>)
|
||||
|
||||
<span class="comment"># Configure the crawler</span>
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema)
|
||||
)
|
||||
|
||||
<span class="keyword">async</span> <span class="keyword">with</span> AsyncWebCrawler() <span class="keyword">as</span> crawler:
|
||||
@@ -218,24 +297,76 @@
|
||||
url=<span class="string">"https://example.com/products"</span>,
|
||||
config=config
|
||||
)
|
||||
<span class="keyword">return</span> json.loads(result.extracted_content)
|
||||
|
||||
<span class="comment"># Parse the extracted data</span>
|
||||
products = json.loads(result.extracted_content)
|
||||
<span class="keyword">print</span>(<span class="string">f"Extracted {len(products)} products"</span>)
|
||||
asyncio.run(extract_products())</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<span class="comment"># Display first product</span>
|
||||
<span class="keyword">if</span> products:
|
||||
<span class="keyword">print</span>(json.dumps(products[0], indent=2))
|
||||
<!-- Script Builder Code -->
|
||||
<div class="code-example" id="code-script">
|
||||
<div class="terminal-window">
|
||||
<div class="terminal-header">
|
||||
<span class="terminal-title">automation_script.py</span>
|
||||
<button class="copy-button" data-code="script">Copy</button>
|
||||
</div>
|
||||
<div class="terminal-content">
|
||||
<pre><code><span class="keyword">import</span> asyncio
|
||||
<span class="keyword">from</span> crawl4ai <span class="keyword">import</span> AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
<span class="keyword">return</span> products
|
||||
<span class="comment"># JavaScript generated from your recorded actions</span>
|
||||
js_script = <span class="string">"""
|
||||
// Search for products
|
||||
document.querySelector('button.search-toggle').click();
|
||||
await new Promise(r => setTimeout(r, 500));
|
||||
|
||||
<span class="comment"># Run the extraction</span>
|
||||
<span class="keyword">if</span> __name__ == <span class="string">"__main__"</span>:
|
||||
asyncio.run(extract_products())</code></pre>
|
||||
// Type search query
|
||||
const searchInput = document.querySelector('input#search');
|
||||
searchInput.value = 'wireless headphones';
|
||||
searchInput.dispatchEvent(new Event('input', {bubbles: true}));
|
||||
|
||||
// Submit search
|
||||
searchInput.dispatchEvent(new KeyboardEvent('keydown', {
|
||||
key: 'Enter', keyCode: 13, bubbles: true
|
||||
}));
|
||||
|
||||
// Wait for results
|
||||
await new Promise(r => setTimeout(r, 2000));
|
||||
|
||||
// Click first product
|
||||
document.querySelector('.product-item:first-child').click();
|
||||
|
||||
// Wait for product page
|
||||
await new Promise(r => setTimeout(r, 1000));
|
||||
|
||||
// Add to cart
|
||||
document.querySelector('button.add-to-cart').click();
|
||||
"""</span>
|
||||
|
||||
<span class="keyword">async</span> <span class="keyword">def</span> <span class="function">automate_shopping</span>():
|
||||
config = CrawlerRunConfig(
|
||||
js_code=js_script,
|
||||
wait_for=<span class="string">"css:.cart-confirmation"</span>,
|
||||
screenshot=<span class="keyword">True</span>
|
||||
)
|
||||
|
||||
<span class="keyword">async</span> <span class="keyword">with</span> AsyncWebCrawler() <span class="keyword">as</span> crawler:
|
||||
result = <span class="keyword">await</span> crawler.arun(
|
||||
url=<span class="string">"https://shop.example.com"</span>,
|
||||
config=config
|
||||
)
|
||||
<span class="keyword">print</span>(<span class="string">f"✓ Automation complete: {result.url}"</span>)
|
||||
<span class="keyword">return</span> result
|
||||
|
||||
asyncio.run(automate_shopping())</code></pre>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
|
||||
<!-- Coming Soon Section -->
|
||||
<section class="coming-soon-section">
|
||||
<h2>Coming Soon: Even More Power</h2>
|
||||
@@ -279,17 +410,6 @@
|
||||
<code>🤖 Auto-detect fields • Smart naming • Pattern recognition</code>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="coming-feature">
|
||||
<div class="feature-header">
|
||||
<span class="feature-badge">Script</span>
|
||||
<h3>C4A Script Builder</h3>
|
||||
</div>
|
||||
<p>Visual automation script builder for complex interactions - fill forms, click buttons, handle pagination, all without writing code.</p>
|
||||
<div class="feature-preview">
|
||||
<code>🎯 Visual automation • Record & replay • Export as C4A script</code>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="stay-tuned">
|
||||
@@ -324,5 +444,61 @@
|
||||
</footer>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<script>
|
||||
// Tool Selector Interaction
|
||||
document.querySelectorAll('.tool-selector').forEach(selector => {
|
||||
selector.addEventListener('click', function() {
|
||||
// Remove active class from all selectors
|
||||
document.querySelectorAll('.tool-selector').forEach(s => s.classList.remove('active'));
|
||||
document.querySelectorAll('.tool-content').forEach(c => c.classList.remove('active'));
|
||||
|
||||
// Add active class to clicked selector
|
||||
this.classList.add('active');
|
||||
|
||||
// Show corresponding content
|
||||
const toolId = this.getAttribute('data-tool');
|
||||
document.getElementById(toolId).classList.add('active');
|
||||
});
|
||||
});
|
||||
|
||||
// Code Tab Interaction
|
||||
document.querySelectorAll('.code-tab').forEach(tab => {
|
||||
tab.addEventListener('click', function() {
|
||||
// Remove active class from all tabs
|
||||
document.querySelectorAll('.code-tab').forEach(t => t.classList.remove('active'));
|
||||
document.querySelectorAll('.code-example').forEach(e => e.classList.remove('active'));
|
||||
|
||||
// Add active class to clicked tab
|
||||
this.classList.add('active');
|
||||
|
||||
// Show corresponding code
|
||||
const exampleId = this.getAttribute('data-example');
|
||||
document.getElementById('code-' + exampleId).classList.add('active');
|
||||
});
|
||||
});
|
||||
|
||||
// Copy Button Functionality
|
||||
document.querySelectorAll('.copy-button').forEach(button => {
|
||||
button.addEventListener('click', async function() {
|
||||
const codeType = this.getAttribute('data-code');
|
||||
const codeElement = document.getElementById('code-' + codeType).querySelector('pre code');
|
||||
const codeText = codeElement.textContent;
|
||||
|
||||
try {
|
||||
await navigator.clipboard.writeText(codeText);
|
||||
this.textContent = 'Copied!';
|
||||
this.classList.add('copied');
|
||||
|
||||
setTimeout(() => {
|
||||
this.textContent = 'Copy';
|
||||
this.classList.remove('copied');
|
||||
}, 2000);
|
||||
} catch (err) {
|
||||
console.error('Failed to copy code:', err);
|
||||
}
|
||||
});
|
||||
});
|
||||
</script>
|
||||
</body>
|
||||
</html>
|
||||
@@ -1,8 +1,8 @@
|
||||
{
|
||||
"manifest_version": 3,
|
||||
"name": "Crawl4AI Assistant",
|
||||
"version": "1.0.1",
|
||||
"description": "Visual schema and script builder for Crawl4AI - Build extraction schemas by clicking on elements",
|
||||
"version": "1.2.1",
|
||||
"description": "Visual schema and script builder for Crawl4AI - Build extraction schemas and automation scripts by clicking and recording actions",
|
||||
"permissions": [
|
||||
"activeTab",
|
||||
"storage",
|
||||
|
||||
@@ -30,11 +30,11 @@
|
||||
</div>
|
||||
</button>
|
||||
|
||||
<button id="script-mode" class="mode-button script" disabled>
|
||||
<button id="script-mode" class="mode-button script">
|
||||
<div class="icon">🎯</div>
|
||||
<div class="mode-info">
|
||||
<h3>Script Builder</h3>
|
||||
<p>Coming soon - Build automation scripts</p>
|
||||
<h3>Script Builder <span style="color: #ff3c74; font-size: 10px;">(Alpha)</span></h3>
|
||||
<p>Record actions to build automation scripts</p>
|
||||
</div>
|
||||
</button>
|
||||
</div>
|
||||
|
||||
@@ -18,6 +18,10 @@ document.addEventListener('DOMContentLoaded', () => {
|
||||
startSchemaCapture();
|
||||
});
|
||||
|
||||
document.getElementById('script-mode').addEventListener('click', () => {
|
||||
startScriptCapture();
|
||||
});
|
||||
|
||||
// Session actions
|
||||
document.getElementById('generate-code').addEventListener('click', () => {
|
||||
generateCode();
|
||||
@@ -62,6 +66,19 @@ function startSchemaCapture() {
|
||||
});
|
||||
}
|
||||
|
||||
function startScriptCapture() {
|
||||
chrome.tabs.query({ active: true, currentWindow: true }, (tabs) => {
|
||||
chrome.tabs.sendMessage(tabs[0].id, {
|
||||
action: 'startScriptCapture'
|
||||
}, (response) => {
|
||||
if (response && response.success) {
|
||||
// Close the popup to let user interact with the page
|
||||
window.close();
|
||||
}
|
||||
});
|
||||
});
|
||||
}
|
||||
|
||||
function showActiveSession(stats) {
|
||||
document.querySelector('.mode-selector').style.display = 'none';
|
||||
document.getElementById('active-session').classList.remove('hidden');
|
||||
|
||||
@@ -11,8 +11,8 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
|
||||
}
|
||||
|
||||
.app-card {
|
||||
background: var(--md-code-bg-color);
|
||||
border: 1px solid var(--md-default-fg-color--lightest);
|
||||
background: #3f3f44;
|
||||
border: 1px solid #3f3f44;
|
||||
border-radius: 8px;
|
||||
padding: 1.5rem;
|
||||
transition: all 0.3s ease;
|
||||
@@ -23,7 +23,7 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
|
||||
.app-card:hover {
|
||||
transform: translateY(-4px);
|
||||
box-shadow: 0 8px 16px rgba(0, 0, 0, 0.3);
|
||||
border-color: var(--md-primary-fg-color);
|
||||
border-color: #50ffff;
|
||||
}
|
||||
|
||||
.app-card h3 {
|
||||
@@ -31,36 +31,38 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
|
||||
display: flex;
|
||||
align-items: center;
|
||||
gap: 0.5rem;
|
||||
color: #e8e9ed;
|
||||
}
|
||||
|
||||
.app-status {
|
||||
display: inline-block;
|
||||
padding: 0.2rem 0.6rem;
|
||||
border-radius: 4px;
|
||||
padding: 0.25rem 0.75rem;
|
||||
border-radius: 20px;
|
||||
font-size: 0.7rem;
|
||||
font-weight: bold;
|
||||
font-weight: 600;
|
||||
text-transform: uppercase;
|
||||
margin-bottom: 1rem;
|
||||
}
|
||||
|
||||
.status-available {
|
||||
background: #22c55e;
|
||||
color: #000;
|
||||
background: #50ffff;
|
||||
color: #070708;
|
||||
}
|
||||
|
||||
.status-beta {
|
||||
background: #f59e0b;
|
||||
color: #000;
|
||||
color: #070708;
|
||||
}
|
||||
|
||||
.status-coming-soon {
|
||||
background: var(--md-default-fg-color--lightest);
|
||||
color: var(--md-default-bg-color);
|
||||
background: #2a2a2a;
|
||||
color: #888;
|
||||
}
|
||||
|
||||
.app-description {
|
||||
margin: 1rem 0;
|
||||
line-height: 1.6;
|
||||
color: #a3abba;
|
||||
}
|
||||
|
||||
.app-features {
|
||||
@@ -73,13 +75,15 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
|
||||
padding-left: 1.5rem;
|
||||
position: relative;
|
||||
margin-bottom: 0.5rem;
|
||||
color: #d5cec0;
|
||||
font-size: 0.9rem;
|
||||
}
|
||||
|
||||
.app-features li:before {
|
||||
content: "✓";
|
||||
content: "▸";
|
||||
position: absolute;
|
||||
left: 0;
|
||||
color: var(--md-primary-fg-color);
|
||||
color: #50ffff;
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
@@ -89,35 +93,49 @@ Welcome to the Crawl4AI Apps Hub - your gateway to interactive tools and demos t
|
||||
|
||||
.app-btn {
|
||||
display: inline-block;
|
||||
padding: 0.8rem 1.5rem;
|
||||
background: var(--md-primary-fg-color);
|
||||
color: var(--md-primary-bg-color);
|
||||
padding: 0.75rem 1.5rem;
|
||||
background: #50ffff;
|
||||
color: #070708;
|
||||
text-decoration: none;
|
||||
border-radius: 6px;
|
||||
font-weight: bold;
|
||||
font-weight: 600;
|
||||
transition: all 0.2s ease;
|
||||
font-family: dm, Monaco, monospace;
|
||||
}
|
||||
|
||||
.app-btn:hover {
|
||||
background: var(--md-primary-fg-color--dark);
|
||||
background: #09b5a5;
|
||||
transform: scale(1.05);
|
||||
color: #070708;
|
||||
}
|
||||
|
||||
.app-btn.disabled {
|
||||
background: var(--md-default-fg-color--lightest);
|
||||
background: #2a2a2a;
|
||||
color: #666;
|
||||
cursor: not-allowed;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
.app-btn.disabled:hover {
|
||||
background: #2a2a2a;
|
||||
transform: none;
|
||||
}
|
||||
|
||||
.intro-section {
|
||||
background: var(--md-code-bg-color);
|
||||
background: #3f3f44;
|
||||
border-radius: 8px;
|
||||
padding: 2rem;
|
||||
margin-bottom: 3rem;
|
||||
border: 1px solid #3f3f44;
|
||||
}
|
||||
|
||||
.intro-section h2 {
|
||||
margin-top: 0;
|
||||
color: #50ffff;
|
||||
}
|
||||
|
||||
.intro-section p {
|
||||
color: #d5cec0;
|
||||
}
|
||||
</style>
|
||||
|
||||
|
||||
@@ -71,33 +71,6 @@
|
||||
</section>
|
||||
|
||||
<section class="builder">
|
||||
<div class="special-contexts">
|
||||
<h2>Quick Presets</h2>
|
||||
<div class="preset-options">
|
||||
<label class="preset-option">
|
||||
<input type="radio" name="preset" value="vibe" id="preset-vibe">
|
||||
<div class="preset-card">
|
||||
<h3>🎯 Vibe Coding</h3>
|
||||
<p>Curated context for general AI prompting - perfect for exploring capabilities</p>
|
||||
</div>
|
||||
</label>
|
||||
<label class="preset-option">
|
||||
<input type="radio" name="preset" value="all" id="preset-all">
|
||||
<div class="preset-card">
|
||||
<h3>📚 Complete Library</h3>
|
||||
<p>Comprehensive context including all components and perspectives</p>
|
||||
</div>
|
||||
</label>
|
||||
<label class="preset-option">
|
||||
<input type="radio" name="preset" value="custom" id="preset-custom" checked>
|
||||
<div class="preset-card">
|
||||
<h3>🔧 Custom Selection</h3>
|
||||
<p>Choose specific components and context types</p>
|
||||
</div>
|
||||
</label>
|
||||
</div>
|
||||
</div>
|
||||
|
||||
<div class="component-selector" id="component-selector">
|
||||
<h2>Select Components & Context Types</h2>
|
||||
<div class="select-all-controls">
|
||||
@@ -111,9 +84,9 @@
|
||||
<tr>
|
||||
<th width="50"></th>
|
||||
<th>Component</th>
|
||||
<th class="clickable-header" data-type="memory">Memory</th>
|
||||
<th class="clickable-header" data-type="reasoning">Reasoning</th>
|
||||
<th class="clickable-header" data-type="examples">Examples</th>
|
||||
<th class="clickable-header" data-type="memory">Memory<br><span class="header-subtitle">Full Content</span></th>
|
||||
<th class="clickable-header" data-type="reasoning">Reasoning<br><span class="header-subtitle">Diagrams</span></th>
|
||||
<th class="clickable-header" data-type="examples">Examples<br><span class="header-subtitle">Code</span></th>
|
||||
</tr>
|
||||
</thead>
|
||||
<tbody id="components-tbody">
|
||||
@@ -124,6 +97,10 @@
|
||||
</div>
|
||||
|
||||
<div class="action-area">
|
||||
<div class="token-summary" id="token-summary">
|
||||
<span class="token-label">Estimated Tokens:</span>
|
||||
<span class="token-count" id="total-tokens">0</span>
|
||||
</div>
|
||||
<button class="download-btn" id="download-btn">
|
||||
<span class="icon">⬇</span> Generate & Download Context
|
||||
</button>
|
||||
|
||||
@@ -363,6 +363,15 @@ body {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
.header-subtitle {
|
||||
font-size: 10px;
|
||||
color: var(--tertiary-color);
|
||||
text-transform: none;
|
||||
font-weight: normal;
|
||||
display: block;
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
.component-selection-table th.clickable-header {
|
||||
cursor: pointer;
|
||||
user-select: none;
|
||||
@@ -374,6 +383,16 @@ body {
|
||||
color: var(--background-color);
|
||||
}
|
||||
|
||||
.component-selection-table th.clickable-header[data-type="examples"] {
|
||||
cursor: default;
|
||||
opacity: 0.5;
|
||||
}
|
||||
|
||||
.component-selection-table th.clickable-header[data-type="examples"]:hover {
|
||||
background-color: var(--hover-bg);
|
||||
color: var(--primary-color);
|
||||
}
|
||||
|
||||
.component-selection-table th:nth-child(3),
|
||||
.component-selection-table th:nth-child(4),
|
||||
.component-selection-table th:nth-child(5) {
|
||||
@@ -400,12 +419,25 @@ body {
|
||||
font-weight: bold;
|
||||
}
|
||||
|
||||
/* Token display in table cells */
|
||||
.token-info {
|
||||
display: block;
|
||||
font-size: 11px;
|
||||
color: var(--tertiary-color);
|
||||
margin-top: 2px;
|
||||
}
|
||||
|
||||
.component-selection-table input[type="checkbox"] {
|
||||
cursor: pointer;
|
||||
width: 16px;
|
||||
height: 16px;
|
||||
}
|
||||
|
||||
.component-selection-table input[type="checkbox"]:disabled {
|
||||
cursor: not-allowed;
|
||||
opacity: 0.3;
|
||||
}
|
||||
|
||||
/* Disabled row state */
|
||||
.component-selection-table tr.disabled td:not(:first-child) {
|
||||
opacity: 0.5;
|
||||
@@ -418,6 +450,30 @@ body {
|
||||
margin: 40px 0;
|
||||
}
|
||||
|
||||
/* Token Summary */
|
||||
.token-summary {
|
||||
margin-bottom: 20px;
|
||||
font-size: 16px;
|
||||
}
|
||||
|
||||
.token-label {
|
||||
color: var(--tertiary-color);
|
||||
margin-right: 10px;
|
||||
}
|
||||
|
||||
.token-count {
|
||||
color: var(--primary-color);
|
||||
font-weight: bold;
|
||||
font-size: 20px;
|
||||
}
|
||||
|
||||
.token-count::after {
|
||||
content: " est.";
|
||||
font-size: 12px;
|
||||
color: var(--tertiary-color);
|
||||
margin-left: 4px;
|
||||
}
|
||||
|
||||
.download-btn {
|
||||
background-color: var(--primary-dimmed);
|
||||
color: var(--background-color);
|
||||
|
||||
@@ -1,48 +1,61 @@
|
||||
// Crawl4AI LLM Context Builder JavaScript
|
||||
|
||||
// Component definitions
|
||||
// Component definitions - order matters
|
||||
const components = [
|
||||
{
|
||||
id: 'all',
|
||||
name: 'All Components',
|
||||
description: 'All components with all context types',
|
||||
special: true
|
||||
id: 'installation',
|
||||
name: 'Installation',
|
||||
description: 'Setup and installation options'
|
||||
},
|
||||
{
|
||||
id: 'core',
|
||||
name: 'Core Functionality',
|
||||
description: 'Basic crawling and scraping features'
|
||||
id: 'simple_crawling',
|
||||
name: 'Simple Crawling',
|
||||
description: 'Basic web crawling operations'
|
||||
},
|
||||
{
|
||||
id: 'config_objects',
|
||||
name: 'Configuration Objects',
|
||||
description: 'Browser and crawler configuration'
|
||||
},
|
||||
{
|
||||
id: 'deep_crawling',
|
||||
name: 'Deep Crawling',
|
||||
description: 'Multi-page crawling strategies'
|
||||
},
|
||||
{
|
||||
id: 'deployment',
|
||||
name: 'Deployment',
|
||||
description: 'Installation and Docker setup'
|
||||
},
|
||||
{
|
||||
id: 'extraction',
|
||||
name: 'Data Extraction',
|
||||
description: 'Structured data extraction strategies'
|
||||
},
|
||||
{
|
||||
id: 'markdown',
|
||||
name: 'Markdown Generation',
|
||||
description: 'Content-to-markdown conversion'
|
||||
id: 'multi_urls_crawling',
|
||||
name: 'Multi URLs Crawling',
|
||||
description: 'Crawling multiple URLs efficiently'
|
||||
},
|
||||
{
|
||||
id: 'vibe',
|
||||
name: 'Vibe Coding',
|
||||
description: 'General-purpose AI context',
|
||||
special: false
|
||||
id: 'deep_crawling',
|
||||
name: 'Deep Crawling',
|
||||
description: 'Multi-page crawling strategies'
|
||||
},
|
||||
{
|
||||
id: 'docker',
|
||||
name: 'Docker',
|
||||
description: 'Docker deployment and configuration'
|
||||
},
|
||||
{
|
||||
id: 'cli',
|
||||
name: 'CLI',
|
||||
description: 'Command-line interface usage'
|
||||
},
|
||||
{
|
||||
id: 'http_based_crawler_strategy',
|
||||
name: 'HTTP-based Crawler',
|
||||
description: 'HTTP crawler strategy implementation'
|
||||
},
|
||||
{
|
||||
id: 'url_seeder',
|
||||
name: 'URL Seeder',
|
||||
description: 'URL seeding and discovery'
|
||||
},
|
||||
{
|
||||
id: 'deep_crawl_advanced_filters_scorers',
|
||||
name: 'Advanced Filters & Scorers',
|
||||
description: 'Deep crawl filtering and scoring'
|
||||
}
|
||||
];
|
||||
|
||||
@@ -51,45 +64,47 @@ const contextTypes = ['memory', 'reasoning', 'examples'];
|
||||
|
||||
// State management
|
||||
const state = {
|
||||
preset: 'custom',
|
||||
selectedComponents: new Set(),
|
||||
selectedContextTypes: new Map()
|
||||
selectedContextTypes: new Map(),
|
||||
tokenCounts: new Map() // Store token counts for each file
|
||||
};
|
||||
|
||||
// Initialize the application
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
setupPresetHandlers();
|
||||
renderComponents();
|
||||
renderReferenceTable();
|
||||
setupActionHandlers();
|
||||
setupColumnHeaderHandlers();
|
||||
|
||||
// Initialize only core component as selected with all context types
|
||||
state.selectedComponents.add('core');
|
||||
state.selectedContextTypes.set('core', new Set(contextTypes));
|
||||
// Initialize first component as selected with available context types
|
||||
const firstComponent = components[0];
|
||||
state.selectedComponents.add(firstComponent.id);
|
||||
state.selectedContextTypes.set(firstComponent.id, new Set(['memory', 'reasoning']));
|
||||
updateComponentUI();
|
||||
});
|
||||
|
||||
// Setup preset radio button handlers
|
||||
function setupPresetHandlers() {
|
||||
const presetRadios = document.querySelectorAll('input[name="preset"]');
|
||||
presetRadios.forEach(radio => {
|
||||
radio.addEventListener('change', (e) => {
|
||||
state.preset = e.target.value;
|
||||
updatePresetSelection();
|
||||
});
|
||||
});
|
||||
// Helper function to count tokens (words × 2.5)
|
||||
function estimateTokens(text) {
|
||||
if (!text) return 0;
|
||||
const words = text.trim().split(/\s+/).length;
|
||||
return Math.round(words * 2.5);
|
||||
}
|
||||
|
||||
// Update UI based on preset selection
|
||||
function updatePresetSelection() {
|
||||
const componentSelector = document.getElementById('component-selector');
|
||||
// Update total token count display
|
||||
function updateTotalTokenCount() {
|
||||
let totalTokens = 0;
|
||||
|
||||
if (state.preset === 'custom') {
|
||||
componentSelector.style.display = 'block';
|
||||
} else {
|
||||
componentSelector.style.display = 'none';
|
||||
state.selectedComponents.forEach(compId => {
|
||||
const types = state.selectedContextTypes.get(compId);
|
||||
if (types) {
|
||||
types.forEach(type => {
|
||||
const key = `${compId}-${type}`;
|
||||
totalTokens += state.tokenCounts.get(key) || 0;
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
document.getElementById('total-tokens').textContent = totalTokens.toLocaleString();
|
||||
}
|
||||
|
||||
// Render component selection table
|
||||
@@ -97,10 +112,13 @@ function renderComponents() {
|
||||
const tbody = document.getElementById('components-tbody');
|
||||
tbody.innerHTML = '';
|
||||
|
||||
components.filter(c => !c.special).forEach(component => {
|
||||
components.forEach(component => {
|
||||
const row = createComponentRow(component);
|
||||
tbody.appendChild(row);
|
||||
});
|
||||
|
||||
// Fetch token counts for all files
|
||||
fetchAllTokenCounts();
|
||||
}
|
||||
|
||||
// Create a component table row
|
||||
@@ -124,9 +142,17 @@ function createComponentRow(component) {
|
||||
// Context type cells
|
||||
contextTypes.forEach(type => {
|
||||
const td = document.createElement('td');
|
||||
const key = `${component.id}-${type}`;
|
||||
const tokenCount = state.tokenCounts.get(key) || 0;
|
||||
const isDisabled = type === 'examples' ? 'disabled' : '';
|
||||
|
||||
td.innerHTML = `
|
||||
<input type="checkbox" id="check-${component.id}-${type}"
|
||||
data-component="${component.id}" data-type="${type}">
|
||||
data-component="${component.id}" data-type="${type}"
|
||||
${isDisabled}>
|
||||
<span class="token-info" id="tokens-${component.id}-${type}">
|
||||
${tokenCount > 0 ? `${tokenCount.toLocaleString()} tokens` : ''}
|
||||
</span>
|
||||
`;
|
||||
tr.appendChild(td);
|
||||
});
|
||||
@@ -140,9 +166,11 @@ function createComponentRow(component) {
|
||||
// Add event listeners for context type checkboxes
|
||||
contextTypes.forEach(type => {
|
||||
const typeCheckbox = tr.querySelector(`#check-${component.id}-${type}`);
|
||||
if (!typeCheckbox.disabled) {
|
||||
typeCheckbox.addEventListener('change', (e) => {
|
||||
handleContextTypeToggle(component.id, type, e.target.checked);
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return tr;
|
||||
@@ -152,12 +180,12 @@ function createComponentRow(component) {
|
||||
function handleComponentToggle(componentId, checked) {
|
||||
if (checked) {
|
||||
state.selectedComponents.add(componentId);
|
||||
// Select all context types when component is selected
|
||||
// Select only available context types when component is selected
|
||||
if (!state.selectedContextTypes.has(componentId)) {
|
||||
state.selectedContextTypes.set(componentId, new Set(contextTypes));
|
||||
state.selectedContextTypes.set(componentId, new Set(['memory', 'reasoning']));
|
||||
} else {
|
||||
// If component was already partially selected, select all
|
||||
state.selectedContextTypes.set(componentId, new Set(contextTypes));
|
||||
// If component was already partially selected, select all available
|
||||
state.selectedContextTypes.set(componentId, new Set(['memory', 'reasoning']));
|
||||
}
|
||||
} else {
|
||||
state.selectedComponents.delete(componentId);
|
||||
@@ -195,8 +223,10 @@ function handleContextTypeToggle(componentId, type, checked) {
|
||||
|
||||
// Update UI to reflect current state
|
||||
function updateComponentUI() {
|
||||
components.filter(c => !c.special).forEach(component => {
|
||||
components.forEach(component => {
|
||||
const row = document.getElementById(`component-${component.id}`);
|
||||
if (!row) return;
|
||||
|
||||
const mainCheckbox = row.querySelector(`#check-${component.id}`);
|
||||
const hasSelection = state.selectedComponents.has(component.id);
|
||||
const selectedTypes = state.selectedContextTypes.get(component.id) || new Set();
|
||||
@@ -213,15 +243,93 @@ function updateComponentUI() {
|
||||
typeCheckbox.checked = selectedTypes.has(type);
|
||||
});
|
||||
});
|
||||
|
||||
updateTotalTokenCount();
|
||||
}
|
||||
|
||||
// Fetch token counts for all files
|
||||
async function fetchAllTokenCounts() {
|
||||
const promises = [];
|
||||
|
||||
components.forEach(component => {
|
||||
contextTypes.forEach(type => {
|
||||
promises.push(fetchTokenCount(component.id, type));
|
||||
});
|
||||
});
|
||||
|
||||
await Promise.all(promises);
|
||||
updateComponentUI();
|
||||
renderReferenceTable(); // Update reference table with token counts
|
||||
}
|
||||
|
||||
// Fetch token count for a specific file
|
||||
async function fetchTokenCount(componentId, type) {
|
||||
const key = `${componentId}-${type}`;
|
||||
|
||||
try {
|
||||
const fileName = getFileName(componentId, type);
|
||||
const baseUrl = getBaseUrl(type);
|
||||
const response = await fetch(baseUrl + fileName);
|
||||
|
||||
if (response.ok) {
|
||||
const content = await response.text();
|
||||
const tokens = estimateTokens(content);
|
||||
state.tokenCounts.set(key, tokens);
|
||||
|
||||
// Update UI
|
||||
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
|
||||
if (tokenSpan) {
|
||||
tokenSpan.textContent = `${tokens.toLocaleString()} tokens`;
|
||||
}
|
||||
} else if (type === 'examples') {
|
||||
// Examples might not exist yet
|
||||
state.tokenCounts.set(key, 0);
|
||||
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
|
||||
if (tokenSpan) {
|
||||
tokenSpan.textContent = '';
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn(`Failed to fetch token count for ${componentId}-${type}`);
|
||||
if (type === 'examples') {
|
||||
const tokenSpan = document.getElementById(`tokens-${componentId}-${type}`);
|
||||
if (tokenSpan) {
|
||||
tokenSpan.textContent = '';
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Get file name based on component and type
|
||||
function getFileName(componentId, type) {
|
||||
// For new structure, all files are just [componentId].txt
|
||||
return `${componentId}.txt`;
|
||||
}
|
||||
|
||||
// Get base URL based on context type
|
||||
function getBaseUrl(type) {
|
||||
// For MkDocs, we need to go up to the root level
|
||||
const basePrefix = window.location.pathname.includes('/apps/') ? '../../' : '/';
|
||||
|
||||
switch(type) {
|
||||
case 'memory':
|
||||
return basePrefix + 'assets/llm.txt/txt/';
|
||||
case 'reasoning':
|
||||
return basePrefix + 'assets/llm.txt/diagrams/';
|
||||
case 'examples':
|
||||
return basePrefix + 'assets/llm.txt/examples/'; // Will return 404 for now
|
||||
default:
|
||||
return basePrefix + 'assets/llm.txt/txt/';
|
||||
}
|
||||
}
|
||||
|
||||
// Setup action button handlers
|
||||
function setupActionHandlers() {
|
||||
// Select/Deselect all buttons
|
||||
document.getElementById('select-all').addEventListener('click', () => {
|
||||
components.filter(c => !c.special).forEach(comp => {
|
||||
components.forEach(comp => {
|
||||
state.selectedComponents.add(comp.id);
|
||||
state.selectedContextTypes.set(comp.id, new Set(contextTypes));
|
||||
state.selectedContextTypes.set(comp.id, new Set(['memory', 'reasoning']));
|
||||
});
|
||||
updateComponentUI();
|
||||
});
|
||||
@@ -249,9 +357,12 @@ function setupColumnHeaderHandlers() {
|
||||
|
||||
// Toggle all checkboxes in a column
|
||||
function toggleColumnSelection(type) {
|
||||
// Don't toggle examples column
|
||||
if (type === 'examples') return;
|
||||
|
||||
// Check if all are currently selected
|
||||
let allSelected = true;
|
||||
components.filter(c => !c.special).forEach(comp => {
|
||||
components.forEach(comp => {
|
||||
const types = state.selectedContextTypes.get(comp.id);
|
||||
if (!types || !types.has(type)) {
|
||||
allSelected = false;
|
||||
@@ -259,7 +370,7 @@ function toggleColumnSelection(type) {
|
||||
});
|
||||
|
||||
// Toggle all
|
||||
components.filter(c => !c.special).forEach(comp => {
|
||||
components.forEach(comp => {
|
||||
if (!state.selectedContextTypes.has(comp.id)) {
|
||||
state.selectedContextTypes.set(comp.id, new Set());
|
||||
}
|
||||
@@ -314,46 +425,50 @@ async function handleDownload() {
|
||||
function getSelectedFiles() {
|
||||
const files = [];
|
||||
|
||||
if (state.preset === 'vibe') {
|
||||
files.push('crawl4ai_vibe.llm.full.md');
|
||||
} else if (state.preset === 'all') {
|
||||
// Use the dedicated aggregated files for all components
|
||||
files.push('crawl4ai_all_memory_content.llm.md');
|
||||
files.push('crawl4ai_all_reasoning_content.llm.md');
|
||||
files.push('crawl4ai_all_examples_content.llm.md');
|
||||
} else {
|
||||
// Custom selection
|
||||
// Build list of selected files with their context info
|
||||
state.selectedComponents.forEach(compId => {
|
||||
const types = state.selectedContextTypes.get(compId);
|
||||
if (types) {
|
||||
types.forEach(type => {
|
||||
files.push(`crawl4ai_${compId}_${type}_content.llm.md`);
|
||||
files.push({
|
||||
componentId: compId,
|
||||
type: type,
|
||||
fileName: getFileName(compId, type),
|
||||
baseUrl: getBaseUrl(type)
|
||||
});
|
||||
});
|
||||
}
|
||||
});
|
||||
}
|
||||
|
||||
return files;
|
||||
}
|
||||
|
||||
// Fetch multiple files
|
||||
async function fetchFiles(fileNames) {
|
||||
// Use /assets/llmtxt/ path with .txt extension
|
||||
const baseUrl = '/assets/llmtxt/';
|
||||
const promises = fileNames.map(async (fileName) => {
|
||||
// Convert .md to .txt for fetching
|
||||
const txtFileName = fileName.replace('.md', '.txt');
|
||||
async function fetchFiles(fileInfos) {
|
||||
const promises = fileInfos.map(async (fileInfo) => {
|
||||
try {
|
||||
const response = await fetch(baseUrl + txtFileName);
|
||||
const response = await fetch(fileInfo.baseUrl + fileInfo.fileName);
|
||||
if (!response.ok) {
|
||||
console.warn(`Failed to fetch ${txtFileName} from ${baseUrl + txtFileName}`);
|
||||
return { fileName, content: `<!-- Failed to load ${fileName} -->` };
|
||||
if (fileInfo.type === 'examples') {
|
||||
return {
|
||||
fileInfo,
|
||||
content: `<!-- Examples for ${fileInfo.componentId} coming soon -->\n\nExamples are currently being developed for this component.`
|
||||
};
|
||||
}
|
||||
console.warn(`Failed to fetch ${fileInfo.fileName} from ${fileInfo.baseUrl + fileInfo.fileName}`);
|
||||
return { fileInfo, content: `<!-- Failed to load ${fileInfo.fileName} -->` };
|
||||
}
|
||||
const content = await response.text();
|
||||
return { fileName, content };
|
||||
return { fileInfo, content };
|
||||
} catch (error) {
|
||||
console.warn(`Error fetching ${txtFileName} from ${baseUrl + txtFileName}:`, error);
|
||||
return { fileName, content: `<!-- Error loading ${fileName} -->` };
|
||||
if (fileInfo.type === 'examples') {
|
||||
return {
|
||||
fileInfo,
|
||||
content: `<!-- Examples for ${fileInfo.componentId} coming soon -->\n\nExamples are currently being developed for this component.`
|
||||
};
|
||||
}
|
||||
console.warn(`Error fetching ${fileInfo.fileName}:`, error);
|
||||
return { fileInfo, content: `<!-- Error loading ${fileInfo.fileName} -->` };
|
||||
}
|
||||
});
|
||||
|
||||
@@ -362,20 +477,31 @@ async function fetchFiles(fileNames) {
|
||||
|
||||
// Combine file contents with headers
|
||||
function combineContents(fileContents) {
|
||||
// Calculate total tokens
|
||||
let totalTokens = 0;
|
||||
fileContents.forEach(({ content }) => {
|
||||
totalTokens += estimateTokens(content);
|
||||
});
|
||||
|
||||
const header = `# Crawl4AI Custom LLM Context
|
||||
Generated on: ${new Date().toISOString()}
|
||||
Total files: ${fileContents.length}
|
||||
Estimated tokens: ${totalTokens.toLocaleString()}
|
||||
|
||||
---
|
||||
|
||||
`;
|
||||
|
||||
const sections = fileContents.map(({ fileName, content }) => {
|
||||
const componentName = extractComponentName(fileName);
|
||||
const contextType = extractContextType(fileName);
|
||||
const sections = fileContents.map(({ fileInfo, content }) => {
|
||||
const component = components.find(c => c.id === fileInfo.componentId);
|
||||
const componentName = component ? component.name : fileInfo.componentId;
|
||||
const contextType = getContextTypeName(fileInfo.type);
|
||||
const tokens = estimateTokens(content);
|
||||
|
||||
return `## ${componentName} - ${contextType}
|
||||
Source: ${fileName}
|
||||
Component ID: ${fileInfo.componentId}
|
||||
Context Type: ${fileInfo.type}
|
||||
Estimated tokens: ${tokens.toLocaleString()}
|
||||
|
||||
${content}
|
||||
|
||||
@@ -387,25 +513,14 @@ ${content}
|
||||
return header + sections.join('\n');
|
||||
}
|
||||
|
||||
// Extract component name from filename
|
||||
function extractComponentName(fileName) {
|
||||
// Pattern: crawl4ai_{component}_{type}_content.llm.md
|
||||
const match = fileName.match(/crawl4ai_(.+?)_(memory|reasoning|examples|llm\.full)/);
|
||||
if (match) {
|
||||
const compId = match[1];
|
||||
const component = components.find(c => c.id === compId);
|
||||
return component ? component.name : compId.replace(/_/g, ' ').replace(/\b\w/g, l => l.toUpperCase());
|
||||
// Get display name for context type
|
||||
function getContextTypeName(type) {
|
||||
switch(type) {
|
||||
case 'memory': return 'Full Content';
|
||||
case 'reasoning': return 'Diagrams & Workflows';
|
||||
case 'examples': return 'Code Examples';
|
||||
default: return type;
|
||||
}
|
||||
return 'Unknown Component';
|
||||
}
|
||||
|
||||
// Extract context type from filename
|
||||
function extractContextType(fileName) {
|
||||
if (fileName.includes('_memory_')) return 'Memory';
|
||||
if (fileName.includes('_reasoning_')) return 'Reasoning';
|
||||
if (fileName.includes('_examples_')) return 'Examples';
|
||||
if (fileName.includes('.llm.full')) return 'Complete Context';
|
||||
return 'Context';
|
||||
}
|
||||
|
||||
// Download file to user's computer
|
||||
@@ -426,33 +541,35 @@ function renderReferenceTable() {
|
||||
const tbody = document.getElementById('reference-table-body');
|
||||
tbody.innerHTML = '';
|
||||
|
||||
// Since vibe is no longer special, just show all components the same way
|
||||
// Get base path for links
|
||||
const basePrefix = window.location.pathname.includes('/apps/') ? '../../' : '/';
|
||||
|
||||
components.forEach(component => {
|
||||
const row = document.createElement('tr');
|
||||
const memoryTokens = state.tokenCounts.get(`${component.id}-memory`) || 0;
|
||||
const reasoningTokens = state.tokenCounts.get(`${component.id}-reasoning`) || 0;
|
||||
const examplesTokens = state.tokenCounts.get(`${component.id}-examples`) || 0;
|
||||
|
||||
row.innerHTML = `
|
||||
<td><strong>${component.name}</strong></td>
|
||||
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_memory_content.llm.txt" class="file-link" target="_blank">Memory</a></td>
|
||||
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_reasoning_content.llm.txt" class="file-link" target="_blank">Reasoning</a></td>
|
||||
<td><a href="/assets/llmtxt/crawl4ai_${component.id}_examples_content.llm.txt" class="file-link" target="_blank">Examples</a></td>
|
||||
<td><a href="/assets/llmtxt/crawl4ai_${component.id}.llm.full.txt" class="file-link" target="_blank">Full</a></td>
|
||||
<td>
|
||||
<a href="${basePrefix}assets/llm.txt/txt/${component.id}.txt" class="file-link" target="_blank">Memory</a>
|
||||
${memoryTokens > 0 ? `<span class="file-size">${memoryTokens.toLocaleString()} tokens</span>` : ''}
|
||||
</td>
|
||||
<td>
|
||||
<a href="${basePrefix}assets/llm.txt/diagrams/${component.id}.txt" class="file-link" target="_blank">Reasoning</a>
|
||||
${reasoningTokens > 0 ? `<span class="file-size">${reasoningTokens.toLocaleString()} tokens</span>` : ''}
|
||||
</td>
|
||||
<td>
|
||||
${examplesTokens > 0
|
||||
? `<a href="${basePrefix}assets/llm.txt/examples/${component.id}.txt" class="file-link" target="_blank">Examples</a>
|
||||
<span class="file-size">${examplesTokens.toLocaleString()} tokens</span>`
|
||||
: '-'
|
||||
}
|
||||
</td>
|
||||
<td>-</td>
|
||||
`;
|
||||
tbody.appendChild(row);
|
||||
});
|
||||
}
|
||||
|
||||
// Check if examples file exists (all components have examples)
|
||||
function hasExamplesFile(componentId) {
|
||||
// All components have examples files
|
||||
return true;
|
||||
}
|
||||
|
||||
// Check if full file exists (all components have full files)
|
||||
function hasFullFile(componentId) {
|
||||
// All components have full files
|
||||
return true;
|
||||
}
|
||||
|
||||
// Utility function to capitalize first letter
|
||||
function capitalizeFirst(str) {
|
||||
return str.charAt(0).toUpperCase() + str.slice(1);
|
||||
}
|
||||
425
docs/md_v2/assets/llm.txt/diagrams/cli.txt
Normal file
425
docs/md_v2/assets/llm.txt/diagrams/cli.txt
Normal file
@@ -0,0 +1,425 @@
|
||||
## CLI Workflows and Profile Management
|
||||
|
||||
Visual representations of command-line interface operations, browser profile management, and identity-based crawling workflows.
|
||||
|
||||
### CLI Command Flow Architecture
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[crwl command] --> B{Command Type?}
|
||||
|
||||
B -->|URL Crawling| C[Parse URL & Options]
|
||||
B -->|Profile Management| D[profiles subcommand]
|
||||
B -->|CDP Browser| E[cdp subcommand]
|
||||
B -->|Browser Control| F[browser subcommand]
|
||||
B -->|Configuration| G[config subcommand]
|
||||
|
||||
C --> C1{Output Format?}
|
||||
C1 -->|Default| C2[HTML/Markdown]
|
||||
C1 -->|JSON| C3[Structured Data]
|
||||
C1 -->|markdown| C4[Clean Markdown]
|
||||
C1 -->|markdown-fit| C5[Filtered Content]
|
||||
|
||||
C --> C6{Authentication?}
|
||||
C6 -->|Profile Specified| C7[Load Browser Profile]
|
||||
C6 -->|No Profile| C8[Anonymous Session]
|
||||
|
||||
C7 --> C9[Launch with User Data]
|
||||
C8 --> C10[Launch Clean Browser]
|
||||
|
||||
C9 --> C11[Execute Crawl]
|
||||
C10 --> C11
|
||||
|
||||
C11 --> C12{Success?}
|
||||
C12 -->|Yes| C13[Return Results]
|
||||
C12 -->|No| C14[Error Handling]
|
||||
|
||||
D --> D1[Interactive Profile Menu]
|
||||
D1 --> D2{Menu Choice?}
|
||||
D2 -->|Create| D3[Open Browser for Setup]
|
||||
D2 -->|List| D4[Show Existing Profiles]
|
||||
D2 -->|Delete| D5[Remove Profile]
|
||||
D2 -->|Use| D6[Crawl with Profile]
|
||||
|
||||
E --> E1[Launch CDP Browser]
|
||||
E1 --> E2[Remote Debugging Active]
|
||||
|
||||
F --> F1{Browser Action?}
|
||||
F1 -->|start| F2[Start Builtin Browser]
|
||||
F1 -->|stop| F3[Stop Builtin Browser]
|
||||
F1 -->|status| F4[Check Browser Status]
|
||||
F1 -->|view| F5[Open Browser Window]
|
||||
|
||||
G --> G1{Config Action?}
|
||||
G1 -->|list| G2[Show All Settings]
|
||||
G1 -->|set| G3[Update Setting]
|
||||
G1 -->|get| G4[Read Setting]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style C13 fill:#c8e6c9
|
||||
style C14 fill:#ffcdd2
|
||||
style D3 fill:#fff3e0
|
||||
style E2 fill:#f3e5f5
|
||||
```
|
||||
|
||||
### Profile Management Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant CLI
|
||||
participant ProfileManager
|
||||
participant Browser
|
||||
participant FileSystem
|
||||
|
||||
User->>CLI: crwl profiles
|
||||
CLI->>ProfileManager: Initialize profile manager
|
||||
ProfileManager->>FileSystem: Scan for existing profiles
|
||||
FileSystem-->>ProfileManager: Profile list
|
||||
ProfileManager-->>CLI: Show interactive menu
|
||||
CLI-->>User: Display options
|
||||
|
||||
Note over User: User selects "Create new profile"
|
||||
|
||||
User->>CLI: Create profile "linkedin-auth"
|
||||
CLI->>ProfileManager: create_profile("linkedin-auth")
|
||||
ProfileManager->>FileSystem: Create profile directory
|
||||
ProfileManager->>Browser: Launch with new user data dir
|
||||
Browser-->>User: Opens browser window
|
||||
|
||||
Note over User: User manually logs in to LinkedIn
|
||||
|
||||
User->>Browser: Navigate and authenticate
|
||||
Browser->>FileSystem: Save cookies, session data
|
||||
User->>CLI: Press 'q' to save profile
|
||||
CLI->>ProfileManager: finalize_profile()
|
||||
ProfileManager->>FileSystem: Lock profile settings
|
||||
ProfileManager-->>CLI: Profile saved
|
||||
CLI-->>User: Profile "linkedin-auth" created
|
||||
|
||||
Note over User: Later usage
|
||||
|
||||
User->>CLI: crwl https://linkedin.com/feed -p linkedin-auth
|
||||
CLI->>ProfileManager: load_profile("linkedin-auth")
|
||||
ProfileManager->>FileSystem: Read profile data
|
||||
FileSystem-->>ProfileManager: User data directory
|
||||
ProfileManager-->>CLI: Profile configuration
|
||||
CLI->>Browser: Launch with existing profile
|
||||
Browser-->>CLI: Authenticated session ready
|
||||
CLI->>Browser: Navigate to target URL
|
||||
Browser-->>CLI: Crawl results with auth context
|
||||
CLI-->>User: Authenticated content
|
||||
```
|
||||
|
||||
### Browser Management State Machine
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> Stopped: Initial state
|
||||
|
||||
Stopped --> Starting: crwl browser start
|
||||
Starting --> Running: Browser launched
|
||||
Running --> Viewing: crwl browser view
|
||||
Viewing --> Running: Close window
|
||||
Running --> Stopping: crwl browser stop
|
||||
Stopping --> Stopped: Cleanup complete
|
||||
|
||||
Running --> Restarting: crwl browser restart
|
||||
Restarting --> Running: New browser instance
|
||||
|
||||
Stopped --> CDP_Mode: crwl cdp
|
||||
CDP_Mode --> CDP_Running: Remote debugging active
|
||||
CDP_Running --> CDP_Mode: Manual close
|
||||
CDP_Mode --> Stopped: Exit CDP
|
||||
|
||||
Running --> StatusCheck: crwl browser status
|
||||
StatusCheck --> Running: Return status
|
||||
|
||||
note right of Running : Port 9222 active\nBuiltin browser available
|
||||
note right of CDP_Running : Remote debugging\nManual control enabled
|
||||
note right of Viewing : Visual browser window\nDirect interaction
|
||||
```
|
||||
|
||||
### Authentication Workflow for Protected Sites
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Protected Site Access Needed] --> B[Create Profile Strategy]
|
||||
|
||||
B --> C{Existing Profile?}
|
||||
C -->|Yes| D[Test Profile Validity]
|
||||
C -->|No| E[Create New Profile]
|
||||
|
||||
D --> D1{Profile Valid?}
|
||||
D1 -->|Yes| F[Use Existing Profile]
|
||||
D1 -->|No| E
|
||||
|
||||
E --> E1[crwl profiles]
|
||||
E1 --> E2[Select Create New Profile]
|
||||
E2 --> E3[Enter Profile Name]
|
||||
E3 --> E4[Browser Opens for Auth]
|
||||
|
||||
E4 --> E5{Authentication Method?}
|
||||
E5 -->|Login Form| E6[Fill Username/Password]
|
||||
E5 -->|OAuth| E7[OAuth Flow]
|
||||
E5 -->|2FA| E8[Handle 2FA]
|
||||
E5 -->|Session Cookie| E9[Import Cookies]
|
||||
|
||||
E6 --> E10[Manual Login Process]
|
||||
E7 --> E10
|
||||
E8 --> E10
|
||||
E9 --> E10
|
||||
|
||||
E10 --> E11[Verify Authentication]
|
||||
E11 --> E12{Auth Successful?}
|
||||
E12 -->|Yes| E13[Save Profile - Press q]
|
||||
E12 -->|No| E10
|
||||
|
||||
E13 --> F
|
||||
F --> G[Execute Authenticated Crawl]
|
||||
|
||||
G --> H[crwl URL -p profile-name]
|
||||
H --> I[Load Profile Data]
|
||||
I --> J[Launch Browser with Auth]
|
||||
J --> K[Navigate to Protected Content]
|
||||
K --> L[Extract Authenticated Data]
|
||||
L --> M[Return Results]
|
||||
|
||||
style E4 fill:#fff3e0
|
||||
style E10 fill:#e3f2fd
|
||||
style F fill:#e8f5e8
|
||||
style M fill:#c8e6c9
|
||||
```
|
||||
|
||||
### CDP Browser Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "CLI Layer"
|
||||
A[crwl cdp command] --> B[CDP Manager]
|
||||
B --> C[Port Configuration]
|
||||
B --> D[Profile Selection]
|
||||
end
|
||||
|
||||
subgraph "Browser Process"
|
||||
E[Chromium/Firefox] --> F[Remote Debugging]
|
||||
F --> G[WebSocket Endpoint]
|
||||
G --> H[ws://localhost:9222]
|
||||
end
|
||||
|
||||
subgraph "Client Connections"
|
||||
I[Manual Browser Control] --> H
|
||||
J[DevTools Interface] --> H
|
||||
K[External Automation] --> H
|
||||
L[Crawl4AI Crawler] --> H
|
||||
end
|
||||
|
||||
subgraph "Profile Data"
|
||||
M[User Data Directory] --> E
|
||||
N[Cookies & Sessions] --> M
|
||||
O[Extensions] --> M
|
||||
P[Browser State] --> M
|
||||
end
|
||||
|
||||
A --> E
|
||||
C --> H
|
||||
D --> M
|
||||
|
||||
style H fill:#e3f2fd
|
||||
style E fill:#f3e5f5
|
||||
style M fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Configuration Management Hierarchy
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Global Configuration"
|
||||
A[~/.crawl4ai/config.yml] --> B[Default Settings]
|
||||
B --> C[LLM Providers]
|
||||
B --> D[Browser Defaults]
|
||||
B --> E[Output Preferences]
|
||||
end
|
||||
|
||||
subgraph "Profile Configuration"
|
||||
F[Profile Directory] --> G[Browser State]
|
||||
F --> H[Authentication Data]
|
||||
F --> I[Site-Specific Settings]
|
||||
end
|
||||
|
||||
subgraph "Command-Line Overrides"
|
||||
J[-b browser_config] --> K[Runtime Browser Settings]
|
||||
L[-c crawler_config] --> M[Runtime Crawler Settings]
|
||||
N[-o output_format] --> O[Runtime Output Format]
|
||||
end
|
||||
|
||||
subgraph "Configuration Files"
|
||||
P[browser.yml] --> Q[Browser Config Template]
|
||||
R[crawler.yml] --> S[Crawler Config Template]
|
||||
T[extract.yml] --> U[Extraction Config]
|
||||
end
|
||||
|
||||
subgraph "Resolution Order"
|
||||
V[Command Line Args] --> W[Config Files]
|
||||
W --> X[Profile Settings]
|
||||
X --> Y[Global Defaults]
|
||||
end
|
||||
|
||||
J --> V
|
||||
L --> V
|
||||
N --> V
|
||||
P --> W
|
||||
R --> W
|
||||
T --> W
|
||||
F --> X
|
||||
A --> Y
|
||||
|
||||
style V fill:#ffcdd2
|
||||
style W fill:#fff3e0
|
||||
style X fill:#e3f2fd
|
||||
style Y fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Identity-Based Crawling Decision Tree
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Target Website Assessment] --> B{Authentication Required?}
|
||||
|
||||
B -->|No| C[Standard Anonymous Crawl]
|
||||
B -->|Yes| D{Authentication Type?}
|
||||
|
||||
D -->|Login Form| E[Create Login Profile]
|
||||
D -->|OAuth/SSO| F[Create OAuth Profile]
|
||||
D -->|API Key/Token| G[Use Headers/Config]
|
||||
D -->|Session Cookies| H[Import Cookie Profile]
|
||||
|
||||
E --> E1[crwl profiles → Manual login]
|
||||
F --> F1[crwl profiles → OAuth flow]
|
||||
G --> G1[Configure headers in crawler config]
|
||||
H --> H1[Import cookies to profile]
|
||||
|
||||
E1 --> I[Test Authentication]
|
||||
F1 --> I
|
||||
G1 --> I
|
||||
H1 --> I
|
||||
|
||||
I --> J{Auth Test Success?}
|
||||
J -->|Yes| K[Production Crawl Setup]
|
||||
J -->|No| L[Debug Authentication]
|
||||
|
||||
L --> L1{Common Issues?}
|
||||
L1 -->|Rate Limiting| L2[Add delays/user simulation]
|
||||
L1 -->|Bot Detection| L3[Enable stealth mode]
|
||||
L1 -->|Session Expired| L4[Refresh authentication]
|
||||
L1 -->|CAPTCHA| L5[Manual intervention needed]
|
||||
|
||||
L2 --> M[Retry with Adjustments]
|
||||
L3 --> M
|
||||
L4 --> E1
|
||||
L5 --> N[Semi-automated approach]
|
||||
|
||||
M --> I
|
||||
N --> O[Manual auth + automated crawl]
|
||||
|
||||
K --> P[Automated Authenticated Crawling]
|
||||
O --> P
|
||||
C --> P
|
||||
|
||||
P --> Q[Monitor & Maintain Profiles]
|
||||
|
||||
style I fill:#fff3e0
|
||||
style K fill:#e8f5e8
|
||||
style P fill:#c8e6c9
|
||||
style L fill:#ffcdd2
|
||||
style N fill:#f3e5f5
|
||||
```
|
||||
|
||||
### CLI Usage Patterns and Best Practices
|
||||
|
||||
```mermaid
|
||||
timeline
|
||||
title CLI Workflow Evolution
|
||||
|
||||
section Setup Phase
|
||||
Installation : pip install crawl4ai
|
||||
: crawl4ai-setup
|
||||
Basic Test : crwl https://example.com
|
||||
Config Setup : crwl config set defaults
|
||||
|
||||
section Profile Creation
|
||||
Site Analysis : Identify auth requirements
|
||||
Profile Creation : crwl profiles
|
||||
Manual Login : Authenticate in browser
|
||||
Profile Save : Press 'q' to save
|
||||
|
||||
section Development Phase
|
||||
Test Crawls : crwl URL -p profile -v
|
||||
Config Tuning : Adjust browser/crawler settings
|
||||
Output Testing : Try different output formats
|
||||
Error Handling : Debug authentication issues
|
||||
|
||||
section Production Phase
|
||||
Automated Crawls : crwl URL -p profile -o json
|
||||
Batch Processing : Multiple URLs with same profile
|
||||
Monitoring : Check profile validity
|
||||
Maintenance : Update profiles as needed
|
||||
```
|
||||
|
||||
### Multi-Profile Management Strategy
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Profile Categories"
|
||||
A[Social Media Profiles]
|
||||
B[Work/Enterprise Profiles]
|
||||
C[E-commerce Profiles]
|
||||
D[Research Profiles]
|
||||
end
|
||||
|
||||
subgraph "Social Media"
|
||||
A --> A1[linkedin-personal]
|
||||
A --> A2[twitter-monitor]
|
||||
A --> A3[facebook-research]
|
||||
A --> A4[instagram-brand]
|
||||
end
|
||||
|
||||
subgraph "Enterprise"
|
||||
B --> B1[company-intranet]
|
||||
B --> B2[github-enterprise]
|
||||
B --> B3[confluence-docs]
|
||||
B --> B4[jira-tickets]
|
||||
end
|
||||
|
||||
subgraph "E-commerce"
|
||||
C --> C1[amazon-seller]
|
||||
C --> C2[shopify-admin]
|
||||
C --> C3[ebay-monitor]
|
||||
C --> C4[marketplace-competitor]
|
||||
end
|
||||
|
||||
subgraph "Research"
|
||||
D --> D1[academic-journals]
|
||||
D --> D2[data-platforms]
|
||||
D --> D3[survey-tools]
|
||||
D --> D4[government-portals]
|
||||
end
|
||||
|
||||
subgraph "Usage Patterns"
|
||||
E[Daily Monitoring] --> A2
|
||||
E --> B1
|
||||
F[Weekly Reports] --> C3
|
||||
F --> D2
|
||||
G[On-Demand Research] --> D1
|
||||
G --> D4
|
||||
H[Competitive Analysis] --> C4
|
||||
H --> A4
|
||||
end
|
||||
|
||||
style A1 fill:#e3f2fd
|
||||
style B1 fill:#f3e5f5
|
||||
style C1 fill:#e8f5e8
|
||||
style D1 fill:#fff3e0
|
||||
```
|
||||
|
||||
**📖 Learn more:** [CLI Reference](https://docs.crawl4ai.com/core/cli/), [Identity-Based Crawling](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [Authentication Strategies](https://docs.crawl4ai.com/advanced/hooks-auth/)
|
||||
1421
docs/md_v2/assets/llm.txt/diagrams/config_objects.txt
Normal file
1421
docs/md_v2/assets/llm.txt/diagrams/config_objects.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,401 @@
|
||||
## Deep Crawling Filters & Scorers Architecture
|
||||
|
||||
Visual representations of advanced URL filtering, scoring strategies, and performance optimization workflows for intelligent deep crawling.
|
||||
|
||||
### Filter Chain Processing Pipeline
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[URL Input] --> B{Domain Filter}
|
||||
B -->|✓ Pass| C{Pattern Filter}
|
||||
B -->|✗ Fail| X1[Reject: Invalid Domain]
|
||||
|
||||
C -->|✓ Pass| D{Content Type Filter}
|
||||
C -->|✗ Fail| X2[Reject: Pattern Mismatch]
|
||||
|
||||
D -->|✓ Pass| E{SEO Filter}
|
||||
D -->|✗ Fail| X3[Reject: Wrong Content Type]
|
||||
|
||||
E -->|✓ Pass| F{Content Relevance Filter}
|
||||
E -->|✗ Fail| X4[Reject: Low SEO Score]
|
||||
|
||||
F -->|✓ Pass| G[URL Accepted]
|
||||
F -->|✗ Fail| X5[Reject: Low Relevance]
|
||||
|
||||
G --> H[Add to Crawl Queue]
|
||||
|
||||
subgraph "Fast Filters"
|
||||
B
|
||||
C
|
||||
D
|
||||
end
|
||||
|
||||
subgraph "Slow Filters"
|
||||
E
|
||||
F
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style G fill:#c8e6c9
|
||||
style H fill:#e8f5e8
|
||||
style X1 fill:#ffcdd2
|
||||
style X2 fill:#ffcdd2
|
||||
style X3 fill:#ffcdd2
|
||||
style X4 fill:#ffcdd2
|
||||
style X5 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### URL Scoring System Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Input URL"
|
||||
A[https://python.org/tutorial/2024/ml-guide.html]
|
||||
end
|
||||
|
||||
subgraph "Individual Scorers"
|
||||
B[Keyword Relevance Scorer]
|
||||
C[Path Depth Scorer]
|
||||
D[Content Type Scorer]
|
||||
E[Freshness Scorer]
|
||||
F[Domain Authority Scorer]
|
||||
end
|
||||
|
||||
subgraph "Scoring Process"
|
||||
B --> B1[Keywords: python, tutorial, ml<br/>Score: 0.85]
|
||||
C --> C1[Depth: 4 levels<br/>Optimal: 3<br/>Score: 0.75]
|
||||
D --> D1[Content: HTML<br/>Score: 1.0]
|
||||
E --> E1[Year: 2024<br/>Score: 1.0]
|
||||
F --> F1[Domain: python.org<br/>Score: 1.0]
|
||||
end
|
||||
|
||||
subgraph "Composite Scoring"
|
||||
G[Weighted Combination]
|
||||
B1 --> G
|
||||
C1 --> G
|
||||
D1 --> G
|
||||
E1 --> G
|
||||
F1 --> G
|
||||
end
|
||||
|
||||
subgraph "Final Result"
|
||||
H[Composite Score: 0.92]
|
||||
I{Score > Threshold?}
|
||||
J[Accept URL]
|
||||
K[Reject URL]
|
||||
end
|
||||
|
||||
A --> B
|
||||
A --> C
|
||||
A --> D
|
||||
A --> E
|
||||
A --> F
|
||||
|
||||
G --> H
|
||||
H --> I
|
||||
I -->|✓ 0.92 > 0.6| J
|
||||
I -->|✗ Score too low| K
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style G fill:#fff3e0
|
||||
style H fill:#e8f5e8
|
||||
style J fill:#c8e6c9
|
||||
style K fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Filter vs Scorer Decision Matrix
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[URL Processing Decision] --> B{Binary Decision Needed?}
|
||||
|
||||
B -->|Yes - Include/Exclude| C[Use Filters]
|
||||
B -->|No - Quality Rating| D[Use Scorers]
|
||||
|
||||
C --> C1{Filter Type Needed?}
|
||||
C1 -->|Domain Control| C2[DomainFilter]
|
||||
C1 -->|Pattern Matching| C3[URLPatternFilter]
|
||||
C1 -->|Content Type| C4[ContentTypeFilter]
|
||||
C1 -->|SEO Quality| C5[SEOFilter]
|
||||
C1 -->|Content Relevance| C6[ContentRelevanceFilter]
|
||||
|
||||
D --> D1{Scoring Criteria?}
|
||||
D1 -->|Keyword Relevance| D2[KeywordRelevanceScorer]
|
||||
D1 -->|URL Structure| D3[PathDepthScorer]
|
||||
D1 -->|Content Quality| D4[ContentTypeScorer]
|
||||
D1 -->|Time Sensitivity| D5[FreshnessScorer]
|
||||
D1 -->|Source Authority| D6[DomainAuthorityScorer]
|
||||
|
||||
C2 --> E[Chain Filters]
|
||||
C3 --> E
|
||||
C4 --> E
|
||||
C5 --> E
|
||||
C6 --> E
|
||||
|
||||
D2 --> F[Composite Scorer]
|
||||
D3 --> F
|
||||
D4 --> F
|
||||
D5 --> F
|
||||
D6 --> F
|
||||
|
||||
E --> G[Binary Output: Pass/Fail]
|
||||
F --> H[Numeric Score: 0.0-1.0]
|
||||
|
||||
G --> I[Apply to URL Queue]
|
||||
H --> J[Priority Ranking]
|
||||
|
||||
style C fill:#e8f5e8
|
||||
style D fill:#fff3e0
|
||||
style E fill:#f3e5f5
|
||||
style F fill:#e3f2fd
|
||||
style G fill:#c8e6c9
|
||||
style H fill:#ffecb3
|
||||
```
|
||||
|
||||
### Performance Optimization Strategy
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Queue as URL Queue
|
||||
participant Fast as Fast Filters
|
||||
participant Slow as Slow Filters
|
||||
participant Score as Scorers
|
||||
participant Output as Filtered URLs
|
||||
|
||||
Note over Queue, Output: Batch Processing (1000 URLs)
|
||||
|
||||
Queue->>Fast: Apply Domain Filter
|
||||
Fast-->>Queue: 60% passed (600 URLs)
|
||||
|
||||
Queue->>Fast: Apply Pattern Filter
|
||||
Fast-->>Queue: 70% passed (420 URLs)
|
||||
|
||||
Queue->>Fast: Apply Content Type Filter
|
||||
Fast-->>Queue: 90% passed (378 URLs)
|
||||
|
||||
Note over Fast: Fast filters eliminate 62% of URLs
|
||||
|
||||
Queue->>Slow: Apply SEO Filter (378 URLs)
|
||||
Slow-->>Queue: 80% passed (302 URLs)
|
||||
|
||||
Queue->>Slow: Apply Relevance Filter
|
||||
Slow-->>Queue: 75% passed (227 URLs)
|
||||
|
||||
Note over Slow: Content analysis on remaining URLs
|
||||
|
||||
Queue->>Score: Calculate Composite Scores
|
||||
Score-->>Queue: Scored and ranked
|
||||
|
||||
Queue->>Output: Top 100 URLs by score
|
||||
Output-->>Queue: Processing complete
|
||||
|
||||
Note over Queue, Output: Total: 90% filtered out, 10% high-quality URLs retained
|
||||
```
|
||||
|
||||
### Custom Filter Implementation Flow
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> Planning
|
||||
|
||||
Planning --> IdentifyNeeds: Define filtering criteria
|
||||
IdentifyNeeds --> ChooseType: Binary vs Scoring decision
|
||||
|
||||
ChooseType --> FilterImpl: Binary decision needed
|
||||
ChooseType --> ScorerImpl: Quality rating needed
|
||||
|
||||
FilterImpl --> InheritURLFilter: Extend URLFilter base class
|
||||
ScorerImpl --> InheritURLScorer: Extend URLScorer base class
|
||||
|
||||
InheritURLFilter --> ImplementApply: def apply(url) -> bool
|
||||
InheritURLScorer --> ImplementScore: def _calculate_score(url) -> float
|
||||
|
||||
ImplementApply --> AddLogic: Add custom filtering logic
|
||||
ImplementScore --> AddLogic
|
||||
|
||||
AddLogic --> TestFilter: Unit testing
|
||||
TestFilter --> OptimizePerf: Performance optimization
|
||||
|
||||
OptimizePerf --> Integration: Integrate with FilterChain
|
||||
Integration --> Production: Deploy to production
|
||||
|
||||
Production --> Monitor: Monitor performance
|
||||
Monitor --> Tune: Tune parameters
|
||||
Tune --> Production
|
||||
|
||||
note right of Planning : Consider performance impact
|
||||
note right of AddLogic : Handle edge cases
|
||||
note right of OptimizePerf : Cache frequently accessed data
|
||||
```
|
||||
|
||||
### Filter Chain Optimization Patterns
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Naive Approach - Poor Performance"
|
||||
A1[All URLs] --> B1[Slow Filter 1]
|
||||
B1 --> C1[Slow Filter 2]
|
||||
C1 --> D1[Fast Filter 1]
|
||||
D1 --> E1[Fast Filter 2]
|
||||
E1 --> F1[Final Results]
|
||||
|
||||
B1 -.->|High CPU| G1[Performance Issues]
|
||||
C1 -.->|Network Calls| G1
|
||||
end
|
||||
|
||||
subgraph "Optimized Approach - High Performance"
|
||||
A2[All URLs] --> B2[Fast Filter 1]
|
||||
B2 --> C2[Fast Filter 2]
|
||||
C2 --> D2[Batch Process]
|
||||
D2 --> E2[Slow Filter 1]
|
||||
E2 --> F2[Slow Filter 2]
|
||||
F2 --> G2[Final Results]
|
||||
|
||||
D2 --> H2[Concurrent Processing]
|
||||
H2 --> I2[Semaphore Control]
|
||||
end
|
||||
|
||||
subgraph "Performance Metrics"
|
||||
J[Processing Time]
|
||||
K[Memory Usage]
|
||||
L[CPU Utilization]
|
||||
M[Network Requests]
|
||||
end
|
||||
|
||||
G1 -.-> J
|
||||
G1 -.-> K
|
||||
G1 -.-> L
|
||||
G1 -.-> M
|
||||
|
||||
G2 -.-> J
|
||||
G2 -.-> K
|
||||
G2 -.-> L
|
||||
G2 -.-> M
|
||||
|
||||
style A1 fill:#ffcdd2
|
||||
style G1 fill:#ffcdd2
|
||||
style A2 fill:#c8e6c9
|
||||
style G2 fill:#c8e6c9
|
||||
style H2 fill:#e8f5e8
|
||||
style I2 fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Composite Scoring Weight Distribution
|
||||
|
||||
```mermaid
|
||||
pie title Composite Scorer Weight Distribution
|
||||
"Keyword Relevance (30%)" : 30
|
||||
"Domain Authority (25%)" : 25
|
||||
"Content Type (20%)" : 20
|
||||
"Freshness (15%)" : 15
|
||||
"Path Depth (10%)" : 10
|
||||
```
|
||||
|
||||
### Deep Crawl Integration Architecture
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Deep Crawl Strategy"
|
||||
A[Start URL] --> B[Extract Links]
|
||||
B --> C[Apply Filter Chain]
|
||||
C --> D[Calculate Scores]
|
||||
D --> E[Priority Queue]
|
||||
E --> F[Crawl Next URL]
|
||||
F --> B
|
||||
end
|
||||
|
||||
subgraph "Filter Chain Components"
|
||||
C --> C1[Domain Filter]
|
||||
C --> C2[Pattern Filter]
|
||||
C --> C3[Content Filter]
|
||||
C --> C4[SEO Filter]
|
||||
C --> C5[Relevance Filter]
|
||||
end
|
||||
|
||||
subgraph "Scoring Components"
|
||||
D --> D1[Keyword Scorer]
|
||||
D --> D2[Depth Scorer]
|
||||
D --> D3[Freshness Scorer]
|
||||
D --> D4[Authority Scorer]
|
||||
D --> D5[Composite Score]
|
||||
end
|
||||
|
||||
subgraph "Queue Management"
|
||||
E --> E1{Score > Threshold?}
|
||||
E1 -->|Yes| E2[High Priority Queue]
|
||||
E1 -->|No| E3[Low Priority Queue]
|
||||
E2 --> F
|
||||
E3 --> G[Delayed Processing]
|
||||
end
|
||||
|
||||
subgraph "Control Flow"
|
||||
H{Max Depth Reached?}
|
||||
I{Max Pages Reached?}
|
||||
J[Stop Crawling]
|
||||
end
|
||||
|
||||
F --> H
|
||||
H -->|No| I
|
||||
H -->|Yes| J
|
||||
I -->|No| B
|
||||
I -->|Yes| J
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style E2 fill:#c8e6c9
|
||||
style E3 fill:#fff3e0
|
||||
style J fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Filter Performance Comparison
|
||||
|
||||
```mermaid
|
||||
xychart-beta
|
||||
title "Filter Performance Comparison (1000 URLs)"
|
||||
x-axis [Domain, Pattern, ContentType, SEO, Relevance]
|
||||
y-axis "Processing Time (ms)" 0 --> 1000
|
||||
bar [50, 80, 45, 300, 800]
|
||||
```
|
||||
|
||||
### Scoring Algorithm Workflow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Input URL] --> B[Parse URL Components]
|
||||
B --> C[Extract Features]
|
||||
|
||||
C --> D[Domain Analysis]
|
||||
C --> E[Path Analysis]
|
||||
C --> F[Content Type Detection]
|
||||
C --> G[Keyword Extraction]
|
||||
C --> H[Freshness Detection]
|
||||
|
||||
D --> I[Domain Authority Score]
|
||||
E --> J[Path Depth Score]
|
||||
F --> K[Content Type Score]
|
||||
G --> L[Keyword Relevance Score]
|
||||
H --> M[Freshness Score]
|
||||
|
||||
I --> N[Apply Weights]
|
||||
J --> N
|
||||
K --> N
|
||||
L --> N
|
||||
M --> N
|
||||
|
||||
N --> O[Normalize Scores]
|
||||
O --> P[Calculate Final Score]
|
||||
P --> Q{Score >= Threshold?}
|
||||
|
||||
Q -->|Yes| R[Accept for Crawling]
|
||||
Q -->|No| S[Reject URL]
|
||||
|
||||
R --> T[Add to Priority Queue]
|
||||
S --> U[Log Rejection Reason]
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style P fill:#fff3e0
|
||||
style R fill:#c8e6c9
|
||||
style S fill:#ffcdd2
|
||||
style T fill:#e8f5e8
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/), [Custom Implementations](https://docs.crawl4ai.com/advanced/custom-filters/)
|
||||
428
docs/md_v2/assets/llm.txt/diagrams/deep_crawling.txt
Normal file
428
docs/md_v2/assets/llm.txt/diagrams/deep_crawling.txt
Normal file
@@ -0,0 +1,428 @@
|
||||
## Deep Crawling Workflows and Architecture
|
||||
|
||||
Visual representations of multi-level website exploration, filtering strategies, and intelligent crawling patterns.
|
||||
|
||||
### Deep Crawl Strategy Overview
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Start Deep Crawl] --> B{Strategy Selection}
|
||||
|
||||
B -->|Explore All Levels| C[BFS Strategy]
|
||||
B -->|Dive Deep Fast| D[DFS Strategy]
|
||||
B -->|Smart Prioritization| E[Best-First Strategy]
|
||||
|
||||
C --> C1[Breadth-First Search]
|
||||
C1 --> C2[Process all depth 0 links]
|
||||
C2 --> C3[Process all depth 1 links]
|
||||
C3 --> C4[Continue by depth level]
|
||||
|
||||
D --> D1[Depth-First Search]
|
||||
D1 --> D2[Follow first link deeply]
|
||||
D2 --> D3[Backtrack when max depth reached]
|
||||
D3 --> D4[Continue with next branch]
|
||||
|
||||
E --> E1[Best-First Search]
|
||||
E1 --> E2[Score all discovered URLs]
|
||||
E2 --> E3[Process highest scoring URLs first]
|
||||
E3 --> E4[Continuously re-prioritize queue]
|
||||
|
||||
C4 --> F[Apply Filters]
|
||||
D4 --> F
|
||||
E4 --> F
|
||||
|
||||
F --> G{Filter Chain Processing}
|
||||
G -->|Domain Filter| G1[Check allowed/blocked domains]
|
||||
G -->|URL Pattern Filter| G2[Match URL patterns]
|
||||
G -->|Content Type Filter| G3[Verify content types]
|
||||
G -->|SEO Filter| G4[Evaluate SEO quality]
|
||||
G -->|Content Relevance| G5[Score content relevance]
|
||||
|
||||
G1 --> H{Passed All Filters?}
|
||||
G2 --> H
|
||||
G3 --> H
|
||||
G4 --> H
|
||||
G5 --> H
|
||||
|
||||
H -->|Yes| I[Add to Crawl Queue]
|
||||
H -->|No| J[Discard URL]
|
||||
|
||||
I --> K{Processing Mode}
|
||||
K -->|Streaming| L[Process Immediately]
|
||||
K -->|Batch| M[Collect All Results]
|
||||
|
||||
L --> N[Stream Result to User]
|
||||
M --> O[Return Complete Result Set]
|
||||
|
||||
J --> P{More URLs in Queue?}
|
||||
N --> P
|
||||
O --> P
|
||||
|
||||
P -->|Yes| Q{Within Limits?}
|
||||
P -->|No| R[Deep Crawl Complete]
|
||||
|
||||
Q -->|Max Depth OK| S{Max Pages OK}
|
||||
Q -->|Max Depth Exceeded| T[Skip Deeper URLs]
|
||||
|
||||
S -->|Under Limit| U[Continue Crawling]
|
||||
S -->|Limit Reached| R
|
||||
|
||||
T --> P
|
||||
U --> F
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style R fill:#c8e6c9
|
||||
style C fill:#fff3e0
|
||||
style D fill:#f3e5f5
|
||||
style E fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Deep Crawl Strategy Comparison
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "BFS - Breadth-First Search"
|
||||
BFS1[Level 0: Start URL]
|
||||
BFS2[Level 1: All direct links]
|
||||
BFS3[Level 2: All second-level links]
|
||||
BFS4[Level 3: All third-level links]
|
||||
|
||||
BFS1 --> BFS2
|
||||
BFS2 --> BFS3
|
||||
BFS3 --> BFS4
|
||||
|
||||
BFS_NOTE[Complete each depth before going deeper<br/>Good for site mapping<br/>Memory intensive for wide sites]
|
||||
end
|
||||
|
||||
subgraph "DFS - Depth-First Search"
|
||||
DFS1[Start URL]
|
||||
DFS2[First Link → Deep]
|
||||
DFS3[Follow until max depth]
|
||||
DFS4[Backtrack and try next]
|
||||
|
||||
DFS1 --> DFS2
|
||||
DFS2 --> DFS3
|
||||
DFS3 --> DFS4
|
||||
DFS4 --> DFS2
|
||||
|
||||
DFS_NOTE[Go deep on first path<br/>Memory efficient<br/>May miss important pages]
|
||||
end
|
||||
|
||||
subgraph "Best-First - Priority Queue"
|
||||
BF1[Start URL]
|
||||
BF2[Score all discovered links]
|
||||
BF3[Process highest scoring first]
|
||||
BF4[Continuously re-prioritize]
|
||||
|
||||
BF1 --> BF2
|
||||
BF2 --> BF3
|
||||
BF3 --> BF4
|
||||
BF4 --> BF2
|
||||
|
||||
BF_NOTE[Intelligent prioritization<br/>Finds relevant content fast<br/>Recommended for most use cases]
|
||||
end
|
||||
|
||||
style BFS1 fill:#e3f2fd
|
||||
style DFS1 fill:#f3e5f5
|
||||
style BF1 fill:#e8f5e8
|
||||
style BFS_NOTE fill:#fff3e0
|
||||
style DFS_NOTE fill:#fff3e0
|
||||
style BF_NOTE fill:#fff3e0
|
||||
```
|
||||
|
||||
### Filter Chain Processing Sequence
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant URL as Discovered URL
|
||||
participant Chain as Filter Chain
|
||||
participant Domain as Domain Filter
|
||||
participant Pattern as URL Pattern Filter
|
||||
participant Content as Content Type Filter
|
||||
participant SEO as SEO Filter
|
||||
participant Relevance as Content Relevance Filter
|
||||
participant Queue as Crawl Queue
|
||||
|
||||
URL->>Chain: Process URL
|
||||
Chain->>Domain: Check domain rules
|
||||
|
||||
alt Domain Allowed
|
||||
Domain-->>Chain: ✓ Pass
|
||||
Chain->>Pattern: Check URL patterns
|
||||
|
||||
alt Pattern Matches
|
||||
Pattern-->>Chain: ✓ Pass
|
||||
Chain->>Content: Check content type
|
||||
|
||||
alt Content Type Valid
|
||||
Content-->>Chain: ✓ Pass
|
||||
Chain->>SEO: Evaluate SEO quality
|
||||
|
||||
alt SEO Score Above Threshold
|
||||
SEO-->>Chain: ✓ Pass
|
||||
Chain->>Relevance: Score content relevance
|
||||
|
||||
alt Relevance Score High
|
||||
Relevance-->>Chain: ✓ Pass
|
||||
Chain->>Queue: Add to crawl queue
|
||||
Queue-->>URL: Queued for crawling
|
||||
else Relevance Score Low
|
||||
Relevance-->>Chain: ✗ Reject
|
||||
Chain-->>URL: Filtered out - Low relevance
|
||||
end
|
||||
else SEO Score Low
|
||||
SEO-->>Chain: ✗ Reject
|
||||
Chain-->>URL: Filtered out - Poor SEO
|
||||
end
|
||||
else Invalid Content Type
|
||||
Content-->>Chain: ✗ Reject
|
||||
Chain-->>URL: Filtered out - Wrong content type
|
||||
end
|
||||
else Pattern Mismatch
|
||||
Pattern-->>Chain: ✗ Reject
|
||||
Chain-->>URL: Filtered out - Pattern mismatch
|
||||
end
|
||||
else Domain Blocked
|
||||
Domain-->>Chain: ✗ Reject
|
||||
Chain-->>URL: Filtered out - Blocked domain
|
||||
end
|
||||
```
|
||||
|
||||
### URL Lifecycle State Machine
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> Discovered: Found on page
|
||||
|
||||
Discovered --> FilterPending: Enter filter chain
|
||||
|
||||
FilterPending --> DomainCheck: Apply domain filter
|
||||
DomainCheck --> PatternCheck: Domain allowed
|
||||
DomainCheck --> Rejected: Domain blocked
|
||||
|
||||
PatternCheck --> ContentCheck: Pattern matches
|
||||
PatternCheck --> Rejected: Pattern mismatch
|
||||
|
||||
ContentCheck --> SEOCheck: Content type valid
|
||||
ContentCheck --> Rejected: Invalid content
|
||||
|
||||
SEOCheck --> RelevanceCheck: SEO score sufficient
|
||||
SEOCheck --> Rejected: Poor SEO score
|
||||
|
||||
RelevanceCheck --> Scored: Relevance score calculated
|
||||
RelevanceCheck --> Rejected: Low relevance
|
||||
|
||||
Scored --> Queued: Added to priority queue
|
||||
|
||||
Queued --> Crawling: Selected for processing
|
||||
Crawling --> Success: Page crawled successfully
|
||||
Crawling --> Failed: Crawl failed
|
||||
|
||||
Success --> LinkExtraction: Extract new links
|
||||
LinkExtraction --> [*]: Process complete
|
||||
|
||||
Failed --> [*]: Record failure
|
||||
Rejected --> [*]: Log rejection reason
|
||||
|
||||
note right of Scored : Score determines priority<br/>in Best-First strategy
|
||||
|
||||
note right of Failed : Errors logged with<br/>depth and reason
|
||||
```
|
||||
|
||||
### Streaming vs Batch Processing Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Input"
|
||||
A[Start URL] --> B[Deep Crawl Strategy]
|
||||
end
|
||||
|
||||
subgraph "Crawl Engine"
|
||||
B --> C[URL Discovery]
|
||||
C --> D[Filter Chain]
|
||||
D --> E[Priority Queue]
|
||||
E --> F[Page Processor]
|
||||
end
|
||||
|
||||
subgraph "Streaming Mode stream=True"
|
||||
F --> G1[Process Page]
|
||||
G1 --> H1[Extract Content]
|
||||
H1 --> I1[Yield Result Immediately]
|
||||
I1 --> J1[async for result]
|
||||
J1 --> K1[Real-time Processing]
|
||||
|
||||
G1 --> L1[Extract Links]
|
||||
L1 --> M1[Add to Queue]
|
||||
M1 --> F
|
||||
end
|
||||
|
||||
subgraph "Batch Mode stream=False"
|
||||
F --> G2[Process Page]
|
||||
G2 --> H2[Extract Content]
|
||||
H2 --> I2[Store Result]
|
||||
I2 --> N2[Result Collection]
|
||||
|
||||
G2 --> L2[Extract Links]
|
||||
L2 --> M2[Add to Queue]
|
||||
M2 --> O2{More URLs?}
|
||||
O2 -->|Yes| F
|
||||
O2 -->|No| P2[Return All Results]
|
||||
P2 --> Q2[Batch Processing]
|
||||
end
|
||||
|
||||
style I1 fill:#e8f5e8
|
||||
style K1 fill:#e8f5e8
|
||||
style P2 fill:#e3f2fd
|
||||
style Q2 fill:#e3f2fd
|
||||
```
|
||||
|
||||
### Advanced Scoring and Prioritization System
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph "URL Discovery"
|
||||
A[Page Links] --> B[Extract URLs]
|
||||
B --> C[Normalize URLs]
|
||||
end
|
||||
|
||||
subgraph "Scoring System"
|
||||
C --> D[Keyword Relevance Scorer]
|
||||
D --> D1[URL Text Analysis]
|
||||
D --> D2[Keyword Matching]
|
||||
D --> D3[Calculate Base Score]
|
||||
|
||||
D3 --> E[Additional Scoring Factors]
|
||||
E --> E1[URL Structure weight: 0.2]
|
||||
E --> E2[Link Context weight: 0.3]
|
||||
E --> E3[Page Depth Penalty weight: 0.1]
|
||||
E --> E4[Domain Authority weight: 0.4]
|
||||
|
||||
D1 --> F[Combined Score]
|
||||
D2 --> F
|
||||
D3 --> F
|
||||
E1 --> F
|
||||
E2 --> F
|
||||
E3 --> F
|
||||
E4 --> F
|
||||
end
|
||||
|
||||
subgraph "Prioritization"
|
||||
F --> G{Score Threshold}
|
||||
G -->|Above Threshold| H[Priority Queue]
|
||||
G -->|Below Threshold| I[Discard URL]
|
||||
|
||||
H --> J[Best-First Selection]
|
||||
J --> K[Highest Score First]
|
||||
K --> L[Process Page]
|
||||
|
||||
L --> M[Update Scores]
|
||||
M --> N[Re-prioritize Queue]
|
||||
N --> J
|
||||
end
|
||||
|
||||
style F fill:#fff3e0
|
||||
style H fill:#e8f5e8
|
||||
style L fill:#e3f2fd
|
||||
```
|
||||
|
||||
### Deep Crawl Performance and Limits
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Crawl Constraints"
|
||||
A[Max Depth: 2] --> A1[Prevents infinite crawling]
|
||||
B[Max Pages: 50] --> B1[Controls resource usage]
|
||||
C[Score Threshold: 0.3] --> C1[Quality filtering]
|
||||
D[Domain Limits] --> D1[Scope control]
|
||||
end
|
||||
|
||||
subgraph "Performance Monitoring"
|
||||
E[Pages Crawled] --> F[Depth Distribution]
|
||||
E --> G[Success Rate]
|
||||
E --> H[Average Score]
|
||||
E --> I[Processing Time]
|
||||
|
||||
F --> J[Performance Report]
|
||||
G --> J
|
||||
H --> J
|
||||
I --> J
|
||||
end
|
||||
|
||||
subgraph "Resource Management"
|
||||
K[Memory Usage] --> L{Memory Threshold}
|
||||
L -->|Under Limit| M[Continue Crawling]
|
||||
L -->|Over Limit| N[Reduce Concurrency]
|
||||
|
||||
O[CPU Usage] --> P{CPU Threshold}
|
||||
P -->|Normal| M
|
||||
P -->|High| Q[Add Delays]
|
||||
|
||||
R[Network Load] --> S{Rate Limits}
|
||||
S -->|OK| M
|
||||
S -->|Exceeded| T[Throttle Requests]
|
||||
end
|
||||
|
||||
M --> U[Optimal Performance]
|
||||
N --> V[Reduced Performance]
|
||||
Q --> V
|
||||
T --> V
|
||||
|
||||
style U fill:#c8e6c9
|
||||
style V fill:#fff3e0
|
||||
style J fill:#e3f2fd
|
||||
```
|
||||
|
||||
### Error Handling and Recovery Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Strategy as Deep Crawl Strategy
|
||||
participant Queue as Priority Queue
|
||||
participant Crawler as Page Crawler
|
||||
participant Error as Error Handler
|
||||
participant Result as Result Collector
|
||||
|
||||
Strategy->>Queue: Get next URL
|
||||
Queue-->>Strategy: Return highest priority URL
|
||||
|
||||
Strategy->>Crawler: Crawl page
|
||||
|
||||
alt Successful Crawl
|
||||
Crawler-->>Strategy: Return page content
|
||||
Strategy->>Result: Store successful result
|
||||
Strategy->>Strategy: Extract new links
|
||||
Strategy->>Queue: Add new URLs to queue
|
||||
else Network Error
|
||||
Crawler-->>Error: Network timeout/failure
|
||||
Error->>Error: Log error with details
|
||||
Error->>Queue: Mark URL as failed
|
||||
Error-->>Strategy: Skip to next URL
|
||||
else Parse Error
|
||||
Crawler-->>Error: HTML parsing failed
|
||||
Error->>Error: Log parse error
|
||||
Error->>Result: Store failed result
|
||||
Error-->>Strategy: Continue with next URL
|
||||
else Rate Limit Hit
|
||||
Crawler-->>Error: Rate limit exceeded
|
||||
Error->>Error: Apply backoff strategy
|
||||
Error->>Queue: Re-queue URL with delay
|
||||
Error-->>Strategy: Wait before retry
|
||||
else Depth Limit
|
||||
Strategy->>Strategy: Check depth constraint
|
||||
Strategy-->>Queue: Skip URL - too deep
|
||||
else Page Limit
|
||||
Strategy->>Strategy: Check page count
|
||||
Strategy-->>Result: Stop crawling - limit reached
|
||||
end
|
||||
|
||||
Strategy->>Queue: Request next URL
|
||||
Queue-->>Strategy: More URLs available?
|
||||
|
||||
alt Queue Empty
|
||||
Queue-->>Result: Crawl complete
|
||||
else Queue Has URLs
|
||||
Queue-->>Strategy: Continue crawling
|
||||
end
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Deep Crawling Strategies](https://docs.crawl4ai.com/core/deep-crawling/), [Content Filtering](https://docs.crawl4ai.com/core/content-selection/), [Advanced Crawling Patterns](https://docs.crawl4ai.com/advanced/advanced-features/)
|
||||
603
docs/md_v2/assets/llm.txt/diagrams/docker.txt
Normal file
603
docs/md_v2/assets/llm.txt/diagrams/docker.txt
Normal file
@@ -0,0 +1,603 @@
|
||||
## Docker Deployment Architecture and Workflows
|
||||
|
||||
Visual representations of Crawl4AI Docker deployment, API architecture, configuration management, and service interactions.
|
||||
|
||||
### Docker Deployment Decision Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Start Docker Deployment] --> B{Deployment Type?}
|
||||
|
||||
B -->|Quick Start| C[Pre-built Image]
|
||||
B -->|Development| D[Docker Compose]
|
||||
B -->|Custom Build| E[Manual Build]
|
||||
B -->|Production| F[Production Setup]
|
||||
|
||||
C --> C1[docker pull unclecode/crawl4ai]
|
||||
C1 --> C2{Need LLM Support?}
|
||||
C2 -->|Yes| C3[Setup .llm.env]
|
||||
C2 -->|No| C4[Basic run]
|
||||
C3 --> C5[docker run with --env-file]
|
||||
C4 --> C6[docker run basic]
|
||||
|
||||
D --> D1[git clone repository]
|
||||
D1 --> D2[cp .llm.env.example .llm.env]
|
||||
D2 --> D3{Build Type?}
|
||||
D3 -->|Pre-built| D4[IMAGE=latest docker compose up]
|
||||
D3 -->|Local Build| D5[docker compose up --build]
|
||||
D3 -->|All Features| D6[INSTALL_TYPE=all docker compose up]
|
||||
|
||||
E --> E1[docker buildx build]
|
||||
E1 --> E2{Architecture?}
|
||||
E2 -->|Single| E3[--platform linux/amd64]
|
||||
E2 -->|Multi| E4[--platform linux/amd64,linux/arm64]
|
||||
E3 --> E5[Build complete]
|
||||
E4 --> E5
|
||||
|
||||
F --> F1[Production configuration]
|
||||
F1 --> F2[Custom config.yml]
|
||||
F2 --> F3[Resource limits]
|
||||
F3 --> F4[Health monitoring]
|
||||
F4 --> F5[Production ready]
|
||||
|
||||
C5 --> G[Service running on :11235]
|
||||
C6 --> G
|
||||
D4 --> G
|
||||
D5 --> G
|
||||
D6 --> G
|
||||
E5 --> H[docker run custom image]
|
||||
H --> G
|
||||
F5 --> I[Production deployment]
|
||||
|
||||
G --> J[Access playground at /playground]
|
||||
G --> K[Health check at /health]
|
||||
I --> L[Production monitoring]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style G fill:#c8e6c9
|
||||
style I fill:#c8e6c9
|
||||
style J fill:#fff3e0
|
||||
style K fill:#fff3e0
|
||||
style L fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Docker Container Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Host Environment"
|
||||
A[Docker Engine] --> B[Crawl4AI Container]
|
||||
C[.llm.env] --> B
|
||||
D[Custom config.yml] --> B
|
||||
E[Port 11235] --> B
|
||||
F[Shared Memory 1GB+] --> B
|
||||
end
|
||||
|
||||
subgraph "Container Services"
|
||||
B --> G[FastAPI Server :8020]
|
||||
B --> H[Gunicorn WSGI]
|
||||
B --> I[Supervisord Process Manager]
|
||||
B --> J[Redis Cache :6379]
|
||||
|
||||
G --> K[REST API Endpoints]
|
||||
G --> L[WebSocket Connections]
|
||||
G --> M[MCP Protocol]
|
||||
|
||||
H --> N[Worker Processes]
|
||||
I --> O[Service Monitoring]
|
||||
J --> P[Request Caching]
|
||||
end
|
||||
|
||||
subgraph "Browser Management"
|
||||
B --> Q[Playwright Framework]
|
||||
Q --> R[Chromium Browser]
|
||||
Q --> S[Firefox Browser]
|
||||
Q --> T[WebKit Browser]
|
||||
|
||||
R --> U[Browser Pool]
|
||||
S --> U
|
||||
T --> U
|
||||
|
||||
U --> V[Page Sessions]
|
||||
U --> W[Context Management]
|
||||
end
|
||||
|
||||
subgraph "External Services"
|
||||
X[OpenAI API] -.-> K
|
||||
Y[Anthropic Claude] -.-> K
|
||||
Z[Local Ollama] -.-> K
|
||||
AA[Groq API] -.-> K
|
||||
BB[Google Gemini] -.-> K
|
||||
end
|
||||
|
||||
subgraph "Client Interactions"
|
||||
CC[Python SDK] --> K
|
||||
DD[REST API Calls] --> K
|
||||
EE[MCP Clients] --> M
|
||||
FF[Web Browser] --> G
|
||||
GG[Monitoring Tools] --> K
|
||||
end
|
||||
|
||||
style B fill:#e3f2fd
|
||||
style G fill:#f3e5f5
|
||||
style Q fill:#e8f5e8
|
||||
style K fill:#fff3e0
|
||||
```
|
||||
|
||||
### API Endpoints Architecture
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Core Endpoints"
|
||||
A[/crawl] --> A1[Single URL crawl]
|
||||
A2[/crawl/stream] --> A3[Streaming multi-URL]
|
||||
A4[/crawl/job] --> A5[Async job submission]
|
||||
A6[/crawl/job/{id}] --> A7[Job status check]
|
||||
end
|
||||
|
||||
subgraph "Specialized Endpoints"
|
||||
B[/html] --> B1[Preprocessed HTML]
|
||||
B2[/screenshot] --> B3[PNG capture]
|
||||
B4[/pdf] --> B5[PDF generation]
|
||||
B6[/execute_js] --> B7[JavaScript execution]
|
||||
B8[/md] --> B9[Markdown extraction]
|
||||
end
|
||||
|
||||
subgraph "Utility Endpoints"
|
||||
C[/health] --> C1[Service status]
|
||||
C2[/metrics] --> C3[Prometheus metrics]
|
||||
C4[/schema] --> C5[API documentation]
|
||||
C6[/playground] --> C7[Interactive testing]
|
||||
end
|
||||
|
||||
subgraph "LLM Integration"
|
||||
D[/llm/{url}] --> D1[Q&A over URL]
|
||||
D2[/ask] --> D3[Library context search]
|
||||
D4[/config/dump] --> D5[Config validation]
|
||||
end
|
||||
|
||||
subgraph "MCP Protocol"
|
||||
E[/mcp/sse] --> E1[Server-Sent Events]
|
||||
E2[/mcp/ws] --> E3[WebSocket connection]
|
||||
E4[/mcp/schema] --> E5[MCP tool definitions]
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style B fill:#f3e5f5
|
||||
style C fill:#e8f5e8
|
||||
style D fill:#fff3e0
|
||||
style E fill:#fce4ec
|
||||
```
|
||||
|
||||
### Request Processing Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant FastAPI
|
||||
participant RequestValidator
|
||||
participant BrowserPool
|
||||
participant Playwright
|
||||
participant ExtractionEngine
|
||||
participant LLMProvider
|
||||
|
||||
Client->>FastAPI: POST /crawl with config
|
||||
FastAPI->>RequestValidator: Validate JSON structure
|
||||
|
||||
alt Valid Request
|
||||
RequestValidator-->>FastAPI: ✓ Validated
|
||||
FastAPI->>BrowserPool: Request browser instance
|
||||
BrowserPool->>Playwright: Launch browser/reuse session
|
||||
Playwright-->>BrowserPool: Browser ready
|
||||
BrowserPool-->>FastAPI: Browser allocated
|
||||
|
||||
FastAPI->>Playwright: Navigate to URL
|
||||
Playwright->>Playwright: Execute JS, wait conditions
|
||||
Playwright-->>FastAPI: Page content ready
|
||||
|
||||
FastAPI->>ExtractionEngine: Process content
|
||||
|
||||
alt LLM Extraction
|
||||
ExtractionEngine->>LLMProvider: Send content + schema
|
||||
LLMProvider-->>ExtractionEngine: Structured data
|
||||
else CSS Extraction
|
||||
ExtractionEngine->>ExtractionEngine: Apply CSS selectors
|
||||
end
|
||||
|
||||
ExtractionEngine-->>FastAPI: Extraction complete
|
||||
FastAPI->>BrowserPool: Release browser
|
||||
FastAPI-->>Client: CrawlResult response
|
||||
|
||||
else Invalid Request
|
||||
RequestValidator-->>FastAPI: ✗ Validation error
|
||||
FastAPI-->>Client: 400 Bad Request
|
||||
end
|
||||
```
|
||||
|
||||
### Configuration Management Flow
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> ConfigLoading
|
||||
|
||||
ConfigLoading --> DefaultConfig: Load default config.yml
|
||||
ConfigLoading --> CustomConfig: Custom config mounted
|
||||
ConfigLoading --> EnvOverrides: Environment variables
|
||||
|
||||
DefaultConfig --> ConfigMerging
|
||||
CustomConfig --> ConfigMerging
|
||||
EnvOverrides --> ConfigMerging
|
||||
|
||||
ConfigMerging --> ConfigValidation
|
||||
|
||||
ConfigValidation --> Valid: Schema validation passes
|
||||
ConfigValidation --> Invalid: Validation errors
|
||||
|
||||
Invalid --> ConfigError: Log errors and exit
|
||||
ConfigError --> [*]
|
||||
|
||||
Valid --> ServiceInitialization
|
||||
ServiceInitialization --> FastAPISetup
|
||||
ServiceInitialization --> BrowserPoolInit
|
||||
ServiceInitialization --> CacheSetup
|
||||
|
||||
FastAPISetup --> Running
|
||||
BrowserPoolInit --> Running
|
||||
CacheSetup --> Running
|
||||
|
||||
Running --> ConfigReload: Config change detected
|
||||
ConfigReload --> ConfigValidation
|
||||
|
||||
Running --> [*]: Service shutdown
|
||||
|
||||
note right of ConfigMerging : Priority: ENV > Custom > Default
|
||||
note right of ServiceInitialization : All services must initialize successfully
|
||||
```
|
||||
|
||||
### Multi-Architecture Build Process
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Developer Push] --> B[GitHub Repository]
|
||||
|
||||
B --> C[Docker Buildx]
|
||||
C --> D{Build Strategy}
|
||||
|
||||
D -->|Multi-arch| E[Parallel Builds]
|
||||
D -->|Single-arch| F[Platform-specific Build]
|
||||
|
||||
E --> G[AMD64 Build]
|
||||
E --> H[ARM64 Build]
|
||||
|
||||
F --> I[Target Platform Build]
|
||||
|
||||
subgraph "AMD64 Build Process"
|
||||
G --> G1[Ubuntu base image]
|
||||
G1 --> G2[Python 3.11 install]
|
||||
G2 --> G3[System dependencies]
|
||||
G3 --> G4[Crawl4AI installation]
|
||||
G4 --> G5[Playwright setup]
|
||||
G5 --> G6[FastAPI configuration]
|
||||
G6 --> G7[AMD64 image ready]
|
||||
end
|
||||
|
||||
subgraph "ARM64 Build Process"
|
||||
H --> H1[Ubuntu ARM64 base]
|
||||
H1 --> H2[Python 3.11 install]
|
||||
H2 --> H3[ARM-specific deps]
|
||||
H3 --> H4[Crawl4AI installation]
|
||||
H4 --> H5[Playwright setup]
|
||||
H5 --> H6[FastAPI configuration]
|
||||
H6 --> H7[ARM64 image ready]
|
||||
end
|
||||
|
||||
subgraph "Single Architecture"
|
||||
I --> I1[Base image selection]
|
||||
I1 --> I2[Platform dependencies]
|
||||
I2 --> I3[Application setup]
|
||||
I3 --> I4[Platform image ready]
|
||||
end
|
||||
|
||||
G7 --> J[Multi-arch Manifest]
|
||||
H7 --> J
|
||||
I4 --> K[Platform Image]
|
||||
|
||||
J --> L[Docker Hub Registry]
|
||||
K --> L
|
||||
|
||||
L --> M[Pull Request Auto-selects Architecture]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style J fill:#c8e6c9
|
||||
style K fill:#c8e6c9
|
||||
style L fill:#f3e5f5
|
||||
style M fill:#e8f5e8
|
||||
```
|
||||
|
||||
### MCP Integration Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "MCP Client Applications"
|
||||
A[Claude Code] --> B[MCP Protocol]
|
||||
C[Cursor IDE] --> B
|
||||
D[Windsurf] --> B
|
||||
E[Custom MCP Client] --> B
|
||||
end
|
||||
|
||||
subgraph "Crawl4AI MCP Server"
|
||||
B --> F[MCP Endpoint Router]
|
||||
F --> G[SSE Transport /mcp/sse]
|
||||
F --> H[WebSocket Transport /mcp/ws]
|
||||
F --> I[Schema Endpoint /mcp/schema]
|
||||
|
||||
G --> J[MCP Tool Handler]
|
||||
H --> J
|
||||
|
||||
J --> K[Tool: md]
|
||||
J --> L[Tool: html]
|
||||
J --> M[Tool: screenshot]
|
||||
J --> N[Tool: pdf]
|
||||
J --> O[Tool: execute_js]
|
||||
J --> P[Tool: crawl]
|
||||
J --> Q[Tool: ask]
|
||||
end
|
||||
|
||||
subgraph "Crawl4AI Core Services"
|
||||
K --> R[Markdown Generator]
|
||||
L --> S[HTML Preprocessor]
|
||||
M --> T[Screenshot Service]
|
||||
N --> U[PDF Generator]
|
||||
O --> V[JavaScript Executor]
|
||||
P --> W[Batch Crawler]
|
||||
Q --> X[Context Search]
|
||||
|
||||
R --> Y[Browser Pool]
|
||||
S --> Y
|
||||
T --> Y
|
||||
U --> Y
|
||||
V --> Y
|
||||
W --> Y
|
||||
X --> Z[Knowledge Base]
|
||||
end
|
||||
|
||||
subgraph "External Resources"
|
||||
Y --> AA[Playwright Browsers]
|
||||
Z --> BB[Library Documentation]
|
||||
Z --> CC[Code Examples]
|
||||
AA --> DD[Web Pages]
|
||||
end
|
||||
|
||||
style B fill:#e3f2fd
|
||||
style J fill:#f3e5f5
|
||||
style Y fill:#e8f5e8
|
||||
style Z fill:#fff3e0
|
||||
```
|
||||
|
||||
### API Request/Response Flow Patterns
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant LoadBalancer
|
||||
participant FastAPI
|
||||
participant ConfigValidator
|
||||
participant BrowserManager
|
||||
participant CrawlEngine
|
||||
participant ResponseBuilder
|
||||
|
||||
Note over Client,ResponseBuilder: Basic Crawl Request
|
||||
|
||||
Client->>LoadBalancer: POST /crawl
|
||||
LoadBalancer->>FastAPI: Route request
|
||||
|
||||
FastAPI->>ConfigValidator: Validate browser_config
|
||||
ConfigValidator-->>FastAPI: ✓ Valid BrowserConfig
|
||||
|
||||
FastAPI->>ConfigValidator: Validate crawler_config
|
||||
ConfigValidator-->>FastAPI: ✓ Valid CrawlerRunConfig
|
||||
|
||||
FastAPI->>BrowserManager: Allocate browser
|
||||
BrowserManager-->>FastAPI: Browser instance
|
||||
|
||||
FastAPI->>CrawlEngine: Execute crawl
|
||||
|
||||
Note over CrawlEngine: Page processing
|
||||
CrawlEngine->>CrawlEngine: Navigate & wait
|
||||
CrawlEngine->>CrawlEngine: Extract content
|
||||
CrawlEngine->>CrawlEngine: Apply strategies
|
||||
|
||||
CrawlEngine-->>FastAPI: CrawlResult
|
||||
|
||||
FastAPI->>ResponseBuilder: Format response
|
||||
ResponseBuilder-->>FastAPI: JSON response
|
||||
|
||||
FastAPI->>BrowserManager: Release browser
|
||||
FastAPI-->>LoadBalancer: Response ready
|
||||
LoadBalancer-->>Client: 200 OK + CrawlResult
|
||||
|
||||
Note over Client,ResponseBuilder: Streaming Request
|
||||
|
||||
Client->>FastAPI: POST /crawl/stream
|
||||
FastAPI-->>Client: 200 OK (stream start)
|
||||
|
||||
loop For each URL
|
||||
FastAPI->>CrawlEngine: Process URL
|
||||
CrawlEngine-->>FastAPI: Result ready
|
||||
FastAPI-->>Client: NDJSON line
|
||||
end
|
||||
|
||||
FastAPI-->>Client: Stream completed
|
||||
```
|
||||
|
||||
### Configuration Validation Workflow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Client Request] --> B[JSON Payload]
|
||||
B --> C{Pre-validation}
|
||||
|
||||
C -->|✓ Valid JSON| D[Extract Configurations]
|
||||
C -->|✗ Invalid JSON| E[Return 400 Bad Request]
|
||||
|
||||
D --> F[BrowserConfig Validation]
|
||||
D --> G[CrawlerRunConfig Validation]
|
||||
|
||||
F --> H{BrowserConfig Valid?}
|
||||
G --> I{CrawlerRunConfig Valid?}
|
||||
|
||||
H -->|✓ Valid| J[Browser Setup]
|
||||
H -->|✗ Invalid| K[Log Browser Config Errors]
|
||||
|
||||
I -->|✓ Valid| L[Crawler Setup]
|
||||
I -->|✗ Invalid| M[Log Crawler Config Errors]
|
||||
|
||||
K --> N[Collect All Errors]
|
||||
M --> N
|
||||
N --> O[Return 422 Validation Error]
|
||||
|
||||
J --> P{Both Configs Valid?}
|
||||
L --> P
|
||||
|
||||
P -->|✓ Yes| Q[Proceed to Crawling]
|
||||
P -->|✗ No| O
|
||||
|
||||
Q --> R[Execute Crawl Pipeline]
|
||||
R --> S[Return CrawlResult]
|
||||
|
||||
E --> T[Client Error Response]
|
||||
O --> T
|
||||
S --> U[Client Success Response]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style Q fill:#c8e6c9
|
||||
style S fill:#c8e6c9
|
||||
style U fill:#c8e6c9
|
||||
style E fill:#ffcdd2
|
||||
style O fill:#ffcdd2
|
||||
style T fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Production Deployment Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Load Balancer Layer"
|
||||
A[NGINX/HAProxy] --> B[Health Check]
|
||||
A --> C[Request Routing]
|
||||
A --> D[SSL Termination]
|
||||
end
|
||||
|
||||
subgraph "Application Layer"
|
||||
C --> E[Crawl4AI Instance 1]
|
||||
C --> F[Crawl4AI Instance 2]
|
||||
C --> G[Crawl4AI Instance N]
|
||||
|
||||
E --> H[FastAPI Server]
|
||||
F --> I[FastAPI Server]
|
||||
G --> J[FastAPI Server]
|
||||
|
||||
H --> K[Browser Pool 1]
|
||||
I --> L[Browser Pool 2]
|
||||
J --> M[Browser Pool N]
|
||||
end
|
||||
|
||||
subgraph "Shared Services"
|
||||
N[Redis Cluster] --> E
|
||||
N --> F
|
||||
N --> G
|
||||
|
||||
O[Monitoring Stack] --> P[Prometheus]
|
||||
O --> Q[Grafana]
|
||||
O --> R[AlertManager]
|
||||
|
||||
P --> E
|
||||
P --> F
|
||||
P --> G
|
||||
end
|
||||
|
||||
subgraph "External Dependencies"
|
||||
S[OpenAI API] -.-> H
|
||||
T[Anthropic API] -.-> I
|
||||
U[Local LLM Cluster] -.-> J
|
||||
end
|
||||
|
||||
subgraph "Persistent Storage"
|
||||
V[Configuration Volume] --> E
|
||||
V --> F
|
||||
V --> G
|
||||
|
||||
W[Cache Volume] --> N
|
||||
X[Logs Volume] --> O
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style E fill:#f3e5f5
|
||||
style F fill:#f3e5f5
|
||||
style G fill:#f3e5f5
|
||||
style N fill:#e8f5e8
|
||||
style O fill:#fff3e0
|
||||
```
|
||||
|
||||
### Docker Resource Management
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Resource Allocation"
|
||||
A[Host Resources] --> B[CPU Cores]
|
||||
A --> C[Memory GB]
|
||||
A --> D[Disk Space]
|
||||
A --> E[Network Bandwidth]
|
||||
|
||||
B --> F[Container Limits]
|
||||
C --> F
|
||||
D --> F
|
||||
E --> F
|
||||
end
|
||||
|
||||
subgraph "Container Configuration"
|
||||
F --> G[--cpus=4]
|
||||
F --> H[--memory=8g]
|
||||
F --> I[--shm-size=2g]
|
||||
F --> J[Volume Mounts]
|
||||
|
||||
G --> K[Browser Processes]
|
||||
H --> L[Browser Memory]
|
||||
I --> M[Shared Memory for Browsers]
|
||||
J --> N[Config & Cache Storage]
|
||||
end
|
||||
|
||||
subgraph "Monitoring & Scaling"
|
||||
O[Resource Monitor] --> P[CPU Usage %]
|
||||
O --> Q[Memory Usage %]
|
||||
O --> R[Request Queue Length]
|
||||
|
||||
P --> S{CPU > 80%?}
|
||||
Q --> T{Memory > 90%?}
|
||||
R --> U{Queue > 100?}
|
||||
|
||||
S -->|Yes| V[Scale Up]
|
||||
T -->|Yes| V
|
||||
U -->|Yes| V
|
||||
|
||||
V --> W[Add Container Instance]
|
||||
W --> X[Update Load Balancer]
|
||||
end
|
||||
|
||||
subgraph "Performance Optimization"
|
||||
Y[Browser Pool Tuning] --> Z[Max Pages: 40]
|
||||
Y --> AA[Idle TTL: 30min]
|
||||
Y --> BB[Concurrency Limits]
|
||||
|
||||
Z --> CC[Memory Efficiency]
|
||||
AA --> DD[Resource Cleanup]
|
||||
BB --> EE[Throughput Control]
|
||||
end
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style F fill:#f3e5f5
|
||||
style O fill:#e8f5e8
|
||||
style Y fill:#fff3e0
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Docker Deployment Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Production Configuration](https://docs.crawl4ai.com/core/docker-deployment/#production-deployment)
|
||||
478
docs/md_v2/assets/llm.txt/diagrams/extraction.txt
Normal file
478
docs/md_v2/assets/llm.txt/diagrams/extraction.txt
Normal file
@@ -0,0 +1,478 @@
|
||||
## Extraction Strategy Workflows and Architecture
|
||||
|
||||
Visual representations of Crawl4AI's data extraction approaches, strategy selection, and processing workflows.
|
||||
|
||||
### Extraction Strategy Decision Tree
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Content to Extract] --> B{Content Type?}
|
||||
|
||||
B -->|Simple Patterns| C[Common Data Types]
|
||||
B -->|Structured HTML| D[Predictable Structure]
|
||||
B -->|Complex Content| E[Requires Reasoning]
|
||||
B -->|Mixed Content| F[Multiple Data Types]
|
||||
|
||||
C --> C1{Pattern Type?}
|
||||
C1 -->|Email, Phone, URLs| C2[Built-in Regex Patterns]
|
||||
C1 -->|Custom Patterns| C3[Custom Regex Strategy]
|
||||
C1 -->|LLM-Generated| C4[One-time Pattern Generation]
|
||||
|
||||
D --> D1{Selector Type?}
|
||||
D1 -->|CSS Selectors| D2[JsonCssExtractionStrategy]
|
||||
D1 -->|XPath Expressions| D3[JsonXPathExtractionStrategy]
|
||||
D1 -->|Need Schema?| D4[Auto-generate Schema with LLM]
|
||||
|
||||
E --> E1{LLM Provider?}
|
||||
E1 -->|OpenAI/Anthropic| E2[Cloud LLM Strategy]
|
||||
E1 -->|Local Ollama| E3[Local LLM Strategy]
|
||||
E1 -->|Cost-sensitive| E4[Hybrid: Generate Schema Once]
|
||||
|
||||
F --> F1[Multi-Strategy Approach]
|
||||
F1 --> F2[1. Regex for Patterns]
|
||||
F1 --> F3[2. CSS for Structure]
|
||||
F1 --> F4[3. LLM for Complex Analysis]
|
||||
|
||||
C2 --> G[Fast Extraction ⚡]
|
||||
C3 --> G
|
||||
C4 --> H[Cached Pattern Reuse]
|
||||
|
||||
D2 --> I[Schema-based Extraction 🏗️]
|
||||
D3 --> I
|
||||
D4 --> J[Generated Schema Cache]
|
||||
|
||||
E2 --> K[Intelligent Parsing 🧠]
|
||||
E3 --> K
|
||||
E4 --> L[Hybrid Cost-Effective]
|
||||
|
||||
F2 --> M[Comprehensive Results 📊]
|
||||
F3 --> M
|
||||
F4 --> M
|
||||
|
||||
style G fill:#c8e6c9
|
||||
style I fill:#e3f2fd
|
||||
style K fill:#fff3e0
|
||||
style M fill:#f3e5f5
|
||||
style H fill:#e8f5e8
|
||||
style J fill:#e8f5e8
|
||||
style L fill:#ffecb3
|
||||
```
|
||||
|
||||
### LLM Extraction Strategy Workflow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Crawler
|
||||
participant LLMStrategy
|
||||
participant Chunker
|
||||
participant LLMProvider
|
||||
participant Parser
|
||||
|
||||
User->>Crawler: Configure LLMExtractionStrategy
|
||||
User->>Crawler: arun(url, config)
|
||||
|
||||
Crawler->>Crawler: Navigate to URL
|
||||
Crawler->>Crawler: Extract content (HTML/Markdown)
|
||||
Crawler->>LLMStrategy: Process content
|
||||
|
||||
LLMStrategy->>LLMStrategy: Check content size
|
||||
|
||||
alt Content > chunk_threshold
|
||||
LLMStrategy->>Chunker: Split into chunks with overlap
|
||||
Chunker-->>LLMStrategy: Return chunks[]
|
||||
|
||||
loop For each chunk
|
||||
LLMStrategy->>LLMProvider: Send chunk + schema + instruction
|
||||
LLMProvider-->>LLMStrategy: Return structured JSON
|
||||
end
|
||||
|
||||
LLMStrategy->>LLMStrategy: Merge chunk results
|
||||
else Content <= threshold
|
||||
LLMStrategy->>LLMProvider: Send full content + schema
|
||||
LLMProvider-->>LLMStrategy: Return structured JSON
|
||||
end
|
||||
|
||||
LLMStrategy->>Parser: Validate JSON schema
|
||||
Parser-->>LLMStrategy: Validated data
|
||||
|
||||
LLMStrategy->>LLMStrategy: Track token usage
|
||||
LLMStrategy-->>Crawler: Return extracted_content
|
||||
|
||||
Crawler-->>User: CrawlResult with JSON data
|
||||
|
||||
User->>LLMStrategy: show_usage()
|
||||
LLMStrategy-->>User: Token count & estimated cost
|
||||
```
|
||||
|
||||
### Schema-Based Extraction Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Schema Definition"
|
||||
A[JSON Schema] --> A1[baseSelector]
|
||||
A --> A2[fields[]]
|
||||
A --> A3[nested structures]
|
||||
|
||||
A2 --> A4[CSS/XPath selectors]
|
||||
A2 --> A5[Data types: text, html, attribute]
|
||||
A2 --> A6[Default values]
|
||||
|
||||
A3 --> A7[nested objects]
|
||||
A3 --> A8[nested_list arrays]
|
||||
A3 --> A9[simple lists]
|
||||
end
|
||||
|
||||
subgraph "Extraction Engine"
|
||||
B[HTML Content] --> C[Selector Engine]
|
||||
C --> C1[CSS Selector Parser]
|
||||
C --> C2[XPath Evaluator]
|
||||
|
||||
C1 --> D[Element Matcher]
|
||||
C2 --> D
|
||||
|
||||
D --> E[Type Converter]
|
||||
E --> E1[Text Extraction]
|
||||
E --> E2[HTML Preservation]
|
||||
E --> E3[Attribute Extraction]
|
||||
E --> E4[Nested Processing]
|
||||
end
|
||||
|
||||
subgraph "Result Processing"
|
||||
F[Raw Extracted Data] --> G[Structure Builder]
|
||||
G --> G1[Object Construction]
|
||||
G --> G2[Array Assembly]
|
||||
G --> G3[Type Validation]
|
||||
|
||||
G1 --> H[JSON Output]
|
||||
G2 --> H
|
||||
G3 --> H
|
||||
end
|
||||
|
||||
A --> C
|
||||
E --> F
|
||||
H --> I[extracted_content]
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style C fill:#f3e5f5
|
||||
style G fill:#e8f5e8
|
||||
style H fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Automatic Schema Generation Process
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> CheckCache
|
||||
|
||||
CheckCache --> CacheHit: Schema exists
|
||||
CheckCache --> SamplePage: Schema missing
|
||||
|
||||
CacheHit --> LoadSchema
|
||||
LoadSchema --> FastExtraction
|
||||
|
||||
SamplePage --> ExtractHTML: Crawl sample URL
|
||||
ExtractHTML --> LLMAnalysis: Send HTML to LLM
|
||||
LLMAnalysis --> GenerateSchema: Create CSS/XPath selectors
|
||||
GenerateSchema --> ValidateSchema: Test generated schema
|
||||
|
||||
ValidateSchema --> SchemaWorks: Valid selectors
|
||||
ValidateSchema --> RefineSchema: Invalid selectors
|
||||
|
||||
RefineSchema --> LLMAnalysis: Iterate with feedback
|
||||
|
||||
SchemaWorks --> CacheSchema: Save for reuse
|
||||
CacheSchema --> FastExtraction: Use cached schema
|
||||
|
||||
FastExtraction --> [*]: No more LLM calls needed
|
||||
|
||||
note right of CheckCache : One-time LLM cost
|
||||
note right of FastExtraction : Unlimited fast reuse
|
||||
note right of CacheSchema : JSON file storage
|
||||
```
|
||||
|
||||
### Multi-Strategy Extraction Pipeline
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
A[Web Page Content] --> B[Strategy Pipeline]
|
||||
|
||||
subgraph B["Extraction Pipeline"]
|
||||
B1[Stage 1: Regex Patterns]
|
||||
B2[Stage 2: Schema-based CSS]
|
||||
B3[Stage 3: LLM Analysis]
|
||||
|
||||
B1 --> B1a[Email addresses]
|
||||
B1 --> B1b[Phone numbers]
|
||||
B1 --> B1c[URLs and links]
|
||||
B1 --> B1d[Currency amounts]
|
||||
|
||||
B2 --> B2a[Structured products]
|
||||
B2 --> B2b[Article metadata]
|
||||
B2 --> B2c[User reviews]
|
||||
B2 --> B2d[Navigation links]
|
||||
|
||||
B3 --> B3a[Sentiment analysis]
|
||||
B3 --> B3b[Key topics]
|
||||
B3 --> B3c[Entity recognition]
|
||||
B3 --> B3d[Content summary]
|
||||
end
|
||||
|
||||
B1a --> C[Result Merger]
|
||||
B1b --> C
|
||||
B1c --> C
|
||||
B1d --> C
|
||||
|
||||
B2a --> C
|
||||
B2b --> C
|
||||
B2c --> C
|
||||
B2d --> C
|
||||
|
||||
B3a --> C
|
||||
B3b --> C
|
||||
B3c --> C
|
||||
B3d --> C
|
||||
|
||||
C --> D[Combined JSON Output]
|
||||
D --> E[Final CrawlResult]
|
||||
|
||||
style B1 fill:#c8e6c9
|
||||
style B2 fill:#e3f2fd
|
||||
style B3 fill:#fff3e0
|
||||
style C fill:#f3e5f5
|
||||
```
|
||||
|
||||
### Performance Comparison Matrix
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Strategy Performance"
|
||||
A[Extraction Strategy Comparison]
|
||||
|
||||
subgraph "Speed ⚡"
|
||||
S1[Regex: ~10ms]
|
||||
S2[CSS Schema: ~50ms]
|
||||
S3[XPath: ~100ms]
|
||||
S4[LLM: ~2-10s]
|
||||
end
|
||||
|
||||
subgraph "Accuracy 🎯"
|
||||
A1[Regex: Pattern-dependent]
|
||||
A2[CSS: High for structured]
|
||||
A3[XPath: Very high]
|
||||
A4[LLM: Excellent for complex]
|
||||
end
|
||||
|
||||
subgraph "Cost 💰"
|
||||
C1[Regex: Free]
|
||||
C2[CSS: Free]
|
||||
C3[XPath: Free]
|
||||
C4[LLM: $0.001-0.01 per page]
|
||||
end
|
||||
|
||||
subgraph "Complexity 🔧"
|
||||
X1[Regex: Simple patterns only]
|
||||
X2[CSS: Structured HTML]
|
||||
X3[XPath: Complex selectors]
|
||||
X4[LLM: Any content type]
|
||||
end
|
||||
end
|
||||
|
||||
style S1 fill:#c8e6c9
|
||||
style S2 fill:#e8f5e8
|
||||
style S3 fill:#fff3e0
|
||||
style S4 fill:#ffcdd2
|
||||
|
||||
style A2 fill:#e8f5e8
|
||||
style A3 fill:#c8e6c9
|
||||
style A4 fill:#c8e6c9
|
||||
|
||||
style C1 fill:#c8e6c9
|
||||
style C2 fill:#c8e6c9
|
||||
style C3 fill:#c8e6c9
|
||||
style C4 fill:#fff3e0
|
||||
|
||||
style X1 fill:#ffcdd2
|
||||
style X2 fill:#e8f5e8
|
||||
style X3 fill:#c8e6c9
|
||||
style X4 fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Regex Pattern Strategy Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Regex Extraction] --> B{Pattern Source?}
|
||||
|
||||
B -->|Built-in| C[Use Predefined Patterns]
|
||||
B -->|Custom| D[Define Custom Regex]
|
||||
B -->|LLM-Generated| E[Generate with AI]
|
||||
|
||||
C --> C1[Email Pattern]
|
||||
C --> C2[Phone Pattern]
|
||||
C --> C3[URL Pattern]
|
||||
C --> C4[Currency Pattern]
|
||||
C --> C5[Date Pattern]
|
||||
|
||||
D --> D1[Write Custom Regex]
|
||||
D --> D2[Test Pattern]
|
||||
D --> D3{Pattern Works?}
|
||||
D3 -->|No| D1
|
||||
D3 -->|Yes| D4[Use Pattern]
|
||||
|
||||
E --> E1[Provide Sample Content]
|
||||
E --> E2[LLM Analyzes Content]
|
||||
E --> E3[Generate Optimized Regex]
|
||||
E --> E4[Cache Pattern for Reuse]
|
||||
|
||||
C1 --> F[Pattern Matching]
|
||||
C2 --> F
|
||||
C3 --> F
|
||||
C4 --> F
|
||||
C5 --> F
|
||||
D4 --> F
|
||||
E4 --> F
|
||||
|
||||
F --> G[Extract Matches]
|
||||
G --> H[Group by Pattern Type]
|
||||
H --> I[JSON Output with Labels]
|
||||
|
||||
style C fill:#e8f5e8
|
||||
style D fill:#e3f2fd
|
||||
style E fill:#fff3e0
|
||||
style F fill:#f3e5f5
|
||||
```
|
||||
|
||||
### Complex Schema Structure Visualization
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "E-commerce Schema Example"
|
||||
A[Category baseSelector] --> B[Category Fields]
|
||||
A --> C[Products nested_list]
|
||||
|
||||
B --> B1[category_name]
|
||||
B --> B2[category_id attribute]
|
||||
B --> B3[category_url attribute]
|
||||
|
||||
C --> C1[Product baseSelector]
|
||||
C1 --> C2[name text]
|
||||
C1 --> C3[price text]
|
||||
C1 --> C4[Details nested object]
|
||||
C1 --> C5[Features list]
|
||||
C1 --> C6[Reviews nested_list]
|
||||
|
||||
C4 --> C4a[brand text]
|
||||
C4 --> C4b[model text]
|
||||
C4 --> C4c[specs html]
|
||||
|
||||
C5 --> C5a[feature text array]
|
||||
|
||||
C6 --> C6a[reviewer text]
|
||||
C6 --> C6b[rating attribute]
|
||||
C6 --> C6c[comment text]
|
||||
C6 --> C6d[date attribute]
|
||||
end
|
||||
|
||||
subgraph "JSON Output Structure"
|
||||
D[categories array] --> D1[category object]
|
||||
D1 --> D2[category_name]
|
||||
D1 --> D3[category_id]
|
||||
D1 --> D4[products array]
|
||||
|
||||
D4 --> D5[product object]
|
||||
D5 --> D6[name, price]
|
||||
D5 --> D7[details object]
|
||||
D5 --> D8[features array]
|
||||
D5 --> D9[reviews array]
|
||||
|
||||
D7 --> D7a[brand, model, specs]
|
||||
D8 --> D8a[feature strings]
|
||||
D9 --> D9a[review objects]
|
||||
end
|
||||
|
||||
A -.-> D
|
||||
B1 -.-> D2
|
||||
C2 -.-> D6
|
||||
C4 -.-> D7
|
||||
C5 -.-> D8
|
||||
C6 -.-> D9
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style C fill:#f3e5f5
|
||||
style C4 fill:#e8f5e8
|
||||
style D fill:#fff3e0
|
||||
```
|
||||
|
||||
### Error Handling and Fallback Strategy
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> PrimaryStrategy
|
||||
|
||||
PrimaryStrategy --> Success: Extraction successful
|
||||
PrimaryStrategy --> ValidationFailed: Invalid data
|
||||
PrimaryStrategy --> ExtractionFailed: No matches found
|
||||
PrimaryStrategy --> TimeoutError: LLM timeout
|
||||
|
||||
ValidationFailed --> FallbackStrategy: Try alternative
|
||||
ExtractionFailed --> FallbackStrategy: Try alternative
|
||||
TimeoutError --> FallbackStrategy: Try alternative
|
||||
|
||||
FallbackStrategy --> FallbackSuccess: Fallback works
|
||||
FallbackStrategy --> FallbackFailed: All strategies failed
|
||||
|
||||
FallbackSuccess --> Success: Return results
|
||||
FallbackFailed --> ErrorReport: Log failure details
|
||||
|
||||
Success --> [*]: Complete
|
||||
ErrorReport --> [*]: Return empty results
|
||||
|
||||
note right of PrimaryStrategy : Try fastest/most accurate first
|
||||
note right of FallbackStrategy : Use simpler but reliable method
|
||||
note left of ErrorReport : Provide debugging information
|
||||
```
|
||||
|
||||
### Token Usage and Cost Optimization
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[LLM Extraction Request] --> B{Content Size Check}
|
||||
|
||||
B -->|Small < 1200 tokens| C[Single LLM Call]
|
||||
B -->|Large > 1200 tokens| D[Chunking Strategy]
|
||||
|
||||
C --> C1[Send full content]
|
||||
C1 --> C2[Parse JSON response]
|
||||
C2 --> C3[Track token usage]
|
||||
|
||||
D --> D1[Split into chunks]
|
||||
D1 --> D2[Add overlap between chunks]
|
||||
D2 --> D3[Process chunks in parallel]
|
||||
|
||||
D3 --> D4[Chunk 1 → LLM]
|
||||
D3 --> D5[Chunk 2 → LLM]
|
||||
D3 --> D6[Chunk N → LLM]
|
||||
|
||||
D4 --> D7[Merge results]
|
||||
D5 --> D7
|
||||
D6 --> D7
|
||||
|
||||
D7 --> D8[Deduplicate data]
|
||||
D8 --> D9[Aggregate token usage]
|
||||
|
||||
C3 --> E[Cost Calculation]
|
||||
D9 --> E
|
||||
|
||||
E --> F[Usage Report]
|
||||
F --> F1[Prompt tokens: X]
|
||||
F --> F2[Completion tokens: Y]
|
||||
F --> F3[Total cost: $Z]
|
||||
|
||||
style C fill:#c8e6c9
|
||||
style D fill:#fff3e0
|
||||
style E fill:#e3f2fd
|
||||
style F fill:#f3e5f5
|
||||
```
|
||||
|
||||
**📖 Learn more:** [LLM Strategies](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Pattern Matching](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
|
||||
@@ -0,0 +1,472 @@
|
||||
## HTTP Crawler Strategy Workflows
|
||||
|
||||
Visual representations of HTTP-based crawling architecture, request flows, and performance characteristics compared to browser-based strategies.
|
||||
|
||||
### HTTP vs Browser Strategy Decision Tree
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Content Crawling Need] --> B{Content Type Analysis}
|
||||
|
||||
B -->|Static HTML| C{JavaScript Required?}
|
||||
B -->|Dynamic SPA| D[Browser Strategy Required]
|
||||
B -->|API Endpoints| E[HTTP Strategy Optimal]
|
||||
B -->|Mixed Content| F{Primary Content Source?}
|
||||
|
||||
C -->|No JS Needed| G[HTTP Strategy Recommended]
|
||||
C -->|JS Required| H[Browser Strategy Required]
|
||||
C -->|Unknown| I{Performance Priority?}
|
||||
|
||||
I -->|Speed Critical| J[Try HTTP First]
|
||||
I -->|Accuracy Critical| K[Use Browser Strategy]
|
||||
|
||||
F -->|Mostly Static| G
|
||||
F -->|Mostly Dynamic| D
|
||||
|
||||
G --> L{Resource Constraints?}
|
||||
L -->|Memory Limited| M[HTTP Strategy - Lightweight]
|
||||
L -->|CPU Limited| N[HTTP Strategy - No Browser]
|
||||
L -->|Network Limited| O[HTTP Strategy - Efficient]
|
||||
L -->|No Constraints| P[Either Strategy Works]
|
||||
|
||||
J --> Q[Test HTTP Results]
|
||||
Q --> R{Content Complete?}
|
||||
R -->|Yes| S[Continue with HTTP]
|
||||
R -->|No| T[Switch to Browser Strategy]
|
||||
|
||||
D --> U[Browser Strategy Features]
|
||||
H --> U
|
||||
K --> U
|
||||
T --> U
|
||||
|
||||
U --> V[JavaScript Execution]
|
||||
U --> W[Screenshots/PDFs]
|
||||
U --> X[Complex Interactions]
|
||||
U --> Y[Session Management]
|
||||
|
||||
M --> Z[HTTP Strategy Benefits]
|
||||
N --> Z
|
||||
O --> Z
|
||||
S --> Z
|
||||
|
||||
Z --> AA[10x Faster Processing]
|
||||
Z --> BB[Lower Memory Usage]
|
||||
Z --> CC[Higher Concurrency]
|
||||
Z --> DD[Simpler Deployment]
|
||||
|
||||
style G fill:#c8e6c9
|
||||
style M fill:#c8e6c9
|
||||
style N fill:#c8e6c9
|
||||
style O fill:#c8e6c9
|
||||
style S fill:#c8e6c9
|
||||
style D fill:#e3f2fd
|
||||
style H fill:#e3f2fd
|
||||
style K fill:#e3f2fd
|
||||
style T fill:#e3f2fd
|
||||
style U fill:#e3f2fd
|
||||
```
|
||||
|
||||
### HTTP Request Lifecycle Sequence
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant HTTPStrategy as HTTP Strategy
|
||||
participant Session as HTTP Session
|
||||
participant Server as Target Server
|
||||
participant Processor as Content Processor
|
||||
|
||||
Client->>HTTPStrategy: crawl(url, config)
|
||||
HTTPStrategy->>HTTPStrategy: validate_url()
|
||||
|
||||
alt URL Type Check
|
||||
HTTPStrategy->>HTTPStrategy: handle_file_url()
|
||||
Note over HTTPStrategy: file:// URLs
|
||||
else
|
||||
HTTPStrategy->>HTTPStrategy: handle_raw_content()
|
||||
Note over HTTPStrategy: raw:// content
|
||||
else
|
||||
HTTPStrategy->>Session: prepare_request()
|
||||
Session->>Session: apply_config()
|
||||
Session->>Session: set_headers()
|
||||
Session->>Session: setup_auth()
|
||||
|
||||
Session->>Server: HTTP Request
|
||||
Note over Session,Server: GET/POST/PUT with headers
|
||||
|
||||
alt Success Response
|
||||
Server-->>Session: HTTP 200 + Content
|
||||
Session-->>HTTPStrategy: response_data
|
||||
else Redirect Response
|
||||
Server-->>Session: HTTP 3xx + Location
|
||||
Session->>Server: Follow redirect
|
||||
Server-->>Session: HTTP 200 + Content
|
||||
Session-->>HTTPStrategy: final_response
|
||||
else Error Response
|
||||
Server-->>Session: HTTP 4xx/5xx
|
||||
Session-->>HTTPStrategy: error_response
|
||||
end
|
||||
end
|
||||
|
||||
HTTPStrategy->>Processor: process_content()
|
||||
Processor->>Processor: clean_html()
|
||||
Processor->>Processor: extract_metadata()
|
||||
Processor->>Processor: generate_markdown()
|
||||
Processor-->>HTTPStrategy: processed_result
|
||||
|
||||
HTTPStrategy-->>Client: CrawlResult
|
||||
|
||||
Note over Client,Processor: Fast, lightweight processing
|
||||
Note over HTTPStrategy: No browser overhead
|
||||
```
|
||||
|
||||
### HTTP Strategy Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "HTTP Crawler Strategy"
|
||||
A[AsyncHTTPCrawlerStrategy] --> B[Session Manager]
|
||||
A --> C[Request Builder]
|
||||
A --> D[Response Handler]
|
||||
A --> E[Error Manager]
|
||||
|
||||
B --> B1[Connection Pool]
|
||||
B --> B2[DNS Cache]
|
||||
B --> B3[SSL Context]
|
||||
|
||||
C --> C1[Headers Builder]
|
||||
C --> C2[Auth Handler]
|
||||
C --> C3[Payload Encoder]
|
||||
|
||||
D --> D1[Content Decoder]
|
||||
D --> D2[Redirect Handler]
|
||||
D --> D3[Status Validator]
|
||||
|
||||
E --> E1[Retry Logic]
|
||||
E --> E2[Timeout Handler]
|
||||
E --> E3[Exception Mapper]
|
||||
end
|
||||
|
||||
subgraph "Content Processing"
|
||||
F[Raw HTML] --> G[HTML Cleaner]
|
||||
G --> H[Markdown Generator]
|
||||
H --> I[Link Extractor]
|
||||
I --> J[Media Extractor]
|
||||
J --> K[Metadata Parser]
|
||||
end
|
||||
|
||||
subgraph "External Resources"
|
||||
L[Target Websites]
|
||||
M[Local Files]
|
||||
N[Raw Content]
|
||||
end
|
||||
|
||||
subgraph "Output"
|
||||
O[CrawlResult]
|
||||
O --> O1[HTML Content]
|
||||
O --> O2[Markdown Text]
|
||||
O --> O3[Extracted Links]
|
||||
O --> O4[Media References]
|
||||
O --> O5[Status Information]
|
||||
end
|
||||
|
||||
A --> F
|
||||
L --> A
|
||||
M --> A
|
||||
N --> A
|
||||
K --> O
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style B fill:#f3e5f5
|
||||
style F fill:#e8f5e8
|
||||
style O fill:#fff3e0
|
||||
```
|
||||
|
||||
### Performance Comparison Flow
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "HTTP Strategy Performance"
|
||||
A1[Request Start] --> A2[DNS Lookup: 50ms]
|
||||
A2 --> A3[TCP Connect: 100ms]
|
||||
A3 --> A4[HTTP Request: 200ms]
|
||||
A4 --> A5[Content Download: 300ms]
|
||||
A5 --> A6[Processing: 50ms]
|
||||
A6 --> A7[Total: ~700ms]
|
||||
end
|
||||
|
||||
subgraph "Browser Strategy Performance"
|
||||
B1[Request Start] --> B2[Browser Launch: 2000ms]
|
||||
B2 --> B3[Page Navigation: 1000ms]
|
||||
B3 --> B4[JS Execution: 500ms]
|
||||
B4 --> B5[Content Rendering: 300ms]
|
||||
B5 --> B6[Processing: 100ms]
|
||||
B6 --> B7[Total: ~3900ms]
|
||||
end
|
||||
|
||||
subgraph "Resource Usage"
|
||||
C1[HTTP Memory: ~50MB]
|
||||
C2[Browser Memory: ~500MB]
|
||||
C3[HTTP CPU: Low]
|
||||
C4[Browser CPU: High]
|
||||
C5[HTTP Concurrency: 100+]
|
||||
C6[Browser Concurrency: 10-20]
|
||||
end
|
||||
|
||||
A7 --> D[5.5x Faster]
|
||||
B7 --> D
|
||||
C1 --> E[10x Less Memory]
|
||||
C2 --> E
|
||||
C5 --> F[5x More Concurrent]
|
||||
C6 --> F
|
||||
|
||||
style A7 fill:#c8e6c9
|
||||
style B7 fill:#ffcdd2
|
||||
style C1 fill:#c8e6c9
|
||||
style C2 fill:#ffcdd2
|
||||
style C5 fill:#c8e6c9
|
||||
style C6 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### HTTP Request Types and Configuration
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> HTTPConfigSetup
|
||||
|
||||
HTTPConfigSetup --> MethodSelection
|
||||
|
||||
MethodSelection --> GET: Simple data retrieval
|
||||
MethodSelection --> POST: Form submission
|
||||
MethodSelection --> PUT: Data upload
|
||||
MethodSelection --> DELETE: Resource removal
|
||||
|
||||
GET --> HeaderSetup: Set Accept headers
|
||||
POST --> PayloadSetup: JSON or form data
|
||||
PUT --> PayloadSetup: File or data upload
|
||||
DELETE --> AuthSetup: Authentication required
|
||||
|
||||
PayloadSetup --> JSONPayload: application/json
|
||||
PayloadSetup --> FormPayload: form-data
|
||||
PayloadSetup --> RawPayload: custom content
|
||||
|
||||
JSONPayload --> HeaderSetup
|
||||
FormPayload --> HeaderSetup
|
||||
RawPayload --> HeaderSetup
|
||||
|
||||
HeaderSetup --> AuthSetup
|
||||
AuthSetup --> SSLSetup
|
||||
SSLSetup --> RedirectSetup
|
||||
RedirectSetup --> RequestExecution
|
||||
|
||||
RequestExecution --> [*]: Request complete
|
||||
|
||||
note right of GET : Default method for most crawling
|
||||
note right of POST : API interactions, form submissions
|
||||
note right of JSONPayload : Structured data transmission
|
||||
note right of HeaderSetup : User-Agent, Accept, Custom headers
|
||||
```
|
||||
|
||||
### Error Handling and Retry Workflow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[HTTP Request] --> B{Response Received?}
|
||||
|
||||
B -->|No| C[Connection Error]
|
||||
B -->|Yes| D{Status Code Check}
|
||||
|
||||
C --> C1{Timeout Error?}
|
||||
C1 -->|Yes| C2[ConnectionTimeoutError]
|
||||
C1 -->|No| C3[Network Error]
|
||||
|
||||
D -->|2xx| E[Success Response]
|
||||
D -->|3xx| F[Redirect Response]
|
||||
D -->|4xx| G[Client Error]
|
||||
D -->|5xx| H[Server Error]
|
||||
|
||||
F --> F1{Follow Redirects?}
|
||||
F1 -->|Yes| F2[Follow Redirect]
|
||||
F1 -->|No| F3[Return Redirect Response]
|
||||
F2 --> A
|
||||
|
||||
G --> G1{Retry on 4xx?}
|
||||
G1 -->|No| G2[HTTPStatusError]
|
||||
G1 -->|Yes| I[Check Retry Count]
|
||||
|
||||
H --> H1{Retry on 5xx?}
|
||||
H1 -->|Yes| I
|
||||
H1 -->|No| H2[HTTPStatusError]
|
||||
|
||||
C2 --> I
|
||||
C3 --> I
|
||||
|
||||
I --> J{Retries < Max?}
|
||||
J -->|No| K[Final Error]
|
||||
J -->|Yes| L[Calculate Backoff]
|
||||
|
||||
L --> M[Wait Backoff Time]
|
||||
M --> N[Increment Retry Count]
|
||||
N --> A
|
||||
|
||||
E --> O[Process Content]
|
||||
F3 --> O
|
||||
O --> P[Return CrawlResult]
|
||||
|
||||
G2 --> Q[Error CrawlResult]
|
||||
H2 --> Q
|
||||
K --> Q
|
||||
|
||||
style E fill:#c8e6c9
|
||||
style P fill:#c8e6c9
|
||||
style G2 fill:#ffcdd2
|
||||
style H2 fill:#ffcdd2
|
||||
style K fill:#ffcdd2
|
||||
style Q fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Batch Processing Architecture
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Client
|
||||
participant BatchManager as Batch Manager
|
||||
participant HTTPPool as Connection Pool
|
||||
participant Workers as HTTP Workers
|
||||
participant Targets as Target Servers
|
||||
|
||||
Client->>BatchManager: batch_crawl(urls)
|
||||
BatchManager->>BatchManager: create_semaphore(max_concurrent)
|
||||
|
||||
loop For each URL batch
|
||||
BatchManager->>HTTPPool: acquire_connection()
|
||||
HTTPPool->>Workers: assign_worker()
|
||||
|
||||
par Concurrent Processing
|
||||
Workers->>Targets: HTTP Request 1
|
||||
Workers->>Targets: HTTP Request 2
|
||||
Workers->>Targets: HTTP Request N
|
||||
end
|
||||
|
||||
par Response Handling
|
||||
Targets-->>Workers: Response 1
|
||||
Targets-->>Workers: Response 2
|
||||
Targets-->>Workers: Response N
|
||||
end
|
||||
|
||||
Workers->>HTTPPool: return_connection()
|
||||
HTTPPool->>BatchManager: batch_results()
|
||||
end
|
||||
|
||||
BatchManager->>BatchManager: aggregate_results()
|
||||
BatchManager-->>Client: final_results()
|
||||
|
||||
Note over Workers,Targets: 20-100 concurrent connections
|
||||
Note over BatchManager: Memory-efficient processing
|
||||
Note over HTTPPool: Connection reuse optimization
|
||||
```
|
||||
|
||||
### Content Type Processing Pipeline
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
A[HTTP Response] --> B{Content-Type Detection}
|
||||
|
||||
B -->|text/html| C[HTML Processing]
|
||||
B -->|application/json| D[JSON Processing]
|
||||
B -->|text/plain| E[Text Processing]
|
||||
B -->|application/xml| F[XML Processing]
|
||||
B -->|Other| G[Binary Processing]
|
||||
|
||||
C --> C1[Parse HTML Structure]
|
||||
C1 --> C2[Extract Text Content]
|
||||
C2 --> C3[Generate Markdown]
|
||||
C3 --> C4[Extract Links/Media]
|
||||
|
||||
D --> D1[Parse JSON Structure]
|
||||
D1 --> D2[Extract Data Fields]
|
||||
D2 --> D3[Format as Readable Text]
|
||||
|
||||
E --> E1[Clean Text Content]
|
||||
E1 --> E2[Basic Formatting]
|
||||
|
||||
F --> F1[Parse XML Structure]
|
||||
F1 --> F2[Extract Text Nodes]
|
||||
F2 --> F3[Convert to Markdown]
|
||||
|
||||
G --> G1[Save Binary Content]
|
||||
G1 --> G2[Generate Metadata]
|
||||
|
||||
C4 --> H[Content Analysis]
|
||||
D3 --> H
|
||||
E2 --> H
|
||||
F3 --> H
|
||||
G2 --> H
|
||||
|
||||
H --> I[Link Extraction]
|
||||
H --> J[Media Detection]
|
||||
H --> K[Metadata Parsing]
|
||||
|
||||
I --> L[CrawlResult Assembly]
|
||||
J --> L
|
||||
K --> L
|
||||
|
||||
L --> M[Final Output]
|
||||
|
||||
style C fill:#e8f5e8
|
||||
style H fill:#fff3e0
|
||||
style L fill:#e3f2fd
|
||||
style M fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Integration with Processing Strategies
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "HTTP Strategy Core"
|
||||
A[HTTP Request] --> B[Raw Content]
|
||||
B --> C[Content Decoder]
|
||||
end
|
||||
|
||||
subgraph "Processing Pipeline"
|
||||
C --> D[HTML Cleaner]
|
||||
D --> E[Markdown Generator]
|
||||
E --> F{Content Filter?}
|
||||
|
||||
F -->|Yes| G[Pruning Filter]
|
||||
F -->|Yes| H[BM25 Filter]
|
||||
F -->|No| I[Raw Markdown]
|
||||
|
||||
G --> J[Fit Markdown]
|
||||
H --> J
|
||||
end
|
||||
|
||||
subgraph "Extraction Strategies"
|
||||
I --> K[CSS Extraction]
|
||||
J --> K
|
||||
I --> L[XPath Extraction]
|
||||
J --> L
|
||||
I --> M[LLM Extraction]
|
||||
J --> M
|
||||
end
|
||||
|
||||
subgraph "Output Generation"
|
||||
K --> N[Structured JSON]
|
||||
L --> N
|
||||
M --> N
|
||||
|
||||
I --> O[Clean Markdown]
|
||||
J --> P[Filtered Content]
|
||||
|
||||
N --> Q[Final CrawlResult]
|
||||
O --> Q
|
||||
P --> Q
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style C fill:#f3e5f5
|
||||
style E fill:#e8f5e8
|
||||
style Q fill:#c8e6c9
|
||||
```
|
||||
|
||||
**📖 Learn more:** [HTTP vs Browser Strategies](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Error Handling](https://docs.crawl4ai.com/api/async-webcrawler/)
|
||||
368
docs/md_v2/assets/llm.txt/diagrams/installation.txt
Normal file
368
docs/md_v2/assets/llm.txt/diagrams/installation.txt
Normal file
@@ -0,0 +1,368 @@
|
||||
## Installation Workflows and Architecture
|
||||
|
||||
Visual representations of Crawl4AI installation processes, deployment options, and system interactions.
|
||||
|
||||
### Installation Decision Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Start Installation] --> B{Environment Type?}
|
||||
|
||||
B -->|Local Development| C[Basic Python Install]
|
||||
B -->|Production| D[Docker Deployment]
|
||||
B -->|Research/Testing| E[Google Colab]
|
||||
B -->|CI/CD Pipeline| F[Automated Setup]
|
||||
|
||||
C --> C1[pip install crawl4ai]
|
||||
C1 --> C2[crawl4ai-setup]
|
||||
C2 --> C3{Need Advanced Features?}
|
||||
|
||||
C3 -->|No| C4[Basic Installation Complete]
|
||||
C3 -->|Text Clustering| C5[pip install crawl4ai with torch]
|
||||
C3 -->|Transformers| C6[pip install crawl4ai with transformer]
|
||||
C3 -->|All Features| C7[pip install crawl4ai with all]
|
||||
|
||||
C5 --> C8[crawl4ai-download-models]
|
||||
C6 --> C8
|
||||
C7 --> C8
|
||||
C8 --> C9[Advanced Installation Complete]
|
||||
|
||||
D --> D1{Deployment Method?}
|
||||
D1 -->|Pre-built Image| D2[docker pull unclecode/crawl4ai]
|
||||
D1 -->|Docker Compose| D3[Clone repo + docker compose]
|
||||
D1 -->|Custom Build| D4[docker buildx build]
|
||||
|
||||
D2 --> D5[Configure .llm.env]
|
||||
D3 --> D5
|
||||
D4 --> D5
|
||||
D5 --> D6[docker run with ports]
|
||||
D6 --> D7[Docker Deployment Complete]
|
||||
|
||||
E --> E1[Colab pip install]
|
||||
E1 --> E2[playwright install chromium]
|
||||
E2 --> E3[Test basic crawl]
|
||||
E3 --> E4[Colab Setup Complete]
|
||||
|
||||
F --> F1[Automated pip install]
|
||||
F1 --> F2[Automated setup scripts]
|
||||
F2 --> F3[CI/CD Integration Complete]
|
||||
|
||||
C4 --> G[Verify with crawl4ai-doctor]
|
||||
C9 --> G
|
||||
D7 --> H[Health check via API]
|
||||
E4 --> I[Run test crawl]
|
||||
F3 --> G
|
||||
|
||||
G --> J[Installation Verified]
|
||||
H --> J
|
||||
I --> J
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style J fill:#c8e6c9
|
||||
style C4 fill:#fff3e0
|
||||
style C9 fill:#fff3e0
|
||||
style D7 fill:#f3e5f5
|
||||
style E4 fill:#fce4ec
|
||||
style F3 fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Basic Installation Sequence
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant PyPI
|
||||
participant System
|
||||
participant Playwright
|
||||
participant Crawler
|
||||
|
||||
User->>PyPI: pip install crawl4ai
|
||||
PyPI-->>User: Package downloaded
|
||||
|
||||
User->>System: crawl4ai-setup
|
||||
System->>Playwright: Install browser binaries
|
||||
Playwright-->>System: Chromium, Firefox installed
|
||||
System-->>User: Setup complete
|
||||
|
||||
User->>System: crawl4ai-doctor
|
||||
System->>System: Check Python version
|
||||
System->>System: Verify Playwright installation
|
||||
System->>System: Test browser launch
|
||||
System-->>User: Diagnostics report
|
||||
|
||||
User->>Crawler: Basic crawl test
|
||||
Crawler->>Playwright: Launch browser
|
||||
Playwright-->>Crawler: Browser ready
|
||||
Crawler->>Crawler: Navigate to test URL
|
||||
Crawler-->>User: Success confirmation
|
||||
```
|
||||
|
||||
### Docker Deployment Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Host System"
|
||||
A[Docker Engine] --> B[Crawl4AI Container]
|
||||
C[.llm.env File] --> B
|
||||
D[Port 11235] --> B
|
||||
end
|
||||
|
||||
subgraph "Container Environment"
|
||||
B --> E[FastAPI Server]
|
||||
B --> F[Playwright Browsers]
|
||||
B --> G[Python Runtime]
|
||||
|
||||
E --> H[/crawl Endpoint]
|
||||
E --> I[/playground Interface]
|
||||
E --> J[/health Monitoring]
|
||||
E --> K[/metrics Prometheus]
|
||||
|
||||
F --> L[Chromium Browser]
|
||||
F --> M[Firefox Browser]
|
||||
F --> N[WebKit Browser]
|
||||
end
|
||||
|
||||
subgraph "External Services"
|
||||
O[OpenAI API] --> B
|
||||
P[Anthropic API] --> B
|
||||
Q[Local LLM Ollama] --> B
|
||||
end
|
||||
|
||||
subgraph "Client Applications"
|
||||
R[Python SDK] --> H
|
||||
S[REST API Calls] --> H
|
||||
T[Web Browser] --> I
|
||||
U[Monitoring Tools] --> J
|
||||
V[Prometheus] --> K
|
||||
end
|
||||
|
||||
style B fill:#e3f2fd
|
||||
style E fill:#f3e5f5
|
||||
style F fill:#e8f5e8
|
||||
style G fill:#fff3e0
|
||||
```
|
||||
|
||||
### Advanced Features Installation Flow
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> BasicInstall
|
||||
|
||||
BasicInstall --> FeatureChoice: crawl4ai installed
|
||||
|
||||
FeatureChoice --> TorchInstall: Need text clustering
|
||||
FeatureChoice --> TransformerInstall: Need HuggingFace models
|
||||
FeatureChoice --> AllInstall: Need everything
|
||||
FeatureChoice --> Complete: Basic features sufficient
|
||||
|
||||
TorchInstall --> TorchSetup: pip install crawl4ai with torch
|
||||
TransformerInstall --> TransformerSetup: pip install crawl4ai with transformer
|
||||
AllInstall --> AllSetup: pip install crawl4ai with all
|
||||
|
||||
TorchSetup --> ModelDownload: crawl4ai-setup
|
||||
TransformerSetup --> ModelDownload: crawl4ai-setup
|
||||
AllSetup --> ModelDownload: crawl4ai-setup
|
||||
|
||||
ModelDownload --> PreDownload: crawl4ai-download-models
|
||||
PreDownload --> Complete: All models cached
|
||||
|
||||
Complete --> Verification: crawl4ai-doctor
|
||||
Verification --> [*]: Installation verified
|
||||
|
||||
note right of TorchInstall : PyTorch for semantic operations
|
||||
note right of TransformerInstall : HuggingFace for LLM features
|
||||
note right of AllInstall : Complete feature set
|
||||
```
|
||||
|
||||
### Platform-Specific Installation Matrix
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Installation Methods"
|
||||
A[Python Package] --> A1[pip install]
|
||||
B[Docker Image] --> B1[docker pull]
|
||||
C[Source Build] --> C1[git clone + build]
|
||||
D[Cloud Platform] --> D1[Colab/Kaggle]
|
||||
end
|
||||
|
||||
subgraph "Operating Systems"
|
||||
E[Linux x86_64]
|
||||
F[Linux ARM64]
|
||||
G[macOS Intel]
|
||||
H[macOS Apple Silicon]
|
||||
I[Windows x86_64]
|
||||
end
|
||||
|
||||
subgraph "Feature Sets"
|
||||
J[Basic crawling]
|
||||
K[Text clustering torch]
|
||||
L[LLM transformers]
|
||||
M[All features]
|
||||
end
|
||||
|
||||
A1 --> E
|
||||
A1 --> F
|
||||
A1 --> G
|
||||
A1 --> H
|
||||
A1 --> I
|
||||
|
||||
B1 --> E
|
||||
B1 --> F
|
||||
B1 --> G
|
||||
B1 --> H
|
||||
|
||||
C1 --> E
|
||||
C1 --> F
|
||||
C1 --> G
|
||||
C1 --> H
|
||||
C1 --> I
|
||||
|
||||
D1 --> E
|
||||
D1 --> I
|
||||
|
||||
E --> J
|
||||
E --> K
|
||||
E --> L
|
||||
E --> M
|
||||
|
||||
F --> J
|
||||
F --> K
|
||||
F --> L
|
||||
F --> M
|
||||
|
||||
G --> J
|
||||
G --> K
|
||||
G --> L
|
||||
G --> M
|
||||
|
||||
H --> J
|
||||
H --> K
|
||||
H --> L
|
||||
H --> M
|
||||
|
||||
I --> J
|
||||
I --> K
|
||||
I --> L
|
||||
I --> M
|
||||
|
||||
style A1 fill:#e3f2fd
|
||||
style B1 fill:#f3e5f5
|
||||
style C1 fill:#e8f5e8
|
||||
style D1 fill:#fff3e0
|
||||
```
|
||||
|
||||
### Docker Multi-Stage Build Process
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant Dev as Developer
|
||||
participant Git as GitHub Repo
|
||||
participant Docker as Docker Engine
|
||||
participant Registry as Docker Hub
|
||||
participant User as End User
|
||||
|
||||
Dev->>Git: Push code changes
|
||||
|
||||
Docker->>Git: Clone repository
|
||||
Docker->>Docker: Stage 1 - Base Python image
|
||||
Docker->>Docker: Stage 2 - Install dependencies
|
||||
Docker->>Docker: Stage 3 - Install Playwright
|
||||
Docker->>Docker: Stage 4 - Copy application code
|
||||
Docker->>Docker: Stage 5 - Setup FastAPI server
|
||||
|
||||
Note over Docker: Multi-architecture build
|
||||
Docker->>Docker: Build for linux/amd64
|
||||
Docker->>Docker: Build for linux/arm64
|
||||
|
||||
Docker->>Registry: Push multi-arch manifest
|
||||
Registry-->>Docker: Build complete
|
||||
|
||||
User->>Registry: docker pull unclecode/crawl4ai
|
||||
Registry-->>User: Download appropriate architecture
|
||||
|
||||
User->>Docker: docker run with configuration
|
||||
Docker->>Docker: Start container
|
||||
Docker->>Docker: Initialize FastAPI server
|
||||
Docker->>Docker: Setup Playwright browsers
|
||||
Docker-->>User: Service ready on port 11235
|
||||
```
|
||||
|
||||
### Installation Verification Workflow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Installation Complete] --> B[Run crawl4ai-doctor]
|
||||
|
||||
B --> C{Python Version Check}
|
||||
C -->|✓ 3.10+| D{Playwright Check}
|
||||
C -->|✗ < 3.10| C1[Upgrade Python]
|
||||
C1 --> D
|
||||
|
||||
D -->|✓ Installed| E{Browser Binaries}
|
||||
D -->|✗ Missing| D1[Run crawl4ai-setup]
|
||||
D1 --> E
|
||||
|
||||
E -->|✓ Available| F{Test Browser Launch}
|
||||
E -->|✗ Missing| E1[playwright install]
|
||||
E1 --> F
|
||||
|
||||
F -->|✓ Success| G[Test Basic Crawl]
|
||||
F -->|✗ Failed| F1[Check system dependencies]
|
||||
F1 --> F
|
||||
|
||||
G --> H{Crawl Test Result}
|
||||
H -->|✓ Success| I[Installation Verified ✓]
|
||||
H -->|✗ Failed| H1[Check network/permissions]
|
||||
H1 --> G
|
||||
|
||||
I --> J[Ready for Production Use]
|
||||
|
||||
style I fill:#c8e6c9
|
||||
style J fill:#e8f5e8
|
||||
style C1 fill:#ffcdd2
|
||||
style D1 fill:#fff3e0
|
||||
style E1 fill:#fff3e0
|
||||
style F1 fill:#ffcdd2
|
||||
style H1 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Resource Requirements by Installation Type
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Basic Installation"
|
||||
A1[Memory: 512MB]
|
||||
A2[Disk: 2GB]
|
||||
A3[CPU: 1 core]
|
||||
A4[Network: Required for setup]
|
||||
end
|
||||
|
||||
subgraph "Advanced Features torch"
|
||||
B1[Memory: 2GB+]
|
||||
B2[Disk: 5GB+]
|
||||
B3[CPU: 2+ cores]
|
||||
B4[GPU: Optional CUDA]
|
||||
end
|
||||
|
||||
subgraph "All Features"
|
||||
C1[Memory: 4GB+]
|
||||
C2[Disk: 10GB+]
|
||||
C3[CPU: 4+ cores]
|
||||
C4[GPU: Recommended]
|
||||
end
|
||||
|
||||
subgraph "Docker Deployment"
|
||||
D1[Memory: 1GB+]
|
||||
D2[Disk: 3GB+]
|
||||
D3[CPU: 2+ cores]
|
||||
D4[Ports: 11235]
|
||||
D5[Shared Memory: 1GB]
|
||||
end
|
||||
|
||||
style A1 fill:#e8f5e8
|
||||
style B1 fill:#fff3e0
|
||||
style C1 fill:#ffecb3
|
||||
style D1 fill:#e3f2fd
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Installation Guide](https://docs.crawl4ai.com/core/installation/), [Docker Deployment](https://docs.crawl4ai.com/core/docker-deployment/), [System Requirements](https://docs.crawl4ai.com/core/installation/#prerequisites)
|
||||
5912
docs/md_v2/assets/llm.txt/diagrams/llms-diagram.txt
Normal file
5912
docs/md_v2/assets/llm.txt/diagrams/llms-diagram.txt
Normal file
File diff suppressed because it is too large
Load Diff
392
docs/md_v2/assets/llm.txt/diagrams/multi_urls_crawling.txt
Normal file
392
docs/md_v2/assets/llm.txt/diagrams/multi_urls_crawling.txt
Normal file
@@ -0,0 +1,392 @@
|
||||
## Multi-URL Crawling Workflows and Architecture
|
||||
|
||||
Visual representations of concurrent crawling patterns, resource management, and monitoring systems for handling multiple URLs efficiently.
|
||||
|
||||
### Multi-URL Processing Modes
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Multi-URL Crawling Request] --> B{Processing Mode?}
|
||||
|
||||
B -->|Batch Mode| C[Collect All URLs]
|
||||
B -->|Streaming Mode| D[Process URLs Individually]
|
||||
|
||||
C --> C1[Queue All URLs]
|
||||
C1 --> C2[Execute Concurrently]
|
||||
C2 --> C3[Wait for All Completion]
|
||||
C3 --> C4[Return Complete Results Array]
|
||||
|
||||
D --> D1[Queue URLs]
|
||||
D1 --> D2[Start First Batch]
|
||||
D2 --> D3[Yield Results as Available]
|
||||
D3 --> D4{More URLs?}
|
||||
D4 -->|Yes| D5[Start Next URLs]
|
||||
D4 -->|No| D6[Stream Complete]
|
||||
D5 --> D3
|
||||
|
||||
C4 --> E[Process Results]
|
||||
D6 --> E
|
||||
|
||||
E --> F[Success/Failure Analysis]
|
||||
F --> G[End]
|
||||
|
||||
style C fill:#e3f2fd
|
||||
style D fill:#f3e5f5
|
||||
style C4 fill:#c8e6c9
|
||||
style D6 fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Memory-Adaptive Dispatcher Flow
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> Initializing
|
||||
|
||||
Initializing --> MonitoringMemory: Start dispatcher
|
||||
|
||||
MonitoringMemory --> CheckingMemory: Every check_interval
|
||||
CheckingMemory --> MemoryOK: Memory < threshold
|
||||
CheckingMemory --> MemoryHigh: Memory >= threshold
|
||||
|
||||
MemoryOK --> DispatchingTasks: Start new crawls
|
||||
MemoryHigh --> WaitingForMemory: Pause dispatching
|
||||
|
||||
DispatchingTasks --> TaskRunning: Launch crawler
|
||||
TaskRunning --> TaskCompleted: Crawl finished
|
||||
TaskRunning --> TaskFailed: Crawl error
|
||||
|
||||
TaskCompleted --> MonitoringMemory: Update stats
|
||||
TaskFailed --> MonitoringMemory: Update stats
|
||||
|
||||
WaitingForMemory --> CheckingMemory: Wait timeout
|
||||
WaitingForMemory --> MonitoringMemory: Memory freed
|
||||
|
||||
note right of MemoryHigh: Prevents OOM crashes
|
||||
note right of DispatchingTasks: Respects max_session_permit
|
||||
note right of WaitingForMemory: Configurable timeout
|
||||
```
|
||||
|
||||
### Concurrent Crawling Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "URL Queue Management"
|
||||
A[URL Input List] --> B[URL Queue]
|
||||
B --> C[Priority Scheduler]
|
||||
C --> D[Batch Assignment]
|
||||
end
|
||||
|
||||
subgraph "Dispatcher Layer"
|
||||
E[Memory Adaptive Dispatcher]
|
||||
F[Semaphore Dispatcher]
|
||||
G[Rate Limiter]
|
||||
H[Resource Monitor]
|
||||
|
||||
E --> I[Memory Checker]
|
||||
F --> J[Concurrency Controller]
|
||||
G --> K[Delay Calculator]
|
||||
H --> L[System Stats]
|
||||
end
|
||||
|
||||
subgraph "Crawler Pool"
|
||||
M[Crawler Instance 1]
|
||||
N[Crawler Instance 2]
|
||||
O[Crawler Instance 3]
|
||||
P[Crawler Instance N]
|
||||
|
||||
M --> Q[Browser Session 1]
|
||||
N --> R[Browser Session 2]
|
||||
O --> S[Browser Session 3]
|
||||
P --> T[Browser Session N]
|
||||
end
|
||||
|
||||
subgraph "Result Processing"
|
||||
U[Result Collector]
|
||||
V[Success Handler]
|
||||
W[Error Handler]
|
||||
X[Retry Queue]
|
||||
Y[Final Results]
|
||||
end
|
||||
|
||||
D --> E
|
||||
D --> F
|
||||
E --> M
|
||||
F --> N
|
||||
G --> O
|
||||
H --> P
|
||||
|
||||
Q --> U
|
||||
R --> U
|
||||
S --> U
|
||||
T --> U
|
||||
|
||||
U --> V
|
||||
U --> W
|
||||
W --> X
|
||||
X --> B
|
||||
V --> Y
|
||||
|
||||
style E fill:#e3f2fd
|
||||
style F fill:#f3e5f5
|
||||
style G fill:#e8f5e8
|
||||
style H fill:#fff3e0
|
||||
```
|
||||
|
||||
### Rate Limiting and Backoff Strategy
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant C as Crawler
|
||||
participant RL as Rate Limiter
|
||||
participant S as Server
|
||||
participant D as Dispatcher
|
||||
|
||||
C->>RL: Request to crawl URL
|
||||
RL->>RL: Calculate delay
|
||||
RL->>RL: Apply base delay (1-3s)
|
||||
RL->>C: Delay applied
|
||||
|
||||
C->>S: HTTP Request
|
||||
|
||||
alt Success Response
|
||||
S-->>C: 200 OK + Content
|
||||
C->>RL: Report success
|
||||
RL->>RL: Reset failure count
|
||||
C->>D: Return successful result
|
||||
else Rate Limited
|
||||
S-->>C: 429 Too Many Requests
|
||||
C->>RL: Report rate limit
|
||||
RL->>RL: Exponential backoff
|
||||
RL->>RL: Increase delay (up to max_delay)
|
||||
RL->>C: Apply longer delay
|
||||
C->>S: Retry request after delay
|
||||
else Server Error
|
||||
S-->>C: 503 Service Unavailable
|
||||
C->>RL: Report server error
|
||||
RL->>RL: Moderate backoff
|
||||
RL->>C: Retry with backoff
|
||||
else Max Retries Exceeded
|
||||
RL->>C: Stop retrying
|
||||
C->>D: Return failed result
|
||||
end
|
||||
```
|
||||
|
||||
### Large-Scale Crawling Workflow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Load URL List 10k+ URLs] --> B[Initialize Dispatcher]
|
||||
|
||||
B --> C{Select Dispatcher Type}
|
||||
C -->|Memory Constrained| D[Memory Adaptive]
|
||||
C -->|Fixed Resources| E[Semaphore Based]
|
||||
|
||||
D --> F[Set Memory Threshold 70%]
|
||||
E --> G[Set Concurrency Limit]
|
||||
|
||||
F --> H[Configure Monitoring]
|
||||
G --> H
|
||||
|
||||
H --> I[Start Crawling Process]
|
||||
I --> J[Monitor System Resources]
|
||||
|
||||
J --> K{Memory Usage?}
|
||||
K -->|< Threshold| L[Continue Dispatching]
|
||||
K -->|>= Threshold| M[Pause New Tasks]
|
||||
|
||||
L --> N[Process Results Stream]
|
||||
M --> O[Wait for Memory]
|
||||
O --> K
|
||||
|
||||
N --> P{Result Type?}
|
||||
P -->|Success| Q[Save to Database]
|
||||
P -->|Failure| R[Log Error]
|
||||
|
||||
Q --> S[Update Progress Counter]
|
||||
R --> S
|
||||
|
||||
S --> T{More URLs?}
|
||||
T -->|Yes| U[Get Next Batch]
|
||||
T -->|No| V[Generate Final Report]
|
||||
|
||||
U --> L
|
||||
V --> W[Analysis Complete]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style D fill:#e8f5e8
|
||||
style E fill:#f3e5f5
|
||||
style V fill:#c8e6c9
|
||||
style W fill:#a5d6a7
|
||||
```
|
||||
|
||||
### Real-Time Monitoring Dashboard Flow
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Data Collection"
|
||||
A[Crawler Tasks] --> B[Performance Metrics]
|
||||
A --> C[Memory Usage]
|
||||
A --> D[Success/Failure Rates]
|
||||
A --> E[Response Times]
|
||||
end
|
||||
|
||||
subgraph "Monitor Processing"
|
||||
F[CrawlerMonitor] --> G[Aggregate Statistics]
|
||||
F --> H[Display Formatter]
|
||||
F --> I[Update Scheduler]
|
||||
end
|
||||
|
||||
subgraph "Display Modes"
|
||||
J[DETAILED Mode]
|
||||
K[AGGREGATED Mode]
|
||||
|
||||
J --> L[Individual Task Status]
|
||||
J --> M[Task-Level Metrics]
|
||||
K --> N[Summary Statistics]
|
||||
K --> O[Overall Progress]
|
||||
end
|
||||
|
||||
subgraph "Output Interface"
|
||||
P[Console Display]
|
||||
Q[Progress Bars]
|
||||
R[Status Tables]
|
||||
S[Real-time Updates]
|
||||
end
|
||||
|
||||
B --> F
|
||||
C --> F
|
||||
D --> F
|
||||
E --> F
|
||||
|
||||
G --> J
|
||||
G --> K
|
||||
H --> J
|
||||
H --> K
|
||||
I --> J
|
||||
I --> K
|
||||
|
||||
L --> P
|
||||
M --> Q
|
||||
N --> R
|
||||
O --> S
|
||||
|
||||
style F fill:#e3f2fd
|
||||
style J fill:#f3e5f5
|
||||
style K fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Error Handling and Recovery Pattern
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> ProcessingURL
|
||||
|
||||
ProcessingURL --> CrawlAttempt: Start crawl
|
||||
|
||||
CrawlAttempt --> Success: HTTP 200
|
||||
CrawlAttempt --> NetworkError: Connection failed
|
||||
CrawlAttempt --> RateLimit: HTTP 429
|
||||
CrawlAttempt --> ServerError: HTTP 5xx
|
||||
CrawlAttempt --> Timeout: Request timeout
|
||||
|
||||
Success --> [*]: Return result
|
||||
|
||||
NetworkError --> RetryCheck: Check retry count
|
||||
RateLimit --> BackoffWait: Apply exponential backoff
|
||||
ServerError --> RetryCheck: Check retry count
|
||||
Timeout --> RetryCheck: Check retry count
|
||||
|
||||
BackoffWait --> RetryCheck: After delay
|
||||
|
||||
RetryCheck --> CrawlAttempt: retries < max_retries
|
||||
RetryCheck --> Failed: retries >= max_retries
|
||||
|
||||
Failed --> ErrorLog: Log failure details
|
||||
ErrorLog --> [*]: Return failed result
|
||||
|
||||
note right of BackoffWait: Exponential backoff for rate limits
|
||||
note right of RetryCheck: Configurable max_retries
|
||||
note right of ErrorLog: Detailed error tracking
|
||||
```
|
||||
|
||||
### Resource Management Timeline
|
||||
|
||||
```mermaid
|
||||
gantt
|
||||
title Multi-URL Crawling Resource Management
|
||||
dateFormat X
|
||||
axisFormat %s
|
||||
|
||||
section Memory Usage
|
||||
Initialize Dispatcher :0, 1
|
||||
Memory Monitoring :1, 10
|
||||
Peak Usage Period :3, 7
|
||||
Memory Cleanup :7, 9
|
||||
|
||||
section Task Execution
|
||||
URL Queue Setup :0, 2
|
||||
Batch 1 Processing :2, 5
|
||||
Batch 2 Processing :4, 7
|
||||
Batch 3 Processing :6, 9
|
||||
Final Results :9, 10
|
||||
|
||||
section Rate Limiting
|
||||
Normal Delays :2, 4
|
||||
Backoff Period :4, 6
|
||||
Recovery Period :6, 8
|
||||
|
||||
section Monitoring
|
||||
System Health Check :0, 10
|
||||
Progress Updates :1, 9
|
||||
Performance Metrics :2, 8
|
||||
```
|
||||
|
||||
### Concurrent Processing Performance Matrix
|
||||
|
||||
```mermaid
|
||||
graph TD
|
||||
subgraph "Input Factors"
|
||||
A[Number of URLs]
|
||||
B[Concurrency Level]
|
||||
C[Memory Threshold]
|
||||
D[Rate Limiting]
|
||||
end
|
||||
|
||||
subgraph "Processing Characteristics"
|
||||
A --> E[Low 1-100 URLs]
|
||||
A --> F[Medium 100-1k URLs]
|
||||
A --> G[High 1k-10k URLs]
|
||||
A --> H[Very High 10k+ URLs]
|
||||
|
||||
B --> I[Conservative 1-5]
|
||||
B --> J[Moderate 5-15]
|
||||
B --> K[Aggressive 15-30]
|
||||
|
||||
C --> L[Strict 60-70%]
|
||||
C --> M[Balanced 70-80%]
|
||||
C --> N[Relaxed 80-90%]
|
||||
end
|
||||
|
||||
subgraph "Recommended Configurations"
|
||||
E --> O[Simple Semaphore]
|
||||
F --> P[Memory Adaptive Basic]
|
||||
G --> Q[Memory Adaptive Advanced]
|
||||
H --> R[Memory Adaptive + Monitoring]
|
||||
|
||||
I --> O
|
||||
J --> P
|
||||
K --> Q
|
||||
K --> R
|
||||
|
||||
L --> Q
|
||||
M --> P
|
||||
N --> O
|
||||
end
|
||||
|
||||
style O fill:#c8e6c9
|
||||
style P fill:#fff3e0
|
||||
style Q fill:#ffecb3
|
||||
style R fill:#ffcdd2
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Multi-URL Crawling Guide](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Dispatcher Configuration](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/#performance-optimization)
|
||||
411
docs/md_v2/assets/llm.txt/diagrams/simple_crawling.txt
Normal file
411
docs/md_v2/assets/llm.txt/diagrams/simple_crawling.txt
Normal file
@@ -0,0 +1,411 @@
|
||||
## Simple Crawling Workflows and Data Flow
|
||||
|
||||
Visual representations of basic web crawling operations, configuration patterns, and result processing workflows.
|
||||
|
||||
### Basic Crawling Sequence
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Crawler as AsyncWebCrawler
|
||||
participant Browser as Browser Instance
|
||||
participant Page as Web Page
|
||||
participant Processor as Content Processor
|
||||
|
||||
User->>Crawler: Create with BrowserConfig
|
||||
Crawler->>Browser: Launch browser instance
|
||||
Browser-->>Crawler: Browser ready
|
||||
|
||||
User->>Crawler: arun(url, CrawlerRunConfig)
|
||||
Crawler->>Browser: Create new page/context
|
||||
Browser->>Page: Navigate to URL
|
||||
Page-->>Browser: Page loaded
|
||||
|
||||
Browser->>Processor: Extract raw HTML
|
||||
Processor->>Processor: Clean HTML
|
||||
Processor->>Processor: Generate markdown
|
||||
Processor->>Processor: Extract media/links
|
||||
Processor-->>Crawler: CrawlResult created
|
||||
|
||||
Crawler-->>User: Return CrawlResult
|
||||
|
||||
Note over User,Processor: All processing happens asynchronously
|
||||
```
|
||||
|
||||
### Crawling Configuration Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Start Crawling] --> B{Browser Config Set?}
|
||||
|
||||
B -->|No| B1[Use Default BrowserConfig]
|
||||
B -->|Yes| B2[Custom BrowserConfig]
|
||||
|
||||
B1 --> C[Launch Browser]
|
||||
B2 --> C
|
||||
|
||||
C --> D{Crawler Run Config Set?}
|
||||
|
||||
D -->|No| D1[Use Default CrawlerRunConfig]
|
||||
D -->|Yes| D2[Custom CrawlerRunConfig]
|
||||
|
||||
D1 --> E[Navigate to URL]
|
||||
D2 --> E
|
||||
|
||||
E --> F{Page Load Success?}
|
||||
F -->|No| F1[Return Error Result]
|
||||
F -->|Yes| G[Apply Content Filters]
|
||||
|
||||
G --> G1{excluded_tags set?}
|
||||
G1 -->|Yes| G2[Remove specified tags]
|
||||
G1 -->|No| G3[Keep all tags]
|
||||
G2 --> G4{css_selector set?}
|
||||
G3 --> G4
|
||||
|
||||
G4 -->|Yes| G5[Extract selected elements]
|
||||
G4 -->|No| G6[Process full page]
|
||||
G5 --> H[Generate Markdown]
|
||||
G6 --> H
|
||||
|
||||
H --> H1{markdown_generator set?}
|
||||
H1 -->|Yes| H2[Use custom generator]
|
||||
H1 -->|No| H3[Use default generator]
|
||||
H2 --> I[Extract Media and Links]
|
||||
H3 --> I
|
||||
|
||||
I --> I1{process_iframes?}
|
||||
I1 -->|Yes| I2[Include iframe content]
|
||||
I1 -->|No| I3[Skip iframes]
|
||||
I2 --> J[Create CrawlResult]
|
||||
I3 --> J
|
||||
|
||||
J --> K[Return Result]
|
||||
|
||||
style A fill:#e1f5fe
|
||||
style K fill:#c8e6c9
|
||||
style F1 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### CrawlResult Data Structure
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "CrawlResult Object"
|
||||
A[CrawlResult] --> B[Basic Info]
|
||||
A --> C[Content Variants]
|
||||
A --> D[Extracted Data]
|
||||
A --> E[Media Assets]
|
||||
A --> F[Optional Outputs]
|
||||
|
||||
B --> B1[url: Final URL]
|
||||
B --> B2[success: Boolean]
|
||||
B --> B3[status_code: HTTP Status]
|
||||
B --> B4[error_message: Error Details]
|
||||
|
||||
C --> C1[html: Raw HTML]
|
||||
C --> C2[cleaned_html: Sanitized HTML]
|
||||
C --> C3[markdown: MarkdownGenerationResult]
|
||||
|
||||
C3 --> C3A[raw_markdown: Basic conversion]
|
||||
C3 --> C3B[markdown_with_citations: With references]
|
||||
C3 --> C3C[fit_markdown: Filtered content]
|
||||
C3 --> C3D[references_markdown: Citation list]
|
||||
|
||||
D --> D1[links: Internal/External]
|
||||
D --> D2[media: Images/Videos/Audio]
|
||||
D --> D3[metadata: Page info]
|
||||
D --> D4[extracted_content: JSON data]
|
||||
D --> D5[tables: Structured table data]
|
||||
|
||||
E --> E1[screenshot: Base64 image]
|
||||
E --> E2[pdf: PDF bytes]
|
||||
E --> E3[mhtml: Archive file]
|
||||
E --> E4[downloaded_files: File paths]
|
||||
|
||||
F --> F1[session_id: Browser session]
|
||||
F --> F2[ssl_certificate: Security info]
|
||||
F --> F3[response_headers: HTTP headers]
|
||||
F --> F4[network_requests: Traffic log]
|
||||
F --> F5[console_messages: Browser logs]
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style C3 fill:#f3e5f5
|
||||
style D5 fill:#e8f5e8
|
||||
```
|
||||
|
||||
### Content Processing Pipeline
|
||||
|
||||
```mermaid
|
||||
flowchart LR
|
||||
subgraph "Input Sources"
|
||||
A1[Web URL]
|
||||
A2[Raw HTML]
|
||||
A3[Local File]
|
||||
end
|
||||
|
||||
A1 --> B[Browser Navigation]
|
||||
A2 --> C[Direct Processing]
|
||||
A3 --> C
|
||||
|
||||
B --> D[Raw HTML Capture]
|
||||
C --> D
|
||||
|
||||
D --> E{Content Filtering}
|
||||
|
||||
E --> E1[Remove Scripts/Styles]
|
||||
E --> E2[Apply excluded_tags]
|
||||
E --> E3[Apply css_selector]
|
||||
E --> E4[Remove overlay elements]
|
||||
|
||||
E1 --> F[Cleaned HTML]
|
||||
E2 --> F
|
||||
E3 --> F
|
||||
E4 --> F
|
||||
|
||||
F --> G{Markdown Generation}
|
||||
|
||||
G --> G1[HTML to Markdown]
|
||||
G --> G2[Apply Content Filter]
|
||||
G --> G3[Generate Citations]
|
||||
|
||||
G1 --> H[MarkdownGenerationResult]
|
||||
G2 --> H
|
||||
G3 --> H
|
||||
|
||||
F --> I{Media Extraction}
|
||||
I --> I1[Find Images]
|
||||
I --> I2[Find Videos/Audio]
|
||||
I --> I3[Score Relevance]
|
||||
I1 --> J[Media Dictionary]
|
||||
I2 --> J
|
||||
I3 --> J
|
||||
|
||||
F --> K{Link Extraction}
|
||||
K --> K1[Internal Links]
|
||||
K --> K2[External Links]
|
||||
K --> K3[Apply Link Filters]
|
||||
K1 --> L[Links Dictionary]
|
||||
K2 --> L
|
||||
K3 --> L
|
||||
|
||||
H --> M[Final CrawlResult]
|
||||
J --> M
|
||||
L --> M
|
||||
|
||||
style D fill:#e3f2fd
|
||||
style F fill:#f3e5f5
|
||||
style H fill:#e8f5e8
|
||||
style M fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Table Extraction Workflow
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> DetectTables
|
||||
|
||||
DetectTables --> ScoreTables: Find table elements
|
||||
|
||||
ScoreTables --> EvaluateThreshold: Calculate quality scores
|
||||
EvaluateThreshold --> PassThreshold: score >= table_score_threshold
|
||||
EvaluateThreshold --> RejectTable: score < threshold
|
||||
|
||||
PassThreshold --> ExtractHeaders: Parse table structure
|
||||
ExtractHeaders --> ExtractRows: Get header cells
|
||||
ExtractRows --> ExtractMetadata: Get data rows
|
||||
ExtractMetadata --> CreateTableObject: Get caption/summary
|
||||
|
||||
CreateTableObject --> AddToResult: {headers, rows, caption, summary}
|
||||
AddToResult --> [*]: Table extraction complete
|
||||
|
||||
RejectTable --> [*]: Table skipped
|
||||
|
||||
note right of ScoreTables : Factors: header presence, data density, structure quality
|
||||
note right of EvaluateThreshold : Threshold 1-10, higher = stricter
|
||||
```
|
||||
|
||||
### Error Handling Decision Tree
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Start Crawl] --> B[Navigate to URL]
|
||||
|
||||
B --> C{Navigation Success?}
|
||||
C -->|Network Error| C1[Set error_message: Network failure]
|
||||
C -->|Timeout| C2[Set error_message: Page timeout]
|
||||
C -->|Invalid URL| C3[Set error_message: Invalid URL format]
|
||||
C -->|Success| D[Process Page Content]
|
||||
|
||||
C1 --> E[success = False]
|
||||
C2 --> E
|
||||
C3 --> E
|
||||
|
||||
D --> F{Content Processing OK?}
|
||||
F -->|Parser Error| F1[Set error_message: HTML parsing failed]
|
||||
F -->|Memory Error| F2[Set error_message: Insufficient memory]
|
||||
F -->|Success| G[Generate Outputs]
|
||||
|
||||
F1 --> E
|
||||
F2 --> E
|
||||
|
||||
G --> H{Output Generation OK?}
|
||||
H -->|Markdown Error| H1[Partial success with warnings]
|
||||
H -->|Extraction Error| H2[Partial success with warnings]
|
||||
H -->|Success| I[success = True]
|
||||
|
||||
H1 --> I
|
||||
H2 --> I
|
||||
|
||||
E --> J[Return Failed CrawlResult]
|
||||
I --> K[Return Successful CrawlResult]
|
||||
|
||||
J --> L[User Error Handling]
|
||||
K --> M[User Result Processing]
|
||||
|
||||
L --> L1{Check error_message}
|
||||
L1 -->|Network| L2[Retry with different config]
|
||||
L1 -->|Timeout| L3[Increase page_timeout]
|
||||
L1 -->|Parser| L4[Try different scraping_strategy]
|
||||
|
||||
style E fill:#ffcdd2
|
||||
style I fill:#c8e6c9
|
||||
style J fill:#ffcdd2
|
||||
style K fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Configuration Impact Matrix
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Configuration Categories"
|
||||
A[Content Processing]
|
||||
B[Page Interaction]
|
||||
C[Output Generation]
|
||||
D[Performance]
|
||||
end
|
||||
|
||||
subgraph "Configuration Options"
|
||||
A --> A1[word_count_threshold]
|
||||
A --> A2[excluded_tags]
|
||||
A --> A3[css_selector]
|
||||
A --> A4[exclude_external_links]
|
||||
|
||||
B --> B1[process_iframes]
|
||||
B --> B2[remove_overlay_elements]
|
||||
B --> B3[scan_full_page]
|
||||
B --> B4[wait_for]
|
||||
|
||||
C --> C1[screenshot]
|
||||
C --> C2[pdf]
|
||||
C --> C3[markdown_generator]
|
||||
C --> C4[table_score_threshold]
|
||||
|
||||
D --> D1[cache_mode]
|
||||
D --> D2[verbose]
|
||||
D --> D3[page_timeout]
|
||||
D --> D4[semaphore_count]
|
||||
end
|
||||
|
||||
subgraph "Result Impact"
|
||||
A1 --> R1[Filters short text blocks]
|
||||
A2 --> R2[Removes specified HTML tags]
|
||||
A3 --> R3[Focuses on selected content]
|
||||
A4 --> R4[Cleans links dictionary]
|
||||
|
||||
B1 --> R5[Includes iframe content]
|
||||
B2 --> R6[Removes popups/modals]
|
||||
B3 --> R7[Loads dynamic content]
|
||||
B4 --> R8[Waits for specific elements]
|
||||
|
||||
C1 --> R9[Adds screenshot field]
|
||||
C2 --> R10[Adds pdf field]
|
||||
C3 --> R11[Custom markdown processing]
|
||||
C4 --> R12[Filters table quality]
|
||||
|
||||
D1 --> R13[Controls caching behavior]
|
||||
D2 --> R14[Detailed logging output]
|
||||
D3 --> R15[Prevents timeout errors]
|
||||
D4 --> R16[Limits concurrent operations]
|
||||
end
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style B fill:#f3e5f5
|
||||
style C fill:#e8f5e8
|
||||
style D fill:#fff3e0
|
||||
```
|
||||
|
||||
### Raw HTML and Local File Processing
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Crawler
|
||||
participant Processor
|
||||
participant FileSystem
|
||||
|
||||
Note over User,FileSystem: Raw HTML Processing
|
||||
User->>Crawler: arun("raw://html_content")
|
||||
Crawler->>Processor: Parse raw HTML directly
|
||||
Processor->>Processor: Apply same content filters
|
||||
Processor-->>Crawler: Standard CrawlResult
|
||||
Crawler-->>User: Result with markdown
|
||||
|
||||
Note over User,FileSystem: Local File Processing
|
||||
User->>Crawler: arun("file:///path/to/file.html")
|
||||
Crawler->>FileSystem: Read local file
|
||||
FileSystem-->>Crawler: File content
|
||||
Crawler->>Processor: Process file HTML
|
||||
Processor->>Processor: Apply content processing
|
||||
Processor-->>Crawler: Standard CrawlResult
|
||||
Crawler-->>User: Result with markdown
|
||||
|
||||
Note over User,FileSystem: Both return identical CrawlResult structure
|
||||
```
|
||||
|
||||
### Comprehensive Processing Example Flow
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[Input: example.com] --> B[Create Configurations]
|
||||
|
||||
B --> B1[BrowserConfig verbose=True]
|
||||
B --> B2[CrawlerRunConfig with filters]
|
||||
|
||||
B1 --> C[Launch AsyncWebCrawler]
|
||||
B2 --> C
|
||||
|
||||
C --> D[Navigate and Process]
|
||||
|
||||
D --> E{Check Success}
|
||||
E -->|Failed| E1[Print Error Message]
|
||||
E -->|Success| F[Extract Content Summary]
|
||||
|
||||
F --> F1[Get Page Title]
|
||||
F --> F2[Get Content Preview]
|
||||
F --> F3[Process Media Items]
|
||||
F --> F4[Process Links]
|
||||
|
||||
F3 --> F3A[Count Images]
|
||||
F3 --> F3B[Show First 3 Images]
|
||||
|
||||
F4 --> F4A[Count Internal Links]
|
||||
F4 --> F4B[Show First 3 Links]
|
||||
|
||||
F1 --> G[Display Results]
|
||||
F2 --> G
|
||||
F3A --> G
|
||||
F3B --> G
|
||||
F4A --> G
|
||||
F4B --> G
|
||||
|
||||
E1 --> H[End with Error]
|
||||
G --> I[End with Success]
|
||||
|
||||
style E1 fill:#ffcdd2
|
||||
style G fill:#c8e6c9
|
||||
style H fill:#ffcdd2
|
||||
style I fill:#c8e6c9
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Simple Crawling Guide](https://docs.crawl4ai.com/core/simple-crawling/), [Configuration Options](https://docs.crawl4ai.com/core/browser-crawler-config/), [Result Processing](https://docs.crawl4ai.com/core/crawler-result/), [Table Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/)
|
||||
441
docs/md_v2/assets/llm.txt/diagrams/url_seeder.txt
Normal file
441
docs/md_v2/assets/llm.txt/diagrams/url_seeder.txt
Normal file
@@ -0,0 +1,441 @@
|
||||
## URL Seeding Workflows and Architecture
|
||||
|
||||
Visual representations of URL discovery strategies, filtering pipelines, and smart crawling workflows.
|
||||
|
||||
### URL Seeding vs Deep Crawling Strategy Comparison
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Deep Crawling Approach"
|
||||
A1[Start URL] --> A2[Load Page]
|
||||
A2 --> A3[Extract Links]
|
||||
A3 --> A4{More Links?}
|
||||
A4 -->|Yes| A5[Queue Next Page]
|
||||
A5 --> A2
|
||||
A4 -->|No| A6[Complete]
|
||||
|
||||
A7[⏱️ Real-time Discovery]
|
||||
A8[🐌 Sequential Processing]
|
||||
A9[🔍 Limited by Page Structure]
|
||||
A10[💾 High Memory Usage]
|
||||
end
|
||||
|
||||
subgraph "URL Seeding Approach"
|
||||
B1[Domain Input] --> B2[Query Sitemap]
|
||||
B1 --> B3[Query Common Crawl]
|
||||
B2 --> B4[Merge Results]
|
||||
B3 --> B4
|
||||
B4 --> B5[Apply Filters]
|
||||
B5 --> B6[Score Relevance]
|
||||
B6 --> B7[Rank Results]
|
||||
B7 --> B8[Select Top URLs]
|
||||
|
||||
B9[⚡ Instant Discovery]
|
||||
B10[🚀 Parallel Processing]
|
||||
B11[🎯 Pattern-based Filtering]
|
||||
B12[💡 Smart Relevance Scoring]
|
||||
end
|
||||
|
||||
style A1 fill:#ffecb3
|
||||
style B1 fill:#e8f5e8
|
||||
style A6 fill:#ffcdd2
|
||||
style B8 fill:#c8e6c9
|
||||
```
|
||||
|
||||
### URL Discovery Data Flow
|
||||
|
||||
```mermaid
|
||||
sequenceDiagram
|
||||
participant User
|
||||
participant Seeder as AsyncUrlSeeder
|
||||
participant SM as Sitemap
|
||||
participant CC as Common Crawl
|
||||
participant Filter as URL Filter
|
||||
participant Scorer as BM25 Scorer
|
||||
|
||||
User->>Seeder: urls("example.com", config)
|
||||
|
||||
par Parallel Data Sources
|
||||
Seeder->>SM: Fetch sitemap.xml
|
||||
SM-->>Seeder: 500 URLs
|
||||
and
|
||||
Seeder->>CC: Query Common Crawl
|
||||
CC-->>Seeder: 2000 URLs
|
||||
end
|
||||
|
||||
Seeder->>Seeder: Merge and deduplicate
|
||||
Note over Seeder: 2200 unique URLs
|
||||
|
||||
Seeder->>Filter: Apply pattern filter
|
||||
Filter-->>Seeder: 800 matching URLs
|
||||
|
||||
alt extract_head=True
|
||||
loop For each URL
|
||||
Seeder->>Seeder: Extract <head> metadata
|
||||
end
|
||||
Note over Seeder: Title, description, keywords
|
||||
end
|
||||
|
||||
alt query provided
|
||||
Seeder->>Scorer: Calculate relevance scores
|
||||
Scorer-->>Seeder: Scored URLs
|
||||
Seeder->>Seeder: Filter by score_threshold
|
||||
Note over Seeder: 200 relevant URLs
|
||||
end
|
||||
|
||||
Seeder->>Seeder: Sort by relevance
|
||||
Seeder->>Seeder: Apply max_urls limit
|
||||
Seeder-->>User: Top 100 URLs ready for crawling
|
||||
```
|
||||
|
||||
### SeedingConfig Decision Tree
|
||||
|
||||
```mermaid
|
||||
flowchart TD
|
||||
A[SeedingConfig Setup] --> B{Data Source Strategy?}
|
||||
|
||||
B -->|Fast & Official| C[source="sitemap"]
|
||||
B -->|Comprehensive| D[source="cc"]
|
||||
B -->|Maximum Coverage| E[source="sitemap+cc"]
|
||||
|
||||
C --> F{Need Filtering?}
|
||||
D --> F
|
||||
E --> F
|
||||
|
||||
F -->|Yes| G[Set URL Pattern]
|
||||
F -->|No| H[pattern="*"]
|
||||
|
||||
G --> I{Pattern Examples}
|
||||
I --> I1[pattern="*/blog/*"]
|
||||
I --> I2[pattern="*/docs/api/*"]
|
||||
I --> I3[pattern="*.pdf"]
|
||||
I --> I4[pattern="*/product/*"]
|
||||
|
||||
H --> J{Need Metadata?}
|
||||
I1 --> J
|
||||
I2 --> J
|
||||
I3 --> J
|
||||
I4 --> J
|
||||
|
||||
J -->|Yes| K[extract_head=True]
|
||||
J -->|No| L[extract_head=False]
|
||||
|
||||
K --> M{Need Validation?}
|
||||
L --> M
|
||||
|
||||
M -->|Yes| N[live_check=True]
|
||||
M -->|No| O[live_check=False]
|
||||
|
||||
N --> P{Need Relevance Scoring?}
|
||||
O --> P
|
||||
|
||||
P -->|Yes| Q[Set Query + BM25]
|
||||
P -->|No| R[Skip Scoring]
|
||||
|
||||
Q --> S[query="search terms"]
|
||||
S --> T[scoring_method="bm25"]
|
||||
T --> U[score_threshold=0.3]
|
||||
|
||||
R --> V[Performance Tuning]
|
||||
U --> V
|
||||
|
||||
V --> W[Set max_urls]
|
||||
W --> X[Set concurrency]
|
||||
X --> Y[Set hits_per_sec]
|
||||
Y --> Z[Configuration Complete]
|
||||
|
||||
style A fill:#e3f2fd
|
||||
style Z fill:#c8e6c9
|
||||
style K fill:#fff3e0
|
||||
style N fill:#fff3e0
|
||||
style Q fill:#f3e5f5
|
||||
```
|
||||
|
||||
### BM25 Relevance Scoring Pipeline
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Text Corpus Preparation"
|
||||
A1[URL Collection] --> A2[Extract Metadata]
|
||||
A2 --> A3[Title + Description + Keywords]
|
||||
A3 --> A4[Tokenize Text]
|
||||
A4 --> A5[Remove Stop Words]
|
||||
A5 --> A6[Create Document Corpus]
|
||||
end
|
||||
|
||||
subgraph "BM25 Algorithm"
|
||||
B1[Query Terms] --> B2[Term Frequency Calculation]
|
||||
A6 --> B2
|
||||
B2 --> B3[Inverse Document Frequency]
|
||||
B3 --> B4[BM25 Score Calculation]
|
||||
B4 --> B5[Score = Σ(IDF × TF × K1+1)/(TF + K1×(1-b+b×|d|/avgdl))]
|
||||
end
|
||||
|
||||
subgraph "Scoring Results"
|
||||
B5 --> C1[URL Relevance Scores]
|
||||
C1 --> C2{Score ≥ Threshold?}
|
||||
C2 -->|Yes| C3[Include in Results]
|
||||
C2 -->|No| C4[Filter Out]
|
||||
C3 --> C5[Sort by Score DESC]
|
||||
C5 --> C6[Return Top URLs]
|
||||
end
|
||||
|
||||
subgraph "Example Scores"
|
||||
D1["python async tutorial" → 0.85]
|
||||
D2["python documentation" → 0.72]
|
||||
D3["javascript guide" → 0.23]
|
||||
D4["contact us page" → 0.05]
|
||||
end
|
||||
|
||||
style B5 fill:#e3f2fd
|
||||
style C6 fill:#c8e6c9
|
||||
style D1 fill:#c8e6c9
|
||||
style D2 fill:#c8e6c9
|
||||
style D3 fill:#ffecb3
|
||||
style D4 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Multi-Domain Discovery Architecture
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Input Layer"
|
||||
A1[Domain List]
|
||||
A2[SeedingConfig]
|
||||
A3[Query Terms]
|
||||
end
|
||||
|
||||
subgraph "Discovery Engine"
|
||||
B1[AsyncUrlSeeder]
|
||||
B2[Parallel Workers]
|
||||
B3[Rate Limiter]
|
||||
B4[Memory Manager]
|
||||
end
|
||||
|
||||
subgraph "Data Sources"
|
||||
C1[Sitemap Fetcher]
|
||||
C2[Common Crawl API]
|
||||
C3[Live URL Checker]
|
||||
C4[Metadata Extractor]
|
||||
end
|
||||
|
||||
subgraph "Processing Pipeline"
|
||||
D1[URL Deduplication]
|
||||
D2[Pattern Filtering]
|
||||
D3[Relevance Scoring]
|
||||
D4[Quality Assessment]
|
||||
end
|
||||
|
||||
subgraph "Output Layer"
|
||||
E1[Scored URL Lists]
|
||||
E2[Domain Statistics]
|
||||
E3[Performance Metrics]
|
||||
E4[Cache Storage]
|
||||
end
|
||||
|
||||
A1 --> B1
|
||||
A2 --> B1
|
||||
A3 --> B1
|
||||
|
||||
B1 --> B2
|
||||
B2 --> B3
|
||||
B3 --> B4
|
||||
|
||||
B2 --> C1
|
||||
B2 --> C2
|
||||
B2 --> C3
|
||||
B2 --> C4
|
||||
|
||||
C1 --> D1
|
||||
C2 --> D1
|
||||
C3 --> D2
|
||||
C4 --> D3
|
||||
|
||||
D1 --> D2
|
||||
D2 --> D3
|
||||
D3 --> D4
|
||||
|
||||
D4 --> E1
|
||||
B4 --> E2
|
||||
B3 --> E3
|
||||
D1 --> E4
|
||||
|
||||
style B1 fill:#e3f2fd
|
||||
style D3 fill:#f3e5f5
|
||||
style E1 fill:#c8e6c9
|
||||
```
|
||||
|
||||
### Complete Discovery-to-Crawl Pipeline
|
||||
|
||||
```mermaid
|
||||
stateDiagram-v2
|
||||
[*] --> Discovery
|
||||
|
||||
Discovery --> SourceSelection: Configure data sources
|
||||
SourceSelection --> Sitemap: source="sitemap"
|
||||
SourceSelection --> CommonCrawl: source="cc"
|
||||
SourceSelection --> Both: source="sitemap+cc"
|
||||
|
||||
Sitemap --> URLCollection
|
||||
CommonCrawl --> URLCollection
|
||||
Both --> URLCollection
|
||||
|
||||
URLCollection --> Filtering: Apply patterns
|
||||
Filtering --> MetadataExtraction: extract_head=True
|
||||
Filtering --> LiveValidation: extract_head=False
|
||||
|
||||
MetadataExtraction --> LiveValidation: live_check=True
|
||||
MetadataExtraction --> RelevanceScoring: live_check=False
|
||||
LiveValidation --> RelevanceScoring
|
||||
|
||||
RelevanceScoring --> ResultRanking: query provided
|
||||
RelevanceScoring --> ResultLimiting: no query
|
||||
|
||||
ResultRanking --> ResultLimiting: apply score_threshold
|
||||
ResultLimiting --> URLSelection: apply max_urls
|
||||
|
||||
URLSelection --> CrawlPreparation: URLs ready
|
||||
CrawlPreparation --> CrawlExecution: AsyncWebCrawler
|
||||
|
||||
CrawlExecution --> StreamProcessing: stream=True
|
||||
CrawlExecution --> BatchProcessing: stream=False
|
||||
|
||||
StreamProcessing --> [*]
|
||||
BatchProcessing --> [*]
|
||||
|
||||
note right of Discovery : 🔍 Smart URL Discovery
|
||||
note right of URLCollection : 📚 Merge & Deduplicate
|
||||
note right of RelevanceScoring : 🎯 BM25 Algorithm
|
||||
note right of CrawlExecution : 🕷️ High-Performance Crawling
|
||||
```
|
||||
|
||||
### Performance Optimization Strategies
|
||||
|
||||
```mermaid
|
||||
graph LR
|
||||
subgraph "Input Optimization"
|
||||
A1[Smart Source Selection] --> A2[Sitemap First]
|
||||
A2 --> A3[Add CC if Needed]
|
||||
A3 --> A4[Pattern Filtering Early]
|
||||
end
|
||||
|
||||
subgraph "Processing Optimization"
|
||||
B1[Parallel Workers] --> B2[Bounded Queues]
|
||||
B2 --> B3[Rate Limiting]
|
||||
B3 --> B4[Memory Management]
|
||||
B4 --> B5[Lazy Evaluation]
|
||||
end
|
||||
|
||||
subgraph "Output Optimization"
|
||||
C1[Relevance Threshold] --> C2[Max URL Limits]
|
||||
C2 --> C3[Caching Strategy]
|
||||
C3 --> C4[Streaming Results]
|
||||
end
|
||||
|
||||
subgraph "Performance Metrics"
|
||||
D1[URLs/Second: 100-1000]
|
||||
D2[Memory Usage: Bounded]
|
||||
D3[Network Efficiency: 95%+]
|
||||
D4[Cache Hit Rate: 80%+]
|
||||
end
|
||||
|
||||
A4 --> B1
|
||||
B5 --> C1
|
||||
C4 --> D1
|
||||
|
||||
style A2 fill:#e8f5e8
|
||||
style B2 fill:#e3f2fd
|
||||
style C3 fill:#f3e5f5
|
||||
style D3 fill:#c8e6c9
|
||||
```
|
||||
|
||||
### URL Discovery vs Traditional Crawling Comparison
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Traditional Approach"
|
||||
T1[Start URL] --> T2[Crawl Page]
|
||||
T2 --> T3[Extract Links]
|
||||
T3 --> T4[Queue New URLs]
|
||||
T4 --> T2
|
||||
T5[❌ Time: Hours/Days]
|
||||
T6[❌ Resource Heavy]
|
||||
T7[❌ Depth Limited]
|
||||
T8[❌ Discovery Bias]
|
||||
end
|
||||
|
||||
subgraph "URL Seeding Approach"
|
||||
S1[Domain Input] --> S2[Query All Sources]
|
||||
S2 --> S3[Pattern Filter]
|
||||
S3 --> S4[Relevance Score]
|
||||
S4 --> S5[Select Best URLs]
|
||||
S5 --> S6[Ready to Crawl]
|
||||
|
||||
S7[✅ Time: Seconds/Minutes]
|
||||
S8[✅ Resource Efficient]
|
||||
S9[✅ Complete Coverage]
|
||||
S10[✅ Quality Focused]
|
||||
end
|
||||
|
||||
subgraph "Use Case Decision Matrix"
|
||||
U1[Small Sites < 1000 pages] --> U2[Use Deep Crawling]
|
||||
U3[Large Sites > 10000 pages] --> U4[Use URL Seeding]
|
||||
U5[Unknown Structure] --> U6[Start with Seeding]
|
||||
U7[Real-time Discovery] --> U8[Use Deep Crawling]
|
||||
U9[Quality over Quantity] --> U10[Use URL Seeding]
|
||||
end
|
||||
|
||||
style S6 fill:#c8e6c9
|
||||
style S7 fill:#c8e6c9
|
||||
style S8 fill:#c8e6c9
|
||||
style S9 fill:#c8e6c9
|
||||
style S10 fill:#c8e6c9
|
||||
style T5 fill:#ffcdd2
|
||||
style T6 fill:#ffcdd2
|
||||
style T7 fill:#ffcdd2
|
||||
style T8 fill:#ffcdd2
|
||||
```
|
||||
|
||||
### Data Source Characteristics and Selection
|
||||
|
||||
```mermaid
|
||||
graph TB
|
||||
subgraph "Sitemap Source"
|
||||
SM1[📋 Official URL List]
|
||||
SM2[⚡ Fast Response]
|
||||
SM3[📅 Recently Updated]
|
||||
SM4[🎯 High Quality URLs]
|
||||
SM5[❌ May Miss Some Pages]
|
||||
end
|
||||
|
||||
subgraph "Common Crawl Source"
|
||||
CC1[🌐 Comprehensive Coverage]
|
||||
CC2[📚 Historical Data]
|
||||
CC3[🔍 Deep Discovery]
|
||||
CC4[⏳ Slower Response]
|
||||
CC5[🧹 May Include Noise]
|
||||
end
|
||||
|
||||
subgraph "Combined Strategy"
|
||||
CB1[🚀 Best of Both]
|
||||
CB2[📊 Maximum Coverage]
|
||||
CB3[✨ Automatic Deduplication]
|
||||
CB4[⚖️ Balanced Performance]
|
||||
end
|
||||
|
||||
subgraph "Selection Guidelines"
|
||||
G1[Speed Critical → Sitemap Only]
|
||||
G2[Coverage Critical → Common Crawl]
|
||||
G3[Best Quality → Combined]
|
||||
G4[Unknown Domain → Combined]
|
||||
end
|
||||
|
||||
style SM2 fill:#c8e6c9
|
||||
style SM4 fill:#c8e6c9
|
||||
style CC1 fill:#e3f2fd
|
||||
style CC3 fill:#e3f2fd
|
||||
style CB1 fill:#f3e5f5
|
||||
style CB3 fill:#f3e5f5
|
||||
```
|
||||
|
||||
**📖 Learn more:** [URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [Performance Optimization](https://docs.crawl4ai.com/advanced/optimization/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
|
||||
295
docs/md_v2/assets/llm.txt/txt/cli.txt
Normal file
295
docs/md_v2/assets/llm.txt/txt/cli.txt
Normal file
@@ -0,0 +1,295 @@
|
||||
## CLI & Identity-Based Browsing
|
||||
|
||||
Command-line interface for web crawling with persistent browser profiles, authentication, and identity management.
|
||||
|
||||
### Basic CLI Usage
|
||||
|
||||
```bash
|
||||
# Simple crawling
|
||||
crwl https://example.com
|
||||
|
||||
# Get markdown output
|
||||
crwl https://example.com -o markdown
|
||||
|
||||
# JSON output with cache bypass
|
||||
crwl https://example.com -o json --bypass-cache
|
||||
|
||||
# Verbose mode with specific browser settings
|
||||
crwl https://example.com -b "headless=false,viewport_width=1280" -v
|
||||
```
|
||||
|
||||
### Profile Management Commands
|
||||
|
||||
```bash
|
||||
# Launch interactive profile manager
|
||||
crwl profiles
|
||||
|
||||
# Create, list, and manage browser profiles
|
||||
# This opens a menu where you can:
|
||||
# 1. List existing profiles
|
||||
# 2. Create new profile (opens browser for setup)
|
||||
# 3. Delete profiles
|
||||
# 4. Use profile to crawl a website
|
||||
|
||||
# Use a specific profile for crawling
|
||||
crwl https://example.com -p my-profile-name
|
||||
|
||||
# Example workflow for authenticated sites:
|
||||
# 1. Create profile and log in
|
||||
crwl profiles # Select "Create new profile"
|
||||
# 2. Use profile for crawling authenticated content
|
||||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||||
```
|
||||
|
||||
### CDP Browser Management
|
||||
|
||||
```bash
|
||||
# Launch browser with CDP debugging (default port 9222)
|
||||
crwl cdp
|
||||
|
||||
# Use specific profile and custom port
|
||||
crwl cdp -p my-profile -P 9223
|
||||
|
||||
# Launch headless browser with CDP
|
||||
crwl cdp --headless
|
||||
|
||||
# Launch in incognito mode (ignores profile)
|
||||
crwl cdp --incognito
|
||||
|
||||
# Use custom user data directory
|
||||
crwl cdp --user-data-dir ~/my-browser-data --port 9224
|
||||
```
|
||||
|
||||
### Builtin Browser Management
|
||||
|
||||
```bash
|
||||
# Start persistent browser instance
|
||||
crwl browser start
|
||||
|
||||
# Check browser status
|
||||
crwl browser status
|
||||
|
||||
# Open visible window to see the browser
|
||||
crwl browser view --url https://example.com
|
||||
|
||||
# Stop the browser
|
||||
crwl browser stop
|
||||
|
||||
# Restart with different options
|
||||
crwl browser restart --browser-type chromium --port 9223 --no-headless
|
||||
|
||||
# Use builtin browser in crawling
|
||||
crwl https://example.com -b "browser_mode=builtin"
|
||||
```
|
||||
|
||||
### Authentication Workflow Examples
|
||||
|
||||
```bash
|
||||
# Complete workflow for LinkedIn scraping
|
||||
# 1. Create authenticated profile
|
||||
crwl profiles
|
||||
# Select "Create new profile" → login to LinkedIn in browser → press 'q' to save
|
||||
|
||||
# 2. Use profile for crawling
|
||||
crwl https://linkedin.com/in/someone -p linkedin-profile -o markdown
|
||||
|
||||
# 3. Extract structured data with authentication
|
||||
crwl https://linkedin.com/search/results/people/ \
|
||||
-p linkedin-profile \
|
||||
-j "Extract people profiles with names, titles, and companies" \
|
||||
-b "headless=false"
|
||||
|
||||
# GitHub authenticated crawling
|
||||
crwl profiles # Create github-profile
|
||||
crwl https://github.com/settings/profile -p github-profile
|
||||
|
||||
# Twitter/X authenticated access
|
||||
crwl profiles # Create twitter-profile
|
||||
crwl https://twitter.com/home -p twitter-profile -o markdown
|
||||
```
|
||||
|
||||
### Advanced CLI Configuration
|
||||
|
||||
```bash
|
||||
# Complex crawling with multiple configs
|
||||
crwl https://example.com \
|
||||
-B browser.yml \
|
||||
-C crawler.yml \
|
||||
-e extract_llm.yml \
|
||||
-s llm_schema.json \
|
||||
-p my-auth-profile \
|
||||
-o json \
|
||||
-v
|
||||
|
||||
# Quick LLM extraction with authentication
|
||||
crwl https://private-site.com/dashboard \
|
||||
-p auth-profile \
|
||||
-j "Extract user dashboard data including metrics and notifications" \
|
||||
-b "headless=true,viewport_width=1920"
|
||||
|
||||
# Content filtering with authentication
|
||||
crwl https://members-only-site.com \
|
||||
-p member-profile \
|
||||
-f filter_bm25.yml \
|
||||
-c "css_selector=.member-content,scan_full_page=true" \
|
||||
-o markdown-fit
|
||||
```
|
||||
|
||||
### Configuration Files for Identity Browsing
|
||||
|
||||
```yaml
|
||||
# browser_auth.yml
|
||||
headless: false
|
||||
use_managed_browser: true
|
||||
user_data_dir: "/path/to/profile"
|
||||
viewport_width: 1280
|
||||
viewport_height: 720
|
||||
simulate_user: true
|
||||
override_navigator: true
|
||||
|
||||
# crawler_auth.yml
|
||||
magic: true
|
||||
remove_overlay_elements: true
|
||||
simulate_user: true
|
||||
wait_for: "css:.authenticated-content"
|
||||
page_timeout: 60000
|
||||
delay_before_return_html: 2
|
||||
scan_full_page: true
|
||||
```
|
||||
|
||||
### Global Configuration Management
|
||||
|
||||
```bash
|
||||
# List all configuration settings
|
||||
crwl config list
|
||||
|
||||
# Set default LLM provider
|
||||
crwl config set DEFAULT_LLM_PROVIDER "anthropic/claude-3-sonnet"
|
||||
crwl config set DEFAULT_LLM_PROVIDER_TOKEN "your-api-token"
|
||||
|
||||
# Set browser defaults
|
||||
crwl config set BROWSER_HEADLESS false # Always show browser
|
||||
crwl config set USER_AGENT_MODE random # Random user agents
|
||||
|
||||
# Enable verbose mode globally
|
||||
crwl config set VERBOSE true
|
||||
```
|
||||
|
||||
### Q&A with Authenticated Content
|
||||
|
||||
```bash
|
||||
# Ask questions about authenticated content
|
||||
crwl https://private-dashboard.com -p dashboard-profile \
|
||||
-q "What are the key metrics shown in my dashboard?"
|
||||
|
||||
# Multiple questions workflow
|
||||
crwl https://company-intranet.com -p work-profile -o markdown # View content
|
||||
crwl https://company-intranet.com -p work-profile \
|
||||
-q "Summarize this week's announcements"
|
||||
crwl https://company-intranet.com -p work-profile \
|
||||
-q "What are the upcoming deadlines?"
|
||||
```
|
||||
|
||||
### Profile Creation Programmatically
|
||||
|
||||
```python
|
||||
# Create profiles via Python API
|
||||
import asyncio
|
||||
from crawl4ai import BrowserProfiler
|
||||
|
||||
async def create_auth_profile():
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# Create profile interactively (opens browser)
|
||||
profile_path = await profiler.create_profile("linkedin-auth")
|
||||
print(f"Profile created at: {profile_path}")
|
||||
|
||||
# List all profiles
|
||||
profiles = profiler.list_profiles()
|
||||
for profile in profiles:
|
||||
print(f"Profile: {profile['name']} at {profile['path']}")
|
||||
|
||||
# Use profile for crawling
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
use_managed_browser=True,
|
||||
user_data_dir=profile_path
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://linkedin.com/feed")
|
||||
return result
|
||||
|
||||
# asyncio.run(create_auth_profile())
|
||||
```
|
||||
|
||||
### Identity Browsing Best Practices
|
||||
|
||||
```bash
|
||||
# 1. Create specific profiles for different sites
|
||||
crwl profiles # Create "linkedin-work"
|
||||
crwl profiles # Create "github-personal"
|
||||
crwl profiles # Create "company-intranet"
|
||||
|
||||
# 2. Use descriptive profile names
|
||||
crwl https://site1.com -p site1-admin-account
|
||||
crwl https://site2.com -p site2-user-account
|
||||
|
||||
# 3. Combine with appropriate browser settings
|
||||
crwl https://secure-site.com \
|
||||
-p secure-profile \
|
||||
-b "headless=false,simulate_user=true,magic=true" \
|
||||
-c "wait_for=.logged-in-indicator,page_timeout=30000"
|
||||
|
||||
# 4. Test profile before automated crawling
|
||||
crwl cdp -p test-profile # Manually verify login status
|
||||
crwl https://test-url.com -p test-profile -v # Verbose test crawl
|
||||
```
|
||||
|
||||
### Troubleshooting Authentication Issues
|
||||
|
||||
```bash
|
||||
# Debug authentication problems
|
||||
crwl https://auth-site.com -p auth-profile \
|
||||
-b "headless=false,verbose=true" \
|
||||
-c "verbose=true,page_timeout=60000" \
|
||||
-v
|
||||
|
||||
# Check profile status
|
||||
crwl profiles # List profiles and check creation dates
|
||||
|
||||
# Recreate problematic profiles
|
||||
crwl profiles # Delete old profile, create new one
|
||||
|
||||
# Test with visible browser
|
||||
crwl https://problem-site.com -p profile-name \
|
||||
-b "headless=false" \
|
||||
-c "delay_before_return_html=5"
|
||||
```
|
||||
|
||||
### Common Use Cases
|
||||
|
||||
```bash
|
||||
# Social media monitoring (after authentication)
|
||||
crwl https://twitter.com/home -p twitter-monitor \
|
||||
-j "Extract latest tweets with sentiment and engagement metrics"
|
||||
|
||||
# E-commerce competitor analysis (with account access)
|
||||
crwl https://competitor-site.com/products -p competitor-account \
|
||||
-j "Extract product prices, availability, and descriptions"
|
||||
|
||||
# Company dashboard monitoring
|
||||
crwl https://company-dashboard.com -p work-profile \
|
||||
-c "css_selector=.dashboard-content" \
|
||||
-q "What alerts or notifications need attention?"
|
||||
|
||||
# Research data collection (authenticated access)
|
||||
crwl https://research-platform.com/data -p research-profile \
|
||||
-e extract_research.yml \
|
||||
-s research_schema.json \
|
||||
-o json
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Identity-Based Crawling Documentation](https://docs.crawl4ai.com/advanced/identity-based-crawling/), [Browser Profile Management](https://docs.crawl4ai.com/advanced/session-management/), [CLI Examples](https://docs.crawl4ai.com/core/cli/)
|
||||
1171
docs/md_v2/assets/llm.txt/txt/config_objects.txt
Normal file
1171
docs/md_v2/assets/llm.txt/txt/config_objects.txt
Normal file
File diff suppressed because it is too large
Load Diff
@@ -0,0 +1,446 @@
|
||||
## Deep Crawling Filters & Scorers
|
||||
|
||||
Advanced URL filtering and scoring strategies for intelligent deep crawling with performance optimization.
|
||||
|
||||
### URL Filters - Content and Domain Control
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
URLPatternFilter, DomainFilter, ContentTypeFilter,
|
||||
FilterChain, ContentRelevanceFilter, SEOFilter
|
||||
)
|
||||
|
||||
# Pattern-based filtering
|
||||
pattern_filter = URLPatternFilter(
|
||||
patterns=[
|
||||
"*.html", # HTML pages only
|
||||
"*/blog/*", # Blog posts
|
||||
"*/articles/*", # Article pages
|
||||
"*2024*", # Recent content
|
||||
"^https://example.com/docs/.*" # Regex pattern
|
||||
],
|
||||
use_glob=True,
|
||||
reverse=False # False = include matching, True = exclude matching
|
||||
)
|
||||
|
||||
# Domain filtering with subdomains
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["example.com", "docs.example.com"],
|
||||
blocked_domains=["ads.example.com", "tracker.com"]
|
||||
)
|
||||
|
||||
# Content type filtering
|
||||
content_filter = ContentTypeFilter(
|
||||
allowed_types=["text/html", "application/pdf"],
|
||||
check_extension=True
|
||||
)
|
||||
|
||||
# Apply individual filters
|
||||
url = "https://example.com/blog/2024/article.html"
|
||||
print(f"Pattern filter: {pattern_filter.apply(url)}")
|
||||
print(f"Domain filter: {domain_filter.apply(url)}")
|
||||
print(f"Content filter: {content_filter.apply(url)}")
|
||||
```
|
||||
|
||||
### Filter Chaining - Combine Multiple Filters
|
||||
|
||||
```python
|
||||
# Create filter chain for comprehensive filtering
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(allowed_domains=["example.com"]),
|
||||
URLPatternFilter(patterns=["*/blog/*", "*/docs/*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"])
|
||||
])
|
||||
|
||||
# Apply chain to URLs
|
||||
urls = [
|
||||
"https://example.com/blog/post1.html",
|
||||
"https://spam.com/content.html",
|
||||
"https://example.com/blog/image.jpg",
|
||||
"https://example.com/docs/guide.html"
|
||||
]
|
||||
|
||||
async def filter_urls(urls, filter_chain):
|
||||
filtered = []
|
||||
for url in urls:
|
||||
if await filter_chain.apply(url):
|
||||
filtered.append(url)
|
||||
return filtered
|
||||
|
||||
# Usage
|
||||
filtered_urls = await filter_urls(urls, filter_chain)
|
||||
print(f"Filtered URLs: {filtered_urls}")
|
||||
|
||||
# Check filter statistics
|
||||
for filter_obj in filter_chain.filters:
|
||||
stats = filter_obj.stats
|
||||
print(f"{filter_obj.name}: {stats.passed_urls}/{stats.total_urls} passed")
|
||||
```
|
||||
|
||||
### Advanced Content Filters
|
||||
|
||||
```python
|
||||
# BM25-based content relevance filtering
|
||||
relevance_filter = ContentRelevanceFilter(
|
||||
query="python machine learning tutorial",
|
||||
threshold=0.5, # Minimum relevance score
|
||||
k1=1.2, # TF saturation parameter
|
||||
b=0.75, # Length normalization
|
||||
avgdl=1000 # Average document length
|
||||
)
|
||||
|
||||
# SEO quality filtering
|
||||
seo_filter = SEOFilter(
|
||||
threshold=0.65, # Minimum SEO score
|
||||
keywords=["python", "tutorial", "guide"],
|
||||
weights={
|
||||
"title_length": 0.15,
|
||||
"title_kw": 0.18,
|
||||
"meta_description": 0.12,
|
||||
"canonical": 0.10,
|
||||
"robot_ok": 0.20,
|
||||
"schema_org": 0.10,
|
||||
"url_quality": 0.15
|
||||
}
|
||||
)
|
||||
|
||||
# Apply advanced filters
|
||||
url = "https://example.com/python-ml-tutorial"
|
||||
relevance_score = await relevance_filter.apply(url)
|
||||
seo_score = await seo_filter.apply(url)
|
||||
|
||||
print(f"Relevance: {relevance_score}, SEO: {seo_score}")
|
||||
```
|
||||
|
||||
### URL Scorers - Quality and Relevance Scoring
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.scorers import (
|
||||
KeywordRelevanceScorer, PathDepthScorer, ContentTypeScorer,
|
||||
FreshnessScorer, DomainAuthorityScorer, CompositeScorer
|
||||
)
|
||||
|
||||
# Keyword relevance scoring
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["python", "tutorial", "guide", "machine", "learning"],
|
||||
weight=1.0,
|
||||
case_sensitive=False
|
||||
)
|
||||
|
||||
# Path depth scoring (optimal depth = 3)
|
||||
depth_scorer = PathDepthScorer(
|
||||
optimal_depth=3, # /category/subcategory/article
|
||||
weight=0.8
|
||||
)
|
||||
|
||||
# Content type scoring
|
||||
content_type_scorer = ContentTypeScorer(
|
||||
type_weights={
|
||||
"html": 1.0, # Highest priority
|
||||
"pdf": 0.8, # Medium priority
|
||||
"txt": 0.6, # Lower priority
|
||||
"doc": 0.4 # Lowest priority
|
||||
},
|
||||
weight=0.9
|
||||
)
|
||||
|
||||
# Freshness scoring
|
||||
freshness_scorer = FreshnessScorer(
|
||||
weight=0.7,
|
||||
current_year=2024
|
||||
)
|
||||
|
||||
# Domain authority scoring
|
||||
domain_scorer = DomainAuthorityScorer(
|
||||
domain_weights={
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9,
|
||||
"stackoverflow.com": 0.85,
|
||||
"medium.com": 0.7,
|
||||
"personal-blog.com": 0.3
|
||||
},
|
||||
default_weight=0.5,
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
# Score individual URLs
|
||||
url = "https://python.org/tutorial/2024/machine-learning.html"
|
||||
scores = {
|
||||
"keyword": keyword_scorer.score(url),
|
||||
"depth": depth_scorer.score(url),
|
||||
"content": content_type_scorer.score(url),
|
||||
"freshness": freshness_scorer.score(url),
|
||||
"domain": domain_scorer.score(url)
|
||||
}
|
||||
|
||||
print(f"Individual scores: {scores}")
|
||||
```
|
||||
|
||||
### Composite Scoring - Combine Multiple Scorers
|
||||
|
||||
```python
|
||||
# Create composite scorer combining all strategies
|
||||
composite_scorer = CompositeScorer(
|
||||
scorers=[
|
||||
KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
|
||||
PathDepthScorer(optimal_depth=3, weight=1.0),
|
||||
ContentTypeScorer({"html": 1.0, "pdf": 0.8}, weight=1.2),
|
||||
FreshnessScorer(weight=0.8, current_year=2024),
|
||||
DomainAuthorityScorer({
|
||||
"python.org": 1.0,
|
||||
"github.com": 0.9
|
||||
}, weight=1.3)
|
||||
],
|
||||
normalize=True # Normalize by number of scorers
|
||||
)
|
||||
|
||||
# Score multiple URLs
|
||||
urls_to_score = [
|
||||
"https://python.org/tutorial/2024/basics.html",
|
||||
"https://github.com/user/python-guide/blob/main/README.md",
|
||||
"https://random-blog.com/old/2018/python-stuff.html",
|
||||
"https://python.org/docs/deep/nested/advanced/guide.html"
|
||||
]
|
||||
|
||||
scored_urls = []
|
||||
for url in urls_to_score:
|
||||
score = composite_scorer.score(url)
|
||||
scored_urls.append((url, score))
|
||||
|
||||
# Sort by score (highest first)
|
||||
scored_urls.sort(key=lambda x: x[1], reverse=True)
|
||||
|
||||
for url, score in scored_urls:
|
||||
print(f"Score: {score:.3f} - {url}")
|
||||
|
||||
# Check scorer statistics
|
||||
print(f"\nScoring statistics:")
|
||||
print(f"URLs scored: {composite_scorer.stats._urls_scored}")
|
||||
print(f"Average score: {composite_scorer.stats.get_average():.3f}")
|
||||
```
|
||||
|
||||
### Advanced Filter Patterns
|
||||
|
||||
```python
|
||||
# Complex pattern matching
|
||||
advanced_patterns = URLPatternFilter(
|
||||
patterns=[
|
||||
r"^https://docs\.python\.org/\d+/", # Python docs with version
|
||||
r".*/tutorial/.*\.html$", # Tutorial pages
|
||||
r".*/guide/(?!deprecated).*", # Guides but not deprecated
|
||||
"*/blog/{2020,2021,2022,2023,2024}/*", # Recent blog posts
|
||||
"**/{api,reference}/**/*.html" # API/reference docs
|
||||
],
|
||||
use_glob=True
|
||||
)
|
||||
|
||||
# Exclude patterns (reverse=True)
|
||||
exclude_filter = URLPatternFilter(
|
||||
patterns=[
|
||||
"*/admin/*",
|
||||
"*/login/*",
|
||||
"*/private/*",
|
||||
"**/.*", # Hidden files
|
||||
"*.{jpg,png,gif,css,js}$" # Media and assets
|
||||
],
|
||||
reverse=True # Exclude matching patterns
|
||||
)
|
||||
|
||||
# Content type with extension mapping
|
||||
detailed_content_filter = ContentTypeFilter(
|
||||
allowed_types=["text", "application"],
|
||||
check_extension=True,
|
||||
ext_map={
|
||||
"html": "text/html",
|
||||
"htm": "text/html",
|
||||
"md": "text/markdown",
|
||||
"pdf": "application/pdf",
|
||||
"doc": "application/msword",
|
||||
"docx": "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
|
||||
}
|
||||
)
|
||||
```
|
||||
|
||||
### Performance-Optimized Filtering
|
||||
|
||||
```python
|
||||
# High-performance filter chain for large-scale crawling
|
||||
class OptimizedFilterChain:
|
||||
def __init__(self):
|
||||
# Fast filters first (domain, patterns)
|
||||
self.fast_filters = [
|
||||
DomainFilter(
|
||||
allowed_domains=["example.com", "docs.example.com"],
|
||||
blocked_domains=["ads.example.com"]
|
||||
),
|
||||
URLPatternFilter([
|
||||
"*.html", "*.pdf", "*/blog/*", "*/docs/*"
|
||||
])
|
||||
]
|
||||
|
||||
# Slower filters last (content analysis)
|
||||
self.slow_filters = [
|
||||
ContentRelevanceFilter(
|
||||
query="important content",
|
||||
threshold=0.3
|
||||
)
|
||||
]
|
||||
|
||||
async def apply_optimized(self, url: str) -> bool:
|
||||
# Apply fast filters first
|
||||
for filter_obj in self.fast_filters:
|
||||
if not filter_obj.apply(url):
|
||||
return False
|
||||
|
||||
# Only apply slow filters if fast filters pass
|
||||
for filter_obj in self.slow_filters:
|
||||
if not await filter_obj.apply(url):
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
# Batch filtering with concurrency
|
||||
async def batch_filter_urls(urls, filter_chain, max_concurrent=50):
|
||||
import asyncio
|
||||
semaphore = asyncio.Semaphore(max_concurrent)
|
||||
|
||||
async def filter_single(url):
|
||||
async with semaphore:
|
||||
return await filter_chain.apply(url), url
|
||||
|
||||
tasks = [filter_single(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
return [url for passed, url in results if passed]
|
||||
|
||||
# Usage with 1000 URLs
|
||||
large_url_list = [f"https://example.com/page{i}.html" for i in range(1000)]
|
||||
optimized_chain = OptimizedFilterChain()
|
||||
filtered = await batch_filter_urls(large_url_list, optimized_chain)
|
||||
```
|
||||
|
||||
### Custom Filter Implementation
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import URLFilter
|
||||
import re
|
||||
|
||||
class CustomLanguageFilter(URLFilter):
|
||||
"""Filter URLs by language indicators"""
|
||||
|
||||
def __init__(self, allowed_languages=["en"], weight=1.0):
|
||||
super().__init__()
|
||||
self.allowed_languages = set(allowed_languages)
|
||||
self.lang_patterns = {
|
||||
"en": re.compile(r"/en/|/english/|lang=en"),
|
||||
"es": re.compile(r"/es/|/spanish/|lang=es"),
|
||||
"fr": re.compile(r"/fr/|/french/|lang=fr"),
|
||||
"de": re.compile(r"/de/|/german/|lang=de")
|
||||
}
|
||||
|
||||
def apply(self, url: str) -> bool:
|
||||
# Default to English if no language indicators
|
||||
if not any(pattern.search(url) for pattern in self.lang_patterns.values()):
|
||||
result = "en" in self.allowed_languages
|
||||
self._update_stats(result)
|
||||
return result
|
||||
|
||||
# Check for allowed languages
|
||||
for lang in self.allowed_languages:
|
||||
if lang in self.lang_patterns:
|
||||
if self.lang_patterns[lang].search(url):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
# Custom scorer implementation
|
||||
from crawl4ai.deep_crawling.scorers import URLScorer
|
||||
|
||||
class CustomComplexityScorer(URLScorer):
|
||||
"""Score URLs by content complexity indicators"""
|
||||
|
||||
def __init__(self, weight=1.0):
|
||||
super().__init__(weight)
|
||||
self.complexity_indicators = {
|
||||
"tutorial": 0.9,
|
||||
"guide": 0.8,
|
||||
"example": 0.7,
|
||||
"reference": 0.6,
|
||||
"api": 0.5
|
||||
}
|
||||
|
||||
def _calculate_score(self, url: str) -> float:
|
||||
url_lower = url.lower()
|
||||
max_score = 0.0
|
||||
|
||||
for indicator, score in self.complexity_indicators.items():
|
||||
if indicator in url_lower:
|
||||
max_score = max(max_score, score)
|
||||
|
||||
return max_score
|
||||
|
||||
# Use custom filters and scorers
|
||||
custom_filter = CustomLanguageFilter(allowed_languages=["en", "es"])
|
||||
custom_scorer = CustomComplexityScorer(weight=1.2)
|
||||
|
||||
url = "https://example.com/en/tutorial/advanced-guide.html"
|
||||
passes_filter = custom_filter.apply(url)
|
||||
complexity_score = custom_scorer.score(url)
|
||||
|
||||
print(f"Passes language filter: {passes_filter}")
|
||||
print(f"Complexity score: {complexity_score}")
|
||||
```
|
||||
|
||||
### Integration with Deep Crawling
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.deep_crawling import DeepCrawlStrategy
|
||||
|
||||
async def deep_crawl_with_filtering():
|
||||
# Create comprehensive filter chain
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(allowed_domains=["python.org"]),
|
||||
URLPatternFilter(["*/tutorial/*", "*/guide/*", "*/docs/*"]),
|
||||
ContentTypeFilter(["text/html"]),
|
||||
SEOFilter(threshold=0.6, keywords=["python", "programming"])
|
||||
])
|
||||
|
||||
# Create composite scorer
|
||||
scorer = CompositeScorer([
|
||||
KeywordRelevanceScorer(["python", "tutorial"], weight=1.5),
|
||||
FreshnessScorer(weight=0.8),
|
||||
PathDepthScorer(optimal_depth=3, weight=1.0)
|
||||
], normalize=True)
|
||||
|
||||
# Configure deep crawl strategy with filters and scorers
|
||||
deep_strategy = DeepCrawlStrategy(
|
||||
max_depth=3,
|
||||
max_pages=100,
|
||||
url_filter=filter_chain,
|
||||
url_scorer=scorer,
|
||||
score_threshold=0.6 # Only crawl URLs scoring above 0.6
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=deep_strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://python.org",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f"Deep crawl completed: {result.success}")
|
||||
if hasattr(result, 'deep_crawl_results'):
|
||||
print(f"Pages crawled: {len(result.deep_crawl_results)}")
|
||||
|
||||
# Run the deep crawl
|
||||
await deep_crawl_with_filtering()
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Deep Crawling Strategy](https://docs.crawl4ai.com/core/deep-crawling/), [Custom Filter Development](https://docs.crawl4ai.com/advanced/custom-filters/), [Performance Optimization](https://docs.crawl4ai.com/advanced/performance-tuning/)
|
||||
348
docs/md_v2/assets/llm.txt/txt/deep_crawling.txt
Normal file
348
docs/md_v2/assets/llm.txt/txt/deep_crawling.txt
Normal file
@@ -0,0 +1,348 @@
|
||||
## Deep Crawling
|
||||
|
||||
Multi-level website exploration with intelligent filtering, scoring, and prioritization strategies.
|
||||
|
||||
### Basic Deep Crawl Setup
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
from crawl4ai.content_scraping_strategy import LXMLWebScrapingStrategy
|
||||
|
||||
# Basic breadth-first deep crawling
|
||||
async def basic_deep_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Initial page + 2 levels
|
||||
include_external=False # Stay within same domain
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun("https://docs.crawl4ai.com", config=config)
|
||||
|
||||
# Group results by depth
|
||||
pages_by_depth = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
if depth not in pages_by_depth:
|
||||
pages_by_depth[depth] = []
|
||||
pages_by_depth[depth].append(result.url)
|
||||
|
||||
print(f"Crawled {len(results)} pages total")
|
||||
for depth, urls in sorted(pages_by_depth.items()):
|
||||
print(f"Depth {depth}: {len(urls)} pages")
|
||||
```
|
||||
|
||||
### Deep Crawl Strategies
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy, DFSDeepCrawlStrategy, BestFirstCrawlingStrategy
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
|
||||
# Breadth-First Search - explores all links at one depth before going deeper
|
||||
bfs_strategy = BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
max_pages=50, # Limit total pages
|
||||
score_threshold=0.3 # Minimum score for URLs
|
||||
)
|
||||
|
||||
# Depth-First Search - explores as deep as possible before backtracking
|
||||
dfs_strategy = DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
max_pages=30,
|
||||
score_threshold=0.5
|
||||
)
|
||||
|
||||
# Best-First - prioritizes highest scoring pages (recommended)
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"],
|
||||
weight=0.7
|
||||
)
|
||||
|
||||
best_first_strategy = BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=25 # No score_threshold needed - naturally prioritizes
|
||||
)
|
||||
|
||||
# Usage
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=best_first_strategy, # Choose your strategy
|
||||
scraping_strategy=LXMLWebScrapingStrategy()
|
||||
)
|
||||
```
|
||||
|
||||
### Streaming vs Batch Processing
|
||||
|
||||
```python
|
||||
# Batch mode - wait for all results
|
||||
async def batch_deep_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
|
||||
stream=False # Default - collect all results first
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
# Process all results at once
|
||||
for result in results:
|
||||
print(f"Batch processed: {result.url}")
|
||||
|
||||
# Streaming mode - process results as they arrive
|
||||
async def streaming_deep_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(max_depth=1),
|
||||
stream=True # Process results immediately
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://example.com", config=config):
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"Stream processed depth {depth}: {result.url}")
|
||||
```
|
||||
|
||||
### Filtering with Filter Chains
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.filters import (
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
SEOFilter,
|
||||
ContentRelevanceFilter
|
||||
)
|
||||
|
||||
# Single URL pattern filter
|
||||
url_filter = URLPatternFilter(patterns=["*core*", "*guide*"])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=1,
|
||||
filter_chain=FilterChain([url_filter])
|
||||
)
|
||||
)
|
||||
|
||||
# Multiple filters in chain
|
||||
advanced_filter_chain = FilterChain([
|
||||
# Domain filtering
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.example.com"],
|
||||
blocked_domains=["old.docs.example.com", "staging.example.com"]
|
||||
),
|
||||
|
||||
# URL pattern matching
|
||||
URLPatternFilter(patterns=["*tutorial*", "*guide*", "*blog*"]),
|
||||
|
||||
# Content type filtering
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
|
||||
# SEO quality filter
|
||||
SEOFilter(
|
||||
threshold=0.5,
|
||||
keywords=["tutorial", "guide", "documentation"]
|
||||
),
|
||||
|
||||
# Content relevance filter
|
||||
ContentRelevanceFilter(
|
||||
query="Web crawling and data extraction with Python",
|
||||
threshold=0.7
|
||||
)
|
||||
])
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
filter_chain=advanced_filter_chain
|
||||
)
|
||||
)
|
||||
```
|
||||
|
||||
### Intelligent Crawling with Scorers
|
||||
|
||||
```python
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
|
||||
# Keyword relevance scoring
|
||||
async def scored_deep_crawl():
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["browser", "crawler", "web", "automation"],
|
||||
weight=1.0
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
url_scorer=keyword_scorer
|
||||
),
|
||||
stream=True, # Recommended with BestFirst
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
```
|
||||
|
||||
### Limiting Crawl Size
|
||||
|
||||
```python
|
||||
# Max pages limitation across strategies
|
||||
async def limited_crawls():
|
||||
# BFS with page limit
|
||||
bfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
max_pages=5, # Only crawl 5 pages total
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["browser", "crawler"], weight=1.0)
|
||||
)
|
||||
)
|
||||
|
||||
# DFS with score threshold
|
||||
dfs_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=DFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
score_threshold=0.7, # Only URLs with scores above 0.7
|
||||
max_pages=10,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["web", "automation"], weight=1.0)
|
||||
)
|
||||
)
|
||||
|
||||
# Best-First with both constraints
|
||||
bf_config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
max_pages=7, # Automatically gets highest scored pages
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["crawl", "example"], weight=1.0)
|
||||
),
|
||||
stream=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Use any of the configs
|
||||
async for result in await crawler.arun("https://docs.crawl4ai.com", config=bf_config):
|
||||
score = result.metadata.get("score", 0)
|
||||
print(f"Score: {score:.2f} | {result.url}")
|
||||
```
|
||||
|
||||
### Complete Advanced Deep Crawler
|
||||
|
||||
```python
|
||||
async def comprehensive_deep_crawl():
|
||||
# Sophisticated filter chain
|
||||
filter_chain = FilterChain([
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"]
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
SEOFilter(threshold=0.4, keywords=["crawl", "tutorial", "guide"])
|
||||
])
|
||||
|
||||
# Multi-keyword scorer
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration", "browser"],
|
||||
weight=0.8
|
||||
)
|
||||
|
||||
# Complete configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
max_pages=20
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Execute and analyze
|
||||
results = []
|
||||
start_time = time.time()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
# Performance analysis
|
||||
duration = time.time() - start_time
|
||||
avg_score = sum(r.metadata.get('score', 0) for r in results) / len(results)
|
||||
|
||||
print(f"✅ Crawled {len(results)} pages in {duration:.2f}s")
|
||||
print(f"✅ Average relevance score: {avg_score:.2f}")
|
||||
|
||||
# Depth distribution
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f"📊 Depth {depth}: {count} pages")
|
||||
```
|
||||
|
||||
### Error Handling and Robustness
|
||||
|
||||
```python
|
||||
async def robust_deep_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=2,
|
||||
max_pages=15,
|
||||
url_scorer=KeywordRelevanceScorer(keywords=["guide", "tutorial"])
|
||||
),
|
||||
stream=True,
|
||||
page_timeout=30000 # 30 second timeout per page
|
||||
)
|
||||
|
||||
successful_pages = []
|
||||
failed_pages = []
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun("https://docs.crawl4ai.com", config=config):
|
||||
if result.success:
|
||||
successful_pages.append(result)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
score = result.metadata.get("score", 0)
|
||||
print(f"✅ Depth {depth} | Score: {score:.2f} | {result.url}")
|
||||
else:
|
||||
failed_pages.append({
|
||||
'url': result.url,
|
||||
'error': result.error_message,
|
||||
'depth': result.metadata.get("depth", 0)
|
||||
})
|
||||
print(f"❌ Failed: {result.url} - {result.error_message}")
|
||||
|
||||
print(f"📊 Results: {len(successful_pages)} successful, {len(failed_pages)} failed")
|
||||
|
||||
# Analyze failures by depth
|
||||
if failed_pages:
|
||||
failure_by_depth = {}
|
||||
for failure in failed_pages:
|
||||
depth = failure['depth']
|
||||
failure_by_depth[depth] = failure_by_depth.get(depth, 0) + 1
|
||||
|
||||
print("❌ Failures by depth:")
|
||||
for depth, count in sorted(failure_by_depth.items()):
|
||||
print(f" Depth {depth}: {count} failures")
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Deep Crawling Guide](https://docs.crawl4ai.com/core/deep-crawling/), [Filter Documentation](https://docs.crawl4ai.com/core/content-selection/), [Scoring Strategies](https://docs.crawl4ai.com/advanced/advanced-features/)
|
||||
826
docs/md_v2/assets/llm.txt/txt/docker.txt
Normal file
826
docs/md_v2/assets/llm.txt/txt/docker.txt
Normal file
@@ -0,0 +1,826 @@
|
||||
## Docker Deployment
|
||||
|
||||
Complete Docker deployment guide with pre-built images, API endpoints, configuration, and MCP integration.
|
||||
|
||||
### Quick Start with Pre-built Images
|
||||
|
||||
```bash
|
||||
# Pull latest image
|
||||
docker pull unclecode/crawl4ai:latest
|
||||
|
||||
# Setup LLM API keys
|
||||
cat > .llm.env << EOL
|
||||
OPENAI_API_KEY=sk-your-key
|
||||
ANTHROPIC_API_KEY=your-anthropic-key
|
||||
GROQ_API_KEY=your-groq-key
|
||||
GEMINI_API_TOKEN=your-gemini-token
|
||||
EOL
|
||||
|
||||
# Run with LLM support
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:latest
|
||||
|
||||
# Basic run (no LLM)
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:latest
|
||||
|
||||
# Check health
|
||||
curl http://localhost:11235/health
|
||||
```
|
||||
|
||||
### Docker Compose Deployment
|
||||
|
||||
```bash
|
||||
# Clone and setup
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
cp deploy/docker/.llm.env.example .llm.env
|
||||
# Edit .llm.env with your API keys
|
||||
|
||||
# Run pre-built image
|
||||
IMAGE=unclecode/crawl4ai:latest docker compose up -d
|
||||
|
||||
# Build locally
|
||||
docker compose up --build -d
|
||||
|
||||
# Build with all features
|
||||
INSTALL_TYPE=all docker compose up --build -d
|
||||
|
||||
# Build with GPU support
|
||||
ENABLE_GPU=true docker compose up --build -d
|
||||
|
||||
# Stop service
|
||||
docker compose down
|
||||
```
|
||||
|
||||
### Manual Build with Multi-Architecture
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
|
||||
# Build for current architecture
|
||||
docker buildx build -t crawl4ai-local:latest --load .
|
||||
|
||||
# Build for multiple architectures
|
||||
docker buildx build --platform linux/amd64,linux/arm64 \
|
||||
-t crawl4ai-local:latest --load .
|
||||
|
||||
# Build with specific features
|
||||
docker buildx build \
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
--build-arg ENABLE_GPU=false \
|
||||
-t crawl4ai-local:latest --load .
|
||||
|
||||
# Run custom build
|
||||
docker run -d \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-custom \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
crawl4ai-local:latest
|
||||
```
|
||||
|
||||
### Build Arguments
|
||||
|
||||
```bash
|
||||
# Available build options
|
||||
docker buildx build \
|
||||
--build-arg INSTALL_TYPE=all \ # default|all|torch|transformer
|
||||
--build-arg ENABLE_GPU=true \ # true|false
|
||||
--build-arg APP_HOME=/app \ # Install path
|
||||
--build-arg USE_LOCAL=true \ # Use local source
|
||||
--build-arg GITHUB_REPO=url \ # Git repo if USE_LOCAL=false
|
||||
--build-arg GITHUB_BRANCH=main \ # Git branch
|
||||
-t crawl4ai-custom:latest --load .
|
||||
```
|
||||
|
||||
### Core API Endpoints
|
||||
|
||||
```python
|
||||
# Main crawling endpoints
|
||||
import requests
|
||||
import json
|
||||
|
||||
# Basic crawl
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
|
||||
}
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
|
||||
# Streaming crawl
|
||||
payload["crawler_config"]["params"]["stream"] = True
|
||||
response = requests.post("http://localhost:11235/crawl/stream", json=payload)
|
||||
|
||||
# Health check
|
||||
response = requests.get("http://localhost:11235/health")
|
||||
|
||||
# API schema
|
||||
response = requests.get("http://localhost:11235/schema")
|
||||
|
||||
# Metrics (Prometheus format)
|
||||
response = requests.get("http://localhost:11235/metrics")
|
||||
```
|
||||
|
||||
### Specialized Endpoints
|
||||
|
||||
```python
|
||||
# HTML extraction (preprocessed for schema)
|
||||
response = requests.post("http://localhost:11235/html",
|
||||
json={"url": "https://example.com"})
|
||||
|
||||
# Screenshot capture
|
||||
response = requests.post("http://localhost:11235/screenshot", json={
|
||||
"url": "https://example.com",
|
||||
"screenshot_wait_for": 2,
|
||||
"output_path": "/path/to/save/screenshot.png"
|
||||
})
|
||||
|
||||
# PDF generation
|
||||
response = requests.post("http://localhost:11235/pdf", json={
|
||||
"url": "https://example.com",
|
||||
"output_path": "/path/to/save/document.pdf"
|
||||
})
|
||||
|
||||
# JavaScript execution
|
||||
response = requests.post("http://localhost:11235/execute_js", json={
|
||||
"url": "https://example.com",
|
||||
"scripts": [
|
||||
"return document.title",
|
||||
"return Array.from(document.querySelectorAll('a')).map(a => a.href)"
|
||||
]
|
||||
})
|
||||
|
||||
# Markdown generation
|
||||
response = requests.post("http://localhost:11235/md", json={
|
||||
"url": "https://example.com",
|
||||
"f": "fit", # raw|fit|bm25|llm
|
||||
"q": "extract main content", # query for filtering
|
||||
"c": "0" # cache: 0=bypass, 1=use
|
||||
})
|
||||
|
||||
# LLM Q&A
|
||||
response = requests.get("http://localhost:11235/llm/https://example.com?q=What is this page about?")
|
||||
|
||||
# Library context (for AI assistants)
|
||||
response = requests.get("http://localhost:11235/ask", params={
|
||||
"context_type": "all", # code|doc|all
|
||||
"query": "how to use extraction strategies",
|
||||
"score_ratio": 0.5,
|
||||
"max_results": 20
|
||||
})
|
||||
```
|
||||
|
||||
### Python SDK Usage
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||
# Non-streaming crawl
|
||||
results = await client.crawl(
|
||||
["https://example.com"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
|
||||
for result in results:
|
||||
print(f"URL: {result.url}, Success: {result.success}")
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
# Streaming crawl
|
||||
stream_config = CrawlerRunConfig(stream=True, cache_mode=CacheMode.BYPASS)
|
||||
async for result in await client.crawl(
|
||||
["https://example.com", "https://python.org"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=stream_config
|
||||
):
|
||||
print(f"Streamed: {result.url} - {result.success}")
|
||||
|
||||
# Get API schema
|
||||
schema = await client.get_schema()
|
||||
print(f"Schema available: {bool(schema)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Advanced API Configuration
|
||||
|
||||
```python
|
||||
# Complex extraction with LLM
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport": {"type": "dict", "value": {"width": 1200, "height": 800}}
|
||||
}
|
||||
},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"llm_config": {
|
||||
"type": "LLMConfig",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"api_token": "env:OPENAI_API_KEY"
|
||||
}
|
||||
},
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"title": {"type": "string"},
|
||||
"content": {"type": "string"}
|
||||
}
|
||||
}
|
||||
},
|
||||
"instruction": "Extract title and main content"
|
||||
}
|
||||
},
|
||||
"markdown_generator": {
|
||||
"type": "DefaultMarkdownGenerator",
|
||||
"params": {
|
||||
"content_filter": {
|
||||
"type": "PruningContentFilter",
|
||||
"params": {"threshold": 0.6}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
```
|
||||
|
||||
### CSS Extraction Strategy
|
||||
|
||||
```python
|
||||
# CSS-based structured extraction
|
||||
schema = {
|
||||
"name": "ProductList",
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "link", "selector": "a", "type": "attribute", "attribute": "href"}
|
||||
]
|
||||
}
|
||||
|
||||
payload = {
|
||||
"urls": ["https://example-shop.com"],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {"type": "dict", "value": schema}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
data = response.json()
|
||||
extracted = json.loads(data["results"][0]["extracted_content"])
|
||||
```
|
||||
|
||||
### MCP (Model Context Protocol) Integration
|
||||
|
||||
```bash
|
||||
# Add Crawl4AI as MCP provider to Claude Code
|
||||
claude mcp add --transport sse c4ai-sse http://localhost:11235/mcp/sse
|
||||
|
||||
# List MCP providers
|
||||
claude mcp list
|
||||
|
||||
# Test MCP connection
|
||||
python tests/mcp/test_mcp_socket.py
|
||||
|
||||
# Available MCP endpoints
|
||||
# SSE: http://localhost:11235/mcp/sse
|
||||
# WebSocket: ws://localhost:11235/mcp/ws
|
||||
# Schema: http://localhost:11235/mcp/schema
|
||||
```
|
||||
|
||||
Available MCP tools:
|
||||
- `md` - Generate markdown from web content
|
||||
- `html` - Extract preprocessed HTML
|
||||
- `screenshot` - Capture webpage screenshots
|
||||
- `pdf` - Generate PDF documents
|
||||
- `execute_js` - Run JavaScript on web pages
|
||||
- `crawl` - Perform multi-URL crawling
|
||||
- `ask` - Query Crawl4AI library context
|
||||
|
||||
### Configuration Management
|
||||
|
||||
```yaml
|
||||
# config.yml structure
|
||||
app:
|
||||
title: "Crawl4AI API"
|
||||
version: "1.0.0"
|
||||
host: "0.0.0.0"
|
||||
port: 11235
|
||||
timeout_keep_alive: 300
|
||||
|
||||
llm:
|
||||
provider: "openai/gpt-4o-mini"
|
||||
api_key_env: "OPENAI_API_KEY"
|
||||
|
||||
security:
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
trusted_hosts: ["*"]
|
||||
|
||||
crawler:
|
||||
memory_threshold_percent: 95.0
|
||||
rate_limiter:
|
||||
base_delay: [1.0, 2.0]
|
||||
timeouts:
|
||||
stream_init: 30.0
|
||||
batch_process: 300.0
|
||||
pool:
|
||||
max_pages: 40
|
||||
idle_ttl_sec: 1800
|
||||
|
||||
rate_limiting:
|
||||
enabled: true
|
||||
default_limit: "1000/minute"
|
||||
storage_uri: "memory://"
|
||||
|
||||
logging:
|
||||
level: "INFO"
|
||||
format: "%(asctime)s - %(name)s - %(levelname)s - %(message)s"
|
||||
```
|
||||
|
||||
### Custom Configuration Deployment
|
||||
|
||||
```bash
|
||||
# Method 1: Mount custom config
|
||||
docker run -d -p 11235:11235 \
|
||||
--name crawl4ai-custom \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
-v $(pwd)/my-config.yml:/app/config.yml \
|
||||
unclecode/crawl4ai:latest
|
||||
|
||||
# Method 2: Build with custom config
|
||||
# Edit deploy/docker/config.yml then build
|
||||
docker buildx build -t crawl4ai-custom:latest --load .
|
||||
```
|
||||
|
||||
### Monitoring and Health Checks
|
||||
|
||||
```bash
|
||||
# Health endpoint
|
||||
curl http://localhost:11235/health
|
||||
|
||||
# Prometheus metrics
|
||||
curl http://localhost:11235/metrics
|
||||
|
||||
# Configuration validation
|
||||
curl -X POST http://localhost:11235/config/dump \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{"code": "CrawlerRunConfig(cache_mode=\"BYPASS\", screenshot=True)"}'
|
||||
```
|
||||
|
||||
### Playground Interface
|
||||
|
||||
Access the interactive playground at `http://localhost:11235/playground` for:
|
||||
- Testing configurations with visual interface
|
||||
- Generating JSON payloads for REST API
|
||||
- Converting Python config to JSON format
|
||||
- Testing crawl operations directly in browser
|
||||
|
||||
### Async Job Processing
|
||||
|
||||
```python
|
||||
# Submit job for async processing
|
||||
import time
|
||||
|
||||
# Submit crawl job
|
||||
response = requests.post("http://localhost:11235/crawl/job", json=payload)
|
||||
task_id = response.json()["task_id"]
|
||||
|
||||
# Poll for completion
|
||||
while True:
|
||||
result = requests.get(f"http://localhost:11235/crawl/job/{task_id}")
|
||||
status = result.json()
|
||||
|
||||
if status["status"] in ["COMPLETED", "FAILED"]:
|
||||
break
|
||||
time.sleep(1.5)
|
||||
|
||||
print("Final result:", status)
|
||||
```
|
||||
|
||||
### Production Deployment
|
||||
|
||||
```bash
|
||||
# Production-ready deployment
|
||||
docker run -d \
|
||||
--name crawl4ai-prod \
|
||||
--restart unless-stopped \
|
||||
-p 11235:11235 \
|
||||
--env-file .llm.env \
|
||||
--shm-size=2g \
|
||||
--memory=8g \
|
||||
--cpus=4 \
|
||||
-v /path/to/custom-config.yml:/app/config.yml \
|
||||
unclecode/crawl4ai:latest
|
||||
|
||||
# With Docker Compose for production
|
||||
version: '3.8'
|
||||
services:
|
||||
crawl4ai:
|
||||
image: unclecode/crawl4ai:latest
|
||||
ports:
|
||||
- "11235:11235"
|
||||
environment:
|
||||
- OPENAI_API_KEY=${OPENAI_API_KEY}
|
||||
volumes:
|
||||
- ./config.yml:/app/config.yml
|
||||
shm_size: 2g
|
||||
deploy:
|
||||
resources:
|
||||
limits:
|
||||
memory: 8G
|
||||
cpus: '4'
|
||||
restart: unless-stopped
|
||||
```
|
||||
|
||||
### Configuration Validation and JSON Structure
|
||||
|
||||
```python
|
||||
# Method 1: Create config objects and dump to see expected JSON structure
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, LLMConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy
|
||||
import json
|
||||
|
||||
# Create browser config and see JSON structure
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=720,
|
||||
proxy="http://user:pass@proxy:8080"
|
||||
)
|
||||
|
||||
# Get JSON structure
|
||||
browser_json = browser_config.dump()
|
||||
print("BrowserConfig JSON structure:")
|
||||
print(json.dumps(browser_json, indent=2))
|
||||
|
||||
# Create crawler config with extraction strategy
|
||||
schema = {
|
||||
"name": "Articles",
|
||||
"baseSelector": ".article",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"},
|
||||
{"name": "content", "selector": ".content", "type": "html"}
|
||||
]
|
||||
}
|
||||
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
extraction_strategy=JsonCssExtractionStrategy(schema),
|
||||
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
|
||||
wait_for="css:.loaded"
|
||||
)
|
||||
|
||||
crawler_json = crawler_config.dump()
|
||||
print("\nCrawlerRunConfig JSON structure:")
|
||||
print(json.dumps(crawler_json, indent=2))
|
||||
```
|
||||
|
||||
### Reverse Validation - JSON to Objects
|
||||
|
||||
```python
|
||||
# Method 2: Load JSON back to config objects for validation
|
||||
from crawl4ai.async_configs import from_serializable_dict
|
||||
|
||||
# Test JSON structure by converting back to objects
|
||||
test_browser_json = {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True,
|
||||
"viewport_width": 1280,
|
||||
"proxy": "http://user:pass@proxy:8080"
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
# Convert JSON back to object
|
||||
restored_browser = from_serializable_dict(test_browser_json)
|
||||
print(f"✅ Valid BrowserConfig: {type(restored_browser)}")
|
||||
print(f"Headless: {restored_browser.headless}")
|
||||
print(f"Proxy: {restored_browser.proxy}")
|
||||
except Exception as e:
|
||||
print(f"❌ Invalid BrowserConfig JSON: {e}")
|
||||
|
||||
# Test complex crawler config JSON
|
||||
test_crawler_json = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": "bypass",
|
||||
"screenshot": True,
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"name": "Products",
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h3", "type": "text"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
try:
|
||||
restored_crawler = from_serializable_dict(test_crawler_json)
|
||||
print(f"✅ Valid CrawlerRunConfig: {type(restored_crawler)}")
|
||||
print(f"Cache mode: {restored_crawler.cache_mode}")
|
||||
print(f"Has extraction strategy: {restored_crawler.extraction_strategy is not None}")
|
||||
except Exception as e:
|
||||
print(f"❌ Invalid CrawlerRunConfig JSON: {e}")
|
||||
```
|
||||
|
||||
### Using Server's /config/dump Endpoint for Validation
|
||||
|
||||
```python
|
||||
import requests
|
||||
|
||||
# Method 3: Use server endpoint to validate configuration syntax
|
||||
def validate_config_with_server(config_code: str) -> dict:
|
||||
"""Validate configuration using server's /config/dump endpoint"""
|
||||
response = requests.post(
|
||||
"http://localhost:11235/config/dump",
|
||||
json={"code": config_code}
|
||||
)
|
||||
|
||||
if response.status_code == 200:
|
||||
print("✅ Valid configuration syntax")
|
||||
return response.json()
|
||||
else:
|
||||
print(f"❌ Invalid configuration: {response.status_code}")
|
||||
print(response.json())
|
||||
return None
|
||||
|
||||
# Test valid configuration
|
||||
valid_config = """
|
||||
CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
js_code=["window.scrollTo(0, document.body.scrollHeight);"],
|
||||
wait_for="css:.content-loaded"
|
||||
)
|
||||
"""
|
||||
|
||||
result = validate_config_with_server(valid_config)
|
||||
if result:
|
||||
print("Generated JSON structure:")
|
||||
print(json.dumps(result, indent=2))
|
||||
|
||||
# Test invalid configuration (should fail)
|
||||
invalid_config = """
|
||||
CrawlerRunConfig(
|
||||
cache_mode="invalid_mode",
|
||||
screenshot=True,
|
||||
js_code=some_function() # This will fail
|
||||
)
|
||||
"""
|
||||
|
||||
validate_config_with_server(invalid_config)
|
||||
```
|
||||
|
||||
### Configuration Builder Helper
|
||||
|
||||
```python
|
||||
def build_and_validate_request(urls, browser_params=None, crawler_params=None):
|
||||
"""Helper to build and validate complete request payload"""
|
||||
|
||||
# Create configurations
|
||||
browser_config = BrowserConfig(**(browser_params or {}))
|
||||
crawler_config = CrawlerRunConfig(**(crawler_params or {}))
|
||||
|
||||
# Build complete request payload
|
||||
payload = {
|
||||
"urls": urls if isinstance(urls, list) else [urls],
|
||||
"browser_config": browser_config.dump(),
|
||||
"crawler_config": crawler_config.dump()
|
||||
}
|
||||
|
||||
print("✅ Complete request payload:")
|
||||
print(json.dumps(payload, indent=2))
|
||||
|
||||
# Validate by attempting to reconstruct
|
||||
try:
|
||||
test_browser = from_serializable_dict(payload["browser_config"])
|
||||
test_crawler = from_serializable_dict(payload["crawler_config"])
|
||||
print("✅ Payload validation successful")
|
||||
return payload
|
||||
except Exception as e:
|
||||
print(f"❌ Payload validation failed: {e}")
|
||||
return None
|
||||
|
||||
# Example usage
|
||||
payload = build_and_validate_request(
|
||||
urls=["https://example.com"],
|
||||
browser_params={"headless": True, "viewport_width": 1280},
|
||||
crawler_params={
|
||||
"cache_mode": CacheMode.BYPASS,
|
||||
"screenshot": True,
|
||||
"word_count_threshold": 10
|
||||
}
|
||||
)
|
||||
|
||||
if payload:
|
||||
# Send to server
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
print(f"Server response: {response.status_code}")
|
||||
```
|
||||
|
||||
### Common JSON Structure Patterns
|
||||
|
||||
```python
|
||||
# Pattern 1: Simple primitive values
|
||||
simple_config = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": "bypass", # String enum value
|
||||
"screenshot": True, # Boolean
|
||||
"page_timeout": 60000 # Integer
|
||||
}
|
||||
}
|
||||
|
||||
# Pattern 2: Nested objects
|
||||
nested_config = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"llm_config": {
|
||||
"type": "LLMConfig",
|
||||
"params": {
|
||||
"provider": "openai/gpt-4o-mini",
|
||||
"api_token": "env:OPENAI_API_KEY"
|
||||
}
|
||||
},
|
||||
"instruction": "Extract main content"
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Pattern 3: Dictionary values (must use type: dict wrapper)
|
||||
dict_config = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"extraction_strategy": {
|
||||
"type": "JsonCssExtractionStrategy",
|
||||
"params": {
|
||||
"schema": {
|
||||
"type": "dict", # Required wrapper
|
||||
"value": { # Actual dictionary content
|
||||
"name": "Products",
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h2", "type": "text"}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Pattern 4: Lists and arrays
|
||||
list_config = {
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"js_code": [ # Lists are handled directly
|
||||
"window.scrollTo(0, document.body.scrollHeight);",
|
||||
"document.querySelector('.load-more')?.click();"
|
||||
],
|
||||
"excluded_tags": ["script", "style", "nav"]
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
### Troubleshooting Common JSON Errors
|
||||
|
||||
```python
|
||||
def diagnose_json_errors():
|
||||
"""Common JSON structure errors and fixes"""
|
||||
|
||||
# ❌ WRONG: Missing type wrapper for objects
|
||||
wrong_config = {
|
||||
"browser_config": {
|
||||
"headless": True # Missing type wrapper
|
||||
}
|
||||
}
|
||||
|
||||
# ✅ CORRECT: Proper type wrapper
|
||||
correct_config = {
|
||||
"browser_config": {
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": True
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ❌ WRONG: Dictionary without type: dict wrapper
|
||||
wrong_dict = {
|
||||
"schema": {
|
||||
"name": "Products" # Raw dict, should be wrapped
|
||||
}
|
||||
}
|
||||
|
||||
# ✅ CORRECT: Dictionary with proper wrapper
|
||||
correct_dict = {
|
||||
"schema": {
|
||||
"type": "dict",
|
||||
"value": {
|
||||
"name": "Products"
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# ❌ WRONG: Invalid enum string
|
||||
wrong_enum = {
|
||||
"cache_mode": "DISABLED" # Wrong case/value
|
||||
}
|
||||
|
||||
# ✅ CORRECT: Valid enum string
|
||||
correct_enum = {
|
||||
"cache_mode": "bypass" # or "enabled", "disabled", etc.
|
||||
}
|
||||
|
||||
print("Common error patterns documented above")
|
||||
|
||||
# Validate your JSON structure before sending
|
||||
def pre_flight_check(payload):
|
||||
"""Run checks before sending to server"""
|
||||
required_keys = ["urls", "browser_config", "crawler_config"]
|
||||
|
||||
for key in required_keys:
|
||||
if key not in payload:
|
||||
print(f"❌ Missing required key: {key}")
|
||||
return False
|
||||
|
||||
# Check type wrappers
|
||||
for config_key in ["browser_config", "crawler_config"]:
|
||||
config = payload[config_key]
|
||||
if not isinstance(config, dict) or "type" not in config:
|
||||
print(f"❌ {config_key} missing type wrapper")
|
||||
return False
|
||||
if "params" not in config:
|
||||
print(f"❌ {config_key} missing params")
|
||||
return False
|
||||
|
||||
print("✅ Pre-flight check passed")
|
||||
return True
|
||||
|
||||
# Example usage
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
|
||||
}
|
||||
|
||||
if pre_flight_check(payload):
|
||||
# Safe to send to server
|
||||
pass
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment/), [API Reference](https://docs.crawl4ai.com/api/), [MCP Integration](https://docs.crawl4ai.com/core/docker-deployment/#mcp-model-context-protocol-support), [Configuration Options](https://docs.crawl4ai.com/core/docker-deployment/#server-configuration)
|
||||
788
docs/md_v2/assets/llm.txt/txt/extraction.txt
Normal file
788
docs/md_v2/assets/llm.txt/txt/extraction.txt
Normal file
@@ -0,0 +1,788 @@
|
||||
## Extraction Strategies
|
||||
|
||||
Powerful data extraction from web pages using LLM-based intelligent parsing or fast schema/pattern-based approaches.
|
||||
|
||||
### LLM-Based Extraction - Intelligent Content Understanding
|
||||
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
# Define structured data model
|
||||
class Product(BaseModel):
|
||||
name: str = Field(description="Product name")
|
||||
price: str = Field(description="Product price")
|
||||
description: str = Field(description="Product description")
|
||||
features: List[str] = Field(description="List of product features")
|
||||
rating: float = Field(description="Product rating out of 5")
|
||||
|
||||
# Configure LLM provider
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o-mini", # or "ollama/llama3.3", "anthropic/claude-3-5-sonnet"
|
||||
api_token=os.getenv("OPENAI_API_KEY"), # or "env:OPENAI_API_KEY"
|
||||
temperature=0.1,
|
||||
max_tokens=2000
|
||||
)
|
||||
|
||||
# Create LLM extraction strategy
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=llm_config,
|
||||
schema=Product.model_json_schema(),
|
||||
extraction_type="schema", # or "block" for freeform text
|
||||
instruction="""
|
||||
Extract product information from the webpage content.
|
||||
Focus on finding complete product details including:
|
||||
- Product name and price
|
||||
- Detailed description
|
||||
- All listed features
|
||||
- Customer rating if available
|
||||
Return valid JSON array of products.
|
||||
""",
|
||||
chunk_token_threshold=1200, # Split content if too large
|
||||
overlap_rate=0.1, # 10% overlap between chunks
|
||||
apply_chunking=True, # Enable automatic chunking
|
||||
input_format="markdown", # "html", "fit_markdown", or "markdown"
|
||||
extra_args={"temperature": 0.0, "max_tokens": 800},
|
||||
verbose=True
|
||||
)
|
||||
|
||||
async def extract_with_llm():
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
crawl_config = CrawlerRunConfig(
|
||||
extraction_strategy=llm_strategy,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=10
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
config=crawl_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Parse extracted JSON
|
||||
products = json.loads(result.extracted_content)
|
||||
print(f"Extracted {len(products)} products")
|
||||
|
||||
for product in products[:3]: # Show first 3
|
||||
print(f"Product: {product['name']}")
|
||||
print(f"Price: {product['price']}")
|
||||
print(f"Rating: {product.get('rating', 'N/A')}")
|
||||
|
||||
# Show token usage and cost
|
||||
llm_strategy.show_usage()
|
||||
else:
|
||||
print(f"Extraction failed: {result.error_message}")
|
||||
|
||||
asyncio.run(extract_with_llm())
|
||||
```
|
||||
|
||||
### LLM Strategy Advanced Configuration
|
||||
|
||||
```python
|
||||
# Multiple provider configurations
|
||||
providers = {
|
||||
"openai": LLMConfig(
|
||||
provider="openai/gpt-4o",
|
||||
api_token="env:OPENAI_API_KEY",
|
||||
temperature=0.1
|
||||
),
|
||||
"anthropic": LLMConfig(
|
||||
provider="anthropic/claude-3-5-sonnet-20240620",
|
||||
api_token="env:ANTHROPIC_API_KEY",
|
||||
max_tokens=4000
|
||||
),
|
||||
"ollama": LLMConfig(
|
||||
provider="ollama/llama3.3",
|
||||
api_token=None, # Not needed for Ollama
|
||||
base_url="http://localhost:11434"
|
||||
),
|
||||
"groq": LLMConfig(
|
||||
provider="groq/llama3-70b-8192",
|
||||
api_token="env:GROQ_API_KEY"
|
||||
)
|
||||
}
|
||||
|
||||
# Advanced chunking for large content
|
||||
large_content_strategy = LLMExtractionStrategy(
|
||||
llm_config=providers["openai"],
|
||||
schema=YourModel.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract detailed information...",
|
||||
|
||||
# Chunking parameters
|
||||
chunk_token_threshold=2000, # Larger chunks for complex content
|
||||
overlap_rate=0.15, # More overlap for context preservation
|
||||
apply_chunking=True,
|
||||
|
||||
# Input format selection
|
||||
input_format="fit_markdown", # Use filtered content if available
|
||||
|
||||
# LLM parameters
|
||||
extra_args={
|
||||
"temperature": 0.0, # Deterministic output
|
||||
"top_p": 0.9,
|
||||
"frequency_penalty": 0.1,
|
||||
"presence_penalty": 0.1,
|
||||
"max_tokens": 1500
|
||||
},
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Knowledge graph extraction
|
||||
class Entity(BaseModel):
|
||||
name: str
|
||||
type: str # "person", "organization", "location", etc.
|
||||
description: str
|
||||
|
||||
class Relationship(BaseModel):
|
||||
source: str
|
||||
target: str
|
||||
relationship: str
|
||||
confidence: float
|
||||
|
||||
class KnowledgeGraph(BaseModel):
|
||||
entities: List[Entity]
|
||||
relationships: List[Relationship]
|
||||
summary: str
|
||||
|
||||
knowledge_strategy = LLMExtractionStrategy(
|
||||
llm_config=providers["anthropic"],
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""
|
||||
Create a knowledge graph from the content by:
|
||||
1. Identifying key entities (people, organizations, locations, concepts)
|
||||
2. Finding relationships between entities
|
||||
3. Providing confidence scores for relationships
|
||||
4. Summarizing the main topics
|
||||
""",
|
||||
input_format="html", # Use HTML for better structure preservation
|
||||
apply_chunking=True,
|
||||
chunk_token_threshold=1500
|
||||
)
|
||||
```
|
||||
|
||||
### JSON CSS Extraction - Fast Schema-Based Extraction
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# Basic CSS extraction schema
|
||||
simple_schema = {
|
||||
"name": "Product Listings",
|
||||
"baseSelector": "div.product-card",
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2.product-title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": ".price",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "image_url",
|
||||
"selector": "img.product-image",
|
||||
"type": "attribute",
|
||||
"attribute": "src"
|
||||
},
|
||||
{
|
||||
"name": "product_url",
|
||||
"selector": "a.product-link",
|
||||
"type": "attribute",
|
||||
"attribute": "href"
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Complex nested schema with multiple data types
|
||||
complex_schema = {
|
||||
"name": "E-commerce Product Catalog",
|
||||
"baseSelector": "div.category",
|
||||
"baseFields": [
|
||||
{
|
||||
"name": "category_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-category-id"
|
||||
},
|
||||
{
|
||||
"name": "category_url",
|
||||
"type": "attribute",
|
||||
"attribute": "data-url"
|
||||
}
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "category_name",
|
||||
"selector": "h2.category-title",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "products",
|
||||
"selector": "div.product",
|
||||
"type": "nested_list", # Array of complex objects
|
||||
"fields": [
|
||||
{
|
||||
"name": "name",
|
||||
"selector": "h3.product-name",
|
||||
"type": "text",
|
||||
"default": "Unknown Product"
|
||||
},
|
||||
{
|
||||
"name": "price",
|
||||
"selector": "span.price",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "details",
|
||||
"selector": "div.product-details",
|
||||
"type": "nested", # Single complex object
|
||||
"fields": [
|
||||
{
|
||||
"name": "brand",
|
||||
"selector": "span.brand",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "model",
|
||||
"selector": "span.model",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "specs",
|
||||
"selector": "div.specifications",
|
||||
"type": "html" # Preserve HTML structure
|
||||
}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "features",
|
||||
"selector": "ul.features li",
|
||||
"type": "list", # Simple array of strings
|
||||
"fields": [
|
||||
{"name": "feature", "type": "text"}
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "reviews",
|
||||
"selector": "div.review",
|
||||
"type": "nested_list",
|
||||
"fields": [
|
||||
{
|
||||
"name": "reviewer",
|
||||
"selector": "span.reviewer-name",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "rating",
|
||||
"selector": "span.rating",
|
||||
"type": "attribute",
|
||||
"attribute": "data-rating"
|
||||
},
|
||||
{
|
||||
"name": "comment",
|
||||
"selector": "p.review-text",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "date",
|
||||
"selector": "time.review-date",
|
||||
"type": "attribute",
|
||||
"attribute": "datetime"
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
async def extract_with_css_schema():
|
||||
strategy = JsonCssExtractionStrategy(complex_schema, verbose=True)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=strategy,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
# Enable dynamic content loading if needed
|
||||
js_code="window.scrollTo(0, document.body.scrollHeight);",
|
||||
wait_for="css:.product:nth-child(10)", # Wait for products to load
|
||||
process_iframes=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/catalog",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
data = json.loads(result.extracted_content)
|
||||
print(f"Extracted {len(data)} categories")
|
||||
|
||||
for category in data:
|
||||
print(f"Category: {category['category_name']}")
|
||||
print(f"Products: {len(category.get('products', []))}")
|
||||
|
||||
# Show first product details
|
||||
if category.get('products'):
|
||||
product = category['products'][0]
|
||||
print(f" First product: {product.get('name')}")
|
||||
print(f" Features: {len(product.get('features', []))}")
|
||||
print(f" Reviews: {len(product.get('reviews', []))}")
|
||||
|
||||
asyncio.run(extract_with_css_schema())
|
||||
```
|
||||
|
||||
### Automatic Schema Generation - One-Time LLM, Unlimited Use
|
||||
|
||||
```python
|
||||
import json
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def generate_and_use_schema():
|
||||
"""
|
||||
1. Use LLM once to generate schema from sample HTML
|
||||
2. Cache the schema for reuse
|
||||
3. Use cached schema for fast extraction without LLM calls
|
||||
"""
|
||||
|
||||
cache_dir = Path("./schema_cache")
|
||||
cache_dir.mkdir(exist_ok=True)
|
||||
schema_file = cache_dir / "ecommerce_schema.json"
|
||||
|
||||
# Step 1: Generate or load cached schema
|
||||
if schema_file.exists():
|
||||
schema = json.load(schema_file.open())
|
||||
print("Using cached schema")
|
||||
else:
|
||||
print("Generating schema using LLM...")
|
||||
|
||||
# Configure LLM for schema generation
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o", # or "ollama/llama3.3" for local
|
||||
api_token="env:OPENAI_API_KEY"
|
||||
)
|
||||
|
||||
# Get sample HTML from target site
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
sample_result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
sample_html = sample_result.cleaned_html[:5000] # Use first 5k chars
|
||||
|
||||
# Generate schema automatically (ONE-TIME LLM COST)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
schema_type="css",
|
||||
llm_config=llm_config,
|
||||
instruction="Extract product information including name, price, description, and features"
|
||||
)
|
||||
|
||||
# Cache schema for future use (NO MORE LLM CALLS)
|
||||
json.dump(schema, schema_file.open("w"), indent=2)
|
||||
print("Schema generated and cached")
|
||||
|
||||
# Step 2: Use schema for fast extraction (NO LLM CALLS)
|
||||
strategy = JsonCssExtractionStrategy(schema, verbose=True)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
extraction_strategy=strategy,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Step 3: Extract from multiple pages using same schema
|
||||
urls = [
|
||||
"https://example.com/products",
|
||||
"https://example.com/electronics",
|
||||
"https://example.com/books"
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
if result.success:
|
||||
data = json.loads(result.extracted_content)
|
||||
print(f"{url}: Extracted {len(data)} items")
|
||||
else:
|
||||
print(f"{url}: Failed - {result.error_message}")
|
||||
|
||||
asyncio.run(generate_and_use_schema())
|
||||
```
|
||||
|
||||
### XPath Extraction Strategy
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonXPathExtractionStrategy
|
||||
|
||||
# XPath-based schema (alternative to CSS)
|
||||
xpath_schema = {
|
||||
"name": "News Articles",
|
||||
"baseSelector": "//article[@class='news-item']",
|
||||
"baseFields": [
|
||||
{
|
||||
"name": "article_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-id"
|
||||
}
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "headline",
|
||||
"selector": ".//h2[@class='headline']",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "author",
|
||||
"selector": ".//span[@class='author']/text()",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "publish_date",
|
||||
"selector": ".//time/@datetime",
|
||||
"type": "text"
|
||||
},
|
||||
{
|
||||
"name": "content",
|
||||
"selector": ".//div[@class='article-body']",
|
||||
"type": "html"
|
||||
},
|
||||
{
|
||||
"name": "tags",
|
||||
"selector": ".//div[@class='tags']/span[@class='tag']",
|
||||
"type": "list",
|
||||
"fields": [
|
||||
{"name": "tag", "type": "text"}
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
# Generate XPath schema automatically
|
||||
async def generate_xpath_schema():
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None)
|
||||
|
||||
sample_html = """
|
||||
<article class="news-item" data-id="123">
|
||||
<h2 class="headline">Breaking News</h2>
|
||||
<span class="author">John Doe</span>
|
||||
<time datetime="2024-01-01">Today</time>
|
||||
<div class="article-body"><p>Content here...</p></div>
|
||||
</article>
|
||||
"""
|
||||
|
||||
schema = JsonXPathExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
schema_type="xpath",
|
||||
llm_config=llm_config
|
||||
)
|
||||
|
||||
return schema
|
||||
|
||||
# Use XPath strategy
|
||||
xpath_strategy = JsonXPathExtractionStrategy(xpath_schema, verbose=True)
|
||||
```
|
||||
|
||||
### Regex Extraction Strategy - Pattern-Based Fast Extraction
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import RegexExtractionStrategy
|
||||
|
||||
# Built-in patterns for common data types
|
||||
async def extract_with_builtin_patterns():
|
||||
# Use multiple built-in patterns
|
||||
strategy = RegexExtractionStrategy(
|
||||
pattern=(
|
||||
RegexExtractionStrategy.Email |
|
||||
RegexExtractionStrategy.PhoneUS |
|
||||
RegexExtractionStrategy.Url |
|
||||
RegexExtractionStrategy.Currency |
|
||||
RegexExtractionStrategy.DateIso
|
||||
)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/contact",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
matches = json.loads(result.extracted_content)
|
||||
|
||||
# Group by pattern type
|
||||
by_type = {}
|
||||
for match in matches:
|
||||
label = match['label']
|
||||
if label not in by_type:
|
||||
by_type[label] = []
|
||||
by_type[label].append(match['value'])
|
||||
|
||||
for pattern_type, values in by_type.items():
|
||||
print(f"{pattern_type}: {len(values)} matches")
|
||||
for value in values[:3]: # Show first 3
|
||||
print(f" {value}")
|
||||
|
||||
# Custom regex patterns
|
||||
custom_patterns = {
|
||||
"product_code": r"SKU-\d{4,6}",
|
||||
"discount": r"\d{1,2}%\s*off",
|
||||
"model_number": r"Model:\s*([A-Z0-9-]+)"
|
||||
}
|
||||
|
||||
async def extract_with_custom_patterns():
|
||||
strategy = RegexExtractionStrategy(custom=custom_patterns)
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/products",
|
||||
config=config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
data = json.loads(result.extracted_content)
|
||||
for item in data:
|
||||
print(f"{item['label']}: {item['value']}")
|
||||
|
||||
# LLM-generated patterns (one-time cost)
|
||||
async def generate_custom_patterns():
|
||||
cache_file = Path("./patterns/price_patterns.json")
|
||||
|
||||
if cache_file.exists():
|
||||
patterns = json.load(cache_file.open())
|
||||
else:
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token="env:OPENAI_API_KEY"
|
||||
)
|
||||
|
||||
# Get sample content
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com/pricing")
|
||||
sample_html = result.cleaned_html
|
||||
|
||||
# Generate optimized patterns
|
||||
patterns = RegexExtractionStrategy.generate_pattern(
|
||||
label="pricing_info",
|
||||
html=sample_html,
|
||||
query="Extract all pricing information including discounts and special offers",
|
||||
llm_config=llm_config
|
||||
)
|
||||
|
||||
# Cache for reuse
|
||||
cache_file.parent.mkdir(exist_ok=True)
|
||||
json.dump(patterns, cache_file.open("w"), indent=2)
|
||||
|
||||
# Use cached patterns (no more LLM calls)
|
||||
strategy = RegexExtractionStrategy(custom=patterns)
|
||||
return strategy
|
||||
|
||||
asyncio.run(extract_with_builtin_patterns())
|
||||
asyncio.run(extract_with_custom_patterns())
|
||||
```
|
||||
|
||||
### Complete Extraction Workflow - Combining Strategies
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import (
|
||||
JsonCssExtractionStrategy,
|
||||
RegexExtractionStrategy,
|
||||
LLMExtractionStrategy
|
||||
)
|
||||
|
||||
async def multi_strategy_extraction():
|
||||
"""
|
||||
Demonstrate using multiple extraction strategies in sequence:
|
||||
1. Fast regex for common patterns
|
||||
2. Schema-based for structured data
|
||||
3. LLM for complex reasoning
|
||||
"""
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Strategy 1: Fast regex extraction
|
||||
regex_strategy = RegexExtractionStrategy(
|
||||
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
|
||||
)
|
||||
|
||||
# Strategy 2: Schema-based structured extraction
|
||||
product_schema = {
|
||||
"name": "Products",
|
||||
"baseSelector": "div.product",
|
||||
"fields": [
|
||||
{"name": "name", "selector": "h3", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "rating", "selector": ".rating", "type": "attribute", "attribute": "data-rating"}
|
||||
]
|
||||
}
|
||||
css_strategy = JsonCssExtractionStrategy(product_schema)
|
||||
|
||||
# Strategy 3: LLM for complex analysis
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
|
||||
schema={
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"sentiment": {"type": "string"},
|
||||
"key_topics": {"type": "array", "items": {"type": "string"}},
|
||||
"summary": {"type": "string"}
|
||||
}
|
||||
},
|
||||
extraction_type="schema",
|
||||
instruction="Analyze the content sentiment, extract key topics, and provide a summary"
|
||||
)
|
||||
|
||||
url = "https://example.com/product-reviews"
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Extract contact info with regex
|
||||
regex_config = CrawlerRunConfig(extraction_strategy=regex_strategy)
|
||||
regex_result = await crawler.arun(url=url, config=regex_config)
|
||||
|
||||
# Extract structured product data
|
||||
css_config = CrawlerRunConfig(extraction_strategy=css_strategy)
|
||||
css_result = await crawler.arun(url=url, config=css_config)
|
||||
|
||||
# Extract insights with LLM
|
||||
llm_config = CrawlerRunConfig(extraction_strategy=llm_strategy)
|
||||
llm_result = await crawler.arun(url=url, config=llm_config)
|
||||
|
||||
# Combine results
|
||||
results = {
|
||||
"contacts": json.loads(regex_result.extracted_content) if regex_result.success else [],
|
||||
"products": json.loads(css_result.extracted_content) if css_result.success else [],
|
||||
"analysis": json.loads(llm_result.extracted_content) if llm_result.success else {}
|
||||
}
|
||||
|
||||
print(f"Found {len(results['contacts'])} contact entries")
|
||||
print(f"Found {len(results['products'])} products")
|
||||
print(f"Sentiment: {results['analysis'].get('sentiment', 'N/A')}")
|
||||
|
||||
return results
|
||||
|
||||
# Performance comparison
|
||||
async def compare_extraction_performance():
|
||||
"""Compare speed and accuracy of different strategies"""
|
||||
import time
|
||||
|
||||
url = "https://example.com/large-catalog"
|
||||
|
||||
strategies = {
|
||||
"regex": RegexExtractionStrategy(pattern=RegexExtractionStrategy.Currency),
|
||||
"css": JsonCssExtractionStrategy({
|
||||
"name": "Prices",
|
||||
"baseSelector": ".price",
|
||||
"fields": [{"name": "amount", "selector": "span", "type": "text"}]
|
||||
}),
|
||||
"llm": LLMExtractionStrategy(
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY"),
|
||||
instruction="Extract all prices from the content",
|
||||
extraction_type="block"
|
||||
)
|
||||
}
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for name, strategy in strategies.items():
|
||||
start_time = time.time()
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
duration = time.time() - start_time
|
||||
|
||||
if result.success:
|
||||
data = json.loads(result.extracted_content)
|
||||
print(f"{name}: {len(data)} items in {duration:.2f}s")
|
||||
else:
|
||||
print(f"{name}: Failed in {duration:.2f}s")
|
||||
|
||||
asyncio.run(multi_strategy_extraction())
|
||||
asyncio.run(compare_extraction_performance())
|
||||
```
|
||||
|
||||
### Best Practices and Strategy Selection
|
||||
|
||||
```python
|
||||
# Strategy selection guide
|
||||
def choose_extraction_strategy(use_case):
|
||||
"""
|
||||
Guide for selecting the right extraction strategy
|
||||
"""
|
||||
|
||||
strategies = {
|
||||
# Fast pattern matching for common data types
|
||||
"contact_info": RegexExtractionStrategy(
|
||||
pattern=RegexExtractionStrategy.Email | RegexExtractionStrategy.PhoneUS
|
||||
),
|
||||
|
||||
# Structured data from consistent HTML
|
||||
"product_catalogs": JsonCssExtractionStrategy,
|
||||
|
||||
# Complex reasoning and semantic understanding
|
||||
"content_analysis": LLMExtractionStrategy,
|
||||
|
||||
# Mixed approach for comprehensive extraction
|
||||
"complete_site_analysis": "multi_strategy"
|
||||
}
|
||||
|
||||
recommendations = {
|
||||
"speed_priority": "Use RegexExtractionStrategy for simple patterns, JsonCssExtractionStrategy for structured data",
|
||||
"accuracy_priority": "Use LLMExtractionStrategy for complex content, JsonCssExtractionStrategy for predictable structure",
|
||||
"cost_priority": "Avoid LLM strategies, use schema generation once then JsonCssExtractionStrategy",
|
||||
"scale_priority": "Cache schemas, use regex for simple patterns, avoid LLM for high-volume extraction"
|
||||
}
|
||||
|
||||
return recommendations.get(use_case, "Combine strategies based on content complexity")
|
||||
|
||||
# Error handling and validation
|
||||
async def robust_extraction():
|
||||
strategies = [
|
||||
RegexExtractionStrategy(pattern=RegexExtractionStrategy.Email),
|
||||
JsonCssExtractionStrategy(simple_schema),
|
||||
# LLM as fallback for complex cases
|
||||
]
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for strategy in strategies:
|
||||
try:
|
||||
config = CrawlerRunConfig(extraction_strategy=strategy)
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
|
||||
if result.success and result.extracted_content:
|
||||
data = json.loads(result.extracted_content)
|
||||
if data: # Validate non-empty results
|
||||
print(f"Success with {strategy.__class__.__name__}")
|
||||
return data
|
||||
|
||||
except Exception as e:
|
||||
print(f"Strategy {strategy.__class__.__name__} failed: {e}")
|
||||
continue
|
||||
|
||||
print("All strategies failed")
|
||||
return None
|
||||
```
|
||||
|
||||
**📖 Learn more:** [LLM Strategies Deep Dive](https://docs.crawl4ai.com/extraction/llm-strategies/), [Schema-Based Extraction](https://docs.crawl4ai.com/extraction/no-llm-strategies/), [Regex Patterns](https://docs.crawl4ai.com/extraction/no-llm-strategies/#regexextractionstrategy), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
|
||||
388
docs/md_v2/assets/llm.txt/txt/http_based_crawler_strategy.txt
Normal file
388
docs/md_v2/assets/llm.txt/txt/http_based_crawler_strategy.txt
Normal file
@@ -0,0 +1,388 @@
|
||||
## HTTP Crawler Strategy
|
||||
|
||||
Fast, lightweight HTTP-only crawling without browser overhead for cases where JavaScript execution isn't needed.
|
||||
|
||||
### Basic HTTP Crawler Setup
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, HTTPCrawlerConfig, CacheMode
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
async def main():
|
||||
# Initialize HTTP strategy
|
||||
http_strategy = AsyncHTTPCrawlerStrategy(
|
||||
browser_config=HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
verify_ssl=True,
|
||||
follow_redirects=True
|
||||
),
|
||||
logger=AsyncLogger(verbose=True)
|
||||
)
|
||||
|
||||
# Use with AsyncWebCrawler
|
||||
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Content: {len(result.html)} chars")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### HTTP Request Types
|
||||
|
||||
```python
|
||||
# GET request (default)
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
headers={"Accept": "application/json"}
|
||||
)
|
||||
|
||||
# POST with JSON data
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="POST",
|
||||
json={"key": "value", "data": [1, 2, 3]},
|
||||
headers={"Content-Type": "application/json"}
|
||||
)
|
||||
|
||||
# POST with form data
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="POST",
|
||||
data={"username": "user", "password": "pass"},
|
||||
headers={"Content-Type": "application/x-www-form-urlencoded"}
|
||||
)
|
||||
|
||||
# Advanced configuration
|
||||
http_config = HTTPCrawlerConfig(
|
||||
method="GET",
|
||||
headers={"User-Agent": "Custom Bot/1.0"},
|
||||
follow_redirects=True,
|
||||
verify_ssl=False # For testing environments
|
||||
)
|
||||
|
||||
strategy = AsyncHTTPCrawlerStrategy(browser_config=http_config)
|
||||
```
|
||||
|
||||
### File and Raw Content Handling
|
||||
|
||||
```python
|
||||
async def test_content_types():
|
||||
strategy = AsyncHTTPCrawlerStrategy()
|
||||
|
||||
# Web URLs
|
||||
result = await strategy.crawl("https://httpbin.org/get")
|
||||
print(f"Web content: {result.status_code}")
|
||||
|
||||
# Local files
|
||||
result = await strategy.crawl("file:///path/to/local/file.html")
|
||||
print(f"File content: {len(result.html)}")
|
||||
|
||||
# Raw HTML content
|
||||
raw_html = "raw://<html><body><h1>Test</h1><p>Content</p></body></html>"
|
||||
result = await strategy.crawl(raw_html)
|
||||
print(f"Raw content: {result.html}")
|
||||
|
||||
# Raw content with complex HTML
|
||||
complex_html = """raw://<!DOCTYPE html>
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<div class="content">
|
||||
<h1>Main Title</h1>
|
||||
<p>Paragraph content</p>
|
||||
<ul><li>Item 1</li><li>Item 2</li></ul>
|
||||
</div>
|
||||
</body>
|
||||
</html>"""
|
||||
result = await strategy.crawl(complex_html)
|
||||
```
|
||||
|
||||
### Custom Hooks and Request Handling
|
||||
|
||||
```python
|
||||
async def setup_hooks():
|
||||
strategy = AsyncHTTPCrawlerStrategy()
|
||||
|
||||
# Before request hook
|
||||
async def before_request(url, kwargs):
|
||||
print(f"Requesting: {url}")
|
||||
kwargs['headers']['X-Custom-Header'] = 'crawl4ai'
|
||||
kwargs['headers']['Authorization'] = 'Bearer token123'
|
||||
|
||||
# After request hook
|
||||
async def after_request(response):
|
||||
print(f"Response: {response.status_code}")
|
||||
if hasattr(response, 'redirected_url'):
|
||||
print(f"Redirected to: {response.redirected_url}")
|
||||
|
||||
# Error handling hook
|
||||
async def on_error(error):
|
||||
print(f"Request failed: {error}")
|
||||
|
||||
# Set hooks
|
||||
strategy.set_hook('before_request', before_request)
|
||||
strategy.set_hook('after_request', after_request)
|
||||
strategy.set_hook('on_error', on_error)
|
||||
|
||||
# Use with hooks
|
||||
result = await strategy.crawl("https://httpbin.org/headers")
|
||||
return result
|
||||
```
|
||||
|
||||
### Performance Configuration
|
||||
|
||||
```python
|
||||
# High-performance setup
|
||||
strategy = AsyncHTTPCrawlerStrategy(
|
||||
max_connections=50, # Concurrent connections
|
||||
dns_cache_ttl=300, # DNS cache timeout
|
||||
chunk_size=128 * 1024 # 128KB chunks for large files
|
||||
)
|
||||
|
||||
# Memory-efficient setup for large files
|
||||
strategy = AsyncHTTPCrawlerStrategy(
|
||||
max_connections=10,
|
||||
chunk_size=32 * 1024, # Smaller chunks
|
||||
dns_cache_ttl=600
|
||||
)
|
||||
|
||||
# Custom timeout configuration
|
||||
config = CrawlerRunConfig(
|
||||
page_timeout=30000, # 30 second timeout
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
result = await strategy.crawl("https://slow-server.com", config=config)
|
||||
```
|
||||
|
||||
### Error Handling and Retries
|
||||
|
||||
```python
|
||||
from crawl4ai.async_crawler_strategy import (
|
||||
ConnectionTimeoutError,
|
||||
HTTPStatusError,
|
||||
HTTPCrawlerError
|
||||
)
|
||||
|
||||
async def robust_crawling():
|
||||
strategy = AsyncHTTPCrawlerStrategy()
|
||||
|
||||
urls = [
|
||||
"https://example.com",
|
||||
"https://httpbin.org/status/404",
|
||||
"https://nonexistent.domain.test"
|
||||
]
|
||||
|
||||
for url in urls:
|
||||
try:
|
||||
result = await strategy.crawl(url)
|
||||
print(f"✓ {url}: {result.status_code}")
|
||||
|
||||
except HTTPStatusError as e:
|
||||
print(f"✗ {url}: HTTP {e.status_code}")
|
||||
|
||||
except ConnectionTimeoutError as e:
|
||||
print(f"✗ {url}: Timeout - {e}")
|
||||
|
||||
except HTTPCrawlerError as e:
|
||||
print(f"✗ {url}: Crawler error - {e}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"✗ {url}: Unexpected error - {e}")
|
||||
|
||||
# Retry mechanism
|
||||
async def crawl_with_retry(url, max_retries=3):
|
||||
strategy = AsyncHTTPCrawlerStrategy()
|
||||
|
||||
for attempt in range(max_retries):
|
||||
try:
|
||||
return await strategy.crawl(url)
|
||||
except (ConnectionTimeoutError, HTTPCrawlerError) as e:
|
||||
if attempt == max_retries - 1:
|
||||
raise
|
||||
print(f"Retry {attempt + 1}/{max_retries}: {e}")
|
||||
await asyncio.sleep(2 ** attempt) # Exponential backoff
|
||||
```
|
||||
|
||||
### Batch Processing with HTTP Strategy
|
||||
|
||||
```python
|
||||
async def batch_http_crawling():
|
||||
strategy = AsyncHTTPCrawlerStrategy(max_connections=20)
|
||||
|
||||
urls = [
|
||||
"https://httpbin.org/get",
|
||||
"https://httpbin.org/user-agent",
|
||||
"https://httpbin.org/headers",
|
||||
"https://example.com",
|
||||
"https://httpbin.org/json"
|
||||
]
|
||||
|
||||
# Sequential processing
|
||||
results = []
|
||||
async with strategy:
|
||||
for url in urls:
|
||||
try:
|
||||
result = await strategy.crawl(url)
|
||||
results.append((url, result.status_code, len(result.html)))
|
||||
except Exception as e:
|
||||
results.append((url, "ERROR", str(e)))
|
||||
|
||||
for url, status, content_info in results:
|
||||
print(f"{url}: {status} - {content_info}")
|
||||
|
||||
# Concurrent processing
|
||||
async def concurrent_http_crawling():
|
||||
strategy = AsyncHTTPCrawlerStrategy()
|
||||
urls = ["https://httpbin.org/delay/1"] * 5
|
||||
|
||||
async def crawl_single(url):
|
||||
try:
|
||||
result = await strategy.crawl(url)
|
||||
return f"✓ {result.status_code}"
|
||||
except Exception as e:
|
||||
return f"✗ {e}"
|
||||
|
||||
async with strategy:
|
||||
tasks = [crawl_single(url) for url in urls]
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
for i, result in enumerate(results):
|
||||
print(f"URL {i+1}: {result}")
|
||||
```
|
||||
|
||||
### Integration with Content Processing
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator, PruningContentFilter
|
||||
|
||||
async def http_with_processing():
|
||||
# HTTP strategy with content processing
|
||||
http_strategy = AsyncHTTPCrawlerStrategy(
|
||||
browser_config=HTTPCrawlerConfig(verify_ssl=True)
|
||||
)
|
||||
|
||||
# Configure markdown generation
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=10
|
||||
)
|
||||
),
|
||||
word_count_threshold=5,
|
||||
excluded_tags=['script', 'style', 'nav'],
|
||||
exclude_external_links=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
print(f"Status: {result.status_code}")
|
||||
print(f"Raw HTML: {len(result.html)} chars")
|
||||
if result.markdown:
|
||||
print(f"Markdown: {len(result.markdown.raw_markdown)} chars")
|
||||
if result.markdown.fit_markdown:
|
||||
print(f"Filtered: {len(result.markdown.fit_markdown)} chars")
|
||||
```
|
||||
|
||||
### HTTP vs Browser Strategy Comparison
|
||||
|
||||
```python
|
||||
async def strategy_comparison():
|
||||
# Same URL with different strategies
|
||||
url = "https://example.com"
|
||||
|
||||
# HTTP Strategy (fast, no JS)
|
||||
http_strategy = AsyncHTTPCrawlerStrategy()
|
||||
start_time = time.time()
|
||||
http_result = await http_strategy.crawl(url)
|
||||
http_time = time.time() - start_time
|
||||
|
||||
# Browser Strategy (full features)
|
||||
from crawl4ai import BrowserConfig
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
start_time = time.time()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
browser_result = await crawler.arun(url)
|
||||
browser_time = time.time() - start_time
|
||||
|
||||
print(f"HTTP Strategy:")
|
||||
print(f" Time: {http_time:.2f}s")
|
||||
print(f" Content: {len(http_result.html)} chars")
|
||||
print(f" Features: Fast, lightweight, no JS")
|
||||
|
||||
print(f"Browser Strategy:")
|
||||
print(f" Time: {browser_time:.2f}s")
|
||||
print(f" Content: {len(browser_result.html)} chars")
|
||||
print(f" Features: Full browser, JS, screenshots, etc.")
|
||||
|
||||
# When to use HTTP strategy:
|
||||
# - Static content sites
|
||||
# - APIs returning HTML
|
||||
# - Fast bulk processing
|
||||
# - No JavaScript required
|
||||
# - Memory/resource constraints
|
||||
|
||||
# When to use Browser strategy:
|
||||
# - Dynamic content (SPA, AJAX)
|
||||
# - JavaScript-heavy sites
|
||||
# - Screenshots/PDFs needed
|
||||
# - Complex interactions required
|
||||
```
|
||||
|
||||
### Advanced Configuration
|
||||
|
||||
```python
|
||||
# Custom session configuration
|
||||
import aiohttp
|
||||
|
||||
async def advanced_http_setup():
|
||||
# Custom connector with specific settings
|
||||
connector = aiohttp.TCPConnector(
|
||||
limit=100, # Connection pool size
|
||||
ttl_dns_cache=600, # DNS cache TTL
|
||||
use_dns_cache=True, # Enable DNS caching
|
||||
keepalive_timeout=30, # Keep-alive timeout
|
||||
force_close=False # Reuse connections
|
||||
)
|
||||
|
||||
strategy = AsyncHTTPCrawlerStrategy(
|
||||
max_connections=50,
|
||||
dns_cache_ttl=600,
|
||||
chunk_size=64 * 1024
|
||||
)
|
||||
|
||||
# Custom headers for all requests
|
||||
http_config = HTTPCrawlerConfig(
|
||||
headers={
|
||||
"User-Agent": "Crawl4AI-HTTP/1.0",
|
||||
"Accept": "text/html,application/xhtml+xml",
|
||||
"Accept-Language": "en-US,en;q=0.9",
|
||||
"Accept-Encoding": "gzip, deflate, br",
|
||||
"DNT": "1"
|
||||
},
|
||||
verify_ssl=True,
|
||||
follow_redirects=True
|
||||
)
|
||||
|
||||
strategy.browser_config = http_config
|
||||
|
||||
# Use with custom timeout
|
||||
config = CrawlerRunConfig(
|
||||
page_timeout=45000, # 45 seconds
|
||||
cache_mode=CacheMode.ENABLED
|
||||
)
|
||||
|
||||
result = await strategy.crawl("https://example.com", config=config)
|
||||
await strategy.close()
|
||||
```
|
||||
|
||||
**📖 Learn more:** [AsyncWebCrawler API](https://docs.crawl4ai.com/api/async-webcrawler/), [Browser vs HTTP Strategy](https://docs.crawl4ai.com/core/browser-crawler-config/), [Performance Optimization](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
|
||||
231
docs/md_v2/assets/llm.txt/txt/installation.txt
Normal file
231
docs/md_v2/assets/llm.txt/txt/installation.txt
Normal file
@@ -0,0 +1,231 @@
|
||||
## Installation
|
||||
|
||||
Multiple installation options for different environments and use cases.
|
||||
|
||||
### Basic Installation
|
||||
|
||||
```bash
|
||||
# Install core library
|
||||
pip install crawl4ai
|
||||
|
||||
# Initial setup (installs Playwright browsers)
|
||||
crawl4ai-setup
|
||||
|
||||
# Verify installation
|
||||
crawl4ai-doctor
|
||||
```
|
||||
|
||||
### Quick Verification
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print(result.markdown[:300])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Basic Usage Guide](https://docs.crawl4ai.com/core/quickstart.md)
|
||||
|
||||
### Advanced Features (Optional)
|
||||
|
||||
```bash
|
||||
# PyTorch-based features (text clustering, semantic chunking)
|
||||
pip install crawl4ai[torch]
|
||||
crawl4ai-setup
|
||||
|
||||
# Transformers (Hugging Face models)
|
||||
pip install crawl4ai[transformer]
|
||||
crawl4ai-setup
|
||||
|
||||
# All features (large download)
|
||||
pip install crawl4ai[all]
|
||||
crawl4ai-setup
|
||||
|
||||
# Pre-download models (optional)
|
||||
crawl4ai-download-models
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Advanced Features Documentation](https://docs.crawl4ai.com/extraction/llm-strategies.md)
|
||||
|
||||
### Docker Deployment
|
||||
|
||||
```bash
|
||||
# Pull pre-built image (specify platform for consistency)
|
||||
docker pull --platform linux/amd64 unclecode/crawl4ai:latest
|
||||
# For ARM (M1/M2 Macs): docker pull --platform linux/arm64 unclecode/crawl4ai:latest
|
||||
|
||||
# Setup environment for LLM support
|
||||
cat > .llm.env << EOL
|
||||
OPENAI_API_KEY=sk-your-key
|
||||
ANTHROPIC_API_KEY=your-anthropic-key
|
||||
EOL
|
||||
|
||||
# Run with LLM support (specify platform)
|
||||
docker run -d \
|
||||
--platform linux/amd64 \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:latest
|
||||
|
||||
# For ARM Macs, use: --platform linux/arm64
|
||||
|
||||
# Basic run (no LLM)
|
||||
docker run -d \
|
||||
--platform linux/amd64 \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai \
|
||||
--shm-size=1g \
|
||||
unclecode/crawl4ai:latest
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Complete Docker Guide](https://docs.crawl4ai.com/core/docker-deployment.md)
|
||||
|
||||
### Docker Compose
|
||||
|
||||
```bash
|
||||
# Clone repository
|
||||
git clone https://github.com/unclecode/crawl4ai.git
|
||||
cd crawl4ai
|
||||
|
||||
# Copy environment template
|
||||
cp deploy/docker/.llm.env.example .llm.env
|
||||
# Edit .llm.env with your API keys
|
||||
|
||||
# Run pre-built image
|
||||
IMAGE=unclecode/crawl4ai:latest docker compose up -d
|
||||
|
||||
# Build and run locally
|
||||
docker compose up --build -d
|
||||
|
||||
# Build with all features
|
||||
INSTALL_TYPE=all docker compose up --build -d
|
||||
|
||||
# Stop service
|
||||
docker compose down
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Docker Compose Configuration](https://docs.crawl4ai.com/core/docker-deployment.md#option-2-using-docker-compose)
|
||||
|
||||
### Manual Docker Build
|
||||
|
||||
```bash
|
||||
# Build multi-architecture image (specify platform)
|
||||
docker buildx build --platform linux/amd64 -t crawl4ai-local:latest --load .
|
||||
# For ARM: docker buildx build --platform linux/arm64 -t crawl4ai-local:latest --load .
|
||||
|
||||
# Build with specific features
|
||||
docker buildx build \
|
||||
--platform linux/amd64 \
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
--build-arg ENABLE_GPU=false \
|
||||
-t crawl4ai-local:latest --load .
|
||||
|
||||
# Run custom build (specify platform)
|
||||
docker run -d \
|
||||
--platform linux/amd64 \
|
||||
-p 11235:11235 \
|
||||
--name crawl4ai-custom \
|
||||
--env-file .llm.env \
|
||||
--shm-size=1g \
|
||||
crawl4ai-local:latest
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Manual Build Guide](https://docs.crawl4ai.com/core/docker-deployment.md#option-3-manual-local-build--run)
|
||||
|
||||
### Google Colab
|
||||
|
||||
```python
|
||||
# Install in Colab
|
||||
!pip install crawl4ai
|
||||
!crawl4ai-setup
|
||||
|
||||
# If setup fails, manually install Playwright browsers
|
||||
!playwright install chromium
|
||||
|
||||
# Install with all features (may take 5-10 minutes)
|
||||
!pip install crawl4ai[all]
|
||||
!crawl4ai-setup
|
||||
!crawl4ai-download-models
|
||||
|
||||
# If still having issues, force Playwright install
|
||||
!playwright install chromium --force
|
||||
|
||||
# Quick test
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def test_crawl():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
print("✅ Installation successful!")
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
# Run test in Colab
|
||||
await test_crawl()
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Colab Examples Notebook](https://colab.research.google.com/github/unclecode/crawl4ai/blob/main/docs/examples/quickstart.ipynb)
|
||||
|
||||
### Docker API Usage
|
||||
|
||||
```python
|
||||
# Using Docker SDK
|
||||
import asyncio
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
from crawl4ai import BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
async with Crawl4aiDockerClient(base_url="http://localhost:11235") as client:
|
||||
results = await client.crawl(
|
||||
["https://example.com"],
|
||||
browser_config=BrowserConfig(headless=True),
|
||||
crawler_config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
)
|
||||
for result in results:
|
||||
print(f"Success: {result.success}, Length: {len(result.markdown)}")
|
||||
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Docker Client API](https://docs.crawl4ai.com/core/docker-deployment.md#python-sdk)
|
||||
|
||||
### Direct API Calls
|
||||
|
||||
```python
|
||||
# REST API example
|
||||
import requests
|
||||
|
||||
payload = {
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": {"type": "BrowserConfig", "params": {"headless": True}},
|
||||
"crawler_config": {"type": "CrawlerRunConfig", "params": {"cache_mode": "bypass"}}
|
||||
}
|
||||
|
||||
response = requests.post("http://localhost:11235/crawl", json=payload)
|
||||
print(response.json())
|
||||
```
|
||||
|
||||
**📖 Learn more:** [REST API Reference](https://docs.crawl4ai.com/core/docker-deployment.md#rest-api-examples)
|
||||
|
||||
### Health Check
|
||||
|
||||
```bash
|
||||
# Check Docker service
|
||||
curl http://localhost:11235/health
|
||||
|
||||
# Access playground
|
||||
open http://localhost:11235/playground
|
||||
|
||||
# View metrics
|
||||
curl http://localhost:11235/metrics
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Monitoring & Metrics](https://docs.crawl4ai.com/core/docker-deployment.md#metrics--monitoring)
|
||||
5929
docs/md_v2/assets/llm.txt/txt/llms-full.txt
Normal file
5929
docs/md_v2/assets/llm.txt/txt/llms-full.txt
Normal file
File diff suppressed because it is too large
Load Diff
339
docs/md_v2/assets/llm.txt/txt/multi_urls_crawling.txt
Normal file
339
docs/md_v2/assets/llm.txt/txt/multi_urls_crawling.txt
Normal file
@@ -0,0 +1,339 @@
|
||||
## Multi-URL Crawling
|
||||
|
||||
Concurrent crawling of multiple URLs with intelligent resource management, rate limiting, and real-time monitoring.
|
||||
|
||||
### Basic Multi-URL Crawling
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
# Batch processing (default) - get all results at once
|
||||
async def batch_crawl():
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3"
|
||||
]
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=False # Default: batch mode
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
|
||||
for result in results:
|
||||
if result.success:
|
||||
print(f"✅ {result.url}: {len(result.markdown)} chars")
|
||||
else:
|
||||
print(f"❌ {result.url}: {result.error_message}")
|
||||
|
||||
# Streaming processing - handle results as they complete
|
||||
async def streaming_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True # Enable streaming
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Process results as they become available
|
||||
async for result in await crawler.arun_many(urls, config=config):
|
||||
if result.success:
|
||||
print(f"🔥 Just completed: {result.url}")
|
||||
await process_result_immediately(result)
|
||||
else:
|
||||
print(f"❌ Failed: {result.url}")
|
||||
```
|
||||
|
||||
### Memory-Adaptive Dispatching
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, MemoryAdaptiveDispatcher, CrawlerMonitor, DisplayMode
|
||||
|
||||
# Automatically manages concurrency based on system memory
|
||||
async def memory_adaptive_crawl():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0, # Pause if memory exceeds 80%
|
||||
check_interval=1.0, # Check memory every second
|
||||
max_session_permit=15, # Max concurrent tasks
|
||||
memory_wait_timeout=300.0 # Wait up to 5 minutes for memory
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=50
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=large_url_list,
|
||||
config=config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Each result includes dispatch information
|
||||
for result in results:
|
||||
if result.dispatch_result:
|
||||
dr = result.dispatch_result
|
||||
print(f"Memory used: {dr.memory_usage:.1f}MB")
|
||||
print(f"Duration: {dr.end_time - dr.start_time}")
|
||||
```
|
||||
|
||||
### Rate-Limited Crawling
|
||||
|
||||
```python
|
||||
from crawl4ai import RateLimiter, SemaphoreDispatcher
|
||||
|
||||
# Control request pacing and handle server rate limits
|
||||
async def rate_limited_crawl():
|
||||
rate_limiter = RateLimiter(
|
||||
base_delay=(1.0, 3.0), # Random delay 1-3 seconds
|
||||
max_delay=60.0, # Cap backoff at 60 seconds
|
||||
max_retries=3, # Retry failed requests 3 times
|
||||
rate_limit_codes=[429, 503] # Handle these status codes
|
||||
)
|
||||
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
max_session_permit=5, # Fixed concurrency limit
|
||||
rate_limiter=rate_limiter
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
user_agent_mode="random", # Randomize user agents
|
||||
simulate_user=True # Simulate human behavior
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
print(f"Processed: {result.url}")
|
||||
```
|
||||
|
||||
### Real-Time Monitoring
|
||||
|
||||
```python
|
||||
from crawl4ai import CrawlerMonitor, DisplayMode
|
||||
|
||||
# Monitor crawling progress in real-time
|
||||
async def monitored_crawl():
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=20, # Show 20 tasks in display
|
||||
display_mode=DisplayMode.DETAILED # Show individual task details
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=75.0,
|
||||
max_session_permit=10,
|
||||
monitor=monitor # Attach monitor to dispatcher
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### Advanced Dispatcher Configurations
|
||||
|
||||
```python
|
||||
# Memory-adaptive with comprehensive monitoring
|
||||
memory_dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=85.0, # Higher memory tolerance
|
||||
check_interval=0.5, # Check memory more frequently
|
||||
max_session_permit=20, # More concurrent tasks
|
||||
memory_wait_timeout=600.0, # Wait longer for memory
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(0.5, 1.5),
|
||||
max_delay=30.0,
|
||||
max_retries=5
|
||||
),
|
||||
monitor=CrawlerMonitor(
|
||||
max_visible_rows=15,
|
||||
display_mode=DisplayMode.AGGREGATED # Summary view
|
||||
)
|
||||
)
|
||||
|
||||
# Simple semaphore-based dispatcher
|
||||
semaphore_dispatcher = SemaphoreDispatcher(
|
||||
max_session_permit=8, # Fixed concurrency
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0),
|
||||
max_delay=20.0
|
||||
)
|
||||
)
|
||||
|
||||
# Usage with custom dispatcher
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=config,
|
||||
dispatcher=memory_dispatcher # or semaphore_dispatcher
|
||||
)
|
||||
```
|
||||
|
||||
### Handling Large-Scale Crawling
|
||||
|
||||
```python
|
||||
async def large_scale_crawl():
|
||||
# For thousands of URLs
|
||||
urls = load_urls_from_file("large_url_list.txt") # 10,000+ URLs
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0, # Conservative memory usage
|
||||
max_session_permit=25, # Higher concurrency
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(0.1, 0.5), # Faster for large batches
|
||||
max_retries=2 # Fewer retries for speed
|
||||
),
|
||||
monitor=CrawlerMonitor(display_mode=DisplayMode.AGGREGATED)
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.ENABLED, # Use caching for efficiency
|
||||
stream=True, # Stream for memory efficiency
|
||||
word_count_threshold=100, # Skip short content
|
||||
exclude_external_links=True # Reduce processing overhead
|
||||
)
|
||||
|
||||
successful_crawls = 0
|
||||
failed_crawls = 0
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
if result.success:
|
||||
successful_crawls += 1
|
||||
await save_result_to_database(result)
|
||||
else:
|
||||
failed_crawls += 1
|
||||
await log_failure(result.url, result.error_message)
|
||||
|
||||
# Progress reporting
|
||||
if (successful_crawls + failed_crawls) % 100 == 0:
|
||||
print(f"Progress: {successful_crawls + failed_crawls}/{len(urls)}")
|
||||
|
||||
print(f"Completed: {successful_crawls} successful, {failed_crawls} failed")
|
||||
```
|
||||
|
||||
### Robots.txt Compliance
|
||||
|
||||
```python
|
||||
async def compliant_crawl():
|
||||
config = CrawlerRunConfig(
|
||||
check_robots_txt=True, # Respect robots.txt
|
||||
user_agent="MyBot/1.0", # Identify your bot
|
||||
mean_delay=2.0, # Be polite with delays
|
||||
max_range=1.0
|
||||
)
|
||||
|
||||
dispatcher = SemaphoreDispatcher(
|
||||
max_session_permit=3, # Conservative concurrency
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(2.0, 5.0), # Slower, more respectful
|
||||
max_retries=1
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
if result.success:
|
||||
print(f"✅ Crawled: {result.url}")
|
||||
elif "robots.txt" in result.error_message:
|
||||
print(f"🚫 Blocked by robots.txt: {result.url}")
|
||||
else:
|
||||
print(f"❌ Error: {result.url}")
|
||||
```
|
||||
|
||||
### Performance Analysis
|
||||
|
||||
```python
|
||||
async def analyze_crawl_performance():
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0,
|
||||
max_session_permit=12,
|
||||
monitor=CrawlerMonitor(display_mode=DisplayMode.DETAILED)
|
||||
)
|
||||
|
||||
start_time = time.time()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
end_time = time.time()
|
||||
|
||||
# Analyze results
|
||||
successful = [r for r in results if r.success]
|
||||
failed = [r for r in results if not r.success]
|
||||
|
||||
print(f"Total time: {end_time - start_time:.2f}s")
|
||||
print(f"Success rate: {len(successful)}/{len(results)} ({len(successful)/len(results)*100:.1f}%)")
|
||||
print(f"Avg time per URL: {(end_time - start_time)/len(results):.2f}s")
|
||||
|
||||
# Memory usage analysis
|
||||
if successful and successful[0].dispatch_result:
|
||||
memory_usage = [r.dispatch_result.memory_usage for r in successful if r.dispatch_result]
|
||||
peak_memory = [r.dispatch_result.peak_memory for r in successful if r.dispatch_result]
|
||||
|
||||
print(f"Avg memory usage: {sum(memory_usage)/len(memory_usage):.1f}MB")
|
||||
print(f"Peak memory usage: {max(peak_memory):.1f}MB")
|
||||
```
|
||||
|
||||
### Error Handling and Recovery
|
||||
|
||||
```python
|
||||
async def robust_multi_crawl():
|
||||
failed_urls = []
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
stream=True,
|
||||
page_timeout=30000 # 30 second timeout
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=85.0,
|
||||
max_session_permit=10
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
if result.success:
|
||||
await process_successful_result(result)
|
||||
else:
|
||||
failed_urls.append({
|
||||
'url': result.url,
|
||||
'error': result.error_message,
|
||||
'status_code': result.status_code
|
||||
})
|
||||
|
||||
# Retry logic for specific errors
|
||||
if result.status_code in [503, 429]: # Server errors
|
||||
await schedule_retry(result.url)
|
||||
|
||||
# Report failures
|
||||
if failed_urls:
|
||||
print(f"Failed to crawl {len(failed_urls)} URLs:")
|
||||
for failure in failed_urls[:10]: # Show first 10
|
||||
print(f" {failure['url']}: {failure['error']}")
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Advanced Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/), [Crawl Dispatcher](https://docs.crawl4ai.com/advanced/crawl-dispatcher/), [arun_many() API Reference](https://docs.crawl4ai.com/api/arun_many/)
|
||||
365
docs/md_v2/assets/llm.txt/txt/simple_crawling.txt
Normal file
365
docs/md_v2/assets/llm.txt/txt/simple_crawling.txt
Normal file
@@ -0,0 +1,365 @@
|
||||
## Simple Crawling
|
||||
|
||||
Basic web crawling operations with AsyncWebCrawler, configurations, and response handling.
|
||||
|
||||
### Basic Setup
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig() # Default browser settings
|
||||
run_config = CrawlerRunConfig() # Default crawl settings
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
print(result.markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
### Understanding CrawlResult
|
||||
|
||||
```python
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.6),
|
||||
options={"ignore_links": True}
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun("https://example.com", config=config)
|
||||
|
||||
# Different content formats
|
||||
print(result.html) # Raw HTML
|
||||
print(result.cleaned_html) # Cleaned HTML
|
||||
print(result.markdown.raw_markdown) # Raw markdown
|
||||
print(result.markdown.fit_markdown) # Filtered markdown
|
||||
|
||||
# Status information
|
||||
print(result.success) # True/False
|
||||
print(result.status_code) # HTTP status (200, 404, etc.)
|
||||
|
||||
# Extracted content
|
||||
print(result.media) # Images, videos, audio
|
||||
print(result.links) # Internal/external links
|
||||
```
|
||||
|
||||
### Basic Configuration Options
|
||||
|
||||
```python
|
||||
run_config = CrawlerRunConfig(
|
||||
word_count_threshold=10, # Min words per block
|
||||
exclude_external_links=True, # Remove external links
|
||||
remove_overlay_elements=True, # Remove popups/modals
|
||||
process_iframes=True, # Process iframe content
|
||||
excluded_tags=['form', 'header'] # Skip these tags
|
||||
)
|
||||
|
||||
result = await crawler.arun("https://example.com", config=run_config)
|
||||
```
|
||||
|
||||
### Error Handling
|
||||
|
||||
```python
|
||||
result = await crawler.arun("https://example.com", config=run_config)
|
||||
|
||||
if not result.success:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
print(f"Status code: {result.status_code}")
|
||||
else:
|
||||
print(f"Success! Content length: {len(result.markdown)}")
|
||||
```
|
||||
|
||||
### Debugging with Verbose Logging
|
||||
|
||||
```python
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun("https://example.com")
|
||||
# Detailed logging output will be displayed
|
||||
```
|
||||
|
||||
### Complete Example
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def comprehensive_crawl():
|
||||
browser_config = BrowserConfig(verbose=True)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
# Content filtering
|
||||
word_count_threshold=10,
|
||||
excluded_tags=['form', 'header', 'nav'],
|
||||
exclude_external_links=True,
|
||||
|
||||
# Content processing
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True,
|
||||
|
||||
# Cache control
|
||||
cache_mode=CacheMode.ENABLED
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=run_config
|
||||
)
|
||||
|
||||
if result.success:
|
||||
# Display content summary
|
||||
print(f"Title: {result.metadata.get('title', 'No title')}")
|
||||
print(f"Content: {result.markdown[:500]}...")
|
||||
|
||||
# Process media
|
||||
images = result.media.get("images", [])
|
||||
print(f"Found {len(images)} images")
|
||||
for img in images[:3]: # First 3 images
|
||||
print(f" - {img.get('src', 'No src')}")
|
||||
|
||||
# Process links
|
||||
internal_links = result.links.get("internal", [])
|
||||
print(f"Found {len(internal_links)} internal links")
|
||||
for link in internal_links[:3]: # First 3 links
|
||||
print(f" - {link.get('href', 'No href')}")
|
||||
|
||||
else:
|
||||
print(f"❌ Crawl failed: {result.error_message}")
|
||||
print(f"Status: {result.status_code}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(comprehensive_crawl())
|
||||
```
|
||||
|
||||
### Working with Raw HTML and Local Files
|
||||
|
||||
```python
|
||||
# Crawl raw HTML
|
||||
raw_html = "<html><body><h1>Test</h1><p>Content</p></body></html>"
|
||||
result = await crawler.arun(f"raw://{raw_html}")
|
||||
|
||||
# Crawl local file
|
||||
result = await crawler.arun("file:///path/to/local/file.html")
|
||||
|
||||
# Both return standard CrawlResult objects
|
||||
print(result.markdown)
|
||||
```
|
||||
|
||||
## Table Extraction
|
||||
|
||||
Extract structured data from HTML tables with automatic detection and scoring.
|
||||
|
||||
### Basic Table Extraction
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def extract_tables():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
table_score_threshold=7, # Higher = stricter detection
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
result = await crawler.arun("https://example.com/tables", config=config)
|
||||
|
||||
if result.success and result.tables:
|
||||
# New tables field (v0.6+)
|
||||
for i, table in enumerate(result.tables):
|
||||
print(f"Table {i+1}:")
|
||||
print(f"Headers: {table['headers']}")
|
||||
print(f"Rows: {len(table['rows'])}")
|
||||
print(f"Caption: {table.get('caption', 'No caption')}")
|
||||
|
||||
# Convert to DataFrame
|
||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
||||
print(df.head())
|
||||
|
||||
asyncio.run(extract_tables())
|
||||
```
|
||||
|
||||
### Advanced Table Processing
|
||||
|
||||
```python
|
||||
from crawl4ai import LXMLWebScrapingStrategy
|
||||
|
||||
async def process_financial_tables():
|
||||
config = CrawlerRunConfig(
|
||||
table_score_threshold=8, # Strict detection for data tables
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
keep_data_attributes=True,
|
||||
scan_full_page=True
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://coinmarketcap.com", config=config)
|
||||
|
||||
if result.tables:
|
||||
# Get the main data table (usually first/largest)
|
||||
main_table = result.tables[0]
|
||||
|
||||
# Create DataFrame
|
||||
df = pd.DataFrame(
|
||||
main_table['rows'],
|
||||
columns=main_table['headers']
|
||||
)
|
||||
|
||||
# Clean and process data
|
||||
df = clean_financial_data(df)
|
||||
|
||||
# Save for analysis
|
||||
df.to_csv("market_data.csv", index=False)
|
||||
return df
|
||||
|
||||
def clean_financial_data(df):
|
||||
"""Clean currency symbols, percentages, and large numbers"""
|
||||
for col in df.columns:
|
||||
if 'price' in col.lower():
|
||||
# Remove currency symbols
|
||||
df[col] = df[col].str.replace(r'[^\d.]', '', regex=True)
|
||||
df[col] = pd.to_numeric(df[col], errors='coerce')
|
||||
|
||||
elif '%' in str(df[col].iloc[0]):
|
||||
# Convert percentages
|
||||
df[col] = df[col].str.replace('%', '').astype(float) / 100
|
||||
|
||||
elif any(suffix in str(df[col].iloc[0]) for suffix in ['B', 'M', 'K']):
|
||||
# Handle large numbers (Billions, Millions, etc.)
|
||||
df[col] = df[col].apply(convert_large_numbers)
|
||||
|
||||
return df
|
||||
|
||||
def convert_large_numbers(value):
|
||||
"""Convert 1.5B -> 1500000000"""
|
||||
if pd.isna(value):
|
||||
return float('nan')
|
||||
|
||||
value = str(value)
|
||||
multiplier = 1
|
||||
if 'B' in value:
|
||||
multiplier = 1e9
|
||||
elif 'M' in value:
|
||||
multiplier = 1e6
|
||||
elif 'K' in value:
|
||||
multiplier = 1e3
|
||||
|
||||
number = float(re.sub(r'[^\d.]', '', value))
|
||||
return number * multiplier
|
||||
```
|
||||
|
||||
### Table Detection Configuration
|
||||
|
||||
```python
|
||||
# Strict table detection (data-heavy pages)
|
||||
strict_config = CrawlerRunConfig(
|
||||
table_score_threshold=9, # Only high-quality tables
|
||||
word_count_threshold=5, # Ignore sparse content
|
||||
excluded_tags=['nav', 'footer'] # Skip navigation tables
|
||||
)
|
||||
|
||||
# Lenient detection (mixed content pages)
|
||||
lenient_config = CrawlerRunConfig(
|
||||
table_score_threshold=5, # Include layout tables
|
||||
process_iframes=True, # Check embedded tables
|
||||
scan_full_page=True # Scroll to load dynamic tables
|
||||
)
|
||||
|
||||
# Financial/data site optimization
|
||||
financial_config = CrawlerRunConfig(
|
||||
table_score_threshold=8,
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
wait_for="css:table", # Wait for tables to load
|
||||
scan_full_page=True,
|
||||
scroll_delay=0.2
|
||||
)
|
||||
```
|
||||
|
||||
### Multi-Table Processing
|
||||
|
||||
```python
|
||||
async def extract_all_tables():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun("https://example.com/data", config=config)
|
||||
|
||||
tables_data = {}
|
||||
|
||||
for i, table in enumerate(result.tables):
|
||||
# Create meaningful names based on content
|
||||
table_name = (
|
||||
table.get('caption') or
|
||||
f"table_{i+1}_{table['headers'][0]}"
|
||||
).replace(' ', '_').lower()
|
||||
|
||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
||||
|
||||
# Store with metadata
|
||||
tables_data[table_name] = {
|
||||
'dataframe': df,
|
||||
'headers': table['headers'],
|
||||
'row_count': len(table['rows']),
|
||||
'caption': table.get('caption'),
|
||||
'summary': table.get('summary')
|
||||
}
|
||||
|
||||
return tables_data
|
||||
|
||||
# Usage
|
||||
tables = await extract_all_tables()
|
||||
for name, data in tables.items():
|
||||
print(f"{name}: {data['row_count']} rows")
|
||||
data['dataframe'].to_csv(f"{name}.csv")
|
||||
```
|
||||
|
||||
### Backward Compatibility
|
||||
|
||||
```python
|
||||
# Support both new and old table formats
|
||||
def get_tables(result):
|
||||
# New format (v0.6+)
|
||||
if hasattr(result, 'tables') and result.tables:
|
||||
return result.tables
|
||||
|
||||
# Fallback to media.tables (older versions)
|
||||
return result.media.get('tables', [])
|
||||
|
||||
# Usage in existing code
|
||||
result = await crawler.arun(url, config=config)
|
||||
tables = get_tables(result)
|
||||
|
||||
for table in tables:
|
||||
df = pd.DataFrame(table['rows'], columns=table['headers'])
|
||||
# Process table data...
|
||||
```
|
||||
|
||||
### Table Quality Scoring
|
||||
|
||||
```python
|
||||
# Understanding table_score_threshold values:
|
||||
# 10: Only perfect data tables (headers + data rows)
|
||||
# 8-9: High-quality tables (recommended for financial/data sites)
|
||||
# 6-7: Mixed content tables (news sites, wikis)
|
||||
# 4-5: Layout tables included (broader detection)
|
||||
# 1-3: All table-like structures (very permissive)
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
table_score_threshold=8, # Balanced detection
|
||||
verbose=True # See scoring details in logs
|
||||
)
|
||||
```
|
||||
|
||||
|
||||
**📖 Learn more:** [CrawlResult API Reference](https://docs.crawl4ai.com/api/crawl-result/), [Browser & Crawler Configuration](https://docs.crawl4ai.com/core/browser-crawler-config/), [Cache Modes](https://docs.crawl4ai.com/core/cache-modes/)
|
||||
655
docs/md_v2/assets/llm.txt/txt/url_seeder.txt
Normal file
655
docs/md_v2/assets/llm.txt/txt/url_seeder.txt
Normal file
@@ -0,0 +1,655 @@
|
||||
## URL Seeding
|
||||
|
||||
Smart URL discovery for efficient large-scale crawling. Discover thousands of URLs instantly, filter by relevance, then crawl only what matters.
|
||||
|
||||
### Why URL Seeding vs Deep Crawling
|
||||
|
||||
```python
|
||||
# Deep Crawling: Real-time discovery (page by page)
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.deep_crawling import BFSDeepCrawlStrategy
|
||||
|
||||
async def deep_crawl_example():
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
include_external=False,
|
||||
max_pages=50
|
||||
)
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun("https://example.com", config=config)
|
||||
print(f"Discovered {len(results)} pages dynamically")
|
||||
|
||||
# URL Seeding: Bulk discovery (thousands instantly)
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def url_seeding_example():
|
||||
config = SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
pattern="*/docs/*",
|
||||
extract_head=True,
|
||||
query="API documentation",
|
||||
scoring_method="bm25",
|
||||
max_urls=1000
|
||||
)
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
urls = await seeder.urls("example.com", config)
|
||||
print(f"Discovered {len(urls)} URLs instantly")
|
||||
# Now crawl only the most relevant ones
|
||||
```
|
||||
|
||||
### Basic URL Discovery
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncUrlSeeder, SeedingConfig
|
||||
|
||||
async def basic_discovery():
|
||||
# Context manager handles cleanup automatically
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
|
||||
# Simple discovery from sitemaps
|
||||
config = SeedingConfig(source="sitemap")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
print(f"Found {len(urls)} URLs from sitemap")
|
||||
for url in urls[:5]:
|
||||
print(f" - {url['url']} (status: {url['status']})")
|
||||
|
||||
# Manual cleanup (if needed)
|
||||
async def manual_cleanup():
|
||||
seeder = AsyncUrlSeeder()
|
||||
try:
|
||||
config = SeedingConfig(source="cc") # Common Crawl
|
||||
urls = await seeder.urls("example.com", config)
|
||||
print(f"Found {len(urls)} URLs from Common Crawl")
|
||||
finally:
|
||||
await seeder.close()
|
||||
|
||||
asyncio.run(basic_discovery())
|
||||
```
|
||||
|
||||
### Data Sources and Patterns
|
||||
|
||||
```python
|
||||
# Different data sources
|
||||
configs = [
|
||||
SeedingConfig(source="sitemap"), # Fastest, official URLs
|
||||
SeedingConfig(source="cc"), # Most comprehensive
|
||||
SeedingConfig(source="sitemap+cc"), # Maximum coverage
|
||||
]
|
||||
|
||||
# URL pattern filtering
|
||||
patterns = [
|
||||
SeedingConfig(pattern="*/blog/*"), # Blog posts only
|
||||
SeedingConfig(pattern="*.html"), # HTML files only
|
||||
SeedingConfig(pattern="*/product/*"), # Product pages
|
||||
SeedingConfig(pattern="*/docs/api/*"), # API documentation
|
||||
SeedingConfig(pattern="*"), # Everything
|
||||
]
|
||||
|
||||
# Advanced pattern usage
|
||||
async def pattern_filtering():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Find all blog posts from 2024
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*/blog/2024/*.html",
|
||||
max_urls=100
|
||||
)
|
||||
|
||||
blog_urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Further filter by keywords in URL
|
||||
python_posts = [
|
||||
url for url in blog_urls
|
||||
if "python" in url['url'].lower()
|
||||
]
|
||||
|
||||
print(f"Found {len(python_posts)} Python blog posts")
|
||||
```
|
||||
|
||||
### SeedingConfig Parameters
|
||||
|
||||
```python
|
||||
from crawl4ai import SeedingConfig
|
||||
|
||||
# Comprehensive configuration
|
||||
config = SeedingConfig(
|
||||
# Data sources
|
||||
source="sitemap+cc", # "sitemap", "cc", "sitemap+cc"
|
||||
pattern="*/docs/*", # URL pattern filter
|
||||
|
||||
# Metadata extraction
|
||||
extract_head=True, # Get <head> metadata
|
||||
live_check=True, # Verify URLs are accessible
|
||||
|
||||
# Performance controls
|
||||
max_urls=1000, # Limit results (-1 = unlimited)
|
||||
concurrency=20, # Parallel workers
|
||||
hits_per_sec=10, # Rate limiting
|
||||
|
||||
# Relevance scoring
|
||||
query="API documentation guide", # Search query
|
||||
scoring_method="bm25", # Scoring algorithm
|
||||
score_threshold=0.3, # Minimum relevance (0.0-1.0)
|
||||
|
||||
# Cache and filtering
|
||||
force=False, # Bypass cache
|
||||
filter_nonsense_urls=True, # Remove utility URLs
|
||||
verbose=True # Debug output
|
||||
)
|
||||
|
||||
# Quick configurations for common use cases
|
||||
blog_config = SeedingConfig(
|
||||
source="sitemap",
|
||||
pattern="*/blog/*",
|
||||
extract_head=True
|
||||
)
|
||||
|
||||
api_docs_config = SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
pattern="*/docs/*",
|
||||
query="API reference documentation",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.5
|
||||
)
|
||||
|
||||
product_pages_config = SeedingConfig(
|
||||
source="cc",
|
||||
pattern="*/product/*",
|
||||
live_check=True,
|
||||
max_urls=500
|
||||
)
|
||||
```
|
||||
|
||||
### Metadata Extraction and Analysis
|
||||
|
||||
```python
|
||||
async def metadata_extraction():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True, # Extract <head> metadata
|
||||
pattern="*/blog/*",
|
||||
max_urls=50
|
||||
)
|
||||
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Analyze extracted metadata
|
||||
for url in urls[:5]:
|
||||
head_data = url['head_data']
|
||||
print(f"\nURL: {url['url']}")
|
||||
print(f"Title: {head_data.get('title', 'No title')}")
|
||||
|
||||
# Standard meta tags
|
||||
meta = head_data.get('meta', {})
|
||||
print(f"Description: {meta.get('description', 'N/A')}")
|
||||
print(f"Keywords: {meta.get('keywords', 'N/A')}")
|
||||
print(f"Author: {meta.get('author', 'N/A')}")
|
||||
|
||||
# Open Graph data
|
||||
print(f"OG Image: {meta.get('og:image', 'N/A')}")
|
||||
print(f"OG Type: {meta.get('og:type', 'N/A')}")
|
||||
|
||||
# JSON-LD structured data
|
||||
jsonld = head_data.get('jsonld', [])
|
||||
if jsonld:
|
||||
print(f"Structured data: {len(jsonld)} items")
|
||||
for item in jsonld[:2]:
|
||||
if isinstance(item, dict):
|
||||
print(f" Type: {item.get('@type', 'Unknown')}")
|
||||
print(f" Name: {item.get('name', 'N/A')}")
|
||||
|
||||
# Filter by metadata
|
||||
async def metadata_filtering():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
max_urls=100
|
||||
)
|
||||
|
||||
urls = await seeder.urls("news.example.com", config)
|
||||
|
||||
# Filter by publication date (from JSON-LD)
|
||||
from datetime import datetime, timedelta
|
||||
recent_cutoff = datetime.now() - timedelta(days=7)
|
||||
|
||||
recent_articles = []
|
||||
for url in urls:
|
||||
for jsonld in url['head_data'].get('jsonld', []):
|
||||
if isinstance(jsonld, dict) and 'datePublished' in jsonld:
|
||||
try:
|
||||
pub_date = datetime.fromisoformat(
|
||||
jsonld['datePublished'].replace('Z', '+00:00')
|
||||
)
|
||||
if pub_date > recent_cutoff:
|
||||
recent_articles.append(url)
|
||||
break
|
||||
except:
|
||||
continue
|
||||
|
||||
print(f"Found {len(recent_articles)} recent articles")
|
||||
```
|
||||
|
||||
### BM25 Relevance Scoring
|
||||
|
||||
```python
|
||||
async def relevance_scoring():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Find pages about Python async programming
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True, # Required for content-based scoring
|
||||
query="python async await concurrency",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.3, # Only 30%+ relevant pages
|
||||
max_urls=20
|
||||
)
|
||||
|
||||
urls = await seeder.urls("docs.python.org", config)
|
||||
|
||||
# Results are automatically sorted by relevance
|
||||
print("Most relevant Python async content:")
|
||||
for url in urls[:5]:
|
||||
score = url['relevance_score']
|
||||
title = url['head_data'].get('title', 'No title')
|
||||
print(f"[{score:.2f}] {title}")
|
||||
print(f" {url['url']}")
|
||||
|
||||
# URL-based scoring (when extract_head=False)
|
||||
async def url_based_scoring():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=False, # Fast URL-only scoring
|
||||
query="machine learning tutorial",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2
|
||||
)
|
||||
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Scoring based on URL structure, domain, path segments
|
||||
for url in urls[:5]:
|
||||
print(f"[{url['relevance_score']:.2f}] {url['url']}")
|
||||
|
||||
# Multi-concept queries
|
||||
async def complex_queries():
|
||||
queries = [
|
||||
"data science pandas numpy visualization",
|
||||
"web scraping automation selenium",
|
||||
"machine learning tensorflow pytorch",
|
||||
"api documentation rest graphql"
|
||||
]
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
all_results = []
|
||||
|
||||
for query in queries:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query=query,
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.4,
|
||||
max_urls=10
|
||||
)
|
||||
|
||||
urls = await seeder.urls("learning-site.com", config)
|
||||
all_results.extend(urls)
|
||||
|
||||
# Remove duplicates while preserving order
|
||||
seen = set()
|
||||
unique_results = []
|
||||
for url in all_results:
|
||||
if url['url'] not in seen:
|
||||
seen.add(url['url'])
|
||||
unique_results.append(url)
|
||||
|
||||
print(f"Found {len(unique_results)} unique pages across all topics")
|
||||
```
|
||||
|
||||
### Live URL Validation
|
||||
|
||||
```python
|
||||
async def url_validation():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
live_check=True, # Verify URLs are accessible
|
||||
concurrency=15, # Parallel HEAD requests
|
||||
hits_per_sec=8, # Rate limiting
|
||||
max_urls=100
|
||||
)
|
||||
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Analyze results
|
||||
valid_urls = [u for u in urls if u['status'] == 'valid']
|
||||
invalid_urls = [u for u in urls if u['status'] == 'not_valid']
|
||||
|
||||
print(f"✅ Valid URLs: {len(valid_urls)}")
|
||||
print(f"❌ Invalid URLs: {len(invalid_urls)}")
|
||||
print(f"📊 Success rate: {len(valid_urls)/len(urls)*100:.1f}%")
|
||||
|
||||
# Show some invalid URLs for debugging
|
||||
if invalid_urls:
|
||||
print("\nSample invalid URLs:")
|
||||
for url in invalid_urls[:3]:
|
||||
print(f" - {url['url']}")
|
||||
|
||||
# Combined validation and metadata
|
||||
async def comprehensive_validation():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
live_check=True, # Verify accessibility
|
||||
extract_head=True, # Get metadata
|
||||
query="tutorial guide", # Relevance scoring
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.2,
|
||||
concurrency=10,
|
||||
max_urls=50
|
||||
)
|
||||
|
||||
urls = await seeder.urls("docs.example.com", config)
|
||||
|
||||
# Filter for valid, relevant tutorials
|
||||
good_tutorials = [
|
||||
url for url in urls
|
||||
if url['status'] == 'valid' and
|
||||
url['relevance_score'] > 0.3 and
|
||||
'tutorial' in url['head_data'].get('title', '').lower()
|
||||
]
|
||||
|
||||
print(f"Found {len(good_tutorials)} high-quality tutorials")
|
||||
```
|
||||
|
||||
### Multi-Domain Discovery
|
||||
|
||||
```python
|
||||
async def multi_domain_research():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Research Python tutorials across multiple sites
|
||||
domains = [
|
||||
"docs.python.org",
|
||||
"realpython.com",
|
||||
"python-course.eu",
|
||||
"tutorialspoint.com"
|
||||
]
|
||||
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
query="python beginner tutorial basics",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.3,
|
||||
max_urls=15 # Per domain
|
||||
)
|
||||
|
||||
# Discover across all domains in parallel
|
||||
results = await seeder.many_urls(domains, config)
|
||||
|
||||
# Collect and rank all tutorials
|
||||
all_tutorials = []
|
||||
for domain, urls in results.items():
|
||||
for url in urls:
|
||||
url['domain'] = domain
|
||||
all_tutorials.append(url)
|
||||
|
||||
# Sort by relevance across all domains
|
||||
all_tutorials.sort(key=lambda x: x['relevance_score'], reverse=True)
|
||||
|
||||
print(f"Top 10 Python tutorials across {len(domains)} sites:")
|
||||
for i, tutorial in enumerate(all_tutorials[:10], 1):
|
||||
score = tutorial['relevance_score']
|
||||
title = tutorial['head_data'].get('title', 'No title')[:60]
|
||||
domain = tutorial['domain']
|
||||
print(f"{i:2d}. [{score:.2f}] {title}")
|
||||
print(f" {domain}")
|
||||
|
||||
# Competitor analysis
|
||||
async def competitor_analysis():
|
||||
competitors = ["competitor1.com", "competitor2.com", "competitor3.com"]
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
pattern="*/blog/*",
|
||||
max_urls=50
|
||||
)
|
||||
|
||||
results = await seeder.many_urls(competitors, config)
|
||||
|
||||
# Analyze content strategies
|
||||
for domain, urls in results.items():
|
||||
content_types = {}
|
||||
|
||||
for url in urls:
|
||||
# Extract content type from metadata
|
||||
meta = url['head_data'].get('meta', {})
|
||||
og_type = meta.get('og:type', 'unknown')
|
||||
content_types[og_type] = content_types.get(og_type, 0) + 1
|
||||
|
||||
print(f"\n{domain} content distribution:")
|
||||
for ctype, count in sorted(content_types.items(),
|
||||
key=lambda x: x[1], reverse=True):
|
||||
print(f" {ctype}: {count}")
|
||||
```
|
||||
|
||||
### Complete Pipeline: Discovery → Filter → Crawl
|
||||
|
||||
```python
|
||||
async def smart_research_pipeline():
|
||||
"""Complete pipeline: discover URLs, filter by relevance, crawl top results"""
|
||||
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Step 1: Discover relevant URLs
|
||||
print("🔍 Discovering URLs...")
|
||||
config = SeedingConfig(
|
||||
source="sitemap+cc",
|
||||
extract_head=True,
|
||||
query="machine learning deep learning tutorial",
|
||||
scoring_method="bm25",
|
||||
score_threshold=0.4,
|
||||
max_urls=100
|
||||
)
|
||||
|
||||
urls = await seeder.urls("example.com", config)
|
||||
print(f" Found {len(urls)} relevant URLs")
|
||||
|
||||
# Step 2: Select top articles
|
||||
top_articles = sorted(urls,
|
||||
key=lambda x: x['relevance_score'],
|
||||
reverse=True)[:10]
|
||||
|
||||
print(f" Selected top {len(top_articles)} for crawling")
|
||||
|
||||
# Step 3: Show what we're about to crawl
|
||||
print("\n📋 Articles to crawl:")
|
||||
for i, article in enumerate(top_articles, 1):
|
||||
score = article['relevance_score']
|
||||
title = article['head_data'].get('title', 'No title')[:60]
|
||||
print(f" {i}. [{score:.2f}] {title}")
|
||||
|
||||
# Step 4: Crawl selected articles
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
print(f"\n🕷️ Crawling {len(top_articles)} articles...")
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
config = CrawlerRunConfig(
|
||||
only_text=True,
|
||||
word_count_threshold=200,
|
||||
stream=True # Process results as they come
|
||||
)
|
||||
|
||||
# Extract URLs and crawl
|
||||
article_urls = [article['url'] for article in top_articles]
|
||||
|
||||
crawled_count = 0
|
||||
async for result in await crawler.arun_many(article_urls, config=config):
|
||||
if result.success:
|
||||
crawled_count += 1
|
||||
word_count = len(result.markdown.raw_markdown.split())
|
||||
print(f" ✅ [{crawled_count}/{len(article_urls)}] "
|
||||
f"{word_count} words from {result.url[:50]}...")
|
||||
else:
|
||||
print(f" ❌ Failed: {result.url[:50]}...")
|
||||
|
||||
print(f"\n✨ Successfully crawled {crawled_count} articles!")
|
||||
|
||||
asyncio.run(smart_research_pipeline())
|
||||
```
|
||||
|
||||
### Advanced Features and Performance
|
||||
|
||||
```python
|
||||
# Cache management
|
||||
async def cache_management():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# First run - populate cache
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
force=True # Bypass cache, fetch fresh
|
||||
)
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Subsequent runs - use cache (much faster)
|
||||
config = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
force=False # Use cache
|
||||
)
|
||||
urls = await seeder.urls("example.com", config)
|
||||
|
||||
# Performance optimization
|
||||
async def performance_tuning():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# High-performance configuration
|
||||
config = SeedingConfig(
|
||||
source="cc",
|
||||
concurrency=50, # Many parallel workers
|
||||
hits_per_sec=20, # High rate limit
|
||||
max_urls=10000, # Large dataset
|
||||
extract_head=False, # Skip metadata for speed
|
||||
filter_nonsense_urls=True # Auto-filter utility URLs
|
||||
)
|
||||
|
||||
import time
|
||||
start = time.time()
|
||||
urls = await seeder.urls("large-site.com", config)
|
||||
elapsed = time.time() - start
|
||||
|
||||
print(f"Processed {len(urls)} URLs in {elapsed:.2f}s")
|
||||
print(f"Speed: {len(urls)/elapsed:.0f} URLs/second")
|
||||
|
||||
# Memory-safe processing for large domains
|
||||
async def large_domain_processing():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Safe for domains with 1M+ URLs
|
||||
config = SeedingConfig(
|
||||
source="cc+sitemap",
|
||||
concurrency=50, # Bounded queue adapts to this
|
||||
max_urls=100000, # Process in batches
|
||||
filter_nonsense_urls=True
|
||||
)
|
||||
|
||||
# The seeder automatically manages memory by:
|
||||
# - Using bounded queues (prevents RAM spikes)
|
||||
# - Applying backpressure when queue is full
|
||||
# - Processing URLs as they're discovered
|
||||
urls = await seeder.urls("huge-site.com", config)
|
||||
|
||||
# Configuration cloning and reuse
|
||||
config_base = SeedingConfig(
|
||||
source="sitemap",
|
||||
extract_head=True,
|
||||
concurrency=20
|
||||
)
|
||||
|
||||
# Create variations
|
||||
blog_config = config_base.clone(pattern="*/blog/*")
|
||||
docs_config = config_base.clone(
|
||||
pattern="*/docs/*",
|
||||
query="API documentation",
|
||||
scoring_method="bm25"
|
||||
)
|
||||
fast_config = config_base.clone(
|
||||
extract_head=False,
|
||||
concurrency=100,
|
||||
hits_per_sec=50
|
||||
)
|
||||
```
|
||||
|
||||
### Troubleshooting and Best Practices
|
||||
|
||||
```python
|
||||
# Common issues and solutions
|
||||
async def troubleshooting_guide():
|
||||
async with AsyncUrlSeeder() as seeder:
|
||||
# Issue: No URLs found
|
||||
try:
|
||||
config = SeedingConfig(source="sitemap", pattern="*/nonexistent/*")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
if not urls:
|
||||
# Solution: Try broader pattern or different source
|
||||
config = SeedingConfig(source="cc+sitemap", pattern="*")
|
||||
urls = await seeder.urls("example.com", config)
|
||||
except Exception as e:
|
||||
print(f"Discovery failed: {e}")
|
||||
|
||||
# Issue: Slow performance
|
||||
config = SeedingConfig(
|
||||
source="sitemap", # Faster than CC
|
||||
concurrency=10, # Reduce if hitting rate limits
|
||||
hits_per_sec=5, # Add rate limiting
|
||||
extract_head=False # Skip if metadata not needed
|
||||
)
|
||||
|
||||
# Issue: Low relevance scores
|
||||
config = SeedingConfig(
|
||||
query="specific detailed query terms",
|
||||
score_threshold=0.1, # Lower threshold
|
||||
scoring_method="bm25"
|
||||
)
|
||||
|
||||
# Issue: Memory issues with large sites
|
||||
config = SeedingConfig(
|
||||
max_urls=10000, # Limit results
|
||||
concurrency=20, # Reduce concurrency
|
||||
source="sitemap" # Use sitemap only
|
||||
)
|
||||
|
||||
# Performance benchmarks
|
||||
print("""
|
||||
Typical performance on standard connection:
|
||||
- Sitemap discovery: 100-1,000 URLs/second
|
||||
- Common Crawl discovery: 50-500 URLs/second
|
||||
- HEAD checking: 10-50 URLs/second
|
||||
- Head extraction: 5-20 URLs/second
|
||||
- BM25 scoring: 10,000+ URLs/second
|
||||
""")
|
||||
|
||||
# Best practices
|
||||
best_practices = """
|
||||
✅ Use context manager: async with AsyncUrlSeeder() as seeder
|
||||
✅ Start with sitemaps (faster), add CC if needed
|
||||
✅ Use extract_head=True only when you need metadata
|
||||
✅ Set reasonable max_urls to limit processing
|
||||
✅ Add rate limiting for respectful crawling
|
||||
✅ Cache results with force=False for repeated operations
|
||||
✅ Filter nonsense URLs (enabled by default)
|
||||
✅ Use specific patterns to reduce irrelevant results
|
||||
"""
|
||||
```
|
||||
|
||||
**📖 Learn more:** [Complete URL Seeding Guide](https://docs.crawl4ai.com/core/url-seeding/), [SeedingConfig Reference](https://docs.crawl4ai.com/api/parameters/), [Multi-URL Crawling](https://docs.crawl4ai.com/advanced/multi-url-crawling/)
|
||||
File diff suppressed because one or more lines are too long
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,556 +0,0 @@
|
||||
# Detailed Outline for crawl4ai - config_objects Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_config_objects.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2024-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Configuration Objects in Crawl4ai
|
||||
|
||||
* **1.1. Purpose of Configuration Objects**
|
||||
* Explanation: Configuration objects in `crawl4ai` serve to centralize and manage settings for various components and behaviors of the library. This includes browser setup, individual crawl run parameters, LLM provider interactions, proxy settings, and more.
|
||||
* Benefit: This approach enhances code readability by grouping related settings, improves maintainability by providing a clear structure for configurations, and offers ease of customization for users to tailor the library's behavior to their specific needs.
|
||||
* **1.2. General Principles and Usage**
|
||||
* **1.2.1. Immutability/Cloning:**
|
||||
* Concept: Most configuration objects are designed with a `clone()` method, allowing users to create modified copies without altering the original configuration instance. This promotes safer state management, especially when reusing base configurations for multiple tasks.
|
||||
* Method: `clone(**kwargs)` on most configuration objects.
|
||||
* **1.2.2. Serialization and Deserialization:**
|
||||
* Concept: `crawl4ai` configuration objects can be serialized to dictionary format (e.g., for saving to JSON) and deserialized back into their respective class instances.
|
||||
* Methods:
|
||||
* `dump() -> dict`: Serializes the object to a dictionary suitable for JSON, often using the internal `to_serializable_dict` helper.
|
||||
* `load(data: dict) -> ConfigClass` (Static Method): Deserializes an object from a dictionary, often using the internal `from_serializable_dict` helper.
|
||||
* `to_dict() -> dict`: Converts the object to a standard Python dictionary.
|
||||
* `from_dict(data: dict) -> ConfigClass` (Static Method): Creates an instance from a standard Python dictionary.
|
||||
* Helper Functions:
|
||||
* `crawl4ai.async_configs.to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`: Recursively converts objects into a serializable dictionary format, handling complex types like enums and nested objects.
|
||||
* `crawl4ai.async_configs.from_serializable_dict(data: Any) -> Any`: Reconstructs Python objects from the serializable dictionary format.
|
||||
* **1.3. Scope of this Document**
|
||||
* Statement: This document provides a factual API reference for the primary configuration objects within the `crawl4ai` library, detailing their purpose, initialization parameters, attributes, and key methods.
|
||||
|
||||
## 2. Core Configuration Objects
|
||||
|
||||
### 2.1. `BrowserConfig`
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
* **2.1.1. Purpose:**
|
||||
* Description: The `BrowserConfig` class is used to configure the settings for a browser instance and its associated contexts when using browser-based crawler strategies like `AsyncPlaywrightCrawlerStrategy`. It centralizes all parameters that affect the creation and behavior of the browser.
|
||||
* **2.1.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class BrowserConfig:
|
||||
def __init__(
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
browser_mode: str = "dedicated",
|
||||
use_managed_browser: bool = False,
|
||||
cdp_url: Optional[str] = None,
|
||||
use_persistent_context: bool = False,
|
||||
user_data_dir: Optional[str] = None,
|
||||
chrome_channel: Optional[str] = "chromium", # Note: 'channel' is preferred
|
||||
channel: Optional[str] = "chromium",
|
||||
proxy: Optional[str] = None,
|
||||
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: Optional[dict] = None,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: Optional[str] = None,
|
||||
storage_state: Optional[Union[str, dict]] = None,
|
||||
ignore_https_errors: bool = True,
|
||||
java_script_enabled: bool = True,
|
||||
sleep_on_close: bool = False,
|
||||
verbose: bool = True,
|
||||
cookies: Optional[List[dict]] = None,
|
||||
headers: Optional[dict] = None,
|
||||
user_agent: Optional[str] = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
|
||||
user_agent_mode: Optional[str] = "",
|
||||
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
|
||||
text_mode: bool = False,
|
||||
light_mode: bool = False,
|
||||
extra_args: Optional[List[str]] = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost"
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `browser_type (str, default: "chromium")`: Specifies the browser engine to use. Supported values: `"chromium"`, `"firefox"`, `"webkit"`.
|
||||
* `headless (bool, default: True)`: If `True`, runs the browser without a visible GUI. Set to `False` for debugging or visual interaction.
|
||||
* `browser_mode (str, default: "dedicated")`: Defines how the browser is initialized. Options: `"builtin"` (uses built-in CDP), `"dedicated"` (new instance each time), `"cdp"` (connects to an existing CDP endpoint specified by `cdp_url`), `"docker"` (runs browser in a Docker container).
|
||||
* `use_managed_browser (bool, default: False)`: If `True`, launches the browser using a managed approach (e.g., via CDP or Docker), allowing for more advanced control. Automatically set to `True` if `browser_mode` is `"builtin"`, `"docker"`, or if `cdp_url` is provided, or if `use_persistent_context` is `True`.
|
||||
* `cdp_url (Optional[str], default: None)`: The URL for the Chrome DevTools Protocol (CDP) endpoint. If not provided and `use_managed_browser` is active, it might be set by an internal browser manager.
|
||||
* `use_persistent_context (bool, default: False)`: If `True`, uses a persistent browser context (profile), saving cookies, localStorage, etc., across sessions. Requires `user_data_dir`. Sets `use_managed_browser=True`.
|
||||
* `user_data_dir (Optional[str], default: None)`: Path to a directory for storing user data for persistent sessions. If `None` and `use_persistent_context` is `True`, a temporary directory might be used.
|
||||
* `chrome_channel (Optional[str], default: "chromium")`: Specifies the Chrome channel (e.g., "chrome", "msedge", "chromium-beta"). Only applicable if `browser_type` is "chromium".
|
||||
* `channel (Optional[str], default: "chromium")`: Preferred alias for `chrome_channel`. Set to `""` for Firefox or WebKit.
|
||||
* `proxy (Optional[str], default: None)`: A string representing the proxy server URL (e.g., "http://username:password@proxy.example.com:8080").
|
||||
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: A `ProxyConfig` object or a dictionary specifying detailed proxy settings. Overrides the `proxy` string if both are provided.
|
||||
* `viewport_width (int, default: 1080)`: Default width of the browser viewport in pixels.
|
||||
* `viewport_height (int, default: 600)`: Default height of the browser viewport in pixels.
|
||||
* `viewport (Optional[dict], default: None)`: A dictionary specifying viewport dimensions, e.g., `{"width": 1920, "height": 1080}`. If set, overrides `viewport_width` and `viewport_height`.
|
||||
* `accept_downloads (bool, default: False)`: If `True`, allows files to be downloaded by the browser.
|
||||
* `downloads_path (Optional[str], default: None)`: Directory path where downloaded files will be stored. Required if `accept_downloads` is `True`.
|
||||
* `storage_state (Optional[Union[str, dict]], default: None)`: Path to a JSON file or a dictionary containing the browser's storage state (cookies, localStorage, etc.) to load.
|
||||
* `ignore_https_errors (bool, default: True)`: If `True`, HTTPS certificate errors will be ignored.
|
||||
* `java_script_enabled (bool, default: True)`: If `True`, JavaScript execution is enabled on web pages.
|
||||
* `sleep_on_close (bool, default: False)`: If `True`, introduces a small delay before the browser is closed.
|
||||
* `verbose (bool, default: True)`: If `True`, enables verbose logging for browser operations.
|
||||
* `cookies (Optional[List[dict]], default: None)`: A list of cookie dictionaries to be set in the browser context. Each dictionary should conform to Playwright's cookie format.
|
||||
* `headers (Optional[dict], default: None)`: A dictionary of additional HTTP headers to be sent with every request made by the browser.
|
||||
* `user_agent (Optional[str], default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36")`: The User-Agent string the browser will use.
|
||||
* `user_agent_mode (Optional[str], default: "")`: Mode for generating the User-Agent string. If set (e.g., to "random"), `user_agent_generator_config` can be used.
|
||||
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration dictionary for the User-Agent generator if `user_agent_mode` is active.
|
||||
* `text_mode (bool, default: False)`: If `True`, attempts to disable images and other rich content to potentially speed up loading for text-focused crawls.
|
||||
* `light_mode (bool, default: False)`: If `True`, disables certain background browser features for potential performance gains.
|
||||
* `extra_args (Optional[List[str]], default: None)`: A list of additional command-line arguments to pass to the browser executable upon launch.
|
||||
* `debugging_port (int, default: 9222)`: The port to use for the browser's remote debugging protocol (CDP).
|
||||
* `host (str, default: "localhost")`: The host on which the browser's remote debugging protocol will listen.
|
||||
* **2.1.3. Key Public Attributes/Properties:**
|
||||
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||
* `browser_hint (str)`: [Read-only] - A string representing client hints (Sec-CH-UA) generated based on the `user_agent` string. This is automatically set during initialization.
|
||||
* **2.1.4. Key Public Methods:**
|
||||
* `from_kwargs(cls, kwargs: dict) -> BrowserConfig` (Static Method):
|
||||
* Purpose: Creates a `BrowserConfig` instance from a dictionary of keyword arguments.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `BrowserConfig` instance into a dictionary representation.
|
||||
* `clone(self, **kwargs) -> BrowserConfig`:
|
||||
* Purpose: Creates a deep copy of the current `BrowserConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||
* `dump(self) -> dict`:
|
||||
* Purpose: Serializes the `BrowserConfig` object into a dictionary format that is suitable for JSON storage or transmission, utilizing the `to_serializable_dict` helper.
|
||||
* `load(cls, data: dict) -> BrowserConfig` (Static Method):
|
||||
* Purpose: Deserializes a `BrowserConfig` object from a dictionary (typically one created by `dump()`), utilizing the `from_serializable_dict` helper.
|
||||
|
||||
### 2.2. `CrawlerRunConfig`
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
* **2.2.1. Purpose:**
|
||||
* Description: The `CrawlerRunConfig` class encapsulates all settings that control the behavior of a single crawl operation performed by `AsyncWebCrawler.arun()` or multiple operations within `AsyncWebCrawler.arun_many()`. This includes parameters for content processing, page interaction, caching, and media handling.
|
||||
* **2.2.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class CrawlerRunConfig:
|
||||
def __init__(
|
||||
self,
|
||||
url: Optional[str] = None,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: Optional[ExtractionStrategy] = None,
|
||||
chunking_strategy: Optional[ChunkingStrategy] = RegexChunking(),
|
||||
markdown_generator: Optional[MarkdownGenerationStrategy] = DefaultMarkdownGenerator(),
|
||||
only_text: bool = False,
|
||||
css_selector: Optional[str] = None,
|
||||
target_elements: Optional[List[str]] = None, # Default is [] in __init__
|
||||
excluded_tags: Optional[List[str]] = None, # Default is [] in __init__
|
||||
excluded_selector: Optional[str] = "", # Default is "" in __init__
|
||||
keep_data_attributes: bool = False,
|
||||
keep_attrs: Optional[List[str]] = None, # Default is [] in __init__
|
||||
remove_forms: bool = False,
|
||||
prettify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: Optional[ContentScrapingStrategy] = None, # Instantiated with WebScrapingStrategy() if None
|
||||
proxy_config: Optional[Union[ProxyConfig, dict]] = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
locale: Optional[str] = None,
|
||||
timezone_id: Optional[str] = None,
|
||||
geolocation: Optional[GeolocationConfig] = None,
|
||||
fetch_ssl_certificate: bool = False,
|
||||
cache_mode: CacheMode = CacheMode.BYPASS,
|
||||
session_id: Optional[str] = None,
|
||||
shared_data: Optional[dict] = None,
|
||||
wait_until: str = "domcontentloaded",
|
||||
page_timeout: int = PAGE_TIMEOUT,
|
||||
wait_for: Optional[str] = None,
|
||||
wait_for_timeout: Optional[int] = None,
|
||||
wait_for_images: bool = False,
|
||||
delay_before_return_html: float = 0.1,
|
||||
mean_delay: float = 0.1,
|
||||
max_range: float = 0.3,
|
||||
semaphore_count: int = 5,
|
||||
js_code: Optional[Union[str, List[str]]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
scroll_delay: float = 0.2,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
simulate_user: bool = False,
|
||||
override_navigator: bool = False,
|
||||
magic: bool = False,
|
||||
adjust_viewport_to_content: bool = False,
|
||||
screenshot: bool = False,
|
||||
screenshot_wait_for: Optional[float] = None,
|
||||
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
|
||||
pdf: bool = False,
|
||||
capture_mhtml: bool = False,
|
||||
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||
table_score_threshold: int = 7,
|
||||
exclude_external_images: bool = False,
|
||||
exclude_all_images: bool = False,
|
||||
exclude_social_media_domains: Optional[List[str]] = None, # Uses SOCIAL_MEDIA_DOMAINS if None
|
||||
exclude_external_links: bool = False,
|
||||
exclude_social_media_links: bool = False,
|
||||
exclude_domains: Optional[List[str]] = None, # Default is [] in __init__
|
||||
exclude_internal_links: bool = False,
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
capture_network_requests: bool = False,
|
||||
capture_console_messages: bool = False,
|
||||
method: str = "GET",
|
||||
stream: bool = False,
|
||||
check_robots_txt: bool = False,
|
||||
user_agent: Optional[str] = None,
|
||||
user_agent_mode: Optional[str] = None,
|
||||
user_agent_generator_config: Optional[dict] = None, # Default is {} in __init__
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
experimental: Optional[Dict[str, Any]] = None # Default is {} in __init__
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `url (Optional[str], default: None)`: The target URL for this specific crawl run.
|
||||
* `word_count_threshold (int, default: MIN_WORD_THRESHOLD)`: Minimum word count for a text block to be considered significant during content processing.
|
||||
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`: Strategy for extracting structured data from the page. If `None`, `NoExtractionStrategy` is used.
|
||||
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`: Strategy to split content into chunks before extraction.
|
||||
* `markdown_generator (Optional[MarkdownGenerationStrategy], default: DefaultMarkdownGenerator())`: Strategy for converting HTML to Markdown.
|
||||
* `only_text (bool, default: False)`: If `True`, attempts to extract only textual content, potentially ignoring structural elements beneficial for rich Markdown.
|
||||
* `css_selector (Optional[str], default: None)`: A CSS selector defining the primary region of the page to focus on for content extraction. The raw HTML is reduced to this region.
|
||||
* `target_elements (Optional[List[str]], default: [])`: A list of CSS selectors. If provided, only the content within these elements will be considered for Markdown generation and structured data extraction. Unlike `css_selector`, this does not reduce the raw HTML but scopes the processing.
|
||||
* `excluded_tags (Optional[List[str]], default: [])`: A list of HTML tag names (e.g., "nav", "footer") to be removed from the HTML before processing.
|
||||
* `excluded_selector (Optional[str], default: "")`: A CSS selector specifying elements to be removed from the HTML before processing.
|
||||
* `keep_data_attributes (bool, default: False)`: If `True`, `data-*` attributes on HTML elements are preserved during cleaning.
|
||||
* `keep_attrs (Optional[List[str]], default: [])`: A list of specific HTML attribute names to preserve during HTML cleaning.
|
||||
* `remove_forms (bool, default: False)`: If `True`, all `<form>` elements are removed from the HTML.
|
||||
* `prettify (bool, default: False)`: If `True`, the cleaned HTML output is "prettified" for better readability.
|
||||
* `parser_type (str, default: "lxml")`: The HTML parser to be used by the scraping strategy (e.g., "lxml", "html.parser").
|
||||
* `scraping_strategy (Optional[ContentScrapingStrategy], default: WebScrapingStrategy())`: The strategy for scraping content from the HTML.
|
||||
* `proxy_config (Optional[Union[ProxyConfig, dict]], default: None)`: Proxy configuration for this specific run. Overrides any proxy settings in `BrowserConfig`.
|
||||
* `proxy_rotation_strategy (Optional[ProxyRotationStrategy], default: None)`: Strategy to use for rotating proxies if multiple are available.
|
||||
* `locale (Optional[str], default: None)`: Locale to set for the browser context (e.g., "en-US", "fr-FR"). Affects `Accept-Language` header and JavaScript `navigator.language`.
|
||||
* `timezone_id (Optional[str], default: None)`: Timezone ID to set for the browser context (e.g., "America/New_York", "Europe/Paris"). Affects JavaScript `Date` objects.
|
||||
* `geolocation (Optional[GeolocationConfig], default: None)`: A `GeolocationConfig` object or dictionary to set the browser's mock geolocation.
|
||||
* `fetch_ssl_certificate (bool, default: False)`: If `True`, the SSL certificate information for the main URL will be fetched and included in the `CrawlResult`.
|
||||
* `cache_mode (CacheMode, default: CacheMode.BYPASS)`: Defines caching behavior for this run. See `CacheMode` enum for options.
|
||||
* `session_id (Optional[str], default: None)`: An identifier for a browser session. If provided, `crawl4ai` will attempt to reuse an existing page/context associated with this ID, or create a new one and associate it.
|
||||
* `shared_data (Optional[dict], default: None)`: A dictionary for passing custom data between hooks during the crawl lifecycle.
|
||||
* `wait_until (str, default: "domcontentloaded")`: Playwright's page navigation wait condition (e.g., "load", "domcontentloaded", "networkidle", "commit").
|
||||
* `page_timeout (int, default: PAGE_TIMEOUT)`: Maximum time in milliseconds for page navigation and other page operations.
|
||||
* `wait_for (Optional[str], default: None)`: A CSS selector or a JavaScript expression (prefixed with "js:"). The crawler will wait until this condition is met before proceeding.
|
||||
* `wait_for_timeout (Optional[int], default: None)`: Specific timeout in milliseconds for the `wait_for` condition. If `None`, `page_timeout` is used.
|
||||
* `wait_for_images (bool, default: False)`: If `True`, attempts to wait for all images on the page to finish loading.
|
||||
* `delay_before_return_html (float, default: 0.1)`: Delay in seconds to wait just before the final HTML content is retrieved from the page.
|
||||
* `mean_delay (float, default: 0.1)`: Used with `arun_many`. The mean base delay in seconds between processing URLs.
|
||||
* `max_range (float, default: 0.3)`: Used with `arun_many`. The maximum additional random delay (added to `mean_delay`) between processing URLs.
|
||||
* `semaphore_count (int, default: 5)`: Used with `arun_many` and semaphore-based dispatchers. The maximum number of concurrent crawl operations.
|
||||
* `js_code (Optional[Union[str, List[str]]], default: None)`: A string or list of strings containing JavaScript code to be executed on the page after it loads.
|
||||
* `js_only (bool, default: False)`: If `True`, indicates that this `arun` call is primarily for JavaScript execution on an already loaded page (within a session) and a full page navigation might not be needed.
|
||||
* `ignore_body_visibility (bool, default: True)`: If `True`, proceeds with content extraction even if the `<body>` element is not deemed visible by Playwright.
|
||||
* `scan_full_page (bool, default: False)`: If `True`, the crawler will attempt to scroll through the entire page to trigger lazy-loaded content.
|
||||
* `scroll_delay (float, default: 0.2)`: Delay in seconds between each scroll step when `scan_full_page` is `True`.
|
||||
* `process_iframes (bool, default: False)`: If `True`, attempts to extract and inline content from `<iframe>` elements.
|
||||
* `remove_overlay_elements (bool, default: False)`: If `True`, attempts to identify and remove common overlay elements (popups, cookie banners) before content extraction.
|
||||
* `simulate_user (bool, default: False)`: If `True`, enables heuristics to simulate user interactions (like mouse movements) to potentially bypass some anti-bot measures.
|
||||
* `override_navigator (bool, default: False)`: If `True`, overrides certain JavaScript `navigator` properties to appear more like a standard browser.
|
||||
* `magic (bool, default: False)`: If `True`, enables a combination of techniques (like `remove_overlay_elements`, `simulate_user`) to try and handle dynamic/obfuscated sites.
|
||||
* `adjust_viewport_to_content (bool, default: False)`: If `True`, attempts to adjust the browser viewport size to match the dimensions of the page content.
|
||||
* `screenshot (bool, default: False)`: If `True`, a screenshot of the page will be taken and included in `CrawlResult.screenshot`.
|
||||
* `screenshot_wait_for (Optional[float], default: None)`: Additional delay in seconds to wait before taking the screenshot.
|
||||
* `screenshot_height_threshold (int, default: SCREENSHOT_HEIGHT_THRESHOLD)`: If page height exceeds this, a full-page screenshot strategy might be different.
|
||||
* `pdf (bool, default: False)`: If `True`, a PDF version of the page will be generated and included in `CrawlResult.pdf`.
|
||||
* `capture_mhtml (bool, default: False)`: If `True`, an MHTML archive of the page will be captured and included in `CrawlResult.mhtml`.
|
||||
* `image_description_min_word_threshold (int, default: IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD)`: Minimum word count for surrounding text to be considered as an image description.
|
||||
* `image_score_threshold (int, default: IMAGE_SCORE_THRESHOLD)`: Heuristic score threshold for an image to be included in `CrawlResult.media`.
|
||||
* `table_score_threshold (int, default: 7)`: Heuristic score threshold for an HTML table to be considered a data table and included in `CrawlResult.media`.
|
||||
* `exclude_external_images (bool, default: False)`: If `True`, images hosted on different domains than the main page URL are excluded.
|
||||
* `exclude_all_images (bool, default: False)`: If `True`, all images are excluded from `CrawlResult.media`.
|
||||
* `exclude_social_media_domains (Optional[List[str]], default: SOCIAL_MEDIA_DOMAINS from config)`: List of social media domains whose links should be excluded.
|
||||
* `exclude_external_links (bool, default: False)`: If `True`, all links pointing to external domains are excluded from `CrawlResult.links`.
|
||||
* `exclude_social_media_links (bool, default: False)`: If `True`, links to domains in `exclude_social_media_domains` are excluded.
|
||||
* `exclude_domains (Optional[List[str]], default: [])`: A list of specific domains whose links should be excluded.
|
||||
* `exclude_internal_links (bool, default: False)`: If `True`, all links pointing to the same domain are excluded.
|
||||
* `verbose (bool, default: True)`: Enables verbose logging for this specific crawl run. Overrides `BrowserConfig.verbose`.
|
||||
* `log_console (bool, default: False)`: If `True`, browser console messages are captured (requires `capture_console_messages=True` to be effective).
|
||||
* `capture_network_requests (bool, default: False)`: If `True`, captures details of network requests and responses made by the page.
|
||||
* `capture_console_messages (bool, default: False)`: If `True`, captures messages logged to the browser's console.
|
||||
* `method (str, default: "GET")`: HTTP method to use, primarily for `AsyncHTTPCrawlerStrategy`.
|
||||
* `stream (bool, default: False)`: If `True` when using `arun_many`, results are yielded as an async generator instead of returned as a list at the end.
|
||||
* `check_robots_txt (bool, default: False)`: If `True`, `robots.txt` rules for the domain will be checked and respected.
|
||||
* `user_agent (Optional[str], default: None)`: User-Agent string for this specific run. Overrides `BrowserConfig.user_agent`.
|
||||
* `user_agent_mode (Optional[str], default: None)`: User-Agent generation mode for this specific run.
|
||||
* `user_agent_generator_config (Optional[dict], default: {})`: Configuration for User-Agent generator for this run.
|
||||
* `deep_crawl_strategy (Optional[DeepCrawlStrategy], default: None)`: Strategy to use for deep crawling beyond the initial URL.
|
||||
* `experimental (Optional[Dict[str, Any]], default: {})`: A dictionary for passing experimental or beta parameters.
|
||||
* **2.2.3. Key Public Attributes/Properties:**
|
||||
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||
* **2.2.4. Deprecated Property Handling (`__getattr__`, `_UNWANTED_PROPS`)**
|
||||
* Behavior: Attempting to access a deprecated property (e.g., `bypass_cache`, `disable_cache`, `no_cache_read`, `no_cache_write`) raises an `AttributeError`. The error message directs the user to use the `cache_mode` parameter with the appropriate `CacheMode` enum member instead.
|
||||
* List of Deprecated Properties and their `CacheMode` Equivalents:
|
||||
* `bypass_cache`: Use `cache_mode=CacheMode.BYPASS`.
|
||||
* `disable_cache`: Use `cache_mode=CacheMode.DISABLE`.
|
||||
* `no_cache_read`: Use `cache_mode=CacheMode.WRITE_ONLY`.
|
||||
* `no_cache_write`: Use `cache_mode=CacheMode.READ_ONLY`.
|
||||
* **2.2.5. Key Public Methods:**
|
||||
* `from_kwargs(cls, kwargs: dict) -> CrawlerRunConfig` (Static Method):
|
||||
* Purpose: Creates a `CrawlerRunConfig` instance from a dictionary of keyword arguments.
|
||||
* `dump(self) -> dict`:
|
||||
* Purpose: Serializes the `CrawlerRunConfig` object to a dictionary suitable for JSON storage, handling complex nested objects using `to_serializable_dict`.
|
||||
* `load(cls, data: dict) -> CrawlerRunConfig` (Static Method):
|
||||
* Purpose: Deserializes a `CrawlerRunConfig` object from a dictionary (typically one created by `dump()`), using `from_serializable_dict`.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `CrawlerRunConfig` instance into a dictionary representation. Complex objects like strategies are typically represented by their class name or a simplified form.
|
||||
* `clone(self, **kwargs) -> CrawlerRunConfig`:
|
||||
* Purpose: Creates a deep copy of the current `CrawlerRunConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||
|
||||
### 2.3. `LLMConfig`
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
* **2.3.1. Purpose:**
|
||||
* Description: The `LLMConfig` class provides configuration for interacting with Large Language Model (LLM) providers. It includes settings for the provider name, API token, base URL, and various model-specific parameters like temperature and max tokens.
|
||||
* **2.3.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class LLMConfig:
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = DEFAULT_PROVIDER, # e.g., "openai/gpt-4o-mini"
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
frequency_penalty: Optional[float] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
stop: Optional[List[str]] = None,
|
||||
n: Optional[int] = None,
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `provider (str, default: DEFAULT_PROVIDER)`: The identifier for the LLM provider and model (e.g., "openai/gpt-4o-mini", "ollama/llama3.3", "gemini/gemini-1.5-pro").
|
||||
* `api_token (Optional[str], default: None)`: The API token for authenticating with the LLM provider. If `None`, it attempts to load from environment variables based on the provider (e.g., `OPENAI_API_KEY` for OpenAI, `GEMINI_API_KEY` for Gemini). Can also be set as "env:YOUR_ENV_VAR_NAME".
|
||||
* `base_url (Optional[str], default: None)`: A custom base URL for the LLM API endpoint, useful for self-hosted models or proxies.
|
||||
* `temperature (Optional[float], default: None)`: Controls the randomness of the LLM's output. Higher values (e.g., 0.8) make output more random, lower values (e.g., 0.2) make it more deterministic.
|
||||
* `max_tokens (Optional[int], default: None)`: The maximum number of tokens the LLM should generate in its response.
|
||||
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. The model considers only tokens with cumulative probability mass up to `top_p`.
|
||||
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing the model's likelihood to repeat the same line verbatim.
|
||||
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, increasing the model's likelihood to talk about new topics.
|
||||
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
|
||||
* `n (Optional[int], default: None)`: The number of completions to generate for each prompt.
|
||||
* **2.3.3. Key Public Attributes/Properties:**
|
||||
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||
* **2.3.4. Key Public Methods:**
|
||||
* `from_kwargs(cls, kwargs: dict) -> LLMConfig` (Static Method):
|
||||
* Purpose: Creates an `LLMConfig` instance from a dictionary of keyword arguments.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `LLMConfig` instance into a dictionary representation.
|
||||
* `clone(self, **kwargs) -> LLMConfig`:
|
||||
* Purpose: Creates a deep copy of the current `LLMConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||
|
||||
### 2.4. `GeolocationConfig`
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
* **2.4.1. Purpose:**
|
||||
* Description: The `GeolocationConfig` class stores settings for mocking the browser's geolocation, including latitude, longitude, and accuracy.
|
||||
* **2.4.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class GeolocationConfig:
|
||||
def __init__(
|
||||
self,
|
||||
latitude: float,
|
||||
longitude: float,
|
||||
accuracy: Optional[float] = 0.0
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `latitude (float)`: The latitude coordinate (e.g., 37.7749 for San Francisco).
|
||||
* `longitude (float)`: The longitude coordinate (e.g., -122.4194 for San Francisco).
|
||||
* `accuracy (Optional[float], default: 0.0)`: The accuracy of the geolocation in meters.
|
||||
* **2.4.3. Key Public Attributes/Properties:**
|
||||
* `latitude (float)`: Stores the latitude.
|
||||
* `longitude (float)`: Stores the longitude.
|
||||
* `accuracy (Optional[float])`: Stores the accuracy.
|
||||
* **2.4.4. Key Public Methods:**
|
||||
* `from_dict(cls, geo_dict: dict) -> GeolocationConfig` (Static Method):
|
||||
* Purpose: Creates a `GeolocationConfig` instance from a dictionary.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `GeolocationConfig` instance to a dictionary: `{"latitude": ..., "longitude": ..., "accuracy": ...}`.
|
||||
* `clone(self, **kwargs) -> GeolocationConfig`:
|
||||
* Purpose: Creates a copy of the `GeolocationConfig` instance, allowing for overriding specific attributes with `kwargs`.
|
||||
|
||||
### 2.5. `ProxyConfig`
|
||||
Located in `crawl4ai.async_configs` (and `crawl4ai.proxy_strategy`).
|
||||
|
||||
* **2.5.1. Purpose:**
|
||||
* Description: The `ProxyConfig` class encapsulates the configuration for a single proxy server, including its address, authentication credentials (if any), and optionally its public IP address.
|
||||
* **2.5.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `server (str)`: The proxy server URL, including protocol and port (e.g., "http://127.0.0.1:8080", "socks5://proxy.example.com:1080").
|
||||
* `username (Optional[str], default: None)`: The username for proxy authentication, if required.
|
||||
* `password (Optional[str], default: None)`: The password for proxy authentication, if required.
|
||||
* `ip (Optional[str], default: None)`: The public IP address of the proxy server. If not provided, it will be automatically extracted from the `server` string if possible.
|
||||
* **2.5.3. Key Public Attributes/Properties:**
|
||||
* `server (str)`: The proxy server URL.
|
||||
* `username (Optional[str])`: The username for proxy authentication.
|
||||
* `password (Optional[str])`: The password for proxy authentication.
|
||||
* `ip (Optional[str])`: The public IP address of the proxy. This is either user-provided or automatically extracted from the `server` string during initialization via the internal `_extract_ip_from_server` method.
|
||||
* **2.5.4. Key Public Methods:**
|
||||
* `_extract_ip_from_server(self) -> Optional[str]` (Internal method):
|
||||
* Purpose: Extracts the IP address component from the `self.server` URL string.
|
||||
* `from_string(cls, proxy_str: str) -> ProxyConfig` (Static Method):
|
||||
* Purpose: Creates a `ProxyConfig` instance from a string.
|
||||
* Formats:
|
||||
* `'ip:port:username:password'`
|
||||
* `'ip:port'` (no authentication)
|
||||
* `from_dict(cls, proxy_dict: dict) -> ProxyConfig` (Static Method):
|
||||
* Purpose: Creates a `ProxyConfig` instance from a dictionary with keys "server", "username", "password", and "ip".
|
||||
* `from_env(cls, env_var: str = "PROXIES") -> List[ProxyConfig]` (Static Method):
|
||||
* Purpose: Loads a list of `ProxyConfig` objects from a comma-separated environment variable. Each proxy string in the variable should conform to the format accepted by `from_string`.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `ProxyConfig` instance to a dictionary: `{"server": ..., "username": ..., "password": ..., "ip": ...}`.
|
||||
* `clone(self, **kwargs) -> ProxyConfig`:
|
||||
* Purpose: Creates a copy of the `ProxyConfig` instance, allowing for overriding specific attributes with `kwargs`.
|
||||
|
||||
### 2.6. `HTTPCrawlerConfig`
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
* **2.6.1. Purpose:**
|
||||
* Description: The `HTTPCrawlerConfig` class holds configuration settings specific to direct HTTP-based crawling strategies (e.g., `AsyncHTTPCrawlerStrategy`), which do not use a full browser environment.
|
||||
* **2.6.2. Initialization (`__init__`)**
|
||||
* Signature:
|
||||
```python
|
||||
class HTTPCrawlerConfig:
|
||||
def __init__(
|
||||
self,
|
||||
method: str = "GET",
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
json: Optional[Dict[str, Any]] = None,
|
||||
follow_redirects: bool = True,
|
||||
verify_ssl: bool = True,
|
||||
): ...
|
||||
```
|
||||
* Parameters:
|
||||
* `method (str, default: "GET")`: The HTTP method to use for the request (e.g., "GET", "POST", "PUT").
|
||||
* `headers (Optional[Dict[str, str]], default: None)`: A dictionary of custom HTTP headers to send with the request.
|
||||
* `data (Optional[Dict[str, Any]], default: None)`: Data to be sent in the body of the request, typically for "POST" or "PUT" requests (e.g., form data).
|
||||
* `json (Optional[Dict[str, Any]], default: None)`: JSON data to be sent in the body of the request. If provided, the `Content-Type` header is typically set to `application/json`.
|
||||
* `follow_redirects (bool, default: True)`: If `True`, the crawler will automatically follow HTTP redirects.
|
||||
* `verify_ssl (bool, default: True)`: If `True`, SSL certificates will be verified. Set to `False` to ignore SSL errors (use with caution).
|
||||
* **2.6.3. Key Public Attributes/Properties:**
|
||||
* All parameters listed in `__init__` are available as public attributes with the same names and types.
|
||||
* **2.6.4. Key Public Methods:**
|
||||
* `from_kwargs(cls, kwargs: dict) -> HTTPCrawlerConfig` (Static Method):
|
||||
* Purpose: Creates an `HTTPCrawlerConfig` instance from a dictionary of keyword arguments.
|
||||
* `to_dict(self) -> dict`:
|
||||
* Purpose: Converts the `HTTPCrawlerConfig` instance into a dictionary representation.
|
||||
* `clone(self, **kwargs) -> HTTPCrawlerConfig`:
|
||||
* Purpose: Creates a deep copy of the current `HTTPCrawlerConfig` instance. Keyword arguments can be provided to override specific attributes in the new instance.
|
||||
* `dump(self) -> dict`:
|
||||
* Purpose: Serializes the `HTTPCrawlerConfig` object to a dictionary.
|
||||
* `load(cls, data: dict) -> HTTPCrawlerConfig` (Static Method):
|
||||
* Purpose: Deserializes an `HTTPCrawlerConfig` object from a dictionary.
|
||||
|
||||
## 3. Enumerations and Helper Constants
|
||||
|
||||
### 3.1. `CacheMode` (Enum)
|
||||
Located in `crawl4ai.cache_context`.
|
||||
|
||||
* **3.1.1. Purpose:**
|
||||
* Description: The `CacheMode` enumeration defines the different caching behaviors that can be applied to a crawl operation. It is used in `CrawlerRunConfig` to control how results are read from and written to the cache.
|
||||
* **3.1.2. Enum Members:**
|
||||
* `ENABLE (str)`: Value: "ENABLE". Description: Enables normal caching behavior. The crawler will attempt to read from the cache first, and if a result is not found or is stale, it will perform the crawl and write the new result to the cache.
|
||||
* `DISABLE (str)`: Value: "DISABLE". Description: Disables all caching. The crawler will not read from or write to the cache. Every request will be a fresh crawl.
|
||||
* `READ_ONLY (str)`: Value: "READ_ONLY". Description: The crawler will only attempt to read from the cache. If a result is found, it will be used. If not, the crawl will not proceed further for that URL, and no new data will be written to the cache.
|
||||
* `WRITE_ONLY (str)`: Value: "WRITE_ONLY". Description: The crawler will not attempt to read from the cache. It will always perform a fresh crawl and then write the result to the cache.
|
||||
* `BYPASS (str)`: Value: "BYPASS". Description: The crawler will skip reading from the cache for this specific operation and will perform a fresh crawl. The result of this crawl *will* be written to the cache. This is the default `cache_mode` for `CrawlerRunConfig`.
|
||||
* **3.1.3. Usage:**
|
||||
* Example:
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, CacheMode
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.ENABLE) # Use cache fully
|
||||
config_bypass = CrawlerRunConfig(cache_mode=CacheMode.BYPASS) # Force fresh crawl, then cache
|
||||
```
|
||||
|
||||
## 4. Serialization Helper Functions
|
||||
Located in `crawl4ai.async_configs`.
|
||||
|
||||
### 4.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
|
||||
|
||||
* **4.1.1. Purpose:**
|
||||
* Description: This utility function recursively converts various Python objects, including `crawl4ai` configuration objects, into a dictionary format that is suitable for JSON serialization. It uses a `{ "type": "ClassName", "params": { ... } }` structure for custom class instances to enable proper deserialization later.
|
||||
* **4.1.2. Parameters:**
|
||||
* `obj (Any)`: The Python object to be serialized.
|
||||
* `ignore_default_value (bool, default: False)`: If `True`, when serializing class instances, parameters whose current values match their `__init__` default values might be excluded from the "params" dictionary. (Note: The exact behavior depends on the availability of default values in the class signature and handling of empty/None values).
|
||||
* **4.1.3. Returns:**
|
||||
* `Dict`: A dictionary representation of the input object, structured for easy serialization (e.g., to JSON) and later deserialization by `from_serializable_dict`.
|
||||
* **4.1.4. Key Behaviors:**
|
||||
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
|
||||
* **Enums:** Serialized as `{"type": "EnumClassName", "params": enum_member.value}`.
|
||||
* **Datetime Objects:** Serialized to their ISO 8601 string representation.
|
||||
* **Lists, Tuples, Sets, Frozensets:** Serialized by recursively calling `to_serializable_dict` on each of their elements, returning a list.
|
||||
* **Plain Dictionaries:** Serialized as `{"type": "dict", "value": {key: serialized_value, ...}}`.
|
||||
* **Class Instances (e.g., Config Objects):**
|
||||
* The object's class name is stored in the "type" field.
|
||||
* Parameters from the `__init__` signature and attributes from `__slots__` (if defined) are collected.
|
||||
* Their current values are recursively serialized and stored in the "params" dictionary.
|
||||
* The structure is `{"type": "ClassName", "params": {"param_name": serialized_param_value, ...}}`.
|
||||
|
||||
### 4.2. `from_serializable_dict(data: Any) -> Any`
|
||||
|
||||
* **4.2.1. Purpose:**
|
||||
* Description: This utility function reconstructs Python objects, including `crawl4ai` configuration objects, from the serializable dictionary format previously created by `to_serializable_dict`.
|
||||
* **4.2.2. Parameters:**
|
||||
* `data (Any)`: The dictionary (or basic data type) to be deserialized. This is typically the output of `to_serializable_dict` after being, for example, loaded from a JSON string.
|
||||
* **4.2.3. Returns:**
|
||||
* `Any`: The reconstructed Python object (e.g., an instance of `BrowserConfig`, `LLMConfig`, a list, a plain dictionary, etc.).
|
||||
* **4.2.4. Key Behaviors:**
|
||||
* **Basic Types:** `str`, `int`, `float`, `bool`, `None` are returned as is.
|
||||
* **Typed Dictionaries (from `to_serializable_dict`):**
|
||||
* If `data` is a dictionary and contains a "type" key:
|
||||
* If `data["type"] == "dict"`, it reconstructs a plain Python dictionary from `data["value"]` by recursively deserializing its items.
|
||||
* Otherwise, it attempts to locate the class specified by `data["type"]` within the `crawl4ai` module.
|
||||
* If the class is an `Enum`, it instantiates the enum member using `data["params"]` (the enum value).
|
||||
* If it's a regular class, it recursively deserializes the items in `data["params"]` and uses them as keyword arguments (`**kwargs`) to instantiate the class.
|
||||
* **Lists:** If `data` is a list, it reconstructs a list by recursively calling `from_serializable_dict` on each of its elements.
|
||||
* **Legacy Dictionaries:** If `data` is a dictionary but does not conform to the "type" key structure (for backward compatibility), it attempts to deserialize its values.
|
||||
|
||||
## 5. Cross-References and Relationships
|
||||
|
||||
* **5.1. `BrowserConfig` Usage:**
|
||||
* Typically instantiated once and passed to the `AsyncWebCrawler` constructor via its `config` parameter.
|
||||
* `browser_config = BrowserConfig(headless=False)`
|
||||
* `crawler = AsyncWebCrawler(config=browser_config)`
|
||||
* It defines the global browser settings that will be used for all subsequent crawl operations unless overridden by `CrawlerRunConfig` on a per-run basis.
|
||||
* **5.2. `CrawlerRunConfig` Usage:**
|
||||
* Passed to the `arun()` or `arun_many()` methods of `AsyncWebCrawler`.
|
||||
* `run_config = CrawlerRunConfig(screenshot=True, cache_mode=CacheMode.BYPASS)`
|
||||
* `result = await crawler.arun(url="https://example.com", config=run_config)`
|
||||
* Allows for fine-grained control over individual crawl requests, overriding global settings from `BrowserConfig` or `AsyncWebCrawler`'s defaults where applicable (e.g., `user_agent`, `proxy_config`, `cache_mode`).
|
||||
* **5.3. `LLMConfig` Usage:**
|
||||
* Instantiated and passed to LLM-based extraction strategies (e.g., `LLMExtractionStrategy`) or content filters (`LLMContentFilter`) during their initialization.
|
||||
* `llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="sk-...")`
|
||||
* `extraction_strategy = LLMExtractionStrategy(llm_config=llm_conf, schema=my_schema)`
|
||||
* **5.4. `GeolocationConfig` and `ProxyConfig` Usage:**
|
||||
* `GeolocationConfig` is typically instantiated and assigned to the `geolocation` parameter of `CrawlerRunConfig`.
|
||||
* `geo_conf = GeolocationConfig(latitude=34.0522, longitude=-118.2437)`
|
||||
* `run_config = CrawlerRunConfig(geolocation=geo_conf)`
|
||||
* `ProxyConfig` can be assigned to the `proxy_config` parameter of `BrowserConfig` (for a global proxy applied to all contexts) or `CrawlerRunConfig` (for a proxy specific to a single crawl run).
|
||||
* `proxy_conf = ProxyConfig(server="http://myproxy:8080")`
|
||||
* `browser_config = BrowserConfig(proxy_config=proxy_conf)` (global)
|
||||
* `run_config = CrawlerRunConfig(proxy_config=proxy_conf)` (per-run)
|
||||
* **5.5. `HTTPCrawlerConfig` Usage:**
|
||||
* Used when the `crawler_strategy` for `AsyncWebCrawler` is set to `AsyncHTTPCrawlerStrategy` (for non-browser-based HTTP requests).
|
||||
* `http_conf = HTTPCrawlerConfig(method="POST", json={"key": "value"})`
|
||||
* `http_strategy = AsyncHTTPCrawlerStrategy(http_crawler_config=http_conf)`
|
||||
* `crawler = AsyncWebCrawler(crawler_strategy=http_strategy)`
|
||||
* Alternatively, parameters like `method`, `data`, `json` can be passed directly to `arun()` when using `AsyncHTTPCrawlerStrategy` if they are part of the `CrawlerRunConfig`.
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,356 +0,0 @@
|
||||
```markdown
|
||||
# Examples Outline for crawl4ai - core Component
|
||||
|
||||
**Target Document Type:** Examples Collection
|
||||
**Target Output Filename Suggestion:** `llm_examples_core.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2024-05-24 10:00:00
|
||||
---
|
||||
|
||||
This document provides a collection of runnable code examples for the `core` component of the `crawl4ai` library. Each example is designed to showcase a specific feature or configuration.
|
||||
|
||||
## 1. Basic `AsyncWebCrawler` Usage
|
||||
|
||||
### 1.1. Example: Simplest crawl of a single URL with default `BrowserConfig` and `CrawlerRunConfig`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def simplest_crawl():
|
||||
# Uses default BrowserConfig and CrawlerRunConfig
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful!")
|
||||
print(f"Markdown (first 300 chars):\n{result.markdown.raw_markdown[:300]}...")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(simplest_crawl())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.2. Example: Using `AsyncWebCrawler` as an asynchronous context manager (`async with`).
|
||||
|
||||
This is the recommended way to manage the crawler's lifecycle.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def context_manager_crawl():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful using context manager!")
|
||||
print(f"Page title from metadata: {result.metadata.get('title')}")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(context_manager_crawl())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.3. Example: Explicitly starting and closing the `AsyncWebCrawler` using `start()` and `close()`.
|
||||
|
||||
Useful for scenarios where the crawler's lifecycle needs more manual control.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def explicit_lifecycle_crawl():
|
||||
crawler = AsyncWebCrawler()
|
||||
await crawler.start() # Explicitly start the crawler and browser
|
||||
try:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful with explicit start/close!")
|
||||
print(f"Cleaned HTML (first 300 chars):\n{result.cleaned_html[:300]}...")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
finally:
|
||||
await crawler.close() # Ensure the crawler is closed
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(explicit_lifecycle_crawl())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.4. Example: Handling a failed crawl (e.g., non-existent URL, network error) and checking `CrawlResult.success` and `CrawlResult.error_message`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def failed_crawl_handling():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a deliberately non-existent URL
|
||||
result = await crawler.arun(url="https://thissitedoesnotexist.crawl4ai")
|
||||
if not result.success:
|
||||
print(f"Crawl failed as expected for URL: {result.url}")
|
||||
print(f"Status Code: {result.status_code}")
|
||||
print(f"Error Message: {result.error_message}")
|
||||
else:
|
||||
print("Crawl unexpectedly succeeded!")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(failed_crawl_handling())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.5. Example: Processing raw HTML content directly using `crawler.aprocess_html()`.
|
||||
|
||||
This is useful if you already have HTML content and want to use Crawl4ai's processing capabilities.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def process_raw_html_directly():
|
||||
raw_html_content = """
|
||||
<html>
|
||||
<head><title>My Test Page</title></head>
|
||||
<body>
|
||||
<h1>Welcome!</h1>
|
||||
<p>This is a paragraph with a <a href="https://example.com">link</a>.</p>
|
||||
<script>console.log("This should be removed");</script>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
# No need for BrowserConfig as we are not navigating
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Use CrawlerRunConfig if you need specific processing options
|
||||
config = CrawlerRunConfig()
|
||||
result = await crawler.aprocess_html(
|
||||
url="raw://my_virtual_page", # Provide a conceptual URL
|
||||
html=raw_html_content,
|
||||
config=config
|
||||
)
|
||||
if result.success:
|
||||
print("Raw HTML processed successfully!")
|
||||
print(f"Markdown:\n{result.markdown.raw_markdown}")
|
||||
print(f"Cleaned HTML:\n{result.cleaned_html}")
|
||||
else:
|
||||
print(f"HTML processing failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(process_raw_html_directly())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.6. Example: Crawling a local HTML file using the `file:///` prefix.
|
||||
|
||||
First, create a dummy HTML file named `local_test.html` in the same directory as your script.
|
||||
|
||||
```python
|
||||
# local_test.html
|
||||
# <!DOCTYPE html>
|
||||
# <html>
|
||||
# <head>
|
||||
# <title>Local Test File</title>
|
||||
# </head>
|
||||
# <body>
|
||||
# <h1>Hello from a local file!</h1>
|
||||
# <p>This content is loaded from the local filesystem.</p>
|
||||
# </body>
|
||||
# </html>
|
||||
```
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def crawl_local_file():
|
||||
# Create a dummy local HTML file for the example
|
||||
script_dir = Path(__file__).parent
|
||||
local_file_path = script_dir / "local_test_for_crawl.html"
|
||||
with open(local_file_path, "w", encoding="utf-8") as f:
|
||||
f.write("<!DOCTYPE html><html><head><title>Local Test</title></head><body><h1>Local Content</h1></body></html>")
|
||||
|
||||
file_url = f"file:///{local_file_path.resolve()}"
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=file_url)
|
||||
if result.success:
|
||||
print(f"Successfully crawled local file: {file_url}")
|
||||
print(f"Markdown (first 100 chars): {result.markdown.raw_markdown[:100]}...")
|
||||
else:
|
||||
print(f"Failed to crawl local file: {result.error_message}")
|
||||
|
||||
# Clean up the dummy file
|
||||
if os.path.exists(local_file_path):
|
||||
os.remove(local_file_path)
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(crawl_local_file())
|
||||
```
|
||||
|
||||
---
|
||||
### 1.7. Example: Accessing basic fields from `CrawlResult` (e.g., `url`, `html`, `markdown.raw_markdown`, `status_code`, `response_headers`).
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def access_crawl_result_fields():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print(f"URL Crawled: {result.url}")
|
||||
print(f"Status Code: {result.status_code}")
|
||||
|
||||
print("\n--- Response Headers (sample) ---")
|
||||
if result.response_headers:
|
||||
for key, value in list(result.response_headers.items())[:3]: # Print first 3 headers
|
||||
print(f"{key}: {value}")
|
||||
|
||||
print(f"\n--- Raw HTML (first 100 chars) ---\n{result.html[:100]}...")
|
||||
print(f"\n--- Cleaned HTML (first 100 chars) ---\n{result.cleaned_html[:100]}...")
|
||||
|
||||
if result.markdown:
|
||||
print(f"\n--- Raw Markdown (first 100 chars) ---\n{result.markdown.raw_markdown[:100]}...")
|
||||
|
||||
print(f"\n--- Metadata (sample) ---")
|
||||
if result.metadata:
|
||||
for key, value in list(result.metadata.items())[:3]: # Print first 3 metadata items
|
||||
print(f"{key}: {value}")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(access_crawl_result_fields())
|
||||
```
|
||||
|
||||
---
|
||||
## 2. Configuring the Browser (`BrowserConfig`)
|
||||
|
||||
### 2.1. Example: Initializing `AsyncWebCrawler` with a custom `BrowserConfig` object.
|
||||
|
||||
This example sets the browser to run in non-headless mode and uses Firefox.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def custom_browser_config_init():
|
||||
# Configure browser to be Firefox and visible
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="firefox",
|
||||
headless=False # Set to True to run without UI
|
||||
)
|
||||
|
||||
# Pass the custom config to the crawler
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print(f"Crawl successful with custom BrowserConfig (Firefox, visible)!")
|
||||
print(f"Page title: {result.metadata.get('title')}")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
# This example might open a visible browser window.
|
||||
# Ensure Firefox is installed if you run this.
|
||||
# asyncio.run(custom_browser_config_init())
|
||||
print("Skipping custom_browser_config_init example in automated run to avoid GUI interaction.")
|
||||
```
|
||||
|
||||
---
|
||||
### 2.2. Browser Type and Headless Mode
|
||||
|
||||
#### 2.2.1. Example: Using Chromium browser (default).
|
||||
|
||||
This shows the default behavior if no `browser_type` is specified.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def chromium_default_crawl():
|
||||
# Chromium is the default, but we can explicitly set it
|
||||
browser_config = BrowserConfig(browser_type="chromium", headless=True)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful with Chromium (default)!")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(chromium_default_crawl())
|
||||
```
|
||||
|
||||
---
|
||||
#### 2.2.2. Example: Using Firefox browser (`browser_type="firefox"`).
|
||||
|
||||
Ensure Firefox is installed on your system for this example to run.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def firefox_crawl():
|
||||
browser_config = BrowserConfig(browser_type="firefox", headless=True)
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful with Firefox!")
|
||||
else:
|
||||
print(f"Crawl failed with Firefox: {result.error_message}")
|
||||
except Exception as e:
|
||||
print(f"Error running Firefox example: {e}. Ensure Firefox is installed and Playwright browsers are set up (`crawl4ai-setup`).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# asyncio.run(firefox_crawl())
|
||||
print("Skipping Firefox example in automated run. Uncomment to run if Firefox is installed.")
|
||||
```
|
||||
|
||||
---
|
||||
#### 2.2.3. Example: Using WebKit browser (`browser_type="webkit"`).
|
||||
|
||||
Ensure WebKit (Safari's engine) is installed via Playwright.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def webkit_crawl():
|
||||
browser_config = BrowserConfig(browser_type="webkit", headless=True)
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com")
|
||||
if result.success:
|
||||
print("Crawl successful with WebKit!")
|
||||
else:
|
||||
print(f"Crawl failed with WebKit: {result.error_message}")
|
||||
except Exception as e:
|
||||
print(f"Error running WebKit example: {e}. Ensure WebKit is installed and Playwright browsers are set up (`crawl4ai-setup`).")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# asyncio.run(webkit_crawl())
|
||||
print("Skipping WebKit example in automated run. Uncomment to run if WebKit is installed.")
|
||||
```
|
||||
|
||||
---
|
||||
#### 2.2.4. Example: Running the browser in non-headless mode (`headless=False`) for visual debugging.
|
||||
|
||||
This will open a visible browser window.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
async def non_headless_crawl():
|
||||
browser_config = BrowserConfig(headless=False) # Browser window will be visible
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https
|
||||
```
|
||||
@@ -1,890 +0,0 @@
|
||||
Okay, I have read the objective and instructions. I will now generate the detailed Markdown outline for a "Foundational Memory" document for the `core` component of `crawl4ai`, using the provided `code_analysis_output.md` (the concatenated code snippets from the previous prompt) as the primary source of truth for API details.
|
||||
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - core Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_core.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Core Components
|
||||
* 1.1. Purpose: Provides the foundational classes, configurations, and data models for web crawling and scraping operations within the `crawl4ai` library.
|
||||
* 1.2. Key Functionalities:
|
||||
* Orchestration of asynchronous web crawling (`AsyncWebCrawler`).
|
||||
* Configuration of browser behavior and specific crawl runs (`BrowserConfig`, `CrawlerRunConfig`).
|
||||
* Standardized data structures for crawl results and associated data (`CrawlResult`, `Media`, `Links`, etc.).
|
||||
* Strategies for fetching web content (`AsyncPlaywrightCrawlerStrategy`, `AsyncHTTPCrawlerStrategy`).
|
||||
* Management of browser instances and sessions (`BrowserManager`, `ManagedBrowser`).
|
||||
* Asynchronous logging (`AsyncLogger`).
|
||||
* 1.3. Relationship with other `crawl4ai` components:
|
||||
* The `core` component serves as the foundation upon which specialized strategies (e.g., PDF processing, Markdown generation, content extraction, chunking, content filtering) are built and integrated.
|
||||
|
||||
## 2. Main Class: `AsyncWebCrawler`
|
||||
* 2.1. Purpose: The primary class for orchestrating asynchronous web crawling operations. It manages browser instances (via a `BrowserManager`), applies crawling strategies, and processes HTML content to produce structured results.
|
||||
* 2.2. Initialization (`__init__`)
|
||||
* 2.2.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
crawler_strategy: Optional[AsyncCrawlerStrategy] = None,
|
||||
config: Optional[BrowserConfig] = None,
|
||||
base_directory: str = str(os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())),
|
||||
thread_safe: bool = False,
|
||||
logger: Optional[AsyncLoggerBase] = None,
|
||||
**kwargs,
|
||||
):
|
||||
```
|
||||
* 2.2.2. Parameters:
|
||||
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The strategy to use for fetching web content. If `None`, defaults to `AsyncPlaywrightCrawlerStrategy` initialized with `config` and `logger`.
|
||||
* `config (Optional[BrowserConfig])`: Configuration object for browser settings. If `None`, a default `BrowserConfig()` is created.
|
||||
* `base_directory (str)`: The base directory for storing crawl4ai related files, such as cache and logs. Defaults to `os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home())`.
|
||||
* `thread_safe (bool)`: If `True`, uses an `asyncio.Lock` for thread-safe operations, particularly relevant for `arun_many`. Default: `False`.
|
||||
* `logger (Optional[AsyncLoggerBase])`: An instance of a logger. If `None`, a default `AsyncLogger` is initialized using `base_directory` and `config.verbose`.
|
||||
* `**kwargs`: Additional keyword arguments, primarily for backward compatibility, passed to the `AsyncPlaywrightCrawlerStrategy` if `crawler_strategy` is not provided.
|
||||
* 2.3. Key Public Attributes/Properties:
|
||||
* `browser_config (BrowserConfig)`: Read-only. The browser configuration object used by the crawler.
|
||||
* `crawler_strategy (AsyncCrawlerStrategy)`: Read-only. The active crawling strategy instance.
|
||||
* `logger (AsyncLoggerBase)`: Read-only. The logger instance used by the crawler.
|
||||
* `ready (bool)`: Read-only. `True` if the crawler has been started and is ready to perform crawl operations, `False` otherwise.
|
||||
* 2.4. Lifecycle Methods:
|
||||
* 2.4.1. `async start() -> AsyncWebCrawler`:
|
||||
* Purpose: Asynchronously initializes the crawler strategy (e.g., launches the browser). This must be called before `arun` or `arun_many` if the crawler is not used as an asynchronous context manager.
|
||||
* Returns: The `AsyncWebCrawler` instance (`self`).
|
||||
* 2.4.2. `async close() -> None`:
|
||||
* Purpose: Asynchronously closes the crawler strategy and cleans up resources (e.g., closes the browser). This should be called if `start()` was used explicitly.
|
||||
* 2.4.3. `async __aenter__() -> AsyncWebCrawler`:
|
||||
* Purpose: Entry point for asynchronous context management. Calls `self.start()`.
|
||||
* Returns: The `AsyncWebCrawler` instance (`self`).
|
||||
* 2.4.4. `async __aexit__(exc_type, exc_val, exc_tb) -> None`:
|
||||
* Purpose: Exit point for asynchronous context management. Calls `self.close()`.
|
||||
* 2.5. Primary Crawl Methods:
|
||||
* 2.5.1. `async arun(url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`:
|
||||
* Purpose: Performs a single crawl operation for the given URL or raw HTML content.
|
||||
* Parameters:
|
||||
* `url (str)`: The URL to crawl (e.g., "http://example.com", "file:///path/to/file.html") or raw HTML content prefixed with "raw:" (e.g., "raw:<html>...</html>").
|
||||
* `config (Optional[CrawlerRunConfig])`: Configuration for this specific crawl run. If `None`, a default `CrawlerRunConfig()` is used.
|
||||
* `**kwargs`: Additional parameters passed to the underlying `aprocess_html` method, can be used to override settings in `config`.
|
||||
* Returns: `RunManyReturn` (which resolves to `CrawlResultContainer` containing a single `CrawlResult`).
|
||||
* 2.5.2. `async arun_many(urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`:
|
||||
* Purpose: Crawls multiple URLs concurrently using a specified or default dispatcher strategy.
|
||||
* Parameters:
|
||||
* `urls (List[str])`: A list of URLs to crawl.
|
||||
* `config (Optional[CrawlerRunConfig])`: Configuration applied to all crawl runs in this batch.
|
||||
* `dispatcher (Optional[BaseDispatcher])`: The dispatcher strategy to manage concurrent crawls. Defaults to `MemoryAdaptiveDispatcher`.
|
||||
* `**kwargs`: Additional parameters passed to the underlying `arun` method for each URL.
|
||||
* Returns: `RunManyReturn`. If `config.stream` is `True`, returns an `AsyncGenerator[CrawlResult, None]`. Otherwise, returns a `CrawlResultContainer` (list-like) of `CrawlResult` objects.
|
||||
* 2.6. Internal Processing Method (User-Facing Effects):
|
||||
* 2.6.1. `async aprocess_html(url: str, html: str, extracted_content: Optional[str], config: CrawlerRunConfig, screenshot_data: Optional[str], pdf_data: Optional[bytes], verbose: bool, **kwargs) -> CrawlResult`:
|
||||
* Purpose: Processes the fetched HTML content. This method is called internally by `arun` after content is fetched (either from a live crawl or cache). It applies scraping strategies, content filtering, and Markdown generation based on the `config`.
|
||||
* Parameters:
|
||||
* `url (str)`: The URL of the content being processed.
|
||||
* `html (str)`: The raw HTML content.
|
||||
* `extracted_content (Optional[str])`: Pre-extracted content from a previous step or cache.
|
||||
* `config (CrawlerRunConfig)`: Configuration for this processing run.
|
||||
* `screenshot_data (Optional[str])`: Base64 encoded screenshot data, if available.
|
||||
* `pdf_data (Optional[bytes])`: PDF data, if available.
|
||||
* `verbose (bool)`: Verbosity setting for logging during processing.
|
||||
* `**kwargs`: Additional parameters, including `is_raw_html` and `redirected_url`.
|
||||
* Returns: A `CrawlResult` object containing the processed data.
|
||||
|
||||
## 3. Core Configuration Objects
|
||||
|
||||
* 3.1. Class `BrowserConfig` (from `crawl4ai.async_configs`)
|
||||
* 3.1.1. Purpose: Configures the browser instance launched by Playwright, including its type, mode, display settings, proxy, user agent, and persistent storage options.
|
||||
* 3.1.2. Initialization (`__init__`)
|
||||
* Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
browser_mode: str = "dedicated",
|
||||
use_managed_browser: bool = False,
|
||||
cdp_url: Optional[str] = None,
|
||||
use_persistent_context: bool = False,
|
||||
user_data_dir: Optional[str] = None,
|
||||
channel: Optional[str] = "chromium", # Note: 'channel' from code, outline had 'chrome_channel'
|
||||
proxy: Optional[str] = None, # Note: 'proxy' from code, outline had 'proxy_config' for this level
|
||||
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: Optional[dict] = None,
|
||||
accept_downloads: bool = False,
|
||||
downloads_path: Optional[str] = None,
|
||||
storage_state: Optional[Union[str, dict, None]] = None,
|
||||
ignore_https_errors: bool = True,
|
||||
java_script_enabled: bool = True,
|
||||
sleep_on_close: bool = False,
|
||||
verbose: bool = True,
|
||||
cookies: Optional[list] = None,
|
||||
headers: Optional[dict] = None,
|
||||
user_agent: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36",
|
||||
user_agent_mode: str = "",
|
||||
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
|
||||
text_mode: bool = False,
|
||||
light_mode: bool = False,
|
||||
extra_args: Optional[list] = None,
|
||||
debugging_port: int = 9222,
|
||||
host: str = "localhost",
|
||||
):
|
||||
```
|
||||
* Key Parameters:
|
||||
* `browser_type (str)`: Type of browser to launch ("chromium", "firefox", "webkit"). Default: "chromium".
|
||||
* `headless (bool)`: Whether to run the browser in headless mode. Default: `True`.
|
||||
* `browser_mode (str)`: How the browser should be initialized ("builtin", "dedicated", "cdp", "docker"). Default: "dedicated".
|
||||
* `use_managed_browser (bool)`: Whether to launch the browser using a managed approach (e.g., via CDP). Default: `False`.
|
||||
* `cdp_url (Optional[str])`: URL for Chrome DevTools Protocol endpoint. Default: `None`.
|
||||
* `use_persistent_context (bool)`: Use a persistent browser context (profile). Default: `False`.
|
||||
* `user_data_dir (Optional[str])`: Path to user data directory for persistent sessions. Default: `None`.
|
||||
* `channel (Optional[str])`: Browser channel (e.g., "chromium", "chrome", "msedge"). Default: "chromium".
|
||||
* `proxy (Optional[str])`: Simple proxy server URL string.
|
||||
* `proxy_config (Optional[Union[ProxyConfig, dict, None]])`: Detailed proxy configuration object or dictionary. Takes precedence over `proxy`.
|
||||
* `viewport_width (int)`: Default viewport width. Default: `1080`.
|
||||
* `viewport_height (int)`: Default viewport height. Default: `600`.
|
||||
* `viewport (Optional[dict])`: Dictionary to set viewport dimensions, overrides `viewport_width` and `viewport_height` if set (e.g., `{"width": 1920, "height": 1080}`). Default: `None`.
|
||||
* `accept_downloads (bool)`: Whether to allow file downloads. Default: `False`.
|
||||
* `downloads_path (Optional[str])`: Directory to store downloaded files. Default: `None`.
|
||||
* `storage_state (Optional[Union[str, dict, None]])`: Path to a file or a dictionary containing browser storage state (cookies, localStorage). Default: `None`.
|
||||
* `ignore_https_errors (bool)`: Ignore HTTPS certificate errors. Default: `True`.
|
||||
* `java_script_enabled (bool)`: Enable JavaScript execution. Default: `True`.
|
||||
* `user_agent (str)`: Custom User-Agent string. Default: "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36".
|
||||
* `user_agent_mode (str)`: Mode for generating User-Agent (e.g., "random"). Default: `""` (uses provided `user_agent`).
|
||||
* `user_agent_generator_config (Optional[dict])`: Configuration for User-Agent generation if `user_agent_mode` is active. Default: `{}`.
|
||||
* `text_mode (bool)`: If `True`, disables images and rich content for faster loading. Default: `False`.
|
||||
* `light_mode (bool)`: Disables certain background features for performance. Default: `False`.
|
||||
* `extra_args (Optional[list])`: Additional command-line arguments for the browser. Default: `None` (resolves to `[]`).
|
||||
* `debugging_port (int)`: Port for browser debugging protocol. Default: `9222`.
|
||||
* `host (str)`: Host for browser debugging protocol. Default: "localhost".
|
||||
* 3.1.3. Key Public Methods:
|
||||
* `clone(**kwargs) -> BrowserConfig`: Creates a new `BrowserConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
|
||||
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
|
||||
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
|
||||
* `static load(data: dict) -> BrowserConfig`: Deserializes a `BrowserConfig` instance from a dictionary (previously created by `dump`).
|
||||
* `static from_kwargs(kwargs: dict) -> BrowserConfig`: Creates a `BrowserConfig` instance directly from a dictionary of keyword arguments.
|
||||
|
||||
* 3.2. Class `CrawlerRunConfig` (from `crawl4ai.async_configs`)
|
||||
* 3.2.1. Purpose: Specifies settings for an individual crawl operation initiated by `arun()` or `arun_many()`. These settings can override or augment the global `BrowserConfig`.
|
||||
* 3.2.2. Initialization (`__init__`)
|
||||
* Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
# Content Processing Parameters
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: Optional[ExtractionStrategy] = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
markdown_generator: MarkdownGenerationStrategy = DefaultMarkdownGenerator(),
|
||||
only_text: bool = False,
|
||||
css_selector: Optional[str] = None,
|
||||
target_elements: Optional[List[str]] = None,
|
||||
excluded_tags: Optional[list] = None,
|
||||
excluded_selector: Optional[str] = None,
|
||||
keep_data_attributes: bool = False,
|
||||
keep_attrs: Optional[list] = None,
|
||||
remove_forms: bool = False,
|
||||
prettify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: ContentScrapingStrategy = None, # Will default to WebScrapingStrategy
|
||||
proxy_config: Optional[Union[ProxyConfig, dict, None]] = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
# Browser Location and Identity Parameters
|
||||
locale: Optional[str] = None,
|
||||
timezone_id: Optional[str] = None,
|
||||
geolocation: Optional[GeolocationConfig] = None,
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
# Caching Parameters
|
||||
cache_mode: CacheMode = CacheMode.BYPASS,
|
||||
session_id: Optional[str] = None,
|
||||
bypass_cache: bool = False, # Legacy
|
||||
disable_cache: bool = False, # Legacy
|
||||
no_cache_read: bool = False, # Legacy
|
||||
no_cache_write: bool = False, # Legacy
|
||||
shared_data: Optional[dict] = None,
|
||||
# Page Navigation and Timing Parameters
|
||||
wait_until: str = "domcontentloaded",
|
||||
page_timeout: int = PAGE_TIMEOUT,
|
||||
wait_for: Optional[str] = None,
|
||||
wait_for_timeout: Optional[int] = None,
|
||||
wait_for_images: bool = False,
|
||||
delay_before_return_html: float = 0.1,
|
||||
mean_delay: float = 0.1,
|
||||
max_range: float = 0.3,
|
||||
semaphore_count: int = 5,
|
||||
# Page Interaction Parameters
|
||||
js_code: Optional[Union[str, List[str]]] = None,
|
||||
js_only: bool = False,
|
||||
ignore_body_visibility: bool = True,
|
||||
scan_full_page: bool = False,
|
||||
scroll_delay: float = 0.2,
|
||||
process_iframes: bool = False,
|
||||
remove_overlay_elements: bool = False,
|
||||
simulate_user: bool = False,
|
||||
override_navigator: bool = False,
|
||||
magic: bool = False,
|
||||
adjust_viewport_to_content: bool = False,
|
||||
# Media Handling Parameters
|
||||
screenshot: bool = False,
|
||||
screenshot_wait_for: Optional[float] = None,
|
||||
screenshot_height_threshold: int = SCREENSHOT_HEIGHT_THRESHOLD,
|
||||
pdf: bool = False,
|
||||
capture_mhtml: bool = False,
|
||||
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||
table_score_threshold: int = 7,
|
||||
exclude_external_images: bool = False,
|
||||
exclude_all_images: bool = False,
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains: Optional[list] = None, # Note: 'exclude_social_media_domains' from code
|
||||
exclude_external_links: bool = False,
|
||||
exclude_social_media_links: bool = False,
|
||||
exclude_domains: Optional[list] = None,
|
||||
exclude_internal_links: bool = False,
|
||||
# Debugging and Logging Parameters
|
||||
verbose: bool = True,
|
||||
log_console: bool = False,
|
||||
# Network and Console Capturing Parameters
|
||||
capture_network_requests: bool = False,
|
||||
capture_console_messages: bool = False,
|
||||
# Connection Parameters (for HTTPCrawlerStrategy)
|
||||
method: str = "GET",
|
||||
stream: bool = False,
|
||||
url: Optional[str] = None,
|
||||
# Robots.txt Handling
|
||||
check_robots_txt: bool = False,
|
||||
# User Agent Parameters
|
||||
user_agent: Optional[str] = None,
|
||||
user_agent_mode: Optional[str] = None,
|
||||
user_agent_generator_config: Optional[dict] = None, # Note: 'user_agent_generator_config' from code
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Optional[Dict[str, Any]] = None,
|
||||
):
|
||||
```
|
||||
* Key Parameters:
|
||||
* `word_count_threshold (int)`: Minimum word count for a content block to be considered. Default: `MIN_WORD_THRESHOLD` (200).
|
||||
* `extraction_strategy (Optional[ExtractionStrategy])`: Strategy for structured data extraction (e.g., `LLMExtractionStrategy`, `JsonCssExtractionStrategy`). Default: `None` (falls back to `NoExtractionStrategy`).
|
||||
* `chunking_strategy (ChunkingStrategy)`: Strategy for splitting content into chunks before extraction. Default: `RegexChunking()`.
|
||||
* `markdown_generator (MarkdownGenerationStrategy)`: Strategy for converting HTML to Markdown. Default: `DefaultMarkdownGenerator()`.
|
||||
* `cache_mode (CacheMode)`: Caching behavior for this run. Default: `CacheMode.BYPASS`.
|
||||
* `session_id (Optional[str])`: ID for session persistence (reusing browser tabs/contexts). Default: `None`.
|
||||
* `js_code (Optional[Union[str, List[str]]])`: JavaScript code snippets to execute on the page. Default: `None`.
|
||||
* `wait_for (Optional[str])`: CSS selector or JS condition (prefixed with "js:") to wait for before proceeding. Default: `None`.
|
||||
* `page_timeout (int)`: Timeout for page operations (e.g., navigation) in milliseconds. Default: `PAGE_TIMEOUT` (60000ms).
|
||||
* `screenshot (bool)`: If `True`, capture a screenshot of the page. Default: `False`.
|
||||
* `pdf (bool)`: If `True`, generate a PDF of the page. Default: `False`.
|
||||
* `capture_mhtml (bool)`: If `True`, capture an MHTML snapshot of the page. Default: `False`.
|
||||
* `exclude_external_links (bool)`: If `True`, exclude external links from results. Default: `False`.
|
||||
* `stream (bool)`: If `True` (used with `arun_many`), results are yielded as an `AsyncGenerator`. Default: `False`.
|
||||
* `check_robots_txt (bool)`: If `True`, crawler will check and respect `robots.txt` rules. Default: `False`.
|
||||
* `user_agent (Optional[str])`: Override the browser's User-Agent for this specific run.
|
||||
* 3.2.3. Key Public Methods:
|
||||
* `clone(**kwargs) -> CrawlerRunConfig`: Creates a new `CrawlerRunConfig` instance as a copy of the current one, with specified keyword arguments overriding existing values.
|
||||
* `to_dict() -> dict`: Returns a dictionary representation of the configuration object's attributes.
|
||||
* `dump() -> dict`: Serializes the configuration object to a JSON-serializable dictionary, including nested objects.
|
||||
* `static load(data: dict) -> CrawlerRunConfig`: Deserializes a `CrawlerRunConfig` instance from a dictionary (previously created by `dump`).
|
||||
* `static from_kwargs(kwargs: dict) -> CrawlerRunConfig`: Creates a `CrawlerRunConfig` instance directly from a dictionary of keyword arguments.
|
||||
|
||||
* 3.3. Supporting Configuration Objects (from `crawl4ai.async_configs`)
|
||||
* 3.3.1. Class `GeolocationConfig`
|
||||
* Purpose: Defines geolocation (latitude, longitude, accuracy) to be emulated by the browser.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
latitude: float,
|
||||
longitude: float,
|
||||
accuracy: Optional[float] = 0.0
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `latitude (float)`: Latitude coordinate (e.g., 37.7749).
|
||||
* `longitude (float)`: Longitude coordinate (e.g., -122.4194).
|
||||
* `accuracy (Optional[float])`: Accuracy in meters. Default: `0.0`.
|
||||
* Methods:
|
||||
* `static from_dict(geo_dict: Dict) -> GeolocationConfig`: Creates an instance from a dictionary.
|
||||
* `to_dict() -> Dict`: Converts the instance to a dictionary.
|
||||
* `clone(**kwargs) -> GeolocationConfig`: Creates a copy with updated values.
|
||||
* 3.3.2. Class `ProxyConfig`
|
||||
* Purpose: Defines the settings for a single proxy server, including server address, authentication credentials, and optional IP.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `server (str)`: Proxy server URL (e.g., "http://127.0.0.1:8080", "socks5://user:pass@host:port").
|
||||
* `username (Optional[str])`: Username for proxy authentication.
|
||||
* `password (Optional[str])`: Password for proxy authentication.
|
||||
* `ip (Optional[str])`: Optional IP address associated with the proxy for verification.
|
||||
* Methods:
|
||||
* `static from_string(proxy_str: str) -> ProxyConfig`: Creates an instance from a string (e.g., "ip:port:username:password" or "ip:port").
|
||||
* `static from_dict(proxy_dict: Dict) -> ProxyConfig`: Creates an instance from a dictionary.
|
||||
* `static from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`: Loads a list of proxies from a comma-separated environment variable.
|
||||
* `to_dict() -> Dict`: Converts the instance to a dictionary.
|
||||
* `clone(**kwargs) -> ProxyConfig`: Creates a copy with updated values.
|
||||
* 3.3.3. Class `HTTPCrawlerConfig`
|
||||
* Purpose: Configuration for the `AsyncHTTPCrawlerStrategy`, specifying HTTP method, headers, data/JSON payload, and redirect/SSL verification behavior.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
method: str = "GET",
|
||||
headers: Optional[Dict[str, str]] = None,
|
||||
data: Optional[Dict[str, Any]] = None,
|
||||
json: Optional[Dict[str, Any]] = None,
|
||||
follow_redirects: bool = True,
|
||||
verify_ssl: bool = True,
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `method (str)`: HTTP method (e.g., "GET", "POST"). Default: "GET".
|
||||
* `headers (Optional[Dict[str, str]])`: Dictionary of HTTP request headers. Default: `None`.
|
||||
* `data (Optional[Dict[str, Any]])`: Dictionary of form data to send in the request body. Default: `None`.
|
||||
* `json (Optional[Dict[str, Any]])`: JSON data to send in the request body. Default: `None`.
|
||||
* `follow_redirects (bool)`: Whether to automatically follow HTTP redirects. Default: `True`.
|
||||
* `verify_ssl (bool)`: Whether to verify SSL certificates. Default: `True`.
|
||||
* Methods:
|
||||
* `static from_kwargs(kwargs: dict) -> HTTPCrawlerConfig`: Creates an instance from keyword arguments.
|
||||
* `to_dict() -> dict`: Converts config to a dictionary.
|
||||
* `clone(**kwargs) -> HTTPCrawlerConfig`: Creates a copy with updated values.
|
||||
* `dump() -> dict`: Serializes the config to a dictionary.
|
||||
* `static load(data: dict) -> HTTPCrawlerConfig`: Deserializes from a dictionary.
|
||||
* 3.3.4. Class `LLMConfig`
|
||||
* Purpose: Configures settings for interacting with Large Language Models, including provider choice, API credentials, and generation parameters.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
frequency_penalty: Optional[float] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
stop: Optional[List[str]] = None,
|
||||
n: Optional[int] = None,
|
||||
):
|
||||
```
|
||||
* Key Parameters:
|
||||
* `provider (str)`: Name of the LLM provider (e.g., "openai/gpt-4o", "ollama/llama3.3", "groq/llama3-8b-8192"). Default: `DEFAULT_PROVIDER` (from `crawl4ai.config`).
|
||||
* `api_token (Optional[str])`: API token for the LLM provider. If prefixed with "env:", it reads from the specified environment variable (e.g., "env:OPENAI_API_KEY"). If not provided, it attempts to load from default environment variables based on the provider.
|
||||
* `base_url (Optional[str])`: Custom base URL for the LLM API endpoint.
|
||||
* `temperature (Optional[float])`: Sampling temperature for generation.
|
||||
* `max_tokens (Optional[int])`: Maximum number of tokens to generate.
|
||||
* `top_p (Optional[float])`: Nucleus sampling parameter.
|
||||
* `frequency_penalty (Optional[float])`: Penalty for token frequency.
|
||||
* `presence_penalty (Optional[float])`: Penalty for token presence.
|
||||
* `stop (Optional[List[str]])`: List of stop sequences for generation.
|
||||
* `n (Optional[int])`: Number of completions to generate.
|
||||
* Methods:
|
||||
* `static from_kwargs(kwargs: dict) -> LLMConfig`: Creates an instance from keyword arguments.
|
||||
* `to_dict() -> dict`: Converts config to a dictionary.
|
||||
* `clone(**kwargs) -> LLMConfig`: Creates a copy with updated values.
|
||||
|
||||
## 4. Core Data Models (Results & Payloads from `crawl4ai.models`)
|
||||
|
||||
* 4.1. Class `CrawlResult(BaseModel)`
|
||||
* Purpose: A Pydantic model representing the comprehensive result of a single crawl and processing operation.
|
||||
* Key Fields:
|
||||
* `url (str)`: The final URL that was crawled (after any redirects).
|
||||
* `html (str)`: The raw HTML content fetched from the URL.
|
||||
* `success (bool)`: `True` if the crawl operation (fetching and initial processing) was successful, `False` otherwise.
|
||||
* `cleaned_html (Optional[str])`: HTML content after sanitization and removal of unwanted tags/attributes as per configuration. Default: `None`.
|
||||
* `_markdown (Optional[MarkdownGenerationResult])`: (Private Attribute) Holds the `MarkdownGenerationResult` object if Markdown generation was performed. Use the `markdown` property to access. Default: `None`.
|
||||
* `markdown (Optional[Union[str, MarkdownGenerationResult]])`: (Property) Provides access to Markdown content. Behaves as a string (raw markdown) by default but allows access to `MarkdownGenerationResult` attributes (e.g., `result.markdown.fit_markdown`).
|
||||
* `extracted_content (Optional[str])`: JSON string representation of structured data extracted by an `ExtractionStrategy`. Default: `None`.
|
||||
* `media (Media)`: An object containing lists of `MediaItem` for images, videos, audio, and extracted tables. Default: `Media()`.
|
||||
* `links (Links)`: An object containing lists of `Link` for internal and external hyperlinks found on the page. Default: `Links()`.
|
||||
* `downloaded_files (Optional[List[str]])`: A list of file paths if any files were downloaded during the crawl. Default: `None`.
|
||||
* `js_execution_result (Optional[Dict[str, Any]])`: The result of any JavaScript code executed on the page. Default: `None`.
|
||||
* `screenshot (Optional[str])`: Base64 encoded string of the page screenshot, if `screenshot=True` was set. Default: `None`.
|
||||
* `pdf (Optional[bytes])`: Raw bytes of the PDF generated from the page, if `pdf=True` was set. Default: `None`.
|
||||
* `mhtml (Optional[str])`: MHTML snapshot of the page, if `capture_mhtml=True` was set. Default: `None`.
|
||||
* `metadata (Optional[dict])`: Dictionary of metadata extracted from the page (e.g., title, description, OpenGraph tags, Twitter card data). Default: `None`.
|
||||
* `error_message (Optional[str])`: A message describing the error if `success` is `False`. Default: `None`.
|
||||
* `session_id (Optional[str])`: The session ID used for this crawl, if applicable. Default: `None`.
|
||||
* `response_headers (Optional[dict])`: HTTP response headers from the server. Default: `None`.
|
||||
* `status_code (Optional[int])`: HTTP status code of the response. Default: `None`.
|
||||
* `ssl_certificate (Optional[SSLCertificate])`: Information about the SSL certificate if `fetch_ssl_certificate=True`. Default: `None`.
|
||||
* `dispatch_result (Optional[DispatchResult])`: Metadata about the task execution from the dispatcher (e.g., timings, memory usage). Default: `None`.
|
||||
* `redirected_url (Optional[str])`: The original URL if the request was redirected. Default: `None`.
|
||||
* `network_requests (Optional[List[Dict[str, Any]]])`: List of captured network requests if `capture_network_requests=True`. Default: `None`.
|
||||
* `console_messages (Optional[List[Dict[str, Any]]])`: List of captured browser console messages if `capture_console_messages=True`. Default: `None`.
|
||||
* Methods:
|
||||
* `model_dump(*args, **kwargs)`: Serializes the `CrawlResult` model to a dictionary, ensuring the `_markdown` private attribute is correctly handled and included as "markdown" in the output if present.
|
||||
|
||||
* 4.2. Class `MarkdownGenerationResult(BaseModel)`
|
||||
* Purpose: A Pydantic model that holds various forms of Markdown generated from HTML content.
|
||||
* Fields:
|
||||
* `raw_markdown (str)`: The basic, direct conversion of HTML to Markdown.
|
||||
* `markdown_with_citations (str)`: Markdown content with inline citations (e.g., [^1^]) and a references section.
|
||||
* `references_markdown (str)`: The Markdown content for the "References" section, listing all cited links.
|
||||
* `fit_markdown (Optional[str])`: Markdown generated specifically from content deemed "relevant" by a content filter (like `PruningContentFilter` or `LLMContentFilter`), if such a filter was applied. Default: `None`.
|
||||
* `fit_html (Optional[str])`: The filtered HTML content that was used to generate `fit_markdown`. Default: `None`.
|
||||
* Methods:
|
||||
* `__str__(self) -> str`: Returns `self.raw_markdown` when the object is cast to a string.
|
||||
|
||||
* 4.3. Class `ScrapingResult(BaseModel)`
|
||||
* Purpose: A Pydantic model representing a standardized output from content scraping strategies.
|
||||
* Fields:
|
||||
* `cleaned_html (str)`: The primary sanitized and processed HTML content.
|
||||
* `success (bool)`: Indicates if the scraping operation was successful.
|
||||
* `media (Media)`: A `Media` object containing extracted images, videos, audio, and tables.
|
||||
* `links (Links)`: A `Links` object containing extracted internal and external links.
|
||||
* `metadata (Dict[str, Any])`: A dictionary of metadata extracted from the page (e.g., title, description).
|
||||
|
||||
* 4.4. Class `MediaItem(BaseModel)`
|
||||
* Purpose: A Pydantic model representing a generic media item like an image, video, or audio file.
|
||||
* Fields:
|
||||
* `src (Optional[str])`: The source URL of the media item. Default: `""`.
|
||||
* `data (Optional[str])`: Base64 encoded data for inline media. Default: `""`.
|
||||
* `alt (Optional[str])`: Alternative text for the media item (e.g., image alt text). Default: `""`.
|
||||
* `desc (Optional[str])`: A description or surrounding text related to the media item. Default: `""`.
|
||||
* `score (Optional[int])`: A relevance or importance score, if calculated by a strategy. Default: `0`.
|
||||
* `type (str)`: The type of media (e.g., "image", "video", "audio"). Default: "image".
|
||||
* `group_id (Optional[int])`: An identifier to group related media variants (e.g., different resolutions of the same image from a srcset). Default: `0`.
|
||||
* `format (Optional[str])`: The detected file format (e.g., "jpeg", "png", "mp4"). Default: `None`.
|
||||
* `width (Optional[int])`: The width of the media item in pixels, if available. Default: `None`.
|
||||
|
||||
* 4.5. Class `Link(BaseModel)`
|
||||
* Purpose: A Pydantic model representing an extracted hyperlink.
|
||||
* Fields:
|
||||
* `href (Optional[str])`: The URL (href attribute) of the link. Default: `""`.
|
||||
* `text (Optional[str])`: The anchor text of the link. Default: `""`.
|
||||
* `title (Optional[str])`: The title attribute of the link, if present. Default: `""`.
|
||||
* `base_domain (Optional[str])`: The base domain extracted from the `href`. Default: `""`.
|
||||
|
||||
* 4.6. Class `Media(BaseModel)`
|
||||
* Purpose: A Pydantic model that acts as a container for lists of different types of media items found on a page.
|
||||
* Fields:
|
||||
* `images (List[MediaItem])`: A list of `MediaItem` objects representing images. Default: `[]`.
|
||||
* `videos (List[MediaItem])`: A list of `MediaItem` objects representing videos. Default: `[]`.
|
||||
* `audios (List[MediaItem])`: A list of `MediaItem` objects representing audio files. Default: `[]`.
|
||||
* `tables (List[Dict])`: A list of dictionaries, where each dictionary represents an extracted HTML table with keys like "headers", "rows", "caption", "summary". Default: `[]`.
|
||||
|
||||
* 4.7. Class `Links(BaseModel)`
|
||||
* Purpose: A Pydantic model that acts as a container for lists of internal and external links.
|
||||
* Fields:
|
||||
* `internal (List[Link])`: A list of `Link` objects considered internal to the crawled site. Default: `[]`.
|
||||
* `external (List[Link])`: A list of `Link` objects pointing to external sites. Default: `[]`.
|
||||
|
||||
* 4.8. Class `AsyncCrawlResponse(BaseModel)`
|
||||
* Purpose: A Pydantic model representing the raw response from a crawler strategy's `crawl` method. This data is then processed further to create a `CrawlResult`.
|
||||
* Fields:
|
||||
* `html (str)`: The raw HTML content of the page.
|
||||
* `response_headers (Dict[str, str])`: A dictionary of HTTP response headers.
|
||||
* `js_execution_result (Optional[Dict[str, Any]])`: The result from any JavaScript code executed on the page. Default: `None`.
|
||||
* `status_code (int)`: The HTTP status code of the response.
|
||||
* `screenshot (Optional[str])`: Base64 encoded screenshot data, if captured. Default: `None`.
|
||||
* `pdf_data (Optional[bytes])`: Raw PDF data, if captured. Default: `None`.
|
||||
* `mhtml_data (Optional[str])`: MHTML snapshot data, if captured. Default: `None`.
|
||||
* `downloaded_files (Optional[List[str]])`: A list of local file paths for any files downloaded during the crawl. Default: `None`.
|
||||
* `ssl_certificate (Optional[SSLCertificate])`: SSL certificate information for the site. Default: `None`.
|
||||
* `redirected_url (Optional[str])`: The original URL requested if the final URL is a result of redirection. Default: `None`.
|
||||
* `network_requests (Optional[List[Dict[str, Any]]])`: Captured network requests if enabled. Default: `None`.
|
||||
* `console_messages (Optional[List[Dict[str, Any]]])`: Captured console messages if enabled. Default: `None`.
|
||||
|
||||
* 4.9. Class `TokenUsage(BaseModel)`
|
||||
* Purpose: A Pydantic model to track token usage statistics for interactions with Large Language Models.
|
||||
* Fields:
|
||||
* `completion_tokens (int)`: Number of tokens used for the LLM's completion/response. Default: `0`.
|
||||
* `prompt_tokens (int)`: Number of tokens used for the input prompt to the LLM. Default: `0`.
|
||||
* `total_tokens (int)`: Total number of tokens used (prompt + completion). Default: `0`.
|
||||
* `completion_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of completion tokens. Default: `None`.
|
||||
* `prompt_tokens_details (Optional[dict])`: Provider-specific detailed breakdown of prompt tokens. Default: `None`.
|
||||
|
||||
* 4.10. Class `SSLCertificate(dict)` (from `crawl4ai.ssl_certificate`)
|
||||
* Purpose: Represents an SSL certificate's information, behaving like a dictionary for direct JSON serialization and easy access to its fields.
|
||||
* Key Fields (accessed as dictionary keys):
|
||||
* `subject (dict)`: Dictionary of subject fields (e.g., `{"CN": "example.com", "O": "Example Inc."}`).
|
||||
* `issuer (dict)`: Dictionary of issuer fields.
|
||||
* `version (int)`: Certificate version number.
|
||||
* `serial_number (str)`: Certificate serial number (hexadecimal string).
|
||||
* `not_before (str)`: Validity start date and time (ASN.1/UTC format string, e.g., "YYYYMMDDHHMMSSZ").
|
||||
* `not_after (str)`: Validity end date and time (ASN.1/UTC format string).
|
||||
* `fingerprint (str)`: SHA-256 fingerprint of the certificate (lowercase hex string).
|
||||
* `signature_algorithm (str)`: The algorithm used to sign the certificate (e.g., "sha256WithRSAEncryption").
|
||||
* `raw_cert (str)`: Base64 encoded string of the raw DER-encoded certificate.
|
||||
* `extensions (List[dict])`: A list of dictionaries, each representing a certificate extension with "name" and "value" keys.
|
||||
* Static Methods:
|
||||
* `from_url(url: str, timeout: int = 10) -> Optional[SSLCertificate]`: Fetches the SSL certificate from the given URL and returns an `SSLCertificate` instance, or `None` on failure.
|
||||
* Instance Methods:
|
||||
* `to_json(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate information as a JSON string. If `filepath` is provided, writes to the file and returns `None`.
|
||||
* `to_pem(filepath: Optional[str] = None) -> Optional[str]`: Exports the certificate in PEM format as a string. If `filepath` is provided, writes to the file and returns `None`.
|
||||
* `to_der(filepath: Optional[str] = None) -> Optional[bytes]`: Exports the raw certificate in DER format as bytes. If `filepath` is provided, writes to the file and returns `None`.
|
||||
* Example:
|
||||
```python
|
||||
# Assuming 'cert' is an SSLCertificate instance
|
||||
# print(cert["subject"]["CN"])
|
||||
# cert.to_pem("my_cert.pem")
|
||||
```
|
||||
|
||||
* 4.11. Class `DispatchResult(BaseModel)`
|
||||
* Purpose: Contains metadata about a task's execution when processed by a dispatcher (e.g., in `arun_many`).
|
||||
* Fields:
|
||||
* `task_id (str)`: A unique identifier for the dispatched task.
|
||||
* `memory_usage (float)`: Memory usage (in MB) recorded during the task's execution.
|
||||
* `peak_memory (float)`: Peak memory usage (in MB) recorded during the task's execution.
|
||||
* `start_time (Union[datetime, float])`: The start time of the task (can be a `datetime` object or a Unix timestamp float).
|
||||
* `end_time (Union[datetime, float])`: The end time of the task.
|
||||
* `error_message (str)`: Any error message if the task failed during dispatch or execution. Default: `""`.
|
||||
|
||||
* 4.12. `CrawlResultContainer(Generic[CrawlResultT])`
|
||||
* Purpose: A generic container for `CrawlResult` objects, primarily used as the return type for `arun_many` when `stream=False`. It behaves like a list, allowing iteration, indexing, and length checking.
|
||||
* Methods:
|
||||
* `__iter__(self)`: Allows iteration over the contained `CrawlResult` objects.
|
||||
* `__getitem__(self, index)`: Allows accessing `CrawlResult` objects by index.
|
||||
* `__len__(self)`: Returns the number of `CrawlResult` objects contained.
|
||||
* `__repr__(self)`: Provides a string representation of the container.
|
||||
* Attribute:
|
||||
* `_results (List[CrawlResultT])`: The internal list holding the `CrawlResult` objects.
|
||||
|
||||
* 4.13. `RunManyReturn` (Type Alias from `crawl4ai.models`)
|
||||
* Purpose: A type alias defining the possible return types for the `arun_many` method of `AsyncWebCrawler`.
|
||||
* Definition: `Union[CrawlResultContainer[CrawlResult], AsyncGenerator[CrawlResult, None]]`
|
||||
* This means `arun_many` will return a `CrawlResultContainer` (a list-like object of all `CrawlResult` instances) if `CrawlerRunConfig.stream` is `False` (the default).
|
||||
* It will return an `AsyncGenerator` yielding individual `CrawlResult` instances if `CrawlerRunConfig.stream` is `True`.
|
||||
|
||||
## 5. Core Crawler Strategies (from `crawl4ai.async_crawler_strategy`)
|
||||
|
||||
* 5.1. Abstract Base Class `AsyncCrawlerStrategy(ABC)`
|
||||
* Purpose: Defines the common interface that all asynchronous crawler strategies must implement. This allows `AsyncWebCrawler` to use different fetching mechanisms (e.g., Playwright, HTTP requests) interchangeably.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(self, browser_config: BrowserConfig, logger: AsyncLoggerBase):
|
||||
```
|
||||
* Parameters:
|
||||
* `browser_config (BrowserConfig)`: The browser configuration to be used by the strategy.
|
||||
* `logger (AsyncLoggerBase)`: The logger instance for logging strategy-specific events.
|
||||
* Key Abstract Methods (must be implemented by concrete subclasses):
|
||||
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
|
||||
* Purpose: Fetches the content from the given URL according to the `config`.
|
||||
* Returns: An `AsyncCrawlResponse` object containing the raw fetched data.
|
||||
* `async __aenter__(self)`:
|
||||
* Purpose: Asynchronous context manager entry, typically for initializing resources (e.g., launching a browser).
|
||||
* `async __aexit__(self, exc_type, exc_val, exc_tb)`:
|
||||
* Purpose: Asynchronous context manager exit, for cleaning up resources.
|
||||
* Key Concrete Methods (available to all strategies):
|
||||
* `set_custom_headers(self, headers: dict) -> None`:
|
||||
* Purpose: Sets custom HTTP headers to be used by the strategy for subsequent requests.
|
||||
* `update_user_agent(self, user_agent: str) -> None`:
|
||||
* Purpose: Updates the User-Agent string used by the strategy.
|
||||
* `set_hook(self, hook_name: str, callback: Callable) -> None`:
|
||||
* Purpose: Registers a callback function for a specific hook point in the crawling lifecycle.
|
||||
* `async_run_hook(self, hook_name: str, *args, **kwargs) -> Any`:
|
||||
* Purpose: Executes a registered hook with the given arguments.
|
||||
* `async_get_default_context(self) -> BrowserContext`:
|
||||
* Purpose: Retrieves the default browser context (Playwright specific, might raise `NotImplementedError` in non-Playwright strategies).
|
||||
* `async_create_new_page(self, context: BrowserContext) -> Page`:
|
||||
* Purpose: Creates a new page within a given browser context (Playwright specific).
|
||||
* `async_get_page(self, url: str, config: CrawlerRunConfig, session_id: Optional[str]) -> Tuple[Page, BrowserContext]`:
|
||||
* Purpose: Gets an existing page/context for a session or creates a new one (Playwright specific, managed by `BrowserManager`).
|
||||
* `async_close_page(self, page: Page, session_id: Optional[str]) -> None`:
|
||||
* Purpose: Closes a page, potentially keeping the associated context/session alive (Playwright specific).
|
||||
* `async_kill_session(self, session_id: str) -> None`:
|
||||
* Purpose: Kills (closes) a specific browser session, including its page and context (Playwright specific).
|
||||
|
||||
* 5.2. Class `AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy)`
|
||||
* Purpose: The default crawler strategy, using Playwright to control a web browser for fetching and interacting with web pages. It supports complex JavaScript execution and provides hooks for various stages of the crawl.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
browser_config: Optional[BrowserConfig] = None,
|
||||
logger: Optional[AsyncLoggerBase] = None,
|
||||
browser_manager: Optional[BrowserManager] = None
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `browser_config (Optional[BrowserConfig])`: Browser configuration. Defaults to a new `BrowserConfig()` if not provided.
|
||||
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
|
||||
* `browser_manager (Optional[BrowserManager])`: An instance of `BrowserManager` to manage browser lifecycles and contexts. If `None`, a new `BrowserManager` is created internally.
|
||||
* Key Overridden/Implemented Methods:
|
||||
* `async crawl(self, url: str, config: CrawlerRunConfig) -> AsyncCrawlResponse`:
|
||||
* Purpose: Implements the crawling logic using Playwright. It navigates to the URL, executes JavaScript if specified, waits for conditions, captures screenshots/PDFs if requested, and returns the page content and other metadata.
|
||||
* `async aprocess_html(self, url: str, html: str, config: CrawlerRunConfig, **kwargs) -> CrawlResult`:
|
||||
* Purpose: (Note: While `AsyncWebCrawler` calls this, the default implementation is in `AsyncPlaywrightCrawlerStrategy` for convenience, acting as a bridge to the scraping strategy.) Processes the fetched HTML to produce a `CrawlResult`. This involves using the `scraping_strategy` from the `config` (defaults to `WebScrapingStrategy`) to clean HTML, extract media/links, and then uses the `markdown_generator` to produce Markdown.
|
||||
* Specific Public Methods:
|
||||
* `async_create_new_context(self, config: Optional[CrawlerRunConfig] = None) -> BrowserContext`:
|
||||
* Purpose: Creates a new Playwright `BrowserContext` based on the global `BrowserConfig` and optional overrides from `CrawlerRunConfig`.
|
||||
* `async_setup_context_default(self, context: BrowserContext, config: Optional[CrawlerRunConfig] = None) -> None`:
|
||||
* Purpose: Applies default settings to a `BrowserContext`, such as viewport size, user agent, custom headers, locale, timezone, and geolocation, based on `BrowserConfig` and `CrawlerRunConfig`.
|
||||
* `async_setup_context_hooks(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
|
||||
* Purpose: Sets up event listeners on the context for capturing network requests and console messages if `config.capture_network_requests` or `config.capture_console_messages` is `True`.
|
||||
* `async_handle_storage_state(self, context: BrowserContext, config: CrawlerRunConfig) -> None`:
|
||||
* Purpose: Loads cookies and localStorage from a `storage_state` file or dictionary (specified in `BrowserConfig` or `CrawlerRunConfig`) into the given `BrowserContext`.
|
||||
* Hooks (Callable via `set_hook(hook_name, callback)` and executed by `async_run_hook`):
|
||||
* `on_browser_created`: Called after the Playwright browser instance is launched or connected. Callback receives `(browser, **kwargs)`.
|
||||
* `on_page_context_created`: Called after a new Playwright `BrowserContext` and `Page` are created. Callback receives `(page, context, **kwargs)`.
|
||||
* `before_goto`: Called just before `page.goto(url)` is executed. Callback receives `(page, context, url, **kwargs)`.
|
||||
* `after_goto`: Called after `page.goto(url)` completes successfully. Callback receives `(page, context, url, response, **kwargs)`.
|
||||
* `on_user_agent_updated`: Called when the User-Agent string is updated for a context. Callback receives `(page, context, user_agent, **kwargs)`.
|
||||
* `on_execution_started`: Called when `js_code` execution begins on a page. Callback receives `(page, context, **kwargs)`.
|
||||
* `before_retrieve_html`: Called just before the final HTML content is retrieved from the page. Callback receives `(page, context, **kwargs)`.
|
||||
* `before_return_html`: Called just before the `AsyncCrawlResponse` is returned by the `crawl()` method of the strategy. Callback receives `(page, context, html_content, **kwargs)`.
|
||||
|
||||
* 5.3. Class `AsyncHTTPCrawlerStrategy(AsyncCrawlerStrategy)`
|
||||
* Purpose: A lightweight crawler strategy that uses direct HTTP requests (via `httpx`) instead of a full browser. Suitable for static sites or when JavaScript execution is not needed.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(self, http_config: Optional[HTTPCrawlerConfig] = None, logger: Optional[AsyncLoggerBase] = None):
|
||||
```
|
||||
* Parameters:
|
||||
* `http_config (Optional[HTTPCrawlerConfig])`: Configuration for HTTP requests (method, headers, data, etc.). Defaults to a new `HTTPCrawlerConfig()`.
|
||||
* `logger (Optional[AsyncLoggerBase])`: Logger instance. Defaults to a new `AsyncLogger()`.
|
||||
* Key Overridden/Implemented Methods:
|
||||
* `async crawl(self, url: str, http_config: Optional[HTTPCrawlerConfig] = None, **kwargs) -> AsyncCrawlResponse`:
|
||||
* Purpose: Fetches content from the URL using an HTTP GET or POST request via `httpx`. Does not execute JavaScript. Returns an `AsyncCrawlResponse` with HTML, status code, and headers. Screenshot, PDF, and MHTML capabilities are not available with this strategy.
|
||||
|
||||
## 6. Browser Management (from `crawl4ai.browser_manager`)
|
||||
|
||||
* 6.1. Class `BrowserManager`
|
||||
* Purpose: Manages the lifecycle of Playwright browser instances and their contexts. It handles launching/connecting to browsers, creating new contexts with specific configurations, managing sessions for page reuse, and cleaning up resources.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(self, browser_config: BrowserConfig, logger: Optional[AsyncLoggerBase] = None):
|
||||
```
|
||||
* Parameters:
|
||||
* `browser_config (BrowserConfig)`: The global browser configuration settings.
|
||||
* `logger (Optional[AsyncLoggerBase])`: Logger instance for browser management events.
|
||||
* Key Methods:
|
||||
* `async start() -> None`: Initializes the Playwright instance and launches or connects to the browser based on `browser_config` (e.g., launches a new browser instance or connects to an existing CDP endpoint via `ManagedBrowser`).
|
||||
* `async create_browser_context(self, crawlerRunConfig: Optional[CrawlerRunConfig] = None) -> playwright.async_api.BrowserContext`: Creates a new browser context. If `crawlerRunConfig` is provided, its settings (e.g., locale, viewport, proxy) can override the global `BrowserConfig`.
|
||||
* `async setup_context(self, context: playwright.async_api.BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, is_default: bool = False) -> None`: Applies various settings to a given browser context, including headers, cookies, viewport, geolocation, permissions, and storage state, based on `BrowserConfig` and `CrawlerRunConfig`.
|
||||
* `async get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[playwright.async_api.Page, playwright.async_api.BrowserContext]`: Retrieves an existing page and context for a given `session_id` (if present in `crawlerRunConfig` and the session is active) or creates a new page and context. Manages context reuse based on a signature derived from `CrawlerRunConfig` to ensure contexts with different core settings (like proxy, locale) are isolated.
|
||||
* `async kill_session(self, session_id: str) -> None`: Closes the page and browser context associated with the given `session_id`, effectively ending that session.
|
||||
* `async close() -> None`: Closes all managed browser contexts and the main browser instance.
|
||||
|
||||
* 6.2. Class `ManagedBrowser`
|
||||
* Purpose: Manages the lifecycle of a single, potentially persistent, browser process. It's used when `BrowserConfig.use_managed_browser` is `True` or `BrowserConfig.use_persistent_context` is `True`. It handles launching the browser with a specific user data directory and connecting via CDP.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
user_data_dir: Optional[str] = None,
|
||||
headless: bool = False,
|
||||
logger=None,
|
||||
host: str = "localhost",
|
||||
debugging_port: int = 9222,
|
||||
cdp_url: Optional[str] = None, # Added as per code_analysis
|
||||
browser_config: Optional[BrowserConfig] = None # Added as per code_analysis
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `browser_type (str)`: "chromium", "firefox", or "webkit". Default: "chromium".
|
||||
* `user_data_dir (Optional[str])`: Path to the user data directory for the browser profile. If `None`, a temporary directory might be created.
|
||||
* `headless (bool)`: Whether to launch the browser in headless mode. Default: `False` (typically for managed/persistent scenarios).
|
||||
* `logger`: Logger instance.
|
||||
* `host (str)`: Host for the debugging port. Default: "localhost".
|
||||
* `debugging_port (int)`: Port for the Chrome DevTools Protocol. Default: `9222`.
|
||||
* `cdp_url (Optional[str])`: If provided, attempts to connect to an existing browser at this CDP URL instead of launching a new one.
|
||||
* `browser_config (Optional[BrowserConfig])`: The `BrowserConfig` object providing overall browser settings.
|
||||
* Key Methods:
|
||||
* `async start() -> str`: Starts the browser process (if not connecting to an existing `cdp_url`). If a new browser is launched, it uses the specified `user_data_dir` and `debugging_port`.
|
||||
* Returns: The CDP endpoint URL (e.g., "http://localhost:9222").
|
||||
* `async cleanup() -> None`: Terminates the browser process (if launched by this instance) and removes any temporary user data directory created by it.
|
||||
* Static Methods:
|
||||
* `async create_profile(cls, browser_config: Optional[BrowserConfig] = None, profile_name: Optional[str] = None, logger=None) -> str`:
|
||||
* Purpose: Launches a browser instance with a new or existing user profile, allowing interactive setup (e.g., manual login, cookie acceptance). The browser remains open until the user closes it.
|
||||
* Parameters:
|
||||
* `browser_config (Optional[BrowserConfig])`: Optional browser configuration to use.
|
||||
* `profile_name (Optional[str])`: Name for the profile. If `None`, a default name is used.
|
||||
* `logger`: Logger instance.
|
||||
* Returns: The path to the created/used user data directory, which can then be passed to `BrowserConfig.user_data_dir`.
|
||||
* `list_profiles(cls) -> List[str]`:
|
||||
* Purpose: Lists the names of all browser profiles stored in the default Crawl4AI profiles directory (`~/.crawl4ai/profiles`).
|
||||
* Returns: A list of profile name strings.
|
||||
* `delete_profile(cls, profile_name_or_path: str) -> bool`:
|
||||
* Purpose: Deletes a browser profile either by its name (if in the default directory) or by its full path.
|
||||
* Returns: `True` if deletion was successful, `False` otherwise.
|
||||
|
||||
* 6.3. Function `clone_runtime_state(src: BrowserContext, dst: BrowserContext, crawlerRunConfig: Optional[CrawlerRunConfig] = None, browserConfig: Optional[BrowserConfig] = None) -> None`
|
||||
* Purpose: Asynchronously copies runtime state (cookies, localStorage, session storage) from a source `BrowserContext` to a destination `BrowserContext`. Can also apply headers and geolocation from `CrawlerRunConfig` or `BrowserConfig` to the destination context.
|
||||
* Parameters:
|
||||
* `src (BrowserContext)`: The source browser context.
|
||||
* `dst (BrowserContext)`: The destination browser context.
|
||||
* `crawlerRunConfig (Optional[CrawlerRunConfig])`: Optional run configuration to apply to `dst`.
|
||||
* `browserConfig (Optional[BrowserConfig])`: Optional browser configuration to apply to `dst`.
|
||||
|
||||
## 7. Proxy Rotation Strategies (from `crawl4ai.proxy_strategy`)
|
||||
|
||||
* 7.1. Abstract Base Class `ProxyRotationStrategy(ABC)`
|
||||
* Purpose: Defines the interface for strategies that provide a sequence of proxy configurations, enabling proxy rotation.
|
||||
* Abstract Methods:
|
||||
* `async get_next_proxy(self) -> Optional[ProxyConfig]`:
|
||||
* Purpose: Asynchronously retrieves the next `ProxyConfig` from the strategy.
|
||||
* Returns: A `ProxyConfig` object or `None` if no more proxies are available or an error occurs.
|
||||
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`:
|
||||
* Purpose: Adds a list of `ProxyConfig` objects to the strategy's pool of proxies.
|
||||
|
||||
* 7.2. Class `RoundRobinProxyStrategy(ProxyRotationStrategy)`
|
||||
* Purpose: A simple proxy rotation strategy that cycles through a list of provided proxies in a round-robin fashion.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(self, proxies: Optional[List[ProxyConfig]] = None):
|
||||
```
|
||||
* Parameters:
|
||||
* `proxies (Optional[List[ProxyConfig]])`: An initial list of `ProxyConfig` objects. If `None`, the list is empty and proxies must be added via `add_proxies`.
|
||||
* Methods:
|
||||
* `add_proxies(self, proxies: List[ProxyConfig]) -> None`: Adds new `ProxyConfig` objects to the internal list of proxies and reinitializes the cycle.
|
||||
* `async get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next `ProxyConfig` from the list, cycling back to the beginning when the end is reached. Returns `None` if the list is empty.
|
||||
|
||||
## 8. Logging (from `crawl4ai.async_logger`)
|
||||
|
||||
* 8.1. Abstract Base Class `AsyncLoggerBase(ABC)`
|
||||
* Purpose: Defines the basic interface for an asynchronous logger. Concrete implementations should provide methods for logging messages at different levels.
|
||||
* 8.2. Class `AsyncLogger(AsyncLoggerBase)`
|
||||
* Purpose: The default asynchronous logger for `crawl4ai`. It provides structured logging to both the console and optionally to a file, with customizable icons, colors, and verbosity levels.
|
||||
* Initialization (`__init__`):
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
log_file: Optional[str] = None,
|
||||
verbose: bool = True,
|
||||
tag_width: int = 15, # outline had 10, code has 15
|
||||
icons: Optional[Dict[str, str]] = None,
|
||||
colors: Optional[Dict[LogLevel, LogColor]] = None, # Corrected type annotation
|
||||
log_level: LogLevel = LogLevel.INFO # Assuming LogLevel.INFO is a typical default
|
||||
):
|
||||
```
|
||||
* Parameters:
|
||||
* `log_file (Optional[str])`: Path to a file where logs should be written. If `None`, logs only to console.
|
||||
* `verbose (bool)`: If `True`, enables more detailed logging (DEBUG level). Default: `True`.
|
||||
* `tag_width (int)`: Width for the tag part of the log message. Default: `15`.
|
||||
* `icons (Optional[Dict[str, str]])`: Custom icons for different log tags.
|
||||
* `colors (Optional[Dict[LogLevel, LogColor]])`: Custom colors for different log levels.
|
||||
* `log_level (LogLevel)`: Minimum log level to output.
|
||||
* Key Methods (for logging):
|
||||
* `info(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an informational message.
|
||||
* `warning(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a warning message.
|
||||
* `error(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs an error message.
|
||||
* `debug(self, message: str, tag: Optional[str] = None, **params) -> None`: Logs a debug message (only if `verbose=True` or `log_level` is DEBUG).
|
||||
* `url_status(self, url: str, success: bool, timing: float, tag: str = "FETCH", **params) -> None`: Logs the status of a URL fetch operation, including success/failure and timing.
|
||||
* `error_status(self, url: str, error: str, tag: str = "ERROR", **params) -> None`: Logs an error encountered for a specific URL.
|
||||
|
||||
## 9. Core Utility Functions (from `crawl4ai.async_configs`)
|
||||
* 9.1. `to_serializable_dict(obj: Any, ignore_default_value: bool = False) -> Dict`
|
||||
* Purpose: Recursively converts a Python object (often a Pydantic model or a dataclass instance used for configuration) into a dictionary that is safe for JSON serialization. It handles nested objects, enums, and basic types.
|
||||
* Parameters:
|
||||
* `obj (Any)`: The object to be serialized.
|
||||
* `ignore_default_value (bool)`: If `True`, fields whose current value is the same as their default value (if applicable, e.g., for Pydantic models) might be omitted from the resulting dictionary. Default: `False`.
|
||||
* Returns: `Dict` - A JSON-serializable dictionary representation of the object.
|
||||
* 9.2. `from_serializable_dict(data: Any) -> Any`
|
||||
* Purpose: Recursively reconstructs Python objects from a dictionary representation (typically one created by `to_serializable_dict`). It attempts to instantiate classes based on a "type" key in the dictionary if present.
|
||||
* Parameters:
|
||||
* `data (Any)`: The dictionary (or basic type) to be deserialized.
|
||||
* Returns: `Any` - The reconstructed Python object or the original data if no special deserialization rule applies.
|
||||
* 9.3. `is_empty_value(value: Any) -> bool`
|
||||
* Purpose: Checks if a given value is considered "empty" (e.g., `None`, an empty string, an empty list, an empty dictionary).
|
||||
* Returns: `bool` - `True` if the value is empty, `False` otherwise.
|
||||
|
||||
## 10. Enumerations (Key Enums used in Core)
|
||||
* 10.1. `CacheMode` (from `crawl4ai.cache_context`, defined in `crawl4ai.async_configs` as per provided code)
|
||||
* Purpose: Defines the caching behavior for crawl operations.
|
||||
* Members:
|
||||
* `ENABLE`: (Value: "enable") Normal caching behavior; read from cache if available, write to cache after fetching.
|
||||
* `DISABLE`: (Value: "disable") No caching at all; always fetch fresh content and do not write to cache.
|
||||
* `READ_ONLY`: (Value: "read_only") Only read from the cache; do not write new or updated content to the cache.
|
||||
* `WRITE_ONLY`: (Value: "write_only") Only write to the cache after fetching; do not read from the cache.
|
||||
* `BYPASS`: (Value: "bypass") Skip the cache entirely for this specific operation; fetch fresh content and do not write to cache. This is often the default for individual `CrawlerRunConfig` instances.
|
||||
* 10.2. `DisplayMode` (from `crawl4ai.models`, used by `CrawlerMonitor`)
|
||||
* Purpose: Defines the display mode for the `CrawlerMonitor`.
|
||||
* Members:
|
||||
* `DETAILED`: Shows detailed information for each task.
|
||||
* `AGGREGATED`: Shows summary statistics and overall progress.
|
||||
* 10.3. `CrawlStatus` (from `crawl4ai.models`, used by `CrawlStats`)
|
||||
* Purpose: Represents the status of a crawl task.
|
||||
* Members:
|
||||
* `QUEUED`: Task is waiting to be processed.
|
||||
* `IN_PROGRESS`: Task is currently being processed.
|
||||
* `COMPLETED`: Task finished successfully.
|
||||
* `FAILED`: Task failed.
|
||||
|
||||
## 11. Versioning
|
||||
* 11.1. Accessing Library Version:
|
||||
* The current version of the `crawl4ai` library can be accessed programmatically via the `__version__` attribute of the top-level `crawl4ai` package.
|
||||
* Example:
|
||||
```python
|
||||
from crawl4ai import __version__ as crawl4ai_version
|
||||
print(f"Crawl4AI Version: {crawl4ai_version}")
|
||||
# Expected output based on provided code: Crawl4AI Version: 0.6.3
|
||||
```
|
||||
|
||||
## 12. Basic Usage Examples
|
||||
|
||||
* 12.1. Minimal Crawl:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="http://example.com")
|
||||
if result.success:
|
||||
print("Markdown (first 300 chars):")
|
||||
print(result.markdown.raw_markdown[:300]) # Accessing raw_markdown
|
||||
else:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
* 12.2. Crawl with Basic Configuration:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
browser_cfg = BrowserConfig(headless=True, browser_type="firefox")
|
||||
run_cfg = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
word_count_threshold=50
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_cfg) as crawler:
|
||||
result = await crawler.arun(url="http://example.com", config=run_cfg)
|
||||
if result.success:
|
||||
print(f"Status Code: {result.status_code}")
|
||||
print(f"Cleaned HTML length: {len(result.cleaned_html)}")
|
||||
else:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
* 12.3. Accessing Links and Images from Result:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
|
||||
async def main():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="http://example.com")
|
||||
if result.success:
|
||||
print(f"Found {len(result.links.internal)} internal links.")
|
||||
if result.links.internal:
|
||||
print(f"First internal link: {result.links.internal[0].href}")
|
||||
|
||||
print(f"Found {len(result.media.images)} images.")
|
||||
if result.media.images:
|
||||
print(f"First image src: {result.media.images[0].src}")
|
||||
else:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,813 +0,0 @@
|
||||
Okay, I have read and processed the information. I will now generate the detailed "Foundational Memory" document for the `deep_crawling` component of `crawl4ai`, based on the outline from the previous step and the provided code snippets.
|
||||
|
||||
```markdown
|
||||
# Detailed Foundational Memory for crawl4ai - deep_crawling Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_deep_crawling.md`
|
||||
**Library Version Context:** 0.6.3 (from `crawl4ai/__version__.py`)
|
||||
**Outline Generation Date:** 2024-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Deep Crawling
|
||||
|
||||
* 1.1. Purpose: The `deep_crawling` component provides functionalities for recursively crawling web pages starting from an initial URL. It includes strategies for different traversal orders (BFS, DFS, Best-First), mechanisms for filtering which URLs to visit, and methods for scoring URLs to prioritize crawling.
|
||||
* 1.2. Core Concepts:
|
||||
* 1.2.1. Definition of Deep Crawling in Crawl4ai context: The process of discovering and fetching multiple web pages by following links from an initial set of URLs, adhering to specified depth, page limits, and filtering/scoring rules.
|
||||
* 1.2.2. Key Abstractions:
|
||||
* `DeepCrawlStrategy`: Defines the algorithm for traversing linked web pages (e.g., BFS, DFS).
|
||||
* `URLFilter`: Determines whether a discovered URL should be considered for crawling.
|
||||
* `URLScorer`: Assigns a score to URLs to influence crawling priority, especially in strategies like Best-First.
|
||||
|
||||
## 2. `DeepCrawlStrategy` Interface and Implementations
|
||||
|
||||
* **2.1. `DeepCrawlStrategy` (Abstract Base Class)**
|
||||
* Source: `crawl4ai/deep_crawling/base_strategy.py`
|
||||
* 2.1.1. Purpose: Defines the abstract base class for all deep crawling strategies, outlining the core methods required for traversal logic, resource management, URL validation, and link discovery.
|
||||
* 2.1.2. Key Abstract Methods:
|
||||
* `async def _arun_batch(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> List[CrawlResult]`:
|
||||
* Description: Core logic for batch (non-streaming) deep crawling. Processes URLs level by level (or according to strategy) and returns all results once the crawl is complete or limits are met.
|
||||
* `async def _arun_stream(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig) -> AsyncGenerator[CrawlResult, None]`:
|
||||
* Description: Core logic for streaming deep crawling. Processes URLs and yields `CrawlResult` objects as they become available.
|
||||
* `async def shutdown(self) -> None`:
|
||||
* Description: Cleans up any resources used by the deep crawl strategy, such as signaling cancellation events.
|
||||
* `async def can_process_url(self, url: str, depth: int) -> bool`:
|
||||
* Description: Validates a given URL and current depth against configured filters and limits to decide if it should be processed.
|
||||
* `async def link_discovery(self, result: CrawlResult, source_url: str, current_depth: int, visited: Set[str], next_level: List[tuple], depths: Dict[str, int]) -> None`:
|
||||
* Description: Extracts links from a `CrawlResult`, validates them using `can_process_url`, optionally scores them, and appends valid URLs (and their parent references) to the `next_level` list. Updates the `depths` dictionary for newly discovered URLs.
|
||||
* 2.1.3. Key Concrete Methods:
|
||||
* `async def arun(self, start_url: str, crawler: AsyncWebCrawler, config: Optional[CrawlerRunConfig] = None) -> RunManyReturn`:
|
||||
* Description: Main entry point for initiating a deep crawl. It checks if a `CrawlerRunConfig` is provided and then delegates to either `_arun_stream` or `_arun_batch` based on the `config.stream` flag.
|
||||
* `def __call__(self, start_url: str, crawler: AsyncWebCrawler, config: CrawlerRunConfig)`:
|
||||
* Description: Makes the strategy instance callable, directly invoking the `arun` method.
|
||||
* 2.1.4. Attributes:
|
||||
* `_cancel_event (asyncio.Event)`: Event to signal cancellation of the crawl.
|
||||
* `_pages_crawled (int)`: Counter for the number of pages successfully crawled.
|
||||
|
||||
* **2.2. `BFSDeepCrawlStrategy`**
|
||||
* Source: `crawl4ai/deep_crawling/bfs_strategy.py`
|
||||
* 2.2.1. Purpose: Implements a Breadth-First Search (BFS) deep crawling strategy, exploring all URLs at the current depth level before moving to the next.
|
||||
* 2.2.2. Inheritance: `DeepCrawlStrategy`
|
||||
* 2.2.3. Initialization (`__init__`)
|
||||
* 2.2.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int,
|
||||
filter_chain: FilterChain = FilterChain(),
|
||||
url_scorer: Optional[URLScorer] = None,
|
||||
include_external: bool = False,
|
||||
score_threshold: float = -float('inf'),
|
||||
max_pages: int = float('inf'),
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
```
|
||||
* 2.2.3.2. Parameters:
|
||||
* `max_depth (int)`: Maximum depth to crawl relative to the `start_url`.
|
||||
* `filter_chain (FilterChain`, default: `FilterChain()`)`: A `FilterChain` instance to apply to discovered URLs.
|
||||
* `url_scorer (Optional[URLScorer]`, default: `None`)`: An optional `URLScorer` to score URLs. If provided, URLs below `score_threshold` are skipped, and for crawls exceeding `max_pages`, higher-scored URLs are prioritized.
|
||||
* `include_external (bool`, default: `False`)`: If `True`, allows crawling of URLs from external domains.
|
||||
* `score_threshold (float`, default: `-float('inf')`)`: Minimum score (if `url_scorer` is used) for a URL to be processed.
|
||||
* `max_pages (int`, default: `float('inf')`)`: Maximum total number of pages to crawl.
|
||||
* `logger (Optional[logging.Logger]`, default: `None`)`: An optional logger instance. If `None`, a default logger is created.
|
||||
* 2.2.4. Key Implemented Methods:
|
||||
* `_arun_batch(...)`: Implements BFS traversal by processing URLs level by level. It collects all results from a level before discovering links for the next level. All results are returned as a list upon completion.
|
||||
* `_arun_stream(...)`: Implements BFS traversal, yielding `CrawlResult` objects as soon as they are processed within a level. Link discovery for the next level happens after all URLs in the current level are processed and their results yielded.
|
||||
* `can_process_url(...)`: Validates URL format, applies the `filter_chain`, and checks depth limits. For the start URL (depth 0), filtering is bypassed.
|
||||
* `link_discovery(...)`: Extracts internal (and optionally external) links, normalizes them, checks against `visited` set and `can_process_url`. If a `url_scorer` is present and `max_pages` limit is a concern, it scores and sorts valid links, selecting the top ones within `remaining_capacity`.
|
||||
* `shutdown(...)`: Sets an internal `_cancel_event` to signal graceful termination and records the end time in `stats`.
|
||||
* 2.2.5. Key Attributes/Properties:
|
||||
* `stats (TraversalStats)`: [Read-only] - Instance of `TraversalStats` tracking the progress and statistics of the crawl.
|
||||
* `max_depth (int)`: Maximum crawl depth.
|
||||
* `filter_chain (FilterChain)`: The filter chain used.
|
||||
* `url_scorer (Optional[URLScorer])`: The URL scorer used.
|
||||
* `include_external (bool)`: Flag for including external URLs.
|
||||
* `score_threshold (float)`: URL score threshold.
|
||||
* `max_pages (int)`: Maximum pages to crawl.
|
||||
|
||||
* **2.3. `DFSDeepCrawlStrategy`**
|
||||
* Source: `crawl4ai/deep_crawling/dfs_strategy.py`
|
||||
* 2.3.1. Purpose: Implements a Depth-First Search (DFS) deep crawling strategy, exploring as far as possible along each branch before backtracking.
|
||||
* 2.3.2. Inheritance: `BFSDeepCrawlStrategy` (Note: Leverages much of the `BFSDeepCrawlStrategy`'s infrastructure but overrides traversal logic to use a stack.)
|
||||
* 2.3.3. Initialization (`__init__`)
|
||||
* 2.3.3.1. Signature: (Same as `BFSDeepCrawlStrategy`)
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int,
|
||||
filter_chain: FilterChain = FilterChain(),
|
||||
url_scorer: Optional[URLScorer] = None,
|
||||
include_external: bool = False,
|
||||
score_threshold: float = -float('inf'),
|
||||
max_pages: int = infinity,
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
```
|
||||
* 2.3.3.2. Parameters: Same as `BFSDeepCrawlStrategy`.
|
||||
* 2.3.4. Key Overridden/Implemented Methods:
|
||||
* `_arun_batch(...)`: Implements DFS traversal using a LIFO stack. Processes one URL at a time, discovers its links, and adds them to the stack (typically in reverse order of discovery to maintain a natural DFS path). Collects all results in a list.
|
||||
* `_arun_stream(...)`: Implements DFS traversal using a LIFO stack, yielding `CrawlResult` for each processed URL as it becomes available. Discovered links are added to the stack for subsequent processing.
|
||||
|
||||
* **2.4. `BestFirstCrawlingStrategy`**
|
||||
* Source: `crawl4ai/deep_crawling/bff_strategy.py`
|
||||
* 2.4.1. Purpose: Implements a Best-First Search deep crawling strategy, prioritizing URLs based on scores assigned by a `URLScorer`. It uses a priority queue to manage URLs to visit.
|
||||
* 2.4.2. Inheritance: `DeepCrawlStrategy`
|
||||
* 2.4.3. Initialization (`__init__`)
|
||||
* 2.4.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
max_depth: int,
|
||||
filter_chain: FilterChain = FilterChain(),
|
||||
url_scorer: Optional[URLScorer] = None,
|
||||
include_external: bool = False,
|
||||
max_pages: int = float('inf'),
|
||||
logger: Optional[logging.Logger] = None,
|
||||
):
|
||||
```
|
||||
* 2.4.3.2. Parameters:
|
||||
* `max_depth (int)`: Maximum depth to crawl.
|
||||
* `filter_chain (FilterChain`, default: `FilterChain()`)`: Chain of filters to apply.
|
||||
* `url_scorer (Optional[URLScorer]`, default: `None`)`: Scorer to rank URLs. Crucial for this strategy; if not provided, URLs might effectively be processed in FIFO order (score 0).
|
||||
* `include_external (bool`, default: `False`)`: Whether to include external links.
|
||||
* `max_pages (int`, default: `float('inf')`)`: Maximum number of pages to crawl.
|
||||
* `logger (Optional[logging.Logger]`, default: `None`)`: Logger instance.
|
||||
* 2.4.4. Key Implemented Methods:
|
||||
* `_arun_batch(...)`: Aggregates results from `_arun_best_first` into a list.
|
||||
* `_arun_stream(...)`: Yields results from `_arun_best_first` as they are generated.
|
||||
* `_arun_best_first(...)`: Core logic for best-first traversal. Uses an `asyncio.PriorityQueue` where items are `(score, depth, url, parent_url)`. URLs are processed in batches (default size 10) from the priority queue. Discovered links are scored and added to the queue.
|
||||
* 2.4.5. Key Attributes/Properties:
|
||||
* `stats (TraversalStats)`: [Read-only] - Traversal statistics object.
|
||||
* `BATCH_SIZE (int)`: [Class constant, default: 10] - Number of URLs to process concurrently from the priority queue.
|
||||
|
||||
## 3. URL Filtering Mechanisms
|
||||
|
||||
* **3.1. `URLFilter` (Abstract Base Class)**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.1.1. Purpose: Defines the abstract base class for all URL filters, providing a common interface for deciding whether a URL should be processed.
|
||||
* 3.1.2. Key Abstract Methods:
|
||||
* `apply(self, url: str) -> bool`:
|
||||
* Description: Abstract method that must be implemented by subclasses. It takes a URL string and returns `True` if the URL passes the filter (should be processed), and `False` otherwise.
|
||||
* 3.1.3. Key Attributes/Properties:
|
||||
* `name (str)`: [Read-only] - The name of the filter, typically the class name.
|
||||
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` to track how many URLs were processed, passed, and rejected by this filter.
|
||||
* `logger (logging.Logger)`: [Read-only] - A logger instance specific to this filter, initialized lazily.
|
||||
* 3.1.4. Key Concrete Methods:
|
||||
* `_update_stats(self, passed: bool) -> None`: Updates the `stats` object (total, passed, rejected counts).
|
||||
|
||||
* **3.2. `FilterChain`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.2.1. Purpose: Manages a sequence of `URLFilter` instances. A URL must pass all filters in the chain to be considered valid.
|
||||
* 3.2.2. Initialization (`__init__`)
|
||||
* 3.2.2.1. Signature:
|
||||
```python
|
||||
def __init__(self, filters: List[URLFilter] = None):
|
||||
```
|
||||
* 3.2.2.2. Parameters:
|
||||
* `filters (List[URLFilter]`, default: `None`)`: An optional list of `URLFilter` instances to initialize the chain with. If `None`, an empty chain is created.
|
||||
* 3.2.3. Key Public Methods:
|
||||
* `add_filter(self, filter_: URLFilter) -> FilterChain`:
|
||||
* Description: Adds a new `URLFilter` instance to the end of the chain.
|
||||
* Returns: `(FilterChain)` - The `FilterChain` instance itself, allowing for method chaining.
|
||||
* `async def apply(self, url: str) -> bool`:
|
||||
* Description: Applies each filter in the chain to the given URL. If any filter returns `False` (rejects the URL), this method immediately returns `False`. If all filters pass, it returns `True`. Handles both synchronous and asynchronous `apply` methods of individual filters.
|
||||
* Returns: `(bool)` - `True` if the URL passes all filters, `False` otherwise.
|
||||
* 3.2.4. Key Attributes/Properties:
|
||||
* `filters (Tuple[URLFilter, ...])`: [Read-only] - An immutable tuple containing the `URLFilter` instances in the chain.
|
||||
* `stats (FilterStats)`: [Read-only] - An instance of `FilterStats` tracking the aggregated statistics for the entire chain (total URLs processed, passed, and rejected by the chain as a whole).
|
||||
|
||||
* **3.3. `URLPatternFilter`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.3.1. Purpose: Filters URLs based on whether they match a list of specified string patterns. Supports glob-style wildcards and regular expressions.
|
||||
* 3.3.2. Inheritance: `URLFilter`
|
||||
* 3.3.3. Initialization (`__init__`)
|
||||
* 3.3.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True, # Deprecated, glob is always used for strings if not regex
|
||||
reverse: bool = False,
|
||||
):
|
||||
```
|
||||
* 3.3.3.2. Parameters:
|
||||
* `patterns (Union[str, Pattern, List[Union[str, Pattern]]])`: A single pattern string/compiled regex, or a list of such patterns. String patterns are treated as glob patterns by default unless they are identifiable as regex (e.g., start with `^`, end with `$`, contain `\d`).
|
||||
* `use_glob (bool`, default: `True`)`: [Deprecated] This parameter's functionality is now implicitly handled by pattern detection.
|
||||
* `reverse (bool`, default: `False`)`: If `True`, the filter rejects URLs that match any of the patterns. If `False` (default), it accepts URLs that match any pattern and rejects those that don't match any.
|
||||
* 3.3.4. Key Implemented Methods:
|
||||
* `apply(self, url: str) -> bool`:
|
||||
* Description: Checks if the URL matches any of the configured patterns. Simple suffix/prefix/domain patterns are checked first for performance. For more complex patterns, it uses `fnmatch.translate` (for glob-like strings) or compiled regex objects. The outcome is affected by the `reverse` flag.
|
||||
* 3.3.5. Internal Categorization:
|
||||
* `PATTERN_TYPES`: A dictionary mapping pattern types (SUFFIX, PREFIX, DOMAIN, PATH, REGEX) to integer constants.
|
||||
* `_simple_suffixes (Set[str])`: Stores simple suffix patterns (e.g., `.html`).
|
||||
* `_simple_prefixes (Set[str])`: Stores simple prefix patterns (e.g., `/blog/`).
|
||||
* `_domain_patterns (List[Pattern])`: Stores compiled regex for domain-specific patterns (e.g., `*.example.com`).
|
||||
* `_path_patterns (List[Pattern])`: Stores compiled regex for more general path patterns.
|
||||
|
||||
* **3.4. `ContentTypeFilter`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.4.1. Purpose: Filters URLs based on their expected content type, primarily by inferring it from the file extension in the URL.
|
||||
* 3.4.2. Inheritance: `URLFilter`
|
||||
* 3.4.3. Initialization (`__init__`)
|
||||
* 3.4.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
allowed_types: Union[str, List[str]],
|
||||
check_extension: bool = True,
|
||||
ext_map: Dict[str, str] = _MIME_MAP, # _MIME_MAP is internal
|
||||
):
|
||||
```
|
||||
* 3.4.3.2. Parameters:
|
||||
* `allowed_types (Union[str, List[str]])`: A single MIME type string (e.g., "text/html") or a list of allowed MIME types. Can also be partial types like "image/" to allow all image types.
|
||||
* `check_extension (bool`, default: `True`)`: If `True` (default), the filter attempts to determine the content type by looking at the URL's file extension. If `False`, all URLs pass this filter (unless `allowed_types` is empty).
|
||||
* `ext_map (Dict[str, str]`, default: `ContentTypeFilter._MIME_MAP`)`: A dictionary mapping file extensions to their corresponding MIME types. A comprehensive default map is provided.
|
||||
* 3.4.4. Key Implemented Methods:
|
||||
* `apply(self, url: str) -> bool`:
|
||||
* Description: Extracts the file extension from the URL. If `check_extension` is `True` and an extension is found, it checks if the inferred MIME type (or the extension itself if MIME type is unknown) is among the `allowed_types`. If no extension is found, it typically allows the URL (assuming it might be an HTML page or similar).
|
||||
* 3.4.5. Static Methods:
|
||||
* `_extract_extension(url: str) -> str`: [Cached] Extracts the file extension from a URL path, handling query parameters and fragments.
|
||||
* 3.4.6. Class Variables:
|
||||
* `_MIME_MAP (Dict[str, str])`: A class-level dictionary mapping common file extensions to MIME types.
|
||||
|
||||
* **3.5. `DomainFilter`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.5.1. Purpose: Filters URLs based on a whitelist of allowed domains or a blacklist of blocked domains. Supports subdomain matching.
|
||||
* 3.5.2. Inheritance: `URLFilter`
|
||||
* 3.5.3. Initialization (`__init__`)
|
||||
* 3.5.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
allowed_domains: Union[str, List[str]] = None,
|
||||
blocked_domains: Union[str, List[str]] = None,
|
||||
):
|
||||
```
|
||||
* 3.5.3.2. Parameters:
|
||||
* `allowed_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. If provided, only URLs whose domain (or a subdomain thereof) is in this list will pass.
|
||||
* `blocked_domains (Union[str, List[str]]`, default: `None`)`: A single domain string or a list of domain strings. URLs whose domain (or a subdomain thereof) is in this list will be rejected.
|
||||
* 3.5.4. Key Implemented Methods:
|
||||
* `apply(self, url: str) -> bool`:
|
||||
* Description: Extracts the domain from the URL. First, checks if the domain is in `_blocked_domains` (rejects if true). Then, if `_allowed_domains` is specified, checks if the domain is in that list (accepts if true). If `_allowed_domains` is not specified and the URL was not blocked, it passes.
|
||||
* 3.5.5. Static Methods:
|
||||
* `_normalize_domains(domains: Union[str, List[str]]) -> Set[str]`: Converts input domains to a set of lowercase strings.
|
||||
* `_is_subdomain(domain: str, parent_domain: str) -> bool`: Checks if `domain` is a subdomain of (or equal to) `parent_domain`.
|
||||
* `_extract_domain(url: str) -> str`: [Cached] Extracts the domain name from a URL.
|
||||
|
||||
* **3.6. `ContentRelevanceFilter`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.6.1. Purpose: Filters URLs by fetching their `<head>` section, extracting text content (title, meta tags), and scoring its relevance against a given query using the BM25 algorithm.
|
||||
* 3.6.2. Inheritance: `URLFilter`
|
||||
* 3.6.3. Initialization (`__init__`)
|
||||
* 3.6.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
query: str,
|
||||
threshold: float,
|
||||
k1: float = 1.2,
|
||||
b: float = 0.75,
|
||||
avgdl: int = 1000,
|
||||
):
|
||||
```
|
||||
* 3.6.3.2. Parameters:
|
||||
* `query (str)`: The query string to assess relevance against.
|
||||
* `threshold (float)`: The minimum BM25 score required for the URL to be considered relevant and pass the filter.
|
||||
* `k1 (float`, default: `1.2`)`: BM25 k1 parameter (term frequency saturation).
|
||||
* `b (float`, default: `0.75`)`: BM25 b parameter (length normalization).
|
||||
* `avgdl (int`, default: `1000`)`: Assumed average document length for BM25 calculations (typically based on the head content).
|
||||
* 3.6.4. Key Implemented Methods:
|
||||
* `async def apply(self, url: str) -> bool`:
|
||||
* Description: Asynchronously fetches the HTML `<head>` content of the URL using `HeadPeeker.peek_html`. Extracts title and meta description/keywords. Calculates the BM25 score of this combined text against the `query`. Returns `True` if the score is >= `threshold`.
|
||||
* 3.6.5. Helper Methods:
|
||||
* `_build_document(self, fields: Dict) -> str`: Constructs a weighted document string from title and meta tags.
|
||||
* `_tokenize(self, text: str) -> List[str]`: Simple whitespace tokenizer.
|
||||
* `_bm25(self, document: str) -> float`: Calculates the BM25 score.
|
||||
|
||||
* **3.7. `SEOFilter`**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.7.1. Purpose: Filters URLs by performing a quantitative SEO quality assessment based on the content of their `<head>` section (e.g., title length, meta description presence, canonical tags, robots meta tags, schema.org markup).
|
||||
* 3.7.2. Inheritance: `URLFilter`
|
||||
* 3.7.3. Initialization (`__init__`)
|
||||
* 3.7.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
threshold: float = 0.65,
|
||||
keywords: List[str] = None,
|
||||
weights: Dict[str, float] = None,
|
||||
):
|
||||
```
|
||||
* 3.7.3.2. Parameters:
|
||||
* `threshold (float`, default: `0.65`)`: The minimum aggregated SEO score (typically 0.0 to 1.0 range, though individual factor weights can exceed 1) required for the URL to pass.
|
||||
* `keywords (List[str]`, default: `None`)`: A list of keywords to check for presence in the title.
|
||||
* `weights (Dict[str, float]`, default: `None`)`: A dictionary to override default weights for various SEO factors (e.g., `{"title_length": 0.2, "canonical": 0.15}`).
|
||||
* 3.7.4. Key Implemented Methods:
|
||||
* `async def apply(self, url: str) -> bool`:
|
||||
* Description: Asynchronously fetches the HTML `<head>` content. Calculates scores for individual SEO factors (title length, keyword presence, meta description, canonical tag, robots meta tag, schema.org presence, URL quality). Aggregates these scores using the defined `weights`. Returns `True` if the total score is >= `threshold`.
|
||||
* 3.7.5. Helper Methods (Scoring Factors):
|
||||
* `_score_title_length(self, title: str) -> float`
|
||||
* `_score_keyword_presence(self, text: str) -> float`
|
||||
* `_score_meta_description(self, desc: str) -> float`
|
||||
* `_score_canonical(self, canonical: str, original: str) -> float`
|
||||
* `_score_schema_org(self, html: str) -> float`
|
||||
* `_score_url_quality(self, parsed_url) -> float`
|
||||
* 3.7.6. Class Variables:
|
||||
* `DEFAULT_WEIGHTS (Dict[str, float])`: Default weights for each SEO factor.
|
||||
|
||||
* **3.8. `FilterStats` Data Class**
|
||||
* Source: `crawl4ai/deep_crawling/filters.py`
|
||||
* 3.8.1. Purpose: A data class to track statistics for URL filtering operations, including total URLs processed, passed, and rejected.
|
||||
* 3.8.2. Fields:
|
||||
* `_counters (array.array)`: An array of unsigned integers storing counts for `[total, passed, rejected]`.
|
||||
* 3.8.3. Properties:
|
||||
* `total_urls (int)`: Returns the total number of URLs processed.
|
||||
* `passed_urls (int)`: Returns the number of URLs that passed the filter.
|
||||
* `rejected_urls (int)`: Returns the number of URLs that were rejected.
|
||||
|
||||
## 4. URL Scoring Mechanisms
|
||||
|
||||
* **4.1. `URLScorer` (Abstract Base Class)**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.1.1. Purpose: Defines the abstract base class for all URL scorers. Scorers assign a numerical value to URLs, which can be used to prioritize crawling.
|
||||
* 4.1.2. Key Abstract Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Abstract method to be implemented by subclasses. It takes a URL string and returns a raw numerical score.
|
||||
* 4.1.3. Key Concrete Methods:
|
||||
* `score(self, url: str) -> float`:
|
||||
* Description: Calculates the final score for a URL by calling `_calculate_score` and multiplying the result by the scorer's `weight`. It also updates the internal `ScoringStats`.
|
||||
* Returns: `(float)` - The weighted score.
|
||||
* 4.1.4. Key Attributes/Properties:
|
||||
* `weight (ctypes.c_float)`: [Read-write] - The weight assigned to this scorer. The raw score calculated by `_calculate_score` will be multiplied by this weight. Default is 1.0. Stored as `ctypes.c_float` for memory efficiency.
|
||||
* `stats (ScoringStats)`: [Read-only] - An instance of `ScoringStats` that tracks statistics for this scorer (number of URLs scored, total score, min/max scores).
|
||||
|
||||
* **4.2. `KeywordRelevanceScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.2.1. Purpose: Scores URLs based on the presence and frequency of specified keywords within the URL string itself.
|
||||
* 4.2.2. Inheritance: `URLScorer`
|
||||
* 4.2.3. Initialization (`__init__`)
|
||||
* 4.2.3.1. Signature:
|
||||
```python
|
||||
def __init__(self, keywords: List[str], weight: float = 1.0, case_sensitive: bool = False):
|
||||
```
|
||||
* 4.2.3.2. Parameters:
|
||||
* `keywords (List[str])`: A list of keyword strings to search for in the URL.
|
||||
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||
* `case_sensitive (bool`, default: `False`)`: If `True`, keyword matching is case-sensitive. Otherwise, both the URL and keywords are converted to lowercase for matching.
|
||||
* 4.2.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Counts how many of the provided `keywords` are present in the `url`. The score is the ratio of matched keywords to the total number of keywords (0.0 to 1.0).
|
||||
* 4.2.5. Helper Methods:
|
||||
* `_url_bytes(self, url: str) -> bytes`: [Cached] Converts URL to bytes, lowercasing if not case-sensitive.
|
||||
|
||||
* **4.3. `PathDepthScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.3.1. Purpose: Scores URLs based on their path depth (number of segments in the URL path). It favors URLs closer to an `optimal_depth`.
|
||||
* 4.3.2. Inheritance: `URLScorer`
|
||||
* 4.3.3. Initialization (`__init__`)
|
||||
* 4.3.3.1. Signature:
|
||||
```python
|
||||
def __init__(self, optimal_depth: int = 3, weight: float = 1.0):
|
||||
```
|
||||
* 4.3.3.2. Parameters:
|
||||
* `optimal_depth (int`, default: `3`)`: The path depth considered ideal. URLs at this depth get the highest score.
|
||||
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||
* 4.3.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Calculates the path depth of the URL. The score is `1.0 / (1.0 + abs(depth - optimal_depth))`, meaning URLs at `optimal_depth` score 1.0, and scores decrease as depth deviates. Uses a lookup table for common small differences for speed.
|
||||
* 4.3.5. Static Methods:
|
||||
* `_quick_depth(path: str) -> int`: [Cached] Efficiently calculates path depth without full URL parsing.
|
||||
|
||||
* **4.4. `ContentTypeScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.4.1. Purpose: Scores URLs based on their inferred content type, typically derived from the file extension.
|
||||
* 4.4.2. Inheritance: `URLScorer`
|
||||
* 4.4.3. Initialization (`__init__`)
|
||||
* 4.4.3.1. Signature:
|
||||
```python
|
||||
def __init__(self, type_weights: Dict[str, float], weight: float = 1.0):
|
||||
```
|
||||
* 4.4.3.2. Parameters:
|
||||
* `type_weights (Dict[str, float])`: A dictionary mapping file extensions (e.g., "html", "pdf") or MIME type patterns (e.g., "text/html", "image/") to scores. Patterns ending with '$' are treated as exact extension matches.
|
||||
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||
* 4.4.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Extracts the file extension from the URL. Looks up the score in `type_weights` first by exact extension match (if pattern ends with '$'), then by general extension. If no direct match, it might try matching broader MIME type categories if defined in `type_weights`. Returns 0.0 if no match found.
|
||||
* 4.4.5. Static Methods:
|
||||
* `_quick_extension(url: str) -> str`: [Cached] Efficiently extracts file extension.
|
||||
|
||||
* **4.5. `FreshnessScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.5.1. Purpose: Scores URLs based on dates found within the URL string, giving higher scores to more recent dates.
|
||||
* 4.5.2. Inheritance: `URLScorer`
|
||||
* 4.5.3. Initialization (`__init__`)
|
||||
* 4.5.3.1. Signature:
|
||||
```python
|
||||
def __init__(self, weight: float = 1.0, current_year: int = [datetime.date.today().year]): # Actual default is dynamic
|
||||
```
|
||||
* 4.5.3.2. Parameters:
|
||||
* `weight (float`, default: `1.0`)`: The weight to apply to the calculated score.
|
||||
* `current_year (int`, default: `datetime.date.today().year`)`: The reference year to calculate freshness against.
|
||||
* 4.5.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Uses a regex to find year patterns (YYYY) in the URL. If multiple years are found, it uses the latest valid year. The score is higher for years closer to `current_year`, using a predefined lookup for small differences or a decay function for larger differences. If no year is found, a default score (0.5) is returned.
|
||||
* 4.5.5. Helper Methods:
|
||||
* `_extract_year(self, url: str) -> Optional[int]`: [Cached] Extracts the most recent valid year from the URL.
|
||||
|
||||
* **4.6. `DomainAuthorityScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.6.1. Purpose: Scores URLs based on a predefined list of domain authority weights. This allows prioritizing or de-prioritizing URLs from specific domains.
|
||||
* 4.6.2. Inheritance: `URLScorer`
|
||||
* 4.6.3. Initialization (`__init__`)
|
||||
* 4.6.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
domain_weights: Dict[str, float],
|
||||
default_weight: float = 0.5,
|
||||
weight: float = 1.0,
|
||||
):
|
||||
```
|
||||
* 4.6.3.2. Parameters:
|
||||
* `domain_weights (Dict[str, float])`: A dictionary mapping domain names (e.g., "example.com") to their authority scores (typically between 0.0 and 1.0).
|
||||
* `default_weight (float`, default: `0.5`)`: The score to assign to URLs whose domain is not found in `domain_weights`.
|
||||
* `weight (float`, default: `1.0`)`: The overall weight to apply to the calculated score.
|
||||
* 4.6.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Extracts the domain from the URL. If the domain is in `_domain_weights`, its corresponding score is returned. Otherwise, `_default_weight` is returned. Prioritizes top domains for faster lookup.
|
||||
* 4.6.5. Static Methods:
|
||||
* `_extract_domain(url: str) -> str`: [Cached] Efficiently extracts the domain from a URL.
|
||||
|
||||
* **4.7. `CompositeScorer`**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.7.1. Purpose: Combines the scores from multiple `URLScorer` instances. Each constituent scorer contributes its weighted score to the final composite score.
|
||||
* 4.7.2. Inheritance: `URLScorer`
|
||||
* 4.7.3. Initialization (`__init__`)
|
||||
* 4.7.3.1. Signature:
|
||||
```python
|
||||
def __init__(self, scorers: List[URLScorer], normalize: bool = True):
|
||||
```
|
||||
* 4.7.3.2. Parameters:
|
||||
* `scorers (List[URLScorer])`: A list of `URLScorer` instances to be combined.
|
||||
* `normalize (bool`, default: `True`)`: If `True`, the final composite score is normalized by dividing the sum of weighted scores by the number of scorers. This can help keep scores in a more consistent range.
|
||||
* 4.7.4. Key Implemented Methods:
|
||||
* `_calculate_score(self, url: str) -> float`:
|
||||
* Description: Iterates through all scorers in its list, calls their `score(url)` method (which applies individual weights), and sums up these scores. If `normalize` is `True`, divides the total sum by the number of scorers.
|
||||
* 4.7.5. Key Concrete Methods (overrides `URLScorer.score`):
|
||||
* `score(self, url: str) -> float`:
|
||||
* Description: Calculates the composite score and updates its own `ScoringStats`. Note: The individual scorers' stats are updated when their `score` methods are called internally.
|
||||
|
||||
* **4.8. `ScoringStats` Data Class**
|
||||
* Source: `crawl4ai/deep_crawling/scorers.py`
|
||||
* 4.8.1. Purpose: A data class to track statistics for URL scoring operations, including the number of URLs scored, total score, and min/max scores.
|
||||
* 4.8.2. Fields:
|
||||
* `_urls_scored (int)`: Count of URLs scored.
|
||||
* `_total_score (float)`: Sum of all scores.
|
||||
* `_min_score (Optional[float])`: Minimum score encountered.
|
||||
* `_max_score (Optional[float])`: Maximum score encountered.
|
||||
* 4.8.3. Key Methods:
|
||||
* `update(self, score: float) -> None`: Updates the statistics with a new score.
|
||||
* `get_average(self) -> float`: Calculates and returns the average score.
|
||||
* `get_min(self) -> float`: Lazily initializes and returns the minimum score.
|
||||
* `get_max(self) -> float`: Lazily initializes and returns the maximum score.
|
||||
|
||||
## 5. `DeepCrawlDecorator`
|
||||
|
||||
* Source: `crawl4ai/deep_crawling/base_strategy.py`
|
||||
* 5.1. Purpose: A decorator class that transparently adds deep crawling functionality to the `AsyncWebCrawler.arun` method if a `deep_crawl_strategy` is specified in the `CrawlerRunConfig`.
|
||||
* 5.2. Initialization (`__init__`)
|
||||
* 5.2.1. Signature:
|
||||
```python
|
||||
def __init__(self, crawler: AsyncWebCrawler):
|
||||
```
|
||||
* 5.2.2. Parameters:
|
||||
* `crawler (AsyncWebCrawler)`: The `AsyncWebCrawler` instance whose `arun` method is to be decorated.
|
||||
* 5.3. `__call__` Method
|
||||
* 5.3.1. Signature:
|
||||
```python
|
||||
@wraps(original_arun)
|
||||
async def wrapped_arun(url: str, config: CrawlerRunConfig = None, **kwargs):
|
||||
```
|
||||
* 5.3.2. Functionality: This method wraps the original `arun` method of the `AsyncWebCrawler`.
|
||||
* It checks if `config` is provided, has a `deep_crawl_strategy` set, and if `DeepCrawlDecorator.deep_crawl_active` context variable is `False` (to prevent recursion).
|
||||
* If these conditions are met:
|
||||
* It sets `DeepCrawlDecorator.deep_crawl_active` to `True`.
|
||||
* It calls the `arun` method of the specified `config.deep_crawl_strategy`.
|
||||
* It handles potential streaming results from the strategy by wrapping them in an async generator.
|
||||
* Finally, it resets `DeepCrawlDecorator.deep_crawl_active` to `False`.
|
||||
* If the conditions are not met, it calls the original `arun` method of the crawler.
|
||||
* 5.4. Class Variable:
|
||||
* `deep_crawl_active (ContextVar)`:
|
||||
* Purpose: A `contextvars.ContextVar` used as a flag to indicate if a deep crawl is currently in progress for the current asynchronous context. This prevents the decorator from re-triggering deep crawling if the strategy itself calls the crawler's `arun` or `arun_many` methods.
|
||||
* Default Value: `False`.
|
||||
|
||||
## 6. `TraversalStats` Data Model
|
||||
|
||||
* Source: `crawl4ai/models.py`
|
||||
* 6.1. Purpose: A data class for storing and tracking statistics related to a deep crawl traversal.
|
||||
* 6.2. Fields:
|
||||
* `start_time (datetime)`: The timestamp (Python `datetime` object) when the traversal process began. Default: `datetime.now()`.
|
||||
* `end_time (Optional[datetime])`: The timestamp when the traversal process completed. Default: `None`.
|
||||
* `urls_processed (int)`: The total number of URLs that were successfully fetched and processed. Default: `0`.
|
||||
* `urls_failed (int)`: The total number of URLs that resulted in an error during fetching or processing. Default: `0`.
|
||||
* `urls_skipped (int)`: The total number of URLs that were skipped (e.g., due to filters, already visited, or depth limits). Default: `0`.
|
||||
* `total_depth_reached (int)`: The maximum depth reached from the start URL during the crawl. Default: `0`.
|
||||
* `current_depth (int)`: The current depth level being processed by the crawler (can fluctuate during the crawl, especially for BFS). Default: `0`.
|
||||
|
||||
## 7. Configuration for Deep Crawling (`CrawlerRunConfig`)
|
||||
|
||||
* Source: `crawl4ai/async_configs.py`
|
||||
* 7.1. Purpose: `CrawlerRunConfig` is the primary configuration object passed to `AsyncWebCrawler.arun()` and `AsyncWebCrawler.arun_many()`. It contains various settings that control the behavior of a single crawl run, including those specific to deep crawling.
|
||||
* 7.2. Relevant Fields:
|
||||
* `deep_crawl_strategy (Optional[DeepCrawlStrategy])`:
|
||||
* Type: `Optional[DeepCrawlStrategy]` (where `DeepCrawlStrategy` is the ABC from `crawl4ai.deep_crawling.base_strategy`)
|
||||
* Default: `None`
|
||||
* Description: Specifies the deep crawling strategy instance (e.g., `BFSDeepCrawlStrategy`, `DFSDeepCrawlStrategy`, `BestFirstCrawlingStrategy`) to be used for the crawl. If `None`, deep crawling is disabled, and only the initial URL(s) will be processed.
|
||||
* *Note: Parameters like `max_depth`, `max_pages`, `filter_chain`, `url_scorer`, `score_threshold`, and `include_external` are not direct attributes of `CrawlerRunConfig` for deep crawling. Instead, they are passed to the constructor of the chosen `DeepCrawlStrategy` instance, which is then assigned to `CrawlerRunConfig.deep_crawl_strategy`.*
|
||||
|
||||
## 8. Utility Functions
|
||||
|
||||
* **8.1. `normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
|
||||
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py` if it's a general utility)
|
||||
* 8.1.1. Purpose: Normalizes a URL found during deep crawling. This typically involves resolving relative URLs against the `source_url` to create absolute URLs and removing URL fragments (`#fragment`).
|
||||
* 8.1.2. Signature: `def normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
|
||||
* 8.1.3. Parameters:
|
||||
* `url (str)`: The URL string to be normalized.
|
||||
* `source_url (str)`: The URL of the page where the `url` was discovered. This is used as the base for resolving relative paths.
|
||||
* 8.1.4. Returns: `(str)` - The normalized, absolute URL without fragments.
|
||||
|
||||
* **8.2. `efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str`**
|
||||
* Source: `crawl4ai/deep_crawling/utils.py` (or `crawl4ai/utils.py`)
|
||||
* 8.2.1. Purpose: Provides a potentially more performant version of URL normalization specifically for deep crawling scenarios, likely employing optimizations to avoid repeated or complex parsing operations. (Note: Based on the provided code, this appears to be the same as `normalize_url_for_deep_crawl` if only one is present, or it might contain specific internal optimizations not exposed differently at the API level but used by strategies).
|
||||
* 8.2.2. Signature: `def efficient_normalize_url_for_deep_crawl(url: str, source_url: str) -> str:`
|
||||
* 8.2.3. Parameters:
|
||||
* `url (str)`: The URL string to be normalized.
|
||||
* `source_url (str)`: The URL of the page where the `url` was discovered.
|
||||
* 8.2.4. Returns: `(str)` - The normalized, absolute URL, typically without fragments.
|
||||
|
||||
## 9. PDF Processing Integration (`crawl4ai.processors.pdf`)
|
||||
* 9.1. Overview of PDF processing in Crawl4ai: While not directly part of the `deep_crawling` package, PDF processing components can be used in conjunction if a deep crawl discovers PDF URLs and they need to be processed. The `PDFCrawlerStrategy` can fetch PDFs, and `PDFContentScrapingStrategy` can extract content from them.
|
||||
* **9.2. `PDFCrawlerStrategy`**
|
||||
* Source: `crawl4ai/processors/pdf/__init__.py`
|
||||
* 9.2.1. Purpose: An `AsyncCrawlerStrategy` designed to "crawl" PDF files. In practice, this usually means downloading the PDF content. It returns a minimal `AsyncCrawlResponse` that signals to a `ContentScrapingStrategy` (like `PDFContentScrapingStrategy`) that the content is a PDF.
|
||||
* 9.2.2. Inheritance: `AsyncCrawlerStrategy`
|
||||
* 9.2.3. Initialization (`__init__`)
|
||||
* 9.2.3.1. Signature: `def __init__(self, logger: AsyncLogger = None):`
|
||||
* 9.2.3.2. Parameters:
|
||||
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
|
||||
* 9.2.4. Key Methods:
|
||||
* `async def crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
|
||||
* Description: For a PDF URL, this method typically signifies that the URL points to a PDF. It constructs an `AsyncCrawlResponse` with a `Content-Type` header of `application/pdf` and a placeholder HTML. The actual PDF processing (downloading and content extraction) is usually handled by a subsequent scraping strategy.
|
||||
* **9.3. `PDFContentScrapingStrategy`**
|
||||
* Source: `crawl4ai/processors/pdf/__init__.py`
|
||||
* 9.3.1. Purpose: A `ContentScrapingStrategy` specialized in extracting text, images (optional), and metadata from PDF files. It uses a `PDFProcessorStrategy` (like `NaivePDFProcessorStrategy`) internally.
|
||||
* 9.3.2. Inheritance: `ContentScrapingStrategy`
|
||||
* 9.3.3. Initialization (`__init__`)
|
||||
* 9.3.3.1. Signature:
|
||||
```python
|
||||
def __init__(self,
|
||||
save_images_locally: bool = False,
|
||||
extract_images: bool = False,
|
||||
image_save_dir: str = None,
|
||||
batch_size: int = 4,
|
||||
logger: AsyncLogger = None):
|
||||
```
|
||||
* 9.3.3.2. Parameters:
|
||||
* `save_images_locally (bool`, default: `False`)`: If `True`, extracted images will be saved to the local disk.
|
||||
* `extract_images (bool`, default: `False`)`: If `True`, attempts to extract images from the PDF.
|
||||
* `image_save_dir (str`, default: `None`)`: The directory where extracted images will be saved if `save_images_locally` is `True`.
|
||||
* `batch_size (int`, default: `4`)`: The number of PDF pages to process in parallel batches (if the underlying processor supports it).
|
||||
* `logger (AsyncLogger`, default: `None`)`: An optional logger instance.
|
||||
* 9.3.4. Key Methods:
|
||||
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
|
||||
* Description: Takes the URL (which should point to a PDF or a local PDF path) and processes it. It downloads the PDF if it's a remote URL, then uses the internal `pdf_processor` to extract content. It formats the extracted text into basic HTML and collects image and link information.
|
||||
* `async def ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
|
||||
* Description: Asynchronous version of the `scrape` method, typically by running the synchronous `scrape` method in a separate thread.
|
||||
* 9.3.5. Helper Methods:
|
||||
* `_get_pdf_path(self, url: str) -> str`: Downloads a PDF from a URL to a temporary file if it's not a local path.
|
||||
* **9.4. `NaivePDFProcessorStrategy`**
|
||||
* Source: `crawl4ai/processors/pdf/processor.py`
|
||||
* 9.4.1. Purpose: A concrete implementation of `PDFProcessorStrategy` that uses `PyPDF2` (or similar libraries if extended) to extract text, images, and metadata from PDF documents page by page or in batches.
|
||||
* 9.4.2. Initialization (`__init__`)
|
||||
* Signature: `def __init__(self, image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
|
||||
* Parameters: [Details parameters for image extraction quality, saving, and batch processing size.]
|
||||
* 9.4.3. Key Methods:
|
||||
* `process(self, pdf_path: Path) -> PDFProcessResult`:
|
||||
* Description: Processes a single PDF file sequentially, page by page. Extracts metadata, text, and optionally images from each page.
|
||||
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
|
||||
* Description: Processes a PDF file by dividing its pages into batches and processing these batches in parallel using a thread pool, potentially speeding up extraction for large PDFs.
|
||||
* 9.4.4. Helper Methods:
|
||||
* `_process_page(self, page, image_dir: Optional[Path]) -> PDFPage`: Processes a single PDF page object.
|
||||
* `_extract_images(self, page, image_dir: Optional[Path]) -> List[Dict]`: Extracts images from a page.
|
||||
* `_extract_links(self, page) -> List[str]`: Extracts hyperlinks from a page.
|
||||
* `_extract_metadata(self, pdf_path: Path, reader=None) -> PDFMetadata`: Extracts metadata from the PDF.
|
||||
* **9.5. PDF Data Models**
|
||||
* Source: `crawl4ai/processors/pdf/processor.py`
|
||||
* 9.5.1. `PDFMetadata`:
|
||||
* Purpose: Stores metadata extracted from a PDF document.
|
||||
* Fields:
|
||||
* `title (Optional[str])`: The title of the PDF.
|
||||
* `author (Optional[str])`: The author(s) of the PDF.
|
||||
* `producer (Optional[str])`: The software used to produce the PDF.
|
||||
* `created (Optional[datetime])`: The creation date of the PDF.
|
||||
* `modified (Optional[datetime])`: The last modification date of the PDF.
|
||||
* `pages (int)`: The total number of pages in the PDF. Default: `0`.
|
||||
* `encrypted (bool)`: `True` if the PDF is encrypted, `False` otherwise. Default: `False`.
|
||||
* `file_size (Optional[int])`: The size of the PDF file in bytes. Default: `None`.
|
||||
* 9.5.2. `PDFPage`:
|
||||
* Purpose: Stores content extracted from a single page of a PDF document.
|
||||
* Fields:
|
||||
* `page_number (int)`: The page number (1-indexed).
|
||||
* `raw_text (str)`: The raw text extracted from the page. Default: `""`.
|
||||
* `markdown (str)`: Markdown representation of the page content. Default: `""`.
|
||||
* `html (str)`: Basic HTML representation of the page content. Default: `""`.
|
||||
* `images (List[Dict])`: A list of dictionaries, each representing an extracted image with details like format, path/data, dimensions. Default: `[]`.
|
||||
* `links (List[str])`: A list of hyperlink URLs found on the page. Default: `[]`.
|
||||
* `layout (List[Dict])`: Information about the layout of text elements on the page (e.g., coordinates). Default: `[]`.
|
||||
* 9.5.3. `PDFProcessResult`:
|
||||
* Purpose: Encapsulates the results of processing a PDF document.
|
||||
* Fields:
|
||||
* `metadata (PDFMetadata)`: The metadata of the processed PDF.
|
||||
* `pages (List[PDFPage])`: A list of `PDFPage` objects, one for each page processed.
|
||||
* `processing_time (float)`: The time taken to process the PDF, in seconds. Default: `0.0`.
|
||||
* `version (str)`: The version of the PDF processor. Default: `"1.1"`.
|
||||
|
||||
## 10. Version Information (`crawl4ai.__version__`)
|
||||
* Source: `crawl4ai/__version__.py`
|
||||
* 10.1. `__version__ (str)`: A string representing the current installed version of the `crawl4ai` library (e.g., "0.6.3").
|
||||
|
||||
## 11. Asynchronous Configuration (`crawl4ai.async_configs`)
|
||||
* 11.1. Overview: The `crawl4ai.async_configs` module contains configuration classes used throughout the library, including those relevant for network requests like proxies (`ProxyConfig`) and general crawler/browser behavior.
|
||||
* **11.2. `ProxyConfig`**
|
||||
* Source: `crawl4ai/async_configs.py` (and `crawl4ai/proxy_strategy.py`)
|
||||
* 11.2.1. Purpose: Represents the configuration for a single proxy server, including its address, port, and optional authentication credentials.
|
||||
* 11.2.2. Initialization (`__init__`)
|
||||
* 11.2.2.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
```
|
||||
* 11.2.2.2. Parameters:
|
||||
* `server (str)`: The proxy server URL (e.g., "http://proxy.example.com:8080", "socks5://proxy.example.com:1080").
|
||||
* `username (Optional[str]`, default: `None`)`: The username for proxy authentication, if required.
|
||||
* `password (Optional[str]`, default: `None`)`: The password for proxy authentication, if required.
|
||||
* `ip (Optional[str]`, default: `None`)`: Optionally, the specific IP address of the proxy server. If not provided, it's inferred from the `server` URL.
|
||||
* 11.2.3. Key Static Methods:
|
||||
* `from_string(proxy_str: str) -> ProxyConfig`:
|
||||
* Description: Creates a `ProxyConfig` instance from a string representation. Expected format is "ip:port:username:password" or "ip:port".
|
||||
* Returns: `(ProxyConfig)`
|
||||
* `from_dict(proxy_dict: Dict) -> ProxyConfig`:
|
||||
* Description: Creates a `ProxyConfig` instance from a dictionary.
|
||||
* Returns: `(ProxyConfig)`
|
||||
* `from_env(env_var: str = "PROXIES") -> List[ProxyConfig]`:
|
||||
* Description: Loads a list of proxy configurations from a comma-separated string in an environment variable.
|
||||
* Returns: `(List[ProxyConfig])`
|
||||
* 11.2.4. Key Methods:
|
||||
* `to_dict(self) -> Dict`: Converts the `ProxyConfig` instance to a dictionary.
|
||||
* `clone(self, **kwargs) -> ProxyConfig`: Creates a copy of the instance, optionally updating attributes with `kwargs`.
|
||||
|
||||
* **11.3. `ProxyRotationStrategy` (ABC)**
|
||||
* Source: `crawl4ai/proxy_strategy.py`
|
||||
* 11.3.1. Purpose: Abstract base class defining the interface for proxy rotation strategies.
|
||||
* 11.3.2. Key Abstract Methods:
|
||||
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Asynchronously gets the next `ProxyConfig` from the strategy.
|
||||
* `def add_proxies(self, proxies: List[ProxyConfig])`: Adds a list of `ProxyConfig` objects to the strategy's pool.
|
||||
* **11.4. `RoundRobinProxyStrategy`**
|
||||
* Source: `crawl4ai/proxy_strategy.py`
|
||||
* 11.4.1. Purpose: A simple proxy rotation strategy that cycles through a list of proxies in a round-robin fashion.
|
||||
* 11.4.2. Inheritance: `ProxyRotationStrategy`
|
||||
* 11.4.3. Initialization (`__init__`)
|
||||
* 11.4.3.1. Signature: `def __init__(self, proxies: List[ProxyConfig] = None):`
|
||||
* 11.4.3.2. Parameters:
|
||||
* `proxies (List[ProxyConfig]`, default: `None`)`: An optional initial list of `ProxyConfig` objects.
|
||||
* 11.4.4. Key Implemented Methods:
|
||||
* `add_proxies(self, proxies: List[ProxyConfig])`: Adds new proxies to the internal list and reinitializes the cycle.
|
||||
* `async def get_next_proxy(self) -> Optional[ProxyConfig]`: Returns the next proxy from the cycle. Returns `None` if no proxies are available.
|
||||
|
||||
## 12. HTML to Markdown Conversion (`crawl4ai.markdown_generation_strategy`)
|
||||
* 12.1. `MarkdownGenerationStrategy` (ABC)
|
||||
* Source: `crawl4ai/markdown_generation_strategy.py`
|
||||
* 12.1.1. Purpose: Abstract base class defining the interface for strategies that convert HTML content to Markdown.
|
||||
* 12.1.2. Key Abstract Methods:
|
||||
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
|
||||
* Description: Abstract method to convert the given `input_html` string into a `MarkdownGenerationResult` object.
|
||||
* Parameters:
|
||||
* `input_html (str)`: The HTML content to convert.
|
||||
* `base_url (str`, default: `""`)`: The base URL used for resolving relative links within the HTML.
|
||||
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options to pass to the underlying HTML-to-text conversion library.
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional filter to apply to the HTML before Markdown conversion, potentially to extract only relevant parts.
|
||||
* `citations (bool`, default: `True`)`: If `True`, attempts to convert hyperlinks into Markdown citations with a reference list.
|
||||
* `**kwargs`: Additional keyword arguments.
|
||||
* Returns: `(MarkdownGenerationResult)`
|
||||
* 12.2. `DefaultMarkdownGenerator`
|
||||
* Source: `crawl4ai/markdown_generation_strategy.py`
|
||||
* 12.2.1. Purpose: The default implementation of `MarkdownGenerationStrategy`. It uses the `CustomHTML2Text` class (an enhanced `html2text.HTML2Text`) for the primary conversion and can optionally apply a `RelevantContentFilter`.
|
||||
* 12.2.2. Inheritance: `MarkdownGenerationStrategy`
|
||||
* 12.2.3. Initialization (`__init__`)
|
||||
* 12.2.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
content_source: str = "cleaned_html", # "raw_html", "fit_html"
|
||||
):
|
||||
```
|
||||
* 12.2.3.2. Parameters:
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An instance of a content filter strategy (e.g., `BM25ContentFilter`, `PruningContentFilter`) to be applied to the `input_html` before Markdown conversion. If `None`, no pre-filtering is done.
|
||||
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to configure the `CustomHTML2Text` converter (e.g., `{"body_width": 0, "ignore_links": False}`).
|
||||
* `content_source (str`, default: `"cleaned_html"`)`: Specifies which HTML source to use for Markdown generation if multiple are available (e.g., from `CrawlResult`). Options: `"cleaned_html"` (default), `"raw_html"`, `"fit_html"`. This parameter is primarily used when the generator is part of a larger crawling pipeline.
|
||||
* 12.2.4. Key Methods:
|
||||
* `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`:
|
||||
* Description: Converts HTML to Markdown. If a `content_filter` is provided (either at init or as an argument), it's applied first to get "fit_html". Then, `CustomHTML2Text` converts the chosen HTML (input_html or fit_html) to raw Markdown. If `citations` is True, links in the raw Markdown are converted to citation format.
|
||||
* Returns: `(MarkdownGenerationResult)`
|
||||
* `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`:
|
||||
* Description: Parses Markdown text, identifies links, replaces them with citation markers (e.g., `[text]^(1)`), and generates a corresponding list of references.
|
||||
* Returns: `(Tuple[str, str])` - A tuple containing the Markdown with citations and the Markdown string of references.
|
||||
|
||||
## 13. Content Filtering (`crawl4ai.content_filter_strategy`)
|
||||
* 13.1. `RelevantContentFilter` (ABC)
|
||||
* Source: `crawl4ai/content_filter_strategy.py`
|
||||
* 13.1.1. Purpose: Abstract base class for strategies that filter HTML content to extract only the most relevant parts, typically before Markdown conversion or further processing.
|
||||
* 13.1.2. Key Abstract Methods:
|
||||
* `filter_content(self, html: str) -> List[str]`:
|
||||
* Description: Abstract method that takes an HTML string and returns a list of strings, where each string is a chunk of HTML deemed relevant.
|
||||
* 13.2. `BM25ContentFilter`
|
||||
* Source: `crawl4ai/content_filter_strategy.py`
|
||||
* 13.2.1. Purpose: Filters HTML content by extracting text chunks and scoring their relevance to a user query (or an inferred page query) using the BM25 algorithm.
|
||||
* 13.2.2. Inheritance: `RelevantContentFilter`
|
||||
* 13.2.3. Initialization (`__init__`)
|
||||
* 13.2.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
user_query: Optional[str] = None,
|
||||
bm25_threshold: float = 1.0,
|
||||
language: str = "english",
|
||||
):
|
||||
```
|
||||
* 13.2.3.2. Parameters:
|
||||
* `user_query (Optional[str]`, default: `None`)`: The query to compare content against. If `None`, the filter attempts to extract a query from the page's metadata.
|
||||
* `bm25_threshold (float`, default: `1.0`)`: The minimum BM25 score for a text chunk to be considered relevant.
|
||||
* `language (str`, default: `"english"`)`: The language used for stemming tokens.
|
||||
* 13.2.4. Key Implemented Methods:
|
||||
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, extracts text chunks (paragraphs, list items, etc.), scores them with BM25 against the query, and returns the HTML of chunks exceeding the threshold.
|
||||
* 13.3. `PruningContentFilter`
|
||||
* Source: `crawl4ai/content_filter_strategy.py`
|
||||
* 13.3.1. Purpose: Filters HTML content by recursively pruning less relevant parts of the DOM tree based on a composite score (text density, link density, tag weights, etc.).
|
||||
* 13.3.2. Inheritance: `RelevantContentFilter`
|
||||
* 13.3.3. Initialization (`__init__`)
|
||||
* 13.3.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
user_query: Optional[str] = None,
|
||||
min_word_threshold: Optional[int] = None,
|
||||
threshold_type: str = "fixed", # or "dynamic"
|
||||
threshold: float = 0.48,
|
||||
):
|
||||
```
|
||||
* 13.3.3.2. Parameters:
|
||||
* `user_query (Optional[str]`, default: `None`)`: [Not directly used by pruning logic but inherited].
|
||||
* `min_word_threshold (Optional[int]`, default: `None`)`: Minimum word count for an element to be considered for scoring initially (default behavior might be more nuanced).
|
||||
* `threshold_type (str`, default: `"fixed"`)`: Specifies how the `threshold` is applied. "fixed" uses the direct value. "dynamic" adjusts the threshold based on content characteristics.
|
||||
* `threshold (float`, default: `0.48`)`: The score threshold for pruning. Elements below this score are removed.
|
||||
* 13.3.4. Key Implemented Methods:
|
||||
* `filter_content(self, html: str, min_word_threshold: int = None) -> List[str]`: Parses HTML, applies the pruning algorithm to the body, and returns the remaining significant HTML blocks as a list of strings.
|
||||
* 13.4. `LLMContentFilter`
|
||||
* Source: `crawl4ai/content_filter_strategy.py`
|
||||
* 13.4.1. Purpose: Uses a Large Language Model (LLM) to determine the relevance of HTML content chunks based on a given instruction.
|
||||
* 13.4.2. Inheritance: `RelevantContentFilter`
|
||||
* 13.4.3. Initialization (`__init__`)
|
||||
* 13.4.3.1. Signature:
|
||||
```python
|
||||
def __init__(
|
||||
self,
|
||||
llm_config: Optional[LLMConfig] = None,
|
||||
instruction: Optional[str] = None,
|
||||
chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, # Default from config
|
||||
overlap_rate: float = OVERLAP_RATE, # Default from config
|
||||
word_token_rate: float = WORD_TOKEN_RATE, # Default from config
|
||||
verbose: bool = False,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
ignore_cache: bool = True
|
||||
):
|
||||
```
|
||||
* 13.4.3.2. Parameters:
|
||||
* `llm_config (Optional[LLMConfig])`: Configuration for the LLM (provider, API key, model, etc.).
|
||||
* `instruction (Optional[str])`: The instruction given to the LLM to guide content filtering (e.g., "Extract only the main article content, excluding headers, footers, and ads.").
|
||||
* `chunk_token_threshold (int)`: Maximum number of tokens per chunk sent to the LLM.
|
||||
* `overlap_rate (float)`: Percentage of overlap between consecutive chunks.
|
||||
* `word_token_rate (float)`: Estimated ratio of words to tokens, used for chunking.
|
||||
* `verbose (bool`, default: `False`)`: Enables verbose logging for LLM operations.
|
||||
* `logger (Optional[AsyncLogger]`, default: `None`)`: Custom logger instance.
|
||||
* `ignore_cache (bool`, default: `True`)`: If `True`, bypasses any LLM response caching for this operation.
|
||||
* 13.4.4. Key Implemented Methods:
|
||||
* `filter_content(self, html: str, ignore_cache: bool = True) -> List[str]`:
|
||||
* Description: Chunks the input HTML. For each chunk, it sends a request to the configured LLM with the chunk and the `instruction`. The LLM is expected to return the relevant part of the chunk. These relevant parts are then collected and returned.
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,537 +0,0 @@
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - deployment Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_deployment.md`
|
||||
**Library Version Context:** 0.6.0 (as per Dockerfile ARG `C4AI_VER` from provided `Dockerfile` content)
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Deployment
|
||||
* 1.1. Purpose: This document provides a factual reference for installing the `crawl4ai` library and deploying its server component using Docker. It covers basic and advanced library installation, various Docker deployment methods, server configuration, and an overview of the API for interaction.
|
||||
* 1.2. Scope:
|
||||
* Installation of the `crawl4ai` Python library.
|
||||
* Setup and diagnostic commands for the library.
|
||||
* Deployment of the `crawl4ai` server using Docker, including pre-built images, Docker Compose, and manual builds.
|
||||
* Explanation of Dockerfile parameters and server configuration via `config.yml`.
|
||||
* Details of API interaction, including the Playground UI, Python SDK, and direct REST API calls.
|
||||
* Overview of additional server API endpoints and Model Context Protocol (MCP) support.
|
||||
* High-level understanding of the server's internal logic relevant to users.
|
||||
* The library's version numbering scheme.
|
||||
|
||||
## 2. Library Installation
|
||||
|
||||
* 2.1. **Basic Library Installation**
|
||||
* 2.1.1. Standard Installation
|
||||
* Command: `pip install crawl4ai`
|
||||
* Purpose: Installs the core `crawl4ai` library and its essential dependencies for performing web crawling and scraping tasks. This provides the fundamental `AsyncWebCrawler` and related configuration objects.
|
||||
* 2.1.2. Post-Installation Setup
|
||||
* Command: `crawl4ai-setup`
|
||||
* Purpose:
|
||||
* Initializes the user's home directory structure for Crawl4ai (e.g., `~/.crawl4ai/cache`).
|
||||
* Installs or updates necessary Playwright browsers (Chromium is installed by default) required for browser-based crawling. The `crawl4ai-setup` script internally calls `playwright install --with-deps chromium`.
|
||||
* Performs OS-level checks for common missing libraries that Playwright might depend on, providing guidance if issues are found.
|
||||
* Creates a default `global.yml` configuration file if one doesn't exist.
|
||||
* 2.1.3. Diagnostic Check
|
||||
* Command: `crawl4ai-doctor`
|
||||
* Purpose:
|
||||
* Verifies Python version compatibility.
|
||||
* Confirms Playwright installation and browser integrity by attempting a simple crawl of `https://crawl4ai.com`.
|
||||
* Inspects essential environment variables and potential library conflicts that might affect Crawl4ai's operation.
|
||||
* Provides diagnostic messages indicating success or failure of these checks, with suggestions for resolving common issues.
|
||||
* 2.1.4. Verification Process
|
||||
* Purpose: To confirm that the basic installation and setup were successful and Crawl4ai can perform a simple crawl.
|
||||
* Script Example (as inferred from `crawl4ai-doctor` logic and typical usage):
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_type="chromium",
|
||||
ignore_https_errors=True,
|
||||
light_mode=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=720,
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
screenshot=True,
|
||||
)
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
print("Testing crawling capabilities...")
|
||||
result = await crawler.arun(url="https://crawl4ai.com", config=run_config)
|
||||
if result and result.markdown:
|
||||
print("✅ Crawling test passed!")
|
||||
return True
|
||||
else:
|
||||
print("❌ Test failed: Failed to get content")
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
* Expected Outcome: The script should print "✅ Crawling test passed!" and successfully output Markdown content from the crawled page.
|
||||
|
||||
* 2.2. **Advanced Library Installation (Optional Features)**
|
||||
* 2.2.1. Installation of Optional Extras
|
||||
* Purpose: To install additional dependencies required for specific advanced features of Crawl4ai, such as those involving machine learning models.
|
||||
* Options (as defined in `pyproject.toml`):
|
||||
* `pip install crawl4ai[pdf]`:
|
||||
* Purpose: Installs `PyPDF2` for PDF processing capabilities.
|
||||
* `pip install crawl4ai[torch]`:
|
||||
* Purpose: Installs `torch`, `nltk`, and `scikit-learn`. Enables features relying on PyTorch models, such as some advanced text clustering or semantic analysis within extraction strategies.
|
||||
* `pip install crawl4ai[transformer]`:
|
||||
* Purpose: Installs `transformers` and `tokenizers`. Enables the use of Hugging Face Transformers models for tasks like summarization, question answering, or other advanced NLP features within Crawl4ai.
|
||||
* `pip install crawl4ai[cosine]`:
|
||||
* Purpose: Installs `torch`, `transformers`, and `nltk`. Specifically for features utilizing cosine similarity with embeddings (implies model usage).
|
||||
* `pip install crawl4ai[sync]`:
|
||||
* Purpose: Installs `selenium` for synchronous crawling capabilities (less common, as Crawl4ai primarily focuses on async).
|
||||
* `pip install crawl4ai[all]`:
|
||||
* Purpose: Installs all optional dependencies listed above (`PyPDF2`, `torch`, `nltk`, `scikit-learn`, `transformers`, `tokenizers`, `selenium`), providing the complete suite of Crawl4ai capabilities.
|
||||
* 2.2.2. Model Pre-fetching
|
||||
* Command: `crawl4ai-download-models` (maps to `crawl4ai.model_loader:main`)
|
||||
* Purpose: Downloads and caches machine learning models (e.g., specific sentence transformers or classification models from Hugging Face) that are used by certain optional features, particularly those installed via `crawl4ai[transformer]` or `crawl4ai[cosine]`. This avoids runtime downloads and ensures models are available offline.
|
||||
|
||||
## 3. Docker Deployment (Server Mode)
|
||||
|
||||
* 3.1. **Prerequisites**
|
||||
* 3.1.1. Docker: A working Docker installation. (Link: `https://docs.docker.com/get-docker/`)
|
||||
* 3.1.2. Git: Required for cloning the `crawl4ai` repository if building locally or using Docker Compose from the repository. (Link: `https://git-scm.com/book/en/v2/Getting-Started-Installing-Git`)
|
||||
* 3.1.3. RAM Requirements:
|
||||
* Minimum: 2GB for the basic server without intensive LLM tasks. The `Dockerfile` HEALTCHECK indicates a warning if less than 2GB RAM is available.
|
||||
* Recommended for LLM support: 4GB+ (as specified in `docker-compose.yml` limits).
|
||||
* Shared Memory (`/dev/shm`): Recommended size is 1GB (`--shm-size=1g`) for optimal Chromium browser performance, as specified in `docker-compose.yml` and run commands.
|
||||
* 3.2. **Installation Options**
|
||||
* 3.2.1. **Using Pre-built Images from Docker Hub**
|
||||
* 3.2.1.1. Image Source: `unclecode/crawl4ai:<tag>`
|
||||
* Explanation of `<tag>`:
|
||||
* `latest`: Points to the most recent stable release of Crawl4ai.
|
||||
* Specific version tags (e.g., `0.6.0`, `0.5.1`): Correspond to specific library releases.
|
||||
* Pre-release tags (e.g., `0.6.0-rc1`, `0.7.0-devN`): Development or release candidate versions for testing.
|
||||
* 3.2.1.2. Pulling the Image
|
||||
* Command: `docker pull unclecode/crawl4ai:<tag>` (e.g., `docker pull unclecode/crawl4ai:latest`)
|
||||
* 3.2.1.3. Environment Setup (`.llm.env`)
|
||||
* File Name: `.llm.env` (to be created by the user in the directory where `docker run` or `docker-compose` commands are executed).
|
||||
* Purpose: To securely provide API keys for various LLM providers used by Crawl4ai for features like LLM-based extraction or Q&A.
|
||||
* Example Content (based on `docker-compose.yml`):
|
||||
```env
|
||||
OPENAI_API_KEY=your_openai_api_key
|
||||
DEEPSEEK_API_KEY=your_deepseek_api_key
|
||||
ANTHROPIC_API_KEY=your_anthropic_api_key
|
||||
GROQ_API_KEY=your_groq_api_key
|
||||
TOGETHER_API_KEY=your_together_api_key
|
||||
MISTRAL_API_KEY=your_mistral_api_key
|
||||
GEMINI_API_TOKEN=your_gemini_api_token
|
||||
```
|
||||
* Creation: Users should create this file and populate it with their API keys. An example (`.llm.env.example`) might be provided in the repository.
|
||||
* 3.2.1.4. Running the Container
|
||||
* Basic Run (without LLM support):
|
||||
* Command: `docker run -d -p 11235:11235 --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
|
||||
* Port Mapping: `-p 11235:11235` maps port 11235 on the host to port 11235 in the container (default server port).
|
||||
* Shared Memory: `--shm-size=1g` allocates 1GB of shared memory for the browser.
|
||||
* Run with LLM Support (mounting `.llm.env`):
|
||||
* Command: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name crawl4ai-server unclecode/crawl4ai:<tag>`
|
||||
* 3.2.1.5. Stopping the Container
|
||||
* Command: `docker stop crawl4ai-server`
|
||||
* Command (to remove): `docker rm crawl4ai-server`
|
||||
* 3.2.1.6. Docker Hub Versioning:
|
||||
* Docker image tags on Docker Hub (e.g., `unclecode/crawl4ai:0.6.0`) directly correspond to `crawl4ai` library releases. The `latest` tag usually points to the most recent stable release. Pre-release tags include suffixes like `-devN`, `-aN`, `-bN`, or `-rcN`.
|
||||
|
||||
* 3.2.2. **Using Docker Compose (`docker-compose.yml`)**
|
||||
* 3.2.2.1. Cloning the Repository
|
||||
* Command: `git clone https://github.com/unclecode/crawl4ai.git`
|
||||
* Command: `cd crawl4ai`
|
||||
* 3.2.2.2. Environment Setup (`.llm.env`)
|
||||
* File Name: `.llm.env` (should be created in the root of the cloned `crawl4ai` repository).
|
||||
* Purpose: Same as above, to provide LLM API keys.
|
||||
* 3.2.2.3. Running Pre-built Images
|
||||
* Command: `docker-compose up -d`
|
||||
* Behavior: Uses the image specified in `docker-compose.yml` (e.g., `${IMAGE:-unclecode/crawl4ai}:${TAG:-latest}`).
|
||||
* Overriding image tag: `TAG=0.6.0 docker-compose up -d` or `IMAGE=mycustom/crawl4ai TAG=mytag docker-compose up -d`.
|
||||
* 3.2.2.4. Building Locally with Docker Compose
|
||||
* Command: `docker-compose up -d --build`
|
||||
* Build Arguments (passed from environment variables to `docker-compose.yml` which then passes to `Dockerfile`):
|
||||
* `INSTALL_TYPE`: (e.g., `default`, `torch`, `all`)
|
||||
* Purpose: To include optional Python dependencies during the Docker image build process.
|
||||
* Example: `INSTALL_TYPE=all docker-compose up -d --build`
|
||||
* `ENABLE_GPU`: (e.g., `true`, `false`)
|
||||
* Purpose: To include GPU support (e.g., CUDA toolkits) in the Docker image if the build hardware and target runtime support it.
|
||||
* Example: `ENABLE_GPU=true docker-compose up -d --build`
|
||||
* 3.2.2.5. Stopping Docker Compose Services
|
||||
* Command: `docker-compose down`
|
||||
|
||||
* 3.2.3. **Manual Local Build & Run**
|
||||
* 3.2.3.1. Cloning the Repository: (As above)
|
||||
* 3.2.3.2. Environment Setup (`.llm.env`): (As above)
|
||||
* 3.2.3.3. Building with `docker buildx`
|
||||
* Command Example:
|
||||
```bash
|
||||
docker buildx build --platform linux/amd64,linux/arm64 \
|
||||
--build-arg C4AI_VER=0.6.0 \
|
||||
--build-arg INSTALL_TYPE=all \
|
||||
--build-arg ENABLE_GPU=false \
|
||||
--build-arg USE_LOCAL=true \
|
||||
-t my-crawl4ai-image:custom .
|
||||
```
|
||||
* Purpose of `docker buildx`: A Docker CLI plugin that extends the `docker build` command with full support for BuildKit builder capabilities, including multi-architecture builds.
|
||||
* Explanation of `--platform`: Specifies the target platform(s) for the build (e.g., `linux/amd64`, `linux/arm64`).
|
||||
* Explanation of `--build-arg`: Passes build-time variables defined in the `Dockerfile` (see section 3.3).
|
||||
* 3.2.3.4. Running the Custom-Built Container
|
||||
* Basic Run: `docker run -d -p 11235:11235 --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
|
||||
* Run with LLM Support: `docker run -d -p 11235:11235 --env-file .llm.env --shm-size=1g --name my-crawl4ai-server my-crawl4ai-image:custom`
|
||||
* 3.2.3.5. Stopping the Container: (As above)
|
||||
|
||||
* 3.3. **Dockerfile Parameters (`ARG` values)**
|
||||
* 3.3.1. `C4AI_VER`: (Default: `0.6.0`)
|
||||
* Role: Specifies the version of the `crawl4ai` library. Used for labeling the image and potentially for version-specific logic.
|
||||
* 3.3.2. `APP_HOME`: (Default: `/app`)
|
||||
* Role: Defines the working directory inside the Docker container where the application code and related files are stored and executed.
|
||||
* 3.3.3. `GITHUB_REPO`: (Default: `https://github.com/unclecode/crawl4ai.git`)
|
||||
* Role: The URL of the GitHub repository to clone if `USE_LOCAL` is set to `false`.
|
||||
* 3.3.4. `GITHUB_BRANCH`: (Default: `main`)
|
||||
* Role: The specific branch of the GitHub repository to clone if `USE_LOCAL` is `false`.
|
||||
* 3.3.5. `USE_LOCAL`: (Default: `true`)
|
||||
* Role: A boolean flag. If `true`, the `Dockerfile` installs `crawl4ai` from the local source code copied into `/tmp/project/` during the build context. If `false`, it clones the repository specified by `GITHUB_REPO` and `GITHUB_BRANCH`.
|
||||
* 3.3.6. `PYTHON_VERSION`: (Default: `3.12`)
|
||||
* Role: Specifies the Python version for the base image (e.g., `python:3.12-slim-bookworm`).
|
||||
* 3.3.7. `INSTALL_TYPE`: (Default: `default`)
|
||||
* Role: Controls which optional dependencies of `crawl4ai` are installed. Possible values: `default` (core), `pdf`, `torch`, `transformer`, `cosine`, `sync`, `all`.
|
||||
* 3.3.8. `ENABLE_GPU`: (Default: `false`)
|
||||
* Role: A boolean flag. If `true` and `TARGETARCH` is `amd64`, the `Dockerfile` attempts to install the NVIDIA CUDA toolkit for GPU acceleration.
|
||||
* 3.3.9. `TARGETARCH`:
|
||||
* Role: An automatic build argument provided by Docker, indicating the target architecture of the build (e.g., `amd64`, `arm64`). Used for conditional logic in the `Dockerfile`, such as installing platform-specific optimized libraries or CUDA for `amd64`.
|
||||
|
||||
* 3.4. **Server Configuration (`config.yml`)**
|
||||
* 3.4.1. Location: The server loads its configuration from `/app/config.yml` inside the container by default. This path is relative to `APP_HOME`.
|
||||
* 3.4.2. Structure Overview (based on `deploy/docker/config.yml`):
|
||||
* `app`: General application settings.
|
||||
* `title (str)`: API title (e.g., "Crawl4AI API").
|
||||
* `version (str)`: API version (e.g., "1.0.0").
|
||||
* `host (str)`: Host address for the server to bind to (e.g., "0.0.0.0").
|
||||
* `port (int)`: Port for the server to listen on (e.g., 11234, though Docker usually maps to 11235).
|
||||
* `reload (bool)`: Enable/disable auto-reload for development (default: `false`).
|
||||
* `workers (int)`: Number of worker processes (default: 1).
|
||||
* `timeout_keep_alive (int)`: Keep-alive timeout in seconds (default: 300).
|
||||
* `llm`: Default LLM configuration.
|
||||
* `provider (str)`: Default LLM provider string (e.g., "openai/gpt-4o-mini").
|
||||
* `api_key_env (str)`: Environment variable name to read the API key from (e.g., "OPENAI_API_KEY").
|
||||
* `api_key (Optional[str])`: Directly pass API key (overrides `api_key_env`).
|
||||
* `redis`: Redis connection details.
|
||||
* `host (str)`: Redis host (e.g., "localhost").
|
||||
* `port (int)`: Redis port (e.g., 6379).
|
||||
* `db (int)`: Redis database number (e.g., 0).
|
||||
* `password (str)`: Redis password (default: "").
|
||||
* `ssl (bool)`: Enable SSL for Redis connection (default: `false`).
|
||||
* `ssl_cert_reqs (Optional[str])`: SSL certificate requirements (e.g., "none", "optional", "required").
|
||||
* `ssl_ca_certs (Optional[str])`: Path to CA certificate file.
|
||||
* `ssl_certfile (Optional[str])`: Path to SSL certificate file.
|
||||
* `ssl_keyfile (Optional[str])`: Path to SSL key file.
|
||||
* `rate_limiting`: Configuration for API rate limits.
|
||||
* `enabled (bool)`: Enable/disable rate limiting (default: `true`).
|
||||
* `default_limit (str)`: Default rate limit (e.g., "1000/minute").
|
||||
* `trusted_proxies (List[str])`: List of trusted proxy IP addresses.
|
||||
* `storage_uri (str)`: Storage URI for rate limit counters (e.g., "memory://", "redis://localhost:6379").
|
||||
* `security`: Security-related settings.
|
||||
* `enabled (bool)`: Master switch for security features (default: `false`).
|
||||
* `jwt_enabled (bool)`: Enable/disable JWT authentication (default: `false`).
|
||||
* `https_redirect (bool)`: Enable/disable HTTPS redirection (default: `false`).
|
||||
* `trusted_hosts (List[str])`: List of allowed host headers (e.g., `["*"]` or specific domains).
|
||||
* `headers (Dict[str, str])`: Default security headers to add to responses (e.g., `X-Content-Type-Options`, `Content-Security-Policy`).
|
||||
* `crawler`: Default crawler behavior.
|
||||
* `base_config (Dict[str, Any])`: Base parameters for `CrawlerRunConfig`.
|
||||
* `simulate_user (bool)`: (default: `true`).
|
||||
* `memory_threshold_percent (float)`: Memory usage threshold for adaptive dispatcher (default: `95.0`).
|
||||
* `rate_limiter (Dict[str, Any])`: Configuration for the internal rate limiter for crawling.
|
||||
* `enabled (bool)`: (default: `true`).
|
||||
* `base_delay (List[float, float])`: Min/max delay range (e.g., `[1.0, 2.0]`).
|
||||
* `timeouts (Dict[str, float])`: Timeouts for different crawler operations.
|
||||
* `stream_init (float)`: Timeout for stream initialization (default: `30.0`).
|
||||
* `batch_process (float)`: Timeout for batch processing (default: `300.0`).
|
||||
* `pool (Dict[str, Any])`: Browser pool settings.
|
||||
* `max_pages (int)`: Max concurrent browser pages (default: `40`).
|
||||
* `idle_ttl_sec (int)`: Time-to-live for idle crawlers in seconds (default: `1800`).
|
||||
* `browser (Dict[str, Any])`: Default `BrowserConfig` parameters.
|
||||
* `kwargs (Dict[str, Any])`: Keyword arguments for `BrowserConfig`.
|
||||
* `headless (bool)`: (default: `true`).
|
||||
* `text_mode (bool)`: (default: `true`).
|
||||
* `extra_args (List[str])`: List of additional browser launch arguments (e.g., `"--no-sandbox"`).
|
||||
* `logging`: Logging configuration.
|
||||
* `level (str)`: Logging level (e.g., "INFO", "DEBUG").
|
||||
* `format (str)`: Log message format string.
|
||||
* `observability`: Observability settings.
|
||||
* `prometheus (Dict[str, Any])`: Prometheus metrics configuration.
|
||||
* `enabled (bool)`: (default: `true`).
|
||||
* `endpoint (str)`: Metrics endpoint path (e.g., "/metrics").
|
||||
* `health_check (Dict[str, str])`: Health check endpoint configuration.
|
||||
* `endpoint (str)`: Health check endpoint path (e.g., "/health").
|
||||
* 3.4.3. JWT Authentication
|
||||
* Enabling: Set `security.enabled: true` and `security.jwt_enabled: true` in `config.yml`.
|
||||
* Secret Key: Configured via `security.jwt_secret_key`. This value can be overridden by the environment variable `JWT_SECRET_KEY`.
|
||||
* Algorithm: Configured via `security.jwt_algorithm` (default: `HS256`).
|
||||
* Token Expiry: Configured via `security.jwt_expire_minutes` (default: `30`).
|
||||
* Usage:
|
||||
* 1. Client obtains a token by sending a POST request to the `/token` endpoint with an email in the request body (e.g., `{"email": "user@example.com"}`). The email domain might be validated if configured.
|
||||
* 2. Client includes the received token in the `Authorization` header of subsequent requests to protected API endpoints: `Authorization: Bearer <your_jwt_token>`.
|
||||
* 3.4.4. Customizing `config.yml`
|
||||
* 3.4.4.1. Modifying Before Build:
|
||||
* Method: Edit the `deploy/docker/config.yml` file within the cloned `crawl4ai` repository before building the Docker image. This new configuration will be baked into the image.
|
||||
* 3.4.4.2. Runtime Mount:
|
||||
* Method: Mount a custom `config.yml` file from the host machine to `/app/config.yml` (or the path specified by `APP_HOME`) inside the running Docker container.
|
||||
* Example Command: `docker run -d -p 11235:11235 -v /path/on/host/my-config.yml:/app/config.yml --name crawl4ai-server unclecode/crawl4ai:latest`
|
||||
* 3.4.5. Key Configuration Recommendations
|
||||
* Security:
|
||||
* Enable JWT (`security.jwt_enabled: true`) if the server is exposed to untrusted networks.
|
||||
* Use a strong, unique `jwt_secret_key`.
|
||||
* Configure `security.trusted_hosts` to a specific list of allowed hostnames instead of `["*"]` for production.
|
||||
* If using a reverse proxy for SSL termination, ensure `https_redirect` is appropriately configured or disabled if the proxy handles it.
|
||||
* Resource Management:
|
||||
* Adjust `crawler.pool.max_pages` based on server resources to prevent overwhelming the system.
|
||||
* Tune `crawler.pool.idle_ttl_sec` to balance resource usage and responsiveness for pooled browser instances.
|
||||
* Monitoring:
|
||||
* Keep `observability.prometheus.enabled: true` for production monitoring via the `/metrics` endpoint.
|
||||
* Ensure the `/health` endpoint is accessible to health checking systems.
|
||||
* Performance:
|
||||
* Review and customize `crawler.browser.extra_args` for headless browser optimization (e.g., disabling GPU, sandbox if appropriate for your environment).
|
||||
* Set reasonable `crawler.timeouts` to prevent long-stalled crawls.
|
||||
|
||||
* 3.5. **API Usage (Interacting with the Dockerized Server)**
|
||||
* 3.5.1. **Playground Interface**
|
||||
* Access URL: `http://localhost:11235/playground` (assuming default port mapping).
|
||||
* Purpose: An interactive web UI (Swagger UI/OpenAPI) allowing users to explore API endpoints, view schemas, construct requests, and test API calls directly from their browser.
|
||||
* 3.5.2. **Python SDK (`Crawl4aiDockerClient`)**
|
||||
* Class Name: `Crawl4aiDockerClient`
|
||||
* Location: (Typically imported as `from crawl4ai.docker_client import Crawl4aiDockerClient`) - Actual import might vary based on final library structure; refer to `docs/examples/docker_example.py` or `docs/examples/docker_python_sdk.py`.
|
||||
* Initialization:
|
||||
* Signature: `Crawl4aiDockerClient(base_url: str = "http://localhost:11235", api_token: Optional[str] = None, timeout: int = 300)`
|
||||
* Parameters:
|
||||
* `base_url (str)`: The base URL of the Crawl4ai server. Default: `"http://localhost:11235"`.
|
||||
* `api_token (Optional[str])`: JWT token for authentication if enabled on the server. Default: `None`.
|
||||
* `timeout (int)`: Default timeout in seconds for HTTP requests to the server. Default: `300`.
|
||||
* Authentication (JWT):
|
||||
* Method: Pass the `api_token` during client initialization. The token can be obtained from the server's `/token` endpoint or other authentication mechanisms.
|
||||
* `crawl()` Method:
|
||||
* Signature (Conceptual, based on typical SDK patterns and server capabilities): `async def crawl(self, urls: Union[str, List[str]], browser_config: Optional[Dict] = None, crawler_config: Optional[Dict] = None, stream: bool = False) -> Union[List[Dict], AsyncGenerator[Dict, None]]`
|
||||
*Note: SDK might take `BrowserConfig` and `CrawlerRunConfig` objects directly, which it then serializes.*
|
||||
* Key Parameters:
|
||||
* `urls (Union[str, List[str]])`: A single URL string or a list of URL strings to crawl.
|
||||
* `browser_config (Optional[Dict])`: A dictionary representing the `BrowserConfig` object, or a `BrowserConfig` instance itself.
|
||||
* `crawler_config (Optional[Dict])`: A dictionary representing the `CrawlerRunConfig` object, or a `CrawlerRunConfig` instance itself.
|
||||
* `stream (bool)`: If `True`, the method returns an async generator yielding individual `CrawlResult` dictionaries as they are processed by the server. If `False` (default), it returns a list containing all `CrawlResult` dictionaries after all URLs are processed.
|
||||
* Return Type: `List[Dict]` (for `stream=False`) or `AsyncGenerator[Dict, None]` (for `stream=True`), where each `Dict` represents a `CrawlResult`.
|
||||
* Streaming Behavior:
|
||||
* `stream=True`: Allows processing of results incrementally, suitable for long crawl jobs or real-time data feeds.
|
||||
* `stream=False`: Collects all results before returning, simpler for smaller batches.
|
||||
* `get_schema()` Method:
|
||||
* Signature: `async def get_schema(self) -> dict`
|
||||
* Return Type: `dict`.
|
||||
* Purpose: Fetches the JSON schemas for `BrowserConfig` and `CrawlerRunConfig` from the server's `/schema` endpoint. This helps in constructing valid configuration payloads.
|
||||
* 3.5.3. **JSON Request Schema for Configurations**
|
||||
* Structure: `{"type": "ClassName", "params": {...}}`
|
||||
* Purpose: This structure is used by the server (and expected by the Python SDK internally) to deserialize JSON payloads back into Pydantic configuration objects like `BrowserConfig`, `CrawlerRunConfig`, and their nested strategy objects (e.g., `LLMExtractionStrategy`, `PruningContentFilter`). The `type` field specifies the Python class name, and `params` holds the keyword arguments for its constructor.
|
||||
* Example (`BrowserConfig`):
|
||||
```json
|
||||
{
|
||||
"type": "BrowserConfig",
|
||||
"params": {
|
||||
"headless": true,
|
||||
"browser_type": "chromium",
|
||||
"viewport_width": 1920,
|
||||
"viewport_height": 1080
|
||||
}
|
||||
}
|
||||
```
|
||||
* Example (`CrawlerRunConfig` with a nested `LLMExtractionStrategy`):
|
||||
```json
|
||||
{
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {
|
||||
"cache_mode": {"type": "CacheMode", "params": "BYPASS"},
|
||||
"screenshot": false,
|
||||
"extraction_strategy": {
|
||||
"type": "LLMExtractionStrategy",
|
||||
"params": {
|
||||
"llm_config": {
|
||||
"type": "LLMConfig",
|
||||
"params": {"provider": "openai/gpt-4o-mini"}
|
||||
},
|
||||
"instruction": "Extract the main title and summary."
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
```
|
||||
* 3.5.4. **REST API Examples**
|
||||
* `/crawl` Endpoint:
|
||||
* URL: `http://localhost:11235/crawl`
|
||||
* HTTP Method: `POST`
|
||||
* Payload Structure (`CrawlRequest` model from `deploy/docker/schemas.py`):
|
||||
```json
|
||||
{
|
||||
"urls": ["https://example.com"],
|
||||
"browser_config": { // JSON representation of BrowserConfig
|
||||
"type": "BrowserConfig",
|
||||
"params": {"headless": true}
|
||||
},
|
||||
"crawler_config": { // JSON representation of CrawlerRunConfig
|
||||
"type": "CrawlerRunConfig",
|
||||
"params": {"screenshot": true}
|
||||
}
|
||||
}
|
||||
```
|
||||
* Response Structure: A JSON object, typically `{"success": true, "results": [CrawlResult, ...], "server_processing_time_s": float, ...}`.
|
||||
* `/crawl/stream` Endpoint:
|
||||
* URL: `http://localhost:11235/crawl/stream`
|
||||
* HTTP Method: `POST`
|
||||
* Payload Structure: Same as `/crawl` (`CrawlRequest` model).
|
||||
* Response Structure: Newline Delimited JSON (NDJSON, `application/x-ndjson`). Each line is a JSON string representing a `CrawlResult` object.
|
||||
* Headers: Includes `Content-Type: application/x-ndjson` and `X-Stream-Status: active` while streaming, and a final JSON object `{"status": "completed"}`.
|
||||
|
||||
* 3.6. **Additional API Endpoints (from `server.py`)**
|
||||
* 3.6.1. `/html`
|
||||
* Endpoint URL: `/html`
|
||||
* HTTP Method: `POST`
|
||||
* Purpose: Crawls the given URL, preprocesses its raw HTML content specifically for schema extraction purposes (e.g., by sanitizing and simplifying the structure), and returns the processed HTML.
|
||||
* Request Body (`HTMLRequest` from `deploy/docker/schemas.py`):
|
||||
* `url (str)`: The URL to fetch and process.
|
||||
* Response Structure (JSON):
|
||||
* `html (str)`: The preprocessed HTML string.
|
||||
* `url (str)`: The original URL requested.
|
||||
* `success (bool)`: Indicates if the operation was successful.
|
||||
* 3.6.2. `/screenshot`
|
||||
* Endpoint URL: `/screenshot`
|
||||
* HTTP Method: `POST`
|
||||
* Purpose: Captures a full-page PNG screenshot of the specified URL. Allows an optional delay before capture and an option to save the file server-side.
|
||||
* Request Body (`ScreenshotRequest` from `deploy/docker/schemas.py`):
|
||||
* `url (str)`: The URL to take a screenshot of.
|
||||
* `screenshot_wait_for (Optional[float])`: Seconds to wait before taking the screenshot. Default: `2.0`.
|
||||
* `output_path (Optional[str])`: If provided, the screenshot is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded image is returned. Default: `None`.
|
||||
* Response Structure (JSON):
|
||||
* `success (bool)`: Indicates if the screenshot was successfully taken.
|
||||
* `screenshot (Optional[str])`: Base64 encoded PNG image data, if `output_path` was not provided.
|
||||
* `path (Optional[str])`: The absolute server-side path to the saved screenshot, if `output_path` was provided.
|
||||
* 3.6.3. `/pdf`
|
||||
* Endpoint URL: `/pdf`
|
||||
* HTTP Method: `POST`
|
||||
* Purpose: Generates a PDF document of the rendered content of the specified URL.
|
||||
* Request Body (`PDFRequest` from `deploy/docker/schemas.py`):
|
||||
* `url (str)`: The URL to convert to PDF.
|
||||
* `output_path (Optional[str])`: If provided, the PDF is saved to this path on the server, and the path is returned. Otherwise, the base64 encoded PDF data is returned. Default: `None`.
|
||||
* Response Structure (JSON):
|
||||
* `success (bool)`: Indicates if the PDF generation was successful.
|
||||
* `pdf (Optional[str])`: Base64 encoded PDF data, if `output_path` was not provided.
|
||||
* `path (Optional[str])`: The absolute server-side path to the saved PDF, if `output_path` was provided.
|
||||
* 3.6.4. `/execute_js`
|
||||
* Endpoint URL: `/execute_js`
|
||||
* HTTP Method: `POST`
|
||||
* Purpose: Executes a list of JavaScript snippets on the specified URL in the browser context and returns the full `CrawlResult` object, including any modifications or data retrieved by the scripts.
|
||||
* Request Body (`JSEndpointRequest` from `deploy/docker/schemas.py`):
|
||||
* `url (str)`: The URL on which to execute the JavaScript.
|
||||
* `scripts (List[str])`: A list of JavaScript code snippets to execute sequentially. Each script should be an expression that returns a value.
|
||||
* Response Structure (JSON): A `CrawlResult` object (serialized to a dictionary) containing the state of the page after JS execution, including `js_execution_result`.
|
||||
* 3.6.5. `/ask` (Endpoint defined as `/ask` in `server.py`)
|
||||
* Endpoint URL: `/ask`
|
||||
* HTTP Method: `GET`
|
||||
* Purpose: Retrieves context about the Crawl4ai library itself, either code snippets or documentation sections, filtered by a query. This is designed for AI assistants or RAG systems needing information about Crawl4ai.
|
||||
* Parameters (Query):
|
||||
* `context_type (str, default="all", enum=["code", "doc", "all"])`: Specifies whether to return "code", "doc", or "all" (both).
|
||||
* `query (Optional[str])`: A search query string used to filter relevant chunks using BM25 ranking. If `None`, returns all context of the specified type(s).
|
||||
* `score_ratio (float, default=0.5, ge=0.0, le=1.0)`: The minimum score (as a fraction of the maximum possible score for the query) for a chunk to be included in the results.
|
||||
* `max_results (int, default=20, ge=1)`: The maximum number of result chunks to return.
|
||||
* Response Structure (JSON):
|
||||
* If `query` is provided:
|
||||
* `code_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "code_chunk...", "score": bm25_score}`. Present if `context_type` is "code" or "all".
|
||||
* `doc_results (Optional[List[Dict[str, Union[str, float]]]])`: A list of dictionaries, where each dictionary contains `{"text": "doc_chunk...", "score": bm25_score}`. Present if `context_type` is "doc" or "all".
|
||||
* If `query` is not provided:
|
||||
* `code_context (Optional[str])`: The full concatenated code context as a single string. Present if `context_type` is "code" or "all".
|
||||
* `doc_context (Optional[str])`: The full concatenated documentation context as a single string. Present if `context_type` is "doc" or "all".
|
||||
|
||||
* 3.7. **MCP (Model Context Protocol) Support**
|
||||
* 3.7.1. Explanation of MCP:
|
||||
* Purpose: The Model Context Protocol (MCP) is a standardized way for AI models (like Anthropic's Claude with Code Interpreter capabilities) to discover and interact with external tools and data sources. Crawl4ai's MCP server exposes its functionalities as tools that an MCP-compatible AI can use.
|
||||
* 3.7.2. Connection Endpoints (defined in `mcp_bridge.py` and attached to FastAPI app):
|
||||
* `/mcp/sse`: Server-Sent Events (SSE) endpoint for MCP communication.
|
||||
* `/mcp/ws`: WebSocket endpoint for MCP communication.
|
||||
* `/mcp/messages`: Endpoint for clients to POST messages in the SSE transport.
|
||||
* 3.7.3. Usage with Claude Code Example:
|
||||
* Command: `claude mcp add -t sse c4ai-sse http://localhost:11235/mcp/sse`
|
||||
* Purpose: This command (specific to the Claude Code CLI) registers the Crawl4ai MCP server as a tool provider named `c4ai-sse` using the SSE transport. The AI can then discover and invoke tools from this source.
|
||||
* 3.7.4. List of Available MCP Tools (defined by `@mcp_tool` decorators in `server.py`):
|
||||
* `md`: Fetches Markdown for a URL.
|
||||
* Parameters (derived from `get_markdown` function signature): `url (str)`, `filter_type (FilterType)`, `query (Optional[str])`, `cache (Optional[str])`.
|
||||
* `html`: Generates preprocessed HTML for a URL.
|
||||
* Parameters (derived from `generate_html` function signature): `url (str)`.
|
||||
* `screenshot`: Generates a screenshot of a URL.
|
||||
* Parameters (derived from `generate_screenshot` function signature): `url (str)`, `screenshot_wait_for (Optional[float])`, `output_path (Optional[str])`.
|
||||
* `pdf`: Generates a PDF of a URL.
|
||||
* Parameters (derived from `generate_pdf` function signature): `url (str)`, `output_path (Optional[str])`.
|
||||
* `execute_js`: Executes JavaScript on a URL.
|
||||
* Parameters (derived from `execute_js` function signature): `url (str)`, `scripts (List[str])`.
|
||||
* `crawl`: Performs a full crawl operation.
|
||||
* Parameters (derived from `crawl` function signature): `urls (List[str])`, `browser_config (Optional[Dict])`, `crawler_config (Optional[Dict])`.
|
||||
* `ask`: Retrieves library context.
|
||||
* Parameters (derived from `get_context` function signature): `context_type (str)`, `query (Optional[str])`, `score_ratio (float)`, `max_results (int)`.
|
||||
* 3.7.5. Testing MCP Connections:
|
||||
* Method: Use an MCP client tool (e.g., `claude mcp call c4ai-sse.md url=https://example.com`) to invoke a tool and verify the response.
|
||||
* 3.7.6. Accessing MCP Schemas:
|
||||
* Endpoint URL: `/mcp/schema`
|
||||
* Purpose: Returns a JSON response detailing all registered MCP tools, including their names, descriptions, and input schemas, enabling clients to understand how to use them.
|
||||
|
||||
* 3.8. **Metrics & Monitoring Endpoints**
|
||||
* 3.8.1. `/health`
|
||||
* Purpose: Provides a basic health check for the server, indicating if it's running and responsive.
|
||||
* Response Structure (JSON from `server.py`): `{"status": "ok", "timestamp": float, "version": str}` (where version is `__version__` from `server.py`).
|
||||
* Configuration: Path configurable via `observability.health_check.endpoint` in `config.yml`.
|
||||
* 3.8.2. `/metrics`
|
||||
* Purpose: Exposes application metrics in a format compatible with Prometheus for monitoring and alerting.
|
||||
* Response Format: Prometheus text format.
|
||||
* Configuration: Enabled via `observability.prometheus.enabled: true` and endpoint path via `observability.prometheus.endpoint` in `config.yml`.
|
||||
|
||||
* 3.9. **Underlying Server Logic (`server.py` - High-Level Understanding)**
|
||||
* 3.9.1. FastAPI Application:
|
||||
* Framework: The server is built using the FastAPI Python web framework for creating APIs.
|
||||
* 3.9.2. `crawler_pool` (`CrawlerPool` from `deploy.docker.crawler_pool`):
|
||||
* Role: Manages a pool of `AsyncWebCrawler` instances to reuse browser resources efficiently.
|
||||
* `get_crawler(BrowserConfig)`: Fetches an existing idle crawler compatible with the `BrowserConfig` or creates a new one if none are available or compatible.
|
||||
* `close_all()`: Iterates through all pooled crawlers and closes them.
|
||||
* `janitor()`: An `asyncio.Task` that runs periodically to close and remove crawler instances that have been idle for longer than `crawler.pool.idle_ttl_sec` (configured in `config.yml`).
|
||||
* 3.9.3. Global Page Semaphore (`GLOBAL_SEM`):
|
||||
* Type: `asyncio.Semaphore`.
|
||||
* Purpose: A global semaphore that limits the total number of concurrently open browser pages across all `AsyncWebCrawler` instances managed by the server. This acts as a hard cap to prevent excessive resource consumption.
|
||||
* Configuration: The maximum number of concurrent pages is set by `crawler.pool.max_pages` in `config.yml` (default: `30` in `server.py`, but `40` in `config.yml`). The `AsyncWebCrawler.arun` method acquires this semaphore.
|
||||
* 3.9.4. Job Router (`init_job_router` from `deploy.docker.job`):
|
||||
* Role: Manages asynchronous, long-running tasks, particularly for the `/crawl` (non-streaming batch) endpoint.
|
||||
* Mechanism: Uses Redis (configured in `config.yml`) as a backend for task queuing (storing task metadata like status, creation time, URL, result, error) and status tracking.
|
||||
* User Interaction: When a job is submitted to an endpoint using this router (e.g., `/crawl/job`), a `task_id` is returned. The client then polls an endpoint like `/task/{task_id}` to get the status and eventual result or error.
|
||||
* 3.9.5. Rate Limiting Middleware:
|
||||
* Implementation: Uses the `slowapi` library, integrated with FastAPI.
|
||||
* Purpose: To protect the server from abuse by limiting the number of requests an IP address can make within a specified time window.
|
||||
* Configuration: Settings like `enabled`, `default_limit`, `storage_uri` (e.g., `memory://` or `redis://...`) are managed in the `rate_limiting` section of `config.yml`.
|
||||
* 3.9.6. Security Middleware:
|
||||
* Implementations: `HTTPSRedirectMiddleware` and `TrustedHostMiddleware` from FastAPI, plus custom logic for adding security headers.
|
||||
* Purpose:
|
||||
* `HTTPSRedirectMiddleware`: Redirects HTTP requests to HTTPS if `security.https_redirect` is true.
|
||||
* `TrustedHostMiddleware`: Ensures requests are only served if their `Host` header matches an entry in `security.trusted_hosts`.
|
||||
* Custom header logic: Adds HTTP security headers like `X-Content-Type-Options`, `X-Frame-Options`, `Content-Security-Policy`, `Strict-Transport-Security` to all responses if `security.enabled` is true. These are defined in `security.headers` in `config.yml`.
|
||||
* 3.9.7. API Request Mapping:
|
||||
* Request Models: Pydantic models defined in `deploy/docker/schemas.py` (e.g., `CrawlRequest`, `MarkdownRequest`, `HTMLRequest`, `ScreenshotRequest`, `PDFRequest`, `JSEndpointRequest`, `TokenRequest`, `RawCode`) define the expected JSON structure for incoming API request bodies.
|
||||
* Endpoint Logic: Functions decorated with `@app.post(...)`, `@app.get(...)`, etc., in `server.py` handle incoming HTTP requests. These functions use FastAPI's dependency injection to parse and validate request bodies against the Pydantic models.
|
||||
* `AsyncWebCrawler` Interaction:
|
||||
* The parameters from the parsed request models (e.g., `CrawlRequest.urls`, `CrawlRequest.browser_config`, `CrawlRequest.crawler_config`) are used.
|
||||
* `BrowserConfig` and `CrawlerRunConfig` objects are created by calling their respective `.load()` class methods with the dictionary payloads received in the request (e.g., `BrowserConfig.load(crawl_request.browser_config)`).
|
||||
* These configuration objects are then passed to an `AsyncWebCrawler` instance obtained from the `crawler_pool`, typically to its `arun()` (for single URL or when JS execution context is critical) or `arun_many()` (for batch processing of multiple URLs) methods.
|
||||
* Result Serialization: The `CrawlResult` objects (or lists/generators of them) returned by the `AsyncWebCrawler` are usually serialized to JSON using their `.model_dump()` method before being included in the HTTP response. For streaming endpoints, each `CrawlResult` is serialized and sent as a separate NDJSON line.
|
||||
|
||||
## 4. Version Numbering Scheme
|
||||
|
||||
* 4.1. **Standard Versioning (`MAJOR.MINOR.PATCH`)**
|
||||
* `MAJOR`: Incremented when incompatible API changes are made.
|
||||
* `MINOR`: Incremented when functionality is added in a backward-compatible manner.
|
||||
* `PATCH`: Incremented for backward-compatible bug fixes.
|
||||
* 4.2. **Pre-release Suffixes**
|
||||
* `devN`: (e.g., `0.6.0.dev1`) Development release. These are typically unstable and used for internal testing or early feedback on new, unrefined features.
|
||||
* `aN`: (e.g., `0.6.0a1`) Alpha release. Indicates an early preview of a new version, potentially unstable, and APIs might still change.
|
||||
* `bN`: (e.g., `0.6.0b1`) Beta release. Generally feature-complete for the targeted minor or major version but may still contain bugs. APIs are mostly stable at this point.
|
||||
* `rcN`: (e.g., `0.6.0rc1`) Release Candidate. A version that is potentially the final release, undergoing final testing to catch critical bugs before official release.
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,494 +0,0 @@
|
||||
Okay, I will now generate the "Foundational Memory" document for the `extraction` component of `crawl4ai`, based on the outline you provided and the information I've processed from the codebase and existing documentation.
|
||||
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - extraction Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_extraction.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2024-05-24
|
||||
---
|
||||
|
||||
## 1. Overview of Data Extraction in Crawl4ai
|
||||
|
||||
* 1.1. Purpose of the Extraction Component: The extraction component in Crawl4ai is responsible for parsing structured data from web content (HTML, text, Markdown) or PDF documents. It allows users to define how data should be identified and extracted, using various strategies ranging from rule-based (CSS, XPath, Regex) to LLM-powered approaches. Its goal is to transform raw crawled content into usable, structured information.
|
||||
* 1.2. Core Concepts:
|
||||
* 1.2.1. `ExtractionStrategy`: This is an abstract base class (interface) that defines the contract for all specific extraction methods. Each strategy implements how data is extracted from the provided content.
|
||||
* 1.2.2. `ChunkingStrategy`: This is an abstract base class (interface) for strategies that preprocess content by splitting it into smaller, manageable chunks. This is particularly relevant for LLM-based extraction strategies that have token limits for their input.
|
||||
* 1.2.3. Schemas: Schemas define the structure of the data to be extracted. For non-LLM strategies like `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`, schemas are typically dictionary-based, specifying selectors and field types. For `LLMExtractionStrategy`, schemas can be Pydantic models or JSON schema dictionaries that guide the LLM in structuring its output.
|
||||
* 1.2.4. `CrawlerRunConfig`: The `CrawlerRunConfig` object allows users to specify which `extraction_strategy` and `chunking_strategy` (if applicable) should be used for a particular crawl operation via its `arun()` method.
|
||||
|
||||
## 2. `ExtractionStrategy` Interface
|
||||
|
||||
* 2.1. Purpose: The `ExtractionStrategy` class, found in `crawl4ai.extraction_strategy`, serves as an abstract base class (ABC) defining the standard interface for all data extraction strategies within the Crawl4ai library. Implementations of this class provide specific methods for extracting structured data from content.
|
||||
* 2.2. Key Abstract Methods:
|
||||
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Abstract method intended to extract meaningful blocks or chunks from the given content. Subclasses must implement this.
|
||||
* Parameters:
|
||||
* `url (str)`: The URL of the webpage.
|
||||
* `content (str)`: The HTML, Markdown, or text content of the webpage.
|
||||
* `*q`: Variable positional arguments.
|
||||
* `**kwargs`: Variable keyword arguments.
|
||||
* Returns: `List[Dict[str, Any]]` - A list of extracted blocks or chunks, typically as dictionaries.
|
||||
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Abstract method to process sections of text, often in parallel by default implementations in subclasses. Subclasses must implement this.
|
||||
* Parameters:
|
||||
* `url (str)`: The URL of the webpage.
|
||||
* `sections (List[str])`: List of sections (strings) to process.
|
||||
* `*q`: Variable positional arguments.
|
||||
* `**kwargs`: Variable keyword arguments.
|
||||
* Returns: `List[Dict[str, Any]]` - A list of processed JSON blocks.
|
||||
* 2.3. Input Format Property:
|
||||
* `input_format (str)`: [Read-only] - An attribute indicating the expected input format for the content to be processed by the strategy (e.g., "markdown", "html", "fit_html", "text"). Default is "markdown".
|
||||
|
||||
## 3. Non-LLM Based Extraction Strategies
|
||||
|
||||
* ### 3.1. Class `NoExtractionStrategy`
|
||||
* 3.1.1. Purpose: A baseline `ExtractionStrategy` that performs no actual data extraction. It returns the input content as is, typically useful for scenarios where only raw or cleaned HTML/Markdown is needed without further structuring.
|
||||
* 3.1.2. Inheritance: `ExtractionStrategy`
|
||||
* 3.1.3. Initialization (`__init__`):
|
||||
* 3.1.3.1. Signature: `NoExtractionStrategy(**kwargs)`
|
||||
* 3.1.3.2. Parameters:
|
||||
* `**kwargs`: Passed to the base `ExtractionStrategy` initializer.
|
||||
* 3.1.4. Key Public Methods:
|
||||
* `extract(self, url: str, html: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Returns the provided `html` content wrapped in a list containing a single dictionary: `[{"index": 0, "content": html}]`.
|
||||
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Returns a list where each input section is wrapped in a dictionary: `[{"index": i, "tags": [], "content": section} for i, section in enumerate(sections)]`.
|
||||
|
||||
* ### 3.2. Class `JsonCssExtractionStrategy`
|
||||
* 3.2.1. Purpose: Extracts structured data from HTML content using a JSON schema that defines CSS selectors to locate and extract data for specified fields. It uses BeautifulSoup4 for parsing and selection.
|
||||
* 3.2.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
|
||||
* 3.2.3. Initialization (`__init__`):
|
||||
* 3.2.3.1. Signature: `JsonCssExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||
* 3.2.3.2. Parameters:
|
||||
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules.
|
||||
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||
* 3.2.4. Schema Definition for `JsonCssExtractionStrategy`:
|
||||
* 3.2.4.1. `name (str)`: A descriptive name for the schema (e.g., "ProductDetails").
|
||||
* 3.2.4.2. `baseSelector (str)`: The primary CSS selector that identifies each root element representing an item to be extracted (e.g., "div.product-item").
|
||||
* 3.2.4.3. `fields (List[Dict[str, Any]])`: A list of dictionaries, each defining a field to be extracted from within each `baseSelector` element.
|
||||
* Each field dictionary:
|
||||
* `name (str)`: The key for this field in the output JSON object.
|
||||
* `selector (str)`: The CSS selector for this field, relative to its parent element (either the `baseSelector` or a parent "nested" field).
|
||||
* `type (str)`: Specifies how to extract the data. Common values:
|
||||
* `"text"`: Extracts the text content of the selected element.
|
||||
* `"attribute"`: Extracts the value of a specified HTML attribute.
|
||||
* `"html"`: Extracts the raw inner HTML of the selected element.
|
||||
* `"list"`: Extracts a list of items. The `fields` sub-key then defines the structure of each item in the list (if objects) or the `selector` directly targets list elements for primitive values.
|
||||
* `"nested"`: Extracts a nested JSON object. The `fields` sub-key defines the structure of this nested object.
|
||||
* `attribute (str, Optional)`: Required if `type` is "attribute". Specifies the name of the HTML attribute to extract (e.g., "href", "src").
|
||||
* `fields (List[Dict[str, Any]], Optional)`: Required if `type` is "list" (for a list of objects) or "nested". Defines the structure of the nested object or list items.
|
||||
* `transform (str, Optional)`: A string indicating a transformation to apply to the extracted value (e.g., "lowercase", "uppercase", "strip").
|
||||
* `default (Any, Optional)`: A default value to use if the selector does not find an element or the attribute is missing.
|
||||
* 3.2.5. Key Public Methods:
|
||||
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Parses the `html_content` and applies the defined schema to extract structured data using CSS selectors.
|
||||
* 3.2.6. Features:
|
||||
* 3.2.6.1. Nested Extraction: Supports extracting complex, nested JSON objects by defining "nested" type fields within the schema.
|
||||
* 3.2.6.2. List Handling: Supports extracting lists of primitive values (e.g., list of strings from multiple `<li>` tags) or lists of structured objects (e.g., a list of product details, each with its own fields).
|
||||
|
||||
* ### 3.3. Class `JsonXPathExtractionStrategy`
|
||||
* 3.3.1. Purpose: Extracts structured data from HTML/XML content using a JSON schema that defines XPath expressions to locate and extract data. It uses `lxml` for parsing and XPath evaluation.
|
||||
* 3.3.2. Inheritance: `JsonElementExtractionStrategy` (which inherits from `ExtractionStrategy`)
|
||||
* 3.3.3. Initialization (`__init__`):
|
||||
* 3.3.3.1. Signature: `JsonXPathExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||
* 3.3.3.2. Parameters:
|
||||
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, where selectors are XPath expressions.
|
||||
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||
* 3.3.4. Schema Definition: The schema structure is identical to `JsonCssExtractionStrategy` (see 3.2.4), but the `baseSelector` and field `selector` values must be valid XPath expressions.
|
||||
* 3.3.5. Key Public Methods:
|
||||
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using XPath expressions.
|
||||
|
||||
* ### 3.4. Class `JsonLxmlExtractionStrategy`
|
||||
* 3.4.1. Purpose: Provides an alternative CSS selector-based extraction strategy leveraging the `lxml` library for parsing and selection, which can offer performance benefits over BeautifulSoup4 in some cases.
|
||||
* 3.4.2. Inheritance: `JsonCssExtractionStrategy` (and thus `JsonElementExtractionStrategy`, `ExtractionStrategy`)
|
||||
* 3.4.3. Initialization (`__init__`):
|
||||
* 3.4.3.1. Signature: `JsonLxmlExtractionStrategy(schema: Dict[str, Any], **kwargs)`
|
||||
* 3.4.3.2. Parameters:
|
||||
* `schema (Dict[str, Any])`: The JSON schema defining extraction rules, using CSS selectors.
|
||||
* `**kwargs`: Passed to the base class initializer. Includes `input_format` (default: "html").
|
||||
* 3.4.4. Schema Definition: Identical to `JsonCssExtractionStrategy` (see 3.2.4).
|
||||
* 3.4.5. Key Public Methods:
|
||||
* `extract(self, url: str, html_content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Parses the `html_content` using `lxml` and applies the defined schema to extract structured data using lxml's CSS selector capabilities (which often translates CSS to XPath internally).
|
||||
|
||||
* ### 3.5. Class `RegexExtractionStrategy`
|
||||
* 3.5.1. Purpose: Extracts data from text content (HTML, Markdown, or plain text) using a collection of regular expression patterns. Each match is returned as a structured dictionary.
|
||||
* 3.5.2. Inheritance: `ExtractionStrategy`
|
||||
* 3.5.3. Initialization (`__init__`):
|
||||
* 3.5.3.1. Signature: `RegexExtractionStrategy(patterns: Union[Dict[str, str], List[Tuple[str, str]], "RegexExtractionStrategy._B"] = _B.NOTHING, input_format: str = "fit_html", **kwargs)`
|
||||
* 3.5.3.2. Parameters:
|
||||
* `patterns (Union[Dict[str, str], List[Tuple[str, str]], "_B"], default: _B.NOTHING)`:
|
||||
* Description: Defines the regex patterns to use.
|
||||
* Can be a dictionary mapping labels to regex strings (e.g., `{"email": r"..."}`).
|
||||
* Can be a list of (label, regex_string) tuples.
|
||||
* Can be a bitwise OR combination of `RegexExtractionStrategy._B` enum members for using built-in patterns (e.g., `RegexExtractionStrategy.Email | RegexExtractionStrategy.Url`).
|
||||
* `input_format (str, default: "fit_html")`: Specifies the input format for the content. Options: "html" (raw HTML), "markdown" (Markdown from HTML), "text" (plain text from HTML), "fit_html" (content filtered for relevance before regex application).
|
||||
* `**kwargs`: Passed to the base `ExtractionStrategy`.
|
||||
* 3.5.4. Built-in Patterns (`RegexExtractionStrategy._B` Enum - an `IntFlag`):
|
||||
* `EMAIL (auto())`: Matches email addresses. Example pattern: `r"[\\w.+-]+@[\\w-]+\\.[\\w.-]+"`
|
||||
* `PHONE_INTL (auto())`: Matches international phone numbers. Example pattern: `r"\\+?\\d[\\d .()-]{7,}\\d"`
|
||||
* `PHONE_US (auto())`: Matches US phone numbers. Example pattern: `r"\\(?\\d{3}\\)?[-. ]?\\d{3}[-. ]?\\d{4}"`
|
||||
* `URL (auto())`: Matches URLs. Example pattern: `r"https?://[^\\s\\'\"<>]+"`
|
||||
* `IPV4 (auto())`: Matches IPv4 addresses. Example pattern: `r"(?:\\d{1,3}\\.){3}\\d{1,3}"`
|
||||
* `IPV6 (auto())`: Matches IPv6 addresses. Example pattern: `r"[A-F0-9]{1,4}(?::[A-F0-9]{1,4}){7}"`
|
||||
* `UUID (auto())`: Matches UUIDs. Example pattern: `r"[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}"`
|
||||
* `CURRENCY (auto())`: Matches currency amounts. Example pattern: `r"(?:USD|EUR|RM|\\$|€|¥|£)\\s?\\d+(?:[.,]\\d{2})?"`
|
||||
* `PERCENTAGE (auto())`: Matches percentages. Example pattern: `r"\\d+(?:\\.\\d+)?%"`
|
||||
* `NUMBER (auto())`: Matches numbers (integers, decimals). Example pattern: `r"\\b\\d{1,3}(?:[,.]?\\d{3})*(?:\\.\\d+)?\\b"`
|
||||
* `DATE_ISO (auto())`: Matches ISO 8601 dates (YYYY-MM-DD). Example pattern: `r"\\d{4}-\\d{2}-\\d{2}"`
|
||||
* `DATE_US (auto())`: Matches US-style dates (MM/DD/YYYY or MM/DD/YY). Example pattern: `r"\\d{1,2}/\\d{1,2}/\\d{2,4}"`
|
||||
* `TIME_24H (auto())`: Matches 24-hour time formats (HH:MM or HH:MM:SS). Example pattern: `r"\\b(?:[01]?\\d|2[0-3]):[0-5]\\d(?:[:.][0-5]\\d)?\\b"`
|
||||
* `POSTAL_US (auto())`: Matches US postal codes (ZIP codes). Example pattern: `r"\\b\\d{5}(?:-\\d{4})?\\b"`
|
||||
* `POSTAL_UK (auto())`: Matches UK postal codes. Example pattern: `r"\\b[A-Z]{1,2}\\d[A-Z\\d]? ?\\d[A-Z]{2}\\b"`
|
||||
* `HTML_COLOR_HEX (auto())`: Matches HTML hex color codes. Example pattern: `r"#[0-9A-Fa-f]{6}\\b"`
|
||||
* `TWITTER_HANDLE (auto())`: Matches Twitter handles. Example pattern: `r"@[\\w]{1,15}"`
|
||||
* `HASHTAG (auto())`: Matches hashtags. Example pattern: `r"#[\\w-]+"`
|
||||
* `MAC_ADDR (auto())`: Matches MAC addresses. Example pattern: `r"(?:[0-9A-Fa-f]{2}[:-]){5}[0-9A-Fa-f]{2}"`
|
||||
* `IBAN (auto())`: Matches IBANs. Example pattern: `r"[A-Z]{2}\\d{2}[A-Z0-9]{11,30}"`
|
||||
* `CREDIT_CARD (auto())`: Matches common credit card numbers. Example pattern: `r"\\b(?:4\\d{12}(?:\\d{3})?|5[1-5]\\d{14}|3[47]\\d{13}|6(?:011|5\\d{2})\\d{12})\\b"`
|
||||
* `ALL (_B(-1).value & ~_B.NOTHING.value)`: Includes all built-in patterns except `NOTHING`.
|
||||
* `NOTHING (_B(0).value)`: Includes no built-in patterns.
|
||||
* 3.5.5. Key Public Methods:
|
||||
* `extract(self, url: str, content: str, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Applies all configured regex patterns (built-in and custom) to the input `content`.
|
||||
* Returns: `List[Dict[str, Any]]` - A list of dictionaries, where each dictionary represents a match and contains:
|
||||
* `"url" (str)`: The source URL.
|
||||
* `"label" (str)`: The label of the matching regex pattern.
|
||||
* `"value" (str)`: The actual matched string.
|
||||
* `"span" (Tuple[int, int])`: The start and end indices of the match within the content.
|
||||
* 3.5.6. Static Method: `generate_pattern`
|
||||
* 3.5.6.1. Signature: `staticmethod generate_pattern(label: str, html: str, query: Optional[str] = None, examples: Optional[List[str]] = None, llm_config: Optional[LLMConfig] = None, **kwargs) -> Dict[str, str]`
|
||||
* 3.5.6.2. Purpose: Uses an LLM to automatically generate a Python-compatible regular expression pattern for a given label, based on sample HTML content, an optional natural language query describing the target, and/or examples of desired matches.
|
||||
* 3.5.6.3. Parameters:
|
||||
* `label (str)`: A descriptive label for the pattern to be generated (e.g., "product_price", "article_date").
|
||||
* `html (str)`: The HTML content from which the pattern should be inferred.
|
||||
* `query (Optional[str], default: None)`: A natural language description of what kind of data the regex should capture (e.g., "Extract the publication date", "Find all ISBN numbers").
|
||||
* `examples (Optional[List[str]], default: None)`: A list of example strings that the generated regex should successfully match from the provided HTML.
|
||||
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM to be used. If `None`, uses default `LLMConfig`.
|
||||
* `**kwargs`: Additional arguments passed to the LLM completion request (e.g., `temperature`, `max_tokens`).
|
||||
* 3.5.6.4. Returns: `Dict[str, str]` - A dictionary containing the generated pattern, in the format `{label: "regex_pattern_string"}`.
|
||||
|
||||
## 4. LLM-Based Extraction Strategies
|
||||
|
||||
* ### 4.1. Class `LLMExtractionStrategy`
|
||||
* 4.1.1. Purpose: Employs Large Language Models (LLMs) to extract either structured data according to a schema or relevant blocks of text based on natural language instructions from various content formats (HTML, Markdown, text).
|
||||
* 4.1.2. Inheritance: `ExtractionStrategy`
|
||||
* 4.1.3. Initialization (`__init__`):
|
||||
* 4.1.3.1. Signature: `LLMExtractionStrategy(llm_config: Optional[LLMConfig] = None, instruction: Optional[str] = None, schema: Optional[Union[Dict[str, Any], "BaseModel"]] = None, extraction_type: str = "block", chunk_token_threshold: int = CHUNK_TOKEN_THRESHOLD, overlap_rate: float = OVERLAP_RATE, word_token_rate: float = WORD_TOKEN_RATE, apply_chunking: bool = True, force_json_response: bool = False, **kwargs)`
|
||||
* 4.1.3.2. Parameters:
|
||||
* `llm_config (Optional[LLMConfig], default: None)`: Configuration for the LLM. If `None`, a default `LLMConfig` is created.
|
||||
* `instruction (Optional[str], default: None)`: Natural language instructions to guide the LLM's extraction process (e.g., "Extract the main article content", "Summarize the key points").
|
||||
* `schema (Optional[Union[Dict[str, Any], "BaseModel"]], default: None)`: A Pydantic model class or a dictionary representing a JSON schema. Used when `extraction_type` is "schema" to define the desired output structure.
|
||||
* `extraction_type (str, default: "block")`: Determines the extraction mode.
|
||||
* `"block"`: LLM identifies and extracts relevant blocks/chunks of text based on the `instruction`.
|
||||
* `"schema"`: LLM attempts to populate the fields defined in `schema` from the content.
|
||||
* `chunk_token_threshold (int, default: CHUNK_TOKEN_THRESHOLD)`: The target maximum number of tokens for each chunk of content sent to the LLM. `CHUNK_TOKEN_THRESHOLD` is defined in `crawl4ai.config` (default value: 10000).
|
||||
* `overlap_rate (float, default: OVERLAP_RATE)`: The percentage of overlap between consecutive chunks to ensure context continuity. `OVERLAP_RATE` is defined in `crawl4ai.config` (default value: 0.1, i.e., 10%).
|
||||
* `word_token_rate (float, default: WORD_TOKEN_RATE)`: An estimated ratio of words to tokens (e.g., 0.75 words per token). Used for approximating chunk boundaries. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
|
||||
* `apply_chunking (bool, default: True)`: If `True`, the input content is chunked before being sent to the LLM. If `False`, the entire content is sent (which might exceed token limits for large inputs).
|
||||
* `force_json_response (bool, default: False)`: If `True` and `extraction_type` is "schema", instructs the LLM to strictly adhere to JSON output format.
|
||||
* `**kwargs`: Passed to `ExtractionStrategy` and potentially to the underlying LLM API calls (e.g., `temperature`, `max_tokens` if not set in `llm_config`).
|
||||
* 4.1.4. Key Public Methods:
|
||||
* `extract(self, url: str, content: str, *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Processes the input `content`. If `apply_chunking` is `True`, it first chunks the content using the specified `chunking_strategy` (or a default one if `LLMExtractionStrategy` manages it internally). Then, for each chunk (or the whole content if not chunked), it constructs a prompt based on `instruction` and/or `schema` and sends it to the configured LLM.
|
||||
* Returns: `List[Dict[str, Any]]` - A list of dictionaries.
|
||||
* If `extraction_type` is "block", each dictionary typically contains `{"index": int, "content": str, "tags": List[str]}`.
|
||||
* If `extraction_type` is "schema", each dictionary is an instance of the extracted structured data, ideally conforming to the provided `schema`. If the LLM returns multiple JSON objects in a list, they are parsed and returned.
|
||||
* `run(self, url: str, sections: List[str], *q, **kwargs) -> List[Dict[str, Any]]`:
|
||||
* Description: Processes a list of content `sections` in parallel (using `ThreadPoolExecutor`). Each section is passed to the `extract` method logic.
|
||||
* Returns: `List[Dict[str, Any]]` - Aggregated list of results from processing all sections.
|
||||
* 4.1.5. `TokenUsage` Tracking:
|
||||
* `total_usage (TokenUsage)`: [Read-only Public Attribute] - An instance of `TokenUsage` that accumulates the token counts (prompt, completion, total) from all LLM API calls made by this `LLMExtractionStrategy` instance.
|
||||
* `usages (List[TokenUsage])`: [Read-only Public Attribute] - A list containing individual `TokenUsage` objects for each separate LLM API call made during the extraction process. This allows for detailed tracking of token consumption per call.
|
||||
|
||||
## 5. `ChunkingStrategy` Interface and Implementations
|
||||
|
||||
* ### 5.1. Interface `ChunkingStrategy`
|
||||
* 5.1.1. Purpose: The `ChunkingStrategy` class, found in `crawl4ai.chunking_strategy`, is an abstract base class (ABC) that defines the interface for different content chunking algorithms. Chunking is used to break down large pieces of text or HTML into smaller, manageable segments, often before feeding them to an LLM or other processing steps.
|
||||
* 5.1.2. Key Abstract Methods:
|
||||
* `chunk(self, content: str) -> List[str]`:
|
||||
* Description: Abstract method that must be implemented by subclasses to split the input `content` string into a list of string chunks.
|
||||
* Parameters:
|
||||
* `content (str)`: The content to be chunked.
|
||||
* Returns: `List[str]` - A list of content chunks.
|
||||
|
||||
* ### 5.2. Class `RegexChunking`
|
||||
* 5.2.1. Purpose: Implements `ChunkingStrategy` by splitting content based on a list of regular expression patterns. It can also attempt to merge smaller chunks to meet a target `chunk_size`.
|
||||
* 5.2.2. Inheritance: `ChunkingStrategy`
|
||||
* 5.2.3. Initialization (`__init__`):
|
||||
* 5.2.3.1. Signature: `RegexChunking(patterns: Optional[List[str]] = None, chunk_size: Optional[int] = None, overlap: Optional[int] = None, word_token_ratio: Optional[float] = WORD_TOKEN_RATE, **kwargs)`
|
||||
* 5.2.3.2. Parameters:
|
||||
* `patterns (Optional[List[str]], default: None)`: A list of regex patterns used to split the text. If `None`, defaults to paragraph-based splitting (`["\\n\\n+"]`).
|
||||
* `chunk_size (Optional[int], default: None)`: The target token size for each chunk. If specified, the strategy will try to merge smaller chunks created by regex splitting to approximate this size.
|
||||
* `overlap (Optional[int], default: None)`: The target token overlap between consecutive chunks when `chunk_size` is active.
|
||||
* `word_token_ratio (Optional[float], default: WORD_TOKEN_RATE)`: The estimated ratio of words to tokens, used if `chunk_size` or `overlap` are specified. `WORD_TOKEN_RATE` is defined in `crawl4ai.config` (default value: 0.75).
|
||||
* `**kwargs`: Additional keyword arguments.
|
||||
* 5.2.4. Key Public Methods:
|
||||
* `chunk(self, content: str) -> List[str]`:
|
||||
* Description: Splits the input `content` using the configured regex patterns. If `chunk_size` is set, it then merges these initial chunks to meet the target size with the specified overlap.
|
||||
|
||||
* ### 5.3. Class `IdentityChunking`
|
||||
* 5.3.1. Purpose: A `ChunkingStrategy` that does not perform any actual chunking. It returns the input content as a single chunk in a list.
|
||||
* 5.3.2. Inheritance: `ChunkingStrategy`
|
||||
* 5.3.3. Initialization (`__init__`):
|
||||
* 5.3.3.1. Signature: `IdentityChunking(**kwargs)`
|
||||
* 5.3.3.2. Parameters:
|
||||
* `**kwargs`: Additional keyword arguments.
|
||||
* 5.3.4. Key Public Methods:
|
||||
* `chunk(self, content: str) -> List[str]`:
|
||||
* Description: Returns the input `content` as a single-element list: `[content]`.
|
||||
|
||||
## 6. Defining Schemas for Extraction
|
||||
|
||||
* 6.1. Purpose: Schemas provide a structured way to define what data needs to be extracted from content and how it should be organized. This allows for consistent and predictable output from the extraction process.
|
||||
* 6.2. Schemas for CSS/XPath/LXML-based Extraction (`JsonCssExtractionStrategy`, etc.):
|
||||
* 6.2.1. Format: These strategies use a dictionary-based JSON-like schema.
|
||||
* 6.2.2. Key elements: As detailed in section 3.2.4 for `JsonCssExtractionStrategy`:
|
||||
* `name (str)`: Name of the schema.
|
||||
* `baseSelector (str)`: CSS selector (for CSS strategies) or XPath expression (for XPath strategy) identifying the repeating parent elements.
|
||||
* `fields (List[Dict[str, Any]])`: A list defining each field to extract. Each field definition includes:
|
||||
* `name (str)`: Output key for the field.
|
||||
* `selector (str)`: CSS/XPath selector relative to the `baseSelector` or parent "nested" element.
|
||||
* `type (str)`: "text", "attribute", "html", "list", "nested".
|
||||
* `attribute (str, Optional)`: Name of HTML attribute (if type is "attribute").
|
||||
* `fields (List[Dict], Optional)`: For "list" (of objects) or "nested" types.
|
||||
* `transform (str, Optional)`: e.g., "lowercase".
|
||||
* `default (Any, Optional)`: Default value if not found.
|
||||
* 6.3. Schemas for LLM-based Extraction (`LLMExtractionStrategy`):
|
||||
* 6.3.1. Format: `LLMExtractionStrategy` accepts schemas in two main formats when `extraction_type="schema"`:
|
||||
* Pydantic models: The Pydantic model class itself.
|
||||
* Dictionary: A Python dictionary representing a valid JSON schema.
|
||||
* 6.3.2. Pydantic Models:
|
||||
* Definition: Users can define a Pydantic `BaseModel` where each field represents a piece of data to be extracted. Field types and descriptions are automatically inferred.
|
||||
* Conversion: `LLMExtractionStrategy` internally converts the Pydantic model to its JSON schema representation (`model_json_schema()`) to guide the LLM.
|
||||
* 6.3.3. Dictionary-based JSON Schema:
|
||||
* Structure: Users can provide a dictionary that conforms to the JSON Schema specification. This typically includes a `type: "object"` at the root and a `properties` dictionary defining each field, its type (e.g., "string", "number", "array", "object"), and optionally a `description`.
|
||||
* Usage: This schema is passed to the LLM to instruct it on the desired output format.
|
||||
|
||||
## 7. Configuration with `CrawlerRunConfig`
|
||||
|
||||
* 7.1. Purpose: The `CrawlerRunConfig` class (from `crawl4ai.async_configs`) is used to configure the behavior of a specific `arun()` or `arun_many()` call on an `AsyncWebCrawler` instance. It allows specifying various runtime parameters, including the extraction and chunking strategies.
|
||||
* 7.2. Key Attributes:
|
||||
* `extraction_strategy (Optional[ExtractionStrategy], default: None)`:
|
||||
* Purpose: Specifies the `ExtractionStrategy` instance to be used for processing the content obtained during the crawl. If `None`, no structured extraction beyond basic Markdown generation occurs (unless a default is applied by the crawler).
|
||||
* Type: An instance of a class inheriting from `ExtractionStrategy`.
|
||||
* `chunking_strategy (Optional[ChunkingStrategy], default: RegexChunking())`:
|
||||
* Purpose: Specifies the `ChunkingStrategy` instance to be used for breaking down content into smaller pieces before it's passed to an `ExtractionStrategy` (particularly `LLMExtractionStrategy`).
|
||||
* Type: An instance of a class inheriting from `ChunkingStrategy`.
|
||||
* Default: An instance of `RegexChunking()` with its default parameters (paragraph-based splitting).
|
||||
|
||||
## 8. LLM-Specific Configuration and Models
|
||||
|
||||
* ### 8.1. Class `LLMConfig`
|
||||
* 8.1.1. Purpose: The `LLMConfig` class (from `crawl4ai.async_configs`) centralizes configuration parameters for interacting with Large Language Models (LLMs) through various providers.
|
||||
* 8.1.2. Initialization (`__init__`):
|
||||
* 8.1.2.1. Signature:
|
||||
```python
|
||||
class LLMConfig:
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temperature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
frequency_penalty: Optional[float] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
stop: Optional[List[str]] = None,
|
||||
n: Optional[int] = None,
|
||||
): ...
|
||||
```
|
||||
* 8.1.2.2. Parameters:
|
||||
* `provider (str, default: DEFAULT_PROVIDER)`: Specifies the LLM provider and model, e.g., "openai/gpt-4o-mini", "ollama/llama3.3". `DEFAULT_PROVIDER` is "openai/gpt-4o-mini".
|
||||
* `api_token (Optional[str], default: None)`: API token for the LLM provider. If `None`, the system attempts to read it from environment variables (e.g., `OPENAI_API_KEY`, `GEMINI_API_KEY`, `GROQ_API_KEY` based on provider). Can also be prefixed with "env:" (e.g., "env:MY_CUSTOM_LLM_KEY").
|
||||
* `base_url (Optional[str], default: None)`: Custom base URL for the LLM API endpoint, for self-hosted or alternative provider endpoints.
|
||||
* `temperature (Optional[float], default: None)`: Controls randomness in LLM generation. Higher values (e.g., 0.8) make output more random, lower (e.g., 0.2) more deterministic.
|
||||
* `max_tokens (Optional[int], default: None)`: Maximum number of tokens the LLM should generate in its response.
|
||||
* `top_p (Optional[float], default: None)`: Nucleus sampling parameter. An alternative to temperature; controls the cumulative probability mass of tokens considered for generation.
|
||||
* `frequency_penalty (Optional[float], default: None)`: Penalizes new tokens based on their existing frequency in the text so far, decreasing repetition.
|
||||
* `presence_penalty (Optional[float], default: None)`: Penalizes new tokens based on whether they have appeared in the text so far, encouraging new topics.
|
||||
* `stop (Optional[List[str]], default: None)`: A list of sequences where the API will stop generating further tokens.
|
||||
* `n (Optional[int], default: None)`: Number of completions to generate for each prompt.
|
||||
* 8.1.3. Helper Methods:
|
||||
* `from_kwargs(kwargs: dict) -> LLMConfig`:
|
||||
* Description: [Static method] Creates an `LLMConfig` instance from a dictionary of keyword arguments.
|
||||
* `to_dict() -> dict`:
|
||||
* Description: Converts the `LLMConfig` instance into a dictionary representation.
|
||||
* `clone(**kwargs) -> LLMConfig`:
|
||||
* Description: Creates a new `LLMConfig` instance as a copy of the current one, allowing specific attributes to be overridden with `kwargs`.
|
||||
|
||||
* ### 8.2. Dataclass `TokenUsage`
|
||||
* 8.2.1. Purpose: The `TokenUsage` dataclass (from `crawl4ai.models`) is used to store information about the number of tokens consumed during an LLM API call.
|
||||
* 8.2.2. Fields:
|
||||
* `completion_tokens (int, default: 0)`: The number of tokens generated by the LLM in the completion.
|
||||
* `prompt_tokens (int, default: 0)`: The number of tokens in the prompt sent to the LLM.
|
||||
* `total_tokens (int, default: 0)`: The sum of `completion_tokens` and `prompt_tokens`.
|
||||
* `completion_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of completion tokens, if available.
|
||||
* `prompt_tokens_details (Optional[dict], default: None)`: Provider-specific detailed breakdown of prompt tokens, if available.
|
||||
|
||||
## 9. PDF Processing and Extraction
|
||||
|
||||
* ### 9.1. Overview of PDF Processing
|
||||
* 9.1.1. Purpose: Crawl4ai provides specialized strategies to handle PDF documents, enabling the fetching of PDF content and subsequent extraction of text, images, and metadata. This allows PDFs to be treated as a primary content source similar to HTML web pages.
|
||||
* 9.1.2. Key Components:
|
||||
* `PDFCrawlerStrategy`: For fetching/identifying PDF content.
|
||||
* `PDFContentScrapingStrategy`: For processing PDF content using an underlying PDF processor.
|
||||
* `NaivePDFProcessorStrategy`: The default logic for parsing PDF files.
|
||||
|
||||
* ### 9.2. Class `PDFCrawlerStrategy`
|
||||
* 9.2.1. Purpose: An implementation of `AsyncCrawlerStrategy` specifically for handling PDF documents. It doesn't perform typical browser interactions but focuses on fetching PDF content and setting the appropriate response headers to indicate a PDF document, which then allows `PDFContentScrapingStrategy` to process it.
|
||||
* 9.2.2. Inheritance: `AsyncCrawlerStrategy` (from `crawl4ai.async_crawler_strategy`)
|
||||
* 9.2.3. Initialization (`__init__`):
|
||||
* 9.2.3.1. Signature: `PDFCrawlerStrategy(logger: Optional[AsyncLogger] = None)`
|
||||
* 9.2.3.2. Parameters:
|
||||
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance for logging messages.
|
||||
* 9.2.4. Key Public Methods:
|
||||
* `crawl(self, url: str, **kwargs) -> AsyncCrawlResponse`:
|
||||
* Description: Fetches the content from the given `url`. If the content is identified as a PDF (either by URL extension or `Content-Type` header for remote URLs), it sets `response_headers={"Content-Type": "application/pdf"}` in the returned `AsyncCrawlResponse`. The `html` field of the response will contain a placeholder message as the actual PDF processing happens in the scraping strategy.
|
||||
* `close(self) -> None`:
|
||||
* Description: Placeholder for cleanup, typically does nothing in this strategy.
|
||||
* `__aenter__(self) -> "PDFCrawlerStrategy"`:
|
||||
* Description: Async context manager entry point.
|
||||
* `__aexit__(self, exc_type, exc_val, exc_tb) -> None`:
|
||||
* Description: Async context manager exit point, calls `close()`.
|
||||
|
||||
* ### 9.3. Class `PDFContentScrapingStrategy`
|
||||
* 9.3.1. Purpose: An implementation of `ContentScrapingStrategy` designed to process PDF documents. It uses an underlying `PDFProcessorStrategy` (by default, `NaivePDFProcessorStrategy`) to extract text, images, and metadata from the PDF, then formats this information into a `ScrapingResult`.
|
||||
* 9.3.2. Inheritance: `ContentScrapingStrategy` (from `crawl4ai.content_scraping_strategy`)
|
||||
* 9.3.3. Initialization (`__init__`):
|
||||
* 9.3.3.1. Signature: `PDFContentScrapingStrategy(save_images_locally: bool = False, extract_images: bool = False, image_save_dir: Optional[str] = None, batch_size: int = 4, logger: Optional[AsyncLogger] = None)`
|
||||
* 9.3.3.2. Parameters:
|
||||
* `save_images_locally (bool, default: False)`: If `True`, extracted images will be saved to the local filesystem.
|
||||
* `extract_images (bool, default: False)`: If `True`, the strategy will attempt to extract images from the PDF.
|
||||
* `image_save_dir (Optional[str], default: None)`: The directory where extracted images will be saved if `save_images_locally` is `True`. If `None`, a default or temporary directory might be used.
|
||||
* `batch_size (int, default: 4)`: The number of PDF pages to process in parallel by the underlying `NaivePDFProcessorStrategy`.
|
||||
* `logger (Optional[AsyncLogger], default: None)`: An optional logger instance.
|
||||
* 9.3.4. Key Attributes:
|
||||
* `pdf_processor (NaivePDFProcessorStrategy)`: An instance of `NaivePDFProcessorStrategy` configured with the provided image and batch settings, used to do the actual PDF parsing.
|
||||
* 9.3.5. Key Public Methods:
|
||||
* `scrape(self, url: str, html: str, **params) -> ScrapingResult`:
|
||||
* Description: Takes a `url` (which can be a local file path or a remote HTTP/HTTPS URL pointing to a PDF) and processes it. The `html` parameter is typically a placeholder like "Scraper will handle the real work" as the content comes from the PDF file itself. It downloads remote PDFs to a temporary local file before processing.
|
||||
* Returns: `ScrapingResult` containing the extracted PDF data, including `cleaned_html` (concatenated HTML of pages), `media` (extracted images), `links`, and `metadata`.
|
||||
* `ascrape(self, url: str, html: str, **kwargs) -> ScrapingResult`:
|
||||
* Description: Asynchronous version of `scrape`. Internally calls `scrape` using `asyncio.to_thread`.
|
||||
* 9.3.6. Internal Methods (Conceptual):
|
||||
* `_get_pdf_path(self, url: str) -> str`:
|
||||
* Description: If `url` is an HTTP/HTTPS URL, downloads the PDF to a temporary file and returns its path. If `url` starts with "file://", it strips the prefix and returns the local path. Otherwise, assumes `url` is already a local path. Handles download timeouts and errors.
|
||||
|
||||
* ### 9.4. Class `NaivePDFProcessorStrategy`
|
||||
* 9.4.1. Purpose: The default implementation of `PDFProcessorStrategy` in Crawl4ai. It uses the PyPDF2 library (and Pillow for image processing) to parse PDF files, extract text content page by page, attempt to extract embedded images, and gather document metadata.
|
||||
* 9.4.2. Inheritance: `PDFProcessorStrategy` (from `crawl4ai.processors.pdf.processor`)
|
||||
* 9.4.3. Dependencies: Requires `PyPDF2` and `Pillow`. These are installed with the `crawl4ai[pdf]` extra.
|
||||
* 9.4.4. Initialization (`__init__`):
|
||||
* 9.4.4.1. Signature: `NaivePDFProcessorStrategy(image_dpi: int = 144, image_quality: int = 85, extract_images: bool = True, save_images_locally: bool = False, image_save_dir: Optional[Path] = None, batch_size: int = 4)`
|
||||
* 9.4.4.2. Parameters:
|
||||
* `image_dpi (int, default: 144)`: DPI used when rendering PDF pages to images (if direct image extraction is not possible or disabled).
|
||||
* `image_quality (int, default: 85)`: Quality setting (1-100) for images saved in lossy formats like JPEG.
|
||||
* `extract_images (bool, default: True)`: If `True`, attempts to extract embedded images directly from the PDF's XObjects.
|
||||
* `save_images_locally (bool, default: False)`: If `True`, extracted images are saved to disk. Otherwise, they are base64 encoded and returned in the `PDFPage.images` data.
|
||||
* `image_save_dir (Optional[Path], default: None)`: If `save_images_locally` is True, this specifies the directory to save images. If `None`, a temporary directory (prefixed `pdf_images_`) is created and used.
|
||||
* `batch_size (int, default: 4)`: The number of pages to process in parallel when using the `process_batch` method.
|
||||
* 9.4.5. Key Public Methods:
|
||||
* `process(self, pdf_path: Path) -> PDFProcessResult`:
|
||||
* Description: Processes the PDF specified by `pdf_path` page by page sequentially.
|
||||
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects.
|
||||
* `process_batch(self, pdf_path: Path) -> PDFProcessResult`:
|
||||
* Description: Processes the PDF specified by `pdf_path` by handling pages in parallel batches using a `ThreadPoolExecutor` with `max_workers` set to `batch_size`.
|
||||
* Returns: `PDFProcessResult` containing metadata and a list of `PDFPage` objects, assembled in the correct page order.
|
||||
* 9.4.6. Internal Methods (Conceptual High-Level):
|
||||
* `_process_page(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> PDFPage`: Extracts text, images (if `extract_images` is True), and links from a single PyPDF2 page object.
|
||||
* `_extract_images(self, page: PyPDF2PageObject, image_dir: Optional[Path]) -> List[Dict]`: Iterates through XObjects on a page, identifies images, decodes them (handling FlateDecode, DCTDecode, CCITTFaxDecode, JPXDecode), and either saves them locally or base64 encodes them.
|
||||
* `_extract_links(self, page: PyPDF2PageObject) -> List[str]`: Extracts URI actions from page annotations to get hyperlinks.
|
||||
* `_extract_metadata(self, pdf_path: Path, reader: PyPDF2PdfReader) -> PDFMetadata`: Reads metadata from the PDF document information dictionary (e.g., /Title, /Author, /CreationDate).
|
||||
|
||||
* ### 9.5. Data Models for PDF Processing
|
||||
* 9.5.1. Dataclass `PDFMetadata` (from `crawl4ai.processors.pdf.processor`)
|
||||
* Fields:
|
||||
* `title (Optional[str], default: None)`
|
||||
* `author (Optional[str], default: None)`
|
||||
* `producer (Optional[str], default: None)`
|
||||
* `created (Optional[datetime], default: None)`
|
||||
* `modified (Optional[datetime], default: None)`
|
||||
* `pages (int, default: 0)`
|
||||
* `encrypted (bool, default: False)`
|
||||
* `file_size (Optional[int], default: None)`
|
||||
* 9.5.2. Dataclass `PDFPage` (from `crawl4ai.processors.pdf.processor`)
|
||||
* Fields:
|
||||
* `page_number (int)`
|
||||
* `raw_text (str, default: "")`
|
||||
* `markdown (str, default: "")`: Markdown representation of the page's text content, processed by `clean_pdf_text`.
|
||||
* `html (str, default: "")`: HTML representation of the page's text content, processed by `clean_pdf_text_to_html`.
|
||||
* `images (List[Dict], default_factory: list)`: List of image dictionaries. Each dictionary contains:
|
||||
* `format (str)`: e.g., "png", "jpeg", "tiff", "jp2", "bin".
|
||||
* `width (int)`
|
||||
* `height (int)`
|
||||
* `color_space (str)`: e.g., "/DeviceRGB", "/DeviceGray".
|
||||
* `bits_per_component (int)`
|
||||
* `path (str, Optional)`: If `save_images_locally` was True, path to the saved image file.
|
||||
* `data (str, Optional)`: If `save_images_locally` was False, base64 encoded image data.
|
||||
* `page (int)`: The page number this image was extracted from.
|
||||
* `links (List[str], default_factory: list)`: List of hyperlink URLs found on the page.
|
||||
* `layout (List[Dict], default_factory: list)`: List of dictionaries representing text layout elements, primarily: `{"type": "text", "text": str, "x": float, "y": float}`.
|
||||
* 9.5.3. Dataclass `PDFProcessResult` (from `crawl4ai.processors.pdf.processor`)
|
||||
* Fields:
|
||||
* `metadata (PDFMetadata)`
|
||||
* `pages (List[PDFPage])`
|
||||
* `processing_time (float, default: 0.0)`: Time in seconds taken to process the PDF.
|
||||
* `version (str, default: "1.1")`: Version of the PDF processor strategy (e.g., "1.1" for current `NaivePDFProcessorStrategy`).
|
||||
|
||||
* ### 9.6. Using PDF Strategies with `AsyncWebCrawler`
|
||||
* 9.6.1. Workflow:
|
||||
1. Instantiate `AsyncWebCrawler`. The `crawler_strategy` parameter of `AsyncWebCrawler` should be set to an instance of `PDFCrawlerStrategy` if you intend to primarily crawl PDF URLs or local PDF files directly. If crawling mixed content where PDFs are discovered via links on HTML pages, the default `AsyncPlaywrightCrawlerStrategy` might be used initially, and then a PDF-specific scraping strategy would be applied when a PDF content type is detected.
|
||||
2. In `CrawlerRunConfig`, set the `scraping_strategy` attribute to an instance of `PDFContentScrapingStrategy`. Configure this strategy with desired options like `extract_images`, `save_images_locally`, etc.
|
||||
3. When `crawler.arun(url="path/to/document.pdf", config=run_config)` is called for a PDF URL or local file path:
|
||||
* `PDFCrawlerStrategy` (if used) or the default crawler strategy fetches the file.
|
||||
* `PDFContentScrapingStrategy.scrape()` is invoked. It uses its internal `NaivePDFProcessorStrategy` instance to parse the PDF.
|
||||
* The extracted text, image data, and metadata are populated into the `CrawlResult` object (e.g., `result.markdown`, `result.media["images"]`, `result.metadata`).
|
||||
* 9.6.2. Example Snippet:
|
||||
```python
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, PDFCrawlerStrategy
|
||||
from crawl4ai.content_scraping_strategy import PDFContentScrapingStrategy
|
||||
from crawl4ai.processors.pdf import PDFContentScrapingStrategy # Corrected import path
|
||||
|
||||
async def main():
|
||||
# Setup for PDF processing
|
||||
pdf_crawler_strategy = PDFCrawlerStrategy() # Use if directly targeting PDF URLs
|
||||
pdf_scraping_strategy = PDFContentScrapingStrategy(
|
||||
extract_images=True,
|
||||
save_images_locally=True,
|
||||
image_save_dir="./pdf_images_output" # Ensure this directory exists
|
||||
)
|
||||
Path("./pdf_images_output").mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# If crawling a website that links to PDFs, you might use the default crawler strategy
|
||||
# and rely on content-type detection to switch to PDFContentScrapingStrategy if needed.
|
||||
# For direct PDF URL:
|
||||
async with AsyncWebCrawler(crawler_strategy=pdf_crawler_strategy) as crawler:
|
||||
run_config = CrawlerRunConfig(scraping_strategy=pdf_scraping_strategy)
|
||||
# Example PDF URL (replace with a real one for testing)
|
||||
pdf_url = "https://www.w3.org/WAI/ER/tests/xhtml/testfiles/resources/pdf/dummy.pdf"
|
||||
result = await crawler.arun(url=pdf_url, config=run_config)
|
||||
|
||||
if result.success:
|
||||
print(f"Successfully processed PDF: {result.url}")
|
||||
if result.markdown:
|
||||
print(f"Markdown content (first 500 chars): {result.markdown.raw_markdown[:500]}")
|
||||
if result.media and result.media.images:
|
||||
print(f"Extracted {len(result.media.images)} images.")
|
||||
for img in result.media.images:
|
||||
print(f" - Image source/path: {img.src or img.path}, Page: {img.page}")
|
||||
if result.metadata:
|
||||
print(f"PDF Metadata: {result.metadata}")
|
||||
else:
|
||||
print(f"Failed to process PDF: {result.url}, Error: {result.error_message}")
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# asyncio.run(main())
|
||||
```
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,818 +0,0 @@
|
||||
# Examples Outline for crawl4ai - markdown Component
|
||||
|
||||
**Target Document Type:** Examples Collection
|
||||
**Target Output Filename Suggestion:** `llm_examples_markdown.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
This document provides practical, runnable code examples for the `markdown` component of the `crawl4ai` library, focusing on the `DefaultMarkdownGenerator` and its various configurations.
|
||||
|
||||
## 1. Basic Markdown Generation with `DefaultMarkdownGenerator`
|
||||
|
||||
### 1.1. Example: Generating Markdown with default `DefaultMarkdownGenerator` settings via `AsyncWebCrawler`.
|
||||
This example demonstrates the most basic usage of `DefaultMarkdownGenerator` within an `AsyncWebCrawler` run.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
|
||||
async def basic_markdown_generation_via_crawler():
|
||||
# DefaultMarkdownGenerator will be used by default if markdown_generator is not specified,
|
||||
# but we explicitly set it here for clarity.
|
||||
md_generator = DefaultMarkdownGenerator()
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS # Use BYPASS for fresh content in examples
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Raw Markdown (First 300 chars) ---")
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
print("\n--- Markdown with Citations (First 300 chars) ---")
|
||||
print(result.markdown.markdown_with_citations[:300])
|
||||
print("\n--- References Markdown ---")
|
||||
print(result.markdown.references_markdown) # example.com has no outbound links usually
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(basic_markdown_generation_via_crawler())
|
||||
```
|
||||
---
|
||||
|
||||
### 1.2. Example: Direct instantiation and use of `DefaultMarkdownGenerator`.
|
||||
You can use `DefaultMarkdownGenerator` directly if you already have HTML content.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def direct_markdown_generation():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html>
|
||||
<head><title>Test Page</title></head>
|
||||
<body>
|
||||
<h1>Welcome to Example</h1>
|
||||
<p>This is a paragraph with a <a href="https://example.org/another-page">link</a>.</p>
|
||||
<p>Another paragraph follows.</p>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
# base_url is important for resolving relative links if any, and for citation context
|
||||
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||
|
||||
print("--- Raw Markdown (Direct Generation) ---")
|
||||
print(result_md.raw_markdown)
|
||||
print("\n--- Markdown with Citations (Direct Generation) ---")
|
||||
print(result_md.markdown_with_citations)
|
||||
print("\n--- References Markdown (Direct Generation) ---")
|
||||
print(result_md.references_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
direct_markdown_generation()
|
||||
```
|
||||
---
|
||||
|
||||
## 2. Citation Management in Markdown
|
||||
|
||||
### 2.1. Example: Default citation behavior (citations enabled).
|
||||
By default, `DefaultMarkdownGenerator` generates citations for links.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def default_citation_behavior():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html><body>
|
||||
<p>Check out <a href="https://crawl4ai.com" title="Crawl4ai Homepage">Crawl4ai</a> and
|
||||
<a href="/docs">our documentation</a>.</p>
|
||||
</body></html>
|
||||
"""
|
||||
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||
|
||||
print("--- Raw Markdown ---")
|
||||
print(result_md.raw_markdown)
|
||||
print("\n--- Markdown with Citations ---")
|
||||
print(result_md.markdown_with_citations)
|
||||
print("\n--- References Markdown ---")
|
||||
print(result_md.references_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
default_citation_behavior()
|
||||
```
|
||||
---
|
||||
|
||||
### 2.2. Example: Disabling citations in `DefaultMarkdownGenerator`.
|
||||
You can disable citation generation by setting `citations=False` in the `generate_markdown` method.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def disabling_citations():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html><body>
|
||||
<p>A link to <a href="https://anothersite.com">another site</a> will not be cited.</p>
|
||||
</body></html>
|
||||
"""
|
||||
# Disable citations for this specific call
|
||||
result_md_no_citations = generator.generate_markdown(
|
||||
input_html=html_content,
|
||||
base_url="https://example.com",
|
||||
citations=False
|
||||
)
|
||||
|
||||
print("--- Raw Markdown (Citations Disabled) ---")
|
||||
print(result_md_no_citations.raw_markdown)
|
||||
print("\n--- Markdown with Citations (Citations Disabled) ---")
|
||||
# This should be the same as raw_markdown when citations=False
|
||||
print(result_md_no_citations.markdown_with_citations)
|
||||
print("\n--- References Markdown (Citations Disabled) ---")
|
||||
# This should be empty or minimal
|
||||
print(result_md_no_citations.references_markdown)
|
||||
|
||||
# For comparison, with citations enabled (default)
|
||||
result_md_with_citations = generator.generate_markdown(
|
||||
input_html=html_content,
|
||||
base_url="https://example.com",
|
||||
citations=True # Default
|
||||
)
|
||||
print("\n--- For Comparison: Markdown with Citations (Enabled) ---")
|
||||
print(result_md_with_citations.markdown_with_citations)
|
||||
print("\n--- For Comparison: References Markdown (Enabled) ---")
|
||||
print(result_md_with_citations.references_markdown)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
disabling_citations()
|
||||
```
|
||||
---
|
||||
|
||||
### 2.3. Example: Impact of `base_url` on citation links for relative URLs.
|
||||
The `base_url` parameter is crucial for correctly resolving relative URLs in your HTML content into absolute URLs in the references.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def base_url_impact_on_citations():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html><body>
|
||||
<p>Links: <a href="/features">Features</a>, <a href="pricing.html">Pricing</a>,
|
||||
and an absolute link to <a href="https://external.com/resource">External Resource</a>.</p>
|
||||
</body></html>
|
||||
"""
|
||||
|
||||
print("--- Case 1: With base_url='https://example.com/products/' ---")
|
||||
result_md_case1 = generator.generate_markdown(
|
||||
input_html=html_content,
|
||||
base_url="https://example.com/products/"
|
||||
)
|
||||
print(result_md_case1.references_markdown)
|
||||
|
||||
print("\n--- Case 2: With base_url='https://another-domain.net/' ---")
|
||||
result_md_case2 = generator.generate_markdown(
|
||||
input_html=html_content,
|
||||
base_url="https://another-domain.net/"
|
||||
)
|
||||
print(result_md_case2.references_markdown)
|
||||
|
||||
print("\n--- Case 3: Without base_url (relative links might be incomplete) ---")
|
||||
result_md_case3 = generator.generate_markdown(input_html=html_content)
|
||||
print(result_md_case3.references_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
base_url_impact_on_citations()
|
||||
```
|
||||
---
|
||||
|
||||
### 2.4. Example: Handling HTML with no links (empty `references_markdown`).
|
||||
If the input HTML contains no hyperlinks, the `references_markdown` will be empty.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def no_links_in_html():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = "<html><body><p>This is a paragraph with no links at all.</p><b>Just some bold text.</b></body></html>"
|
||||
result_md = generator.generate_markdown(input_html=html_content, base_url="https://example.com")
|
||||
|
||||
print("--- Raw Markdown ---")
|
||||
print(result_md.raw_markdown)
|
||||
print("\n--- Markdown with Citations ---")
|
||||
print(result_md.markdown_with_citations) # Should be same as raw_markdown
|
||||
print("\n--- References Markdown ---")
|
||||
print(f"'{result_md.references_markdown}'") # Should be empty or contain minimal boilerplate
|
||||
|
||||
if __name__ == "__main__":
|
||||
no_links_in_html()
|
||||
```
|
||||
---
|
||||
|
||||
## 3. Controlling `html2text` Conversion Options
|
||||
The `DefaultMarkdownGenerator` uses the `html2text` library internally. You can pass options to `html2text` either during generator initialization (`options` parameter) or during the `generate_markdown` call (`html2text_options` parameter).
|
||||
|
||||
### 3.1. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore links.
|
||||
This will prevent links from appearing in the Markdown output altogether (different from `citations=False` which keeps link text but omits citation markers).
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def ignore_links_option():
|
||||
# Initialize with html2text option to ignore links
|
||||
generator = DefaultMarkdownGenerator(options={"ignore_links": True})
|
||||
html_content = "<html><body><p>A link to <a href='https://example.com'>Example Site</a> and some text.</p></body></html>"
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown (ignore_links=True) ---")
|
||||
print(result_md.raw_markdown) # Link text might be present or absent based on html2text behavior
|
||||
print("--- Markdown with Citations (ignore_links=True) ---")
|
||||
print(result_md.markdown_with_citations) # No citations as links are ignored
|
||||
print("--- References (ignore_links=True) ---")
|
||||
print(f"'{result_md.references_markdown}'") # Should be empty
|
||||
|
||||
if __name__ == "__main__":
|
||||
ignore_links_option()
|
||||
```
|
||||
---
|
||||
|
||||
### 3.2. Example: Initializing `DefaultMarkdownGenerator` with `options` to ignore images.
|
||||
This will prevent image references (like ``) from appearing in the Markdown.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def ignore_images_option():
|
||||
generator = DefaultMarkdownGenerator(options={"ignore_images": True})
|
||||
html_content = "<html><body><p>An image: <img src='image.png' alt='My Test Image'></p></body></html>"
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown (ignore_images=True) ---")
|
||||
print(result_md.raw_markdown) # Image markdown should be absent
|
||||
|
||||
if __name__ == "__main__":
|
||||
ignore_images_option()
|
||||
```
|
||||
---
|
||||
|
||||
### 3.3. Example: Initializing `DefaultMarkdownGenerator` with `options` for `body_width=0` (no line wrapping).
|
||||
`body_width=0` tells `html2text` not to wrap lines.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def no_line_wrapping_option():
|
||||
generator = DefaultMarkdownGenerator(options={"body_width": 0})
|
||||
long_text = "This is a very long line of text that would normally be wrapped by html2text. " * 5
|
||||
html_content = f"<html><body><p>{long_text}</p></body></html>"
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown (body_width=0) ---")
|
||||
print(result_md.raw_markdown) # Observe the long line without soft wraps
|
||||
|
||||
if __name__ == "__main__":
|
||||
no_line_wrapping_option()
|
||||
```
|
||||
---
|
||||
|
||||
### 3.4. Example: Initializing `DefaultMarkdownGenerator` to disable emphasis.
|
||||
This will remove formatting for `<em>` and `<strong>` tags.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def ignore_emphasis_option():
|
||||
generator = DefaultMarkdownGenerator(options={"ignore_emphasis": True})
|
||||
html_content = "<html><body><p>Normal, <em>emphasized</em>, and <strong>strongly emphasized</strong> text.</p></body></html>"
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown (ignore_emphasis=True) ---")
|
||||
print(result_md.raw_markdown) # Emphasis should be gone
|
||||
|
||||
if __name__ == "__main__":
|
||||
ignore_emphasis_option()
|
||||
```
|
||||
---
|
||||
|
||||
### 3.5. Example: Overriding `html2text_options` at `generate_markdown` call time.
|
||||
Options passed to `generate_markdown` via `html2text_options` take precedence.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def override_html2text_options():
|
||||
# Initial generator might have some defaults
|
||||
generator = DefaultMarkdownGenerator(options={"ignore_links": False})
|
||||
html_content = "<html><body><p>Link: <a href='https://example.com'>Example</a>.</p></body></html>"
|
||||
|
||||
# Override at call time to protect links
|
||||
result_md = generator.generate_markdown(
|
||||
input_html=html_content,
|
||||
html2text_options={"protect_links": True} # Links will be <URL>
|
||||
)
|
||||
|
||||
print("--- Markdown (protect_links=True via call-time override) ---")
|
||||
print(result_md.raw_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
override_html2text_options()
|
||||
```
|
||||
---
|
||||
|
||||
### 3.6. Example: Combining multiple `html2text` options.
|
||||
Multiple options can be combined for fine-grained control over the Markdown output.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def combined_html2text_options():
|
||||
generator = DefaultMarkdownGenerator(options={
|
||||
"ignore_links": True,
|
||||
"ignore_images": True,
|
||||
"body_width": 60 # Wrap at 60 characters
|
||||
})
|
||||
html_content = """
|
||||
<html><body>
|
||||
<p>This is a paragraph with a <a href='https://example.com'>link to ignore</a> and an
|
||||
<img src='image.png' alt='image to ignore'>. It also has some long text to demonstrate wrapping.
|
||||
Lorem ipsum dolor sit amet, consectetur adipiscing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua.
|
||||
</p>
|
||||
</body></html>
|
||||
"""
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown (Combined Options: ignore_links, ignore_images, body_width=60) ---")
|
||||
print(result_md.raw_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
combined_html2text_options()
|
||||
```
|
||||
---
|
||||
|
||||
## 4. Selecting the HTML Content Source for Markdown Generation
|
||||
The `DefaultMarkdownGenerator` can generate Markdown from different HTML sources within the `CrawlResult`.
|
||||
|
||||
### 4.1. Example: Markdown from `cleaned_html` (default `content_source`).
|
||||
This is the default behavior. `cleaned_html` is the HTML after `WebScrapingStrategy` (e.g., `LXMLWebScrapingStrategy`) has processed it.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
|
||||
async def markdown_from_cleaned_html():
|
||||
# Default content_source is "cleaned_html"
|
||||
md_generator = DefaultMarkdownGenerator()
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a more complex page to see the effect of cleaning
|
||||
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Markdown from Cleaned HTML (Default - First 300 chars) ---")
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
# For comparison, show a snippet of cleaned_html
|
||||
print("\n--- Cleaned HTML (Source - First 300 chars) ---")
|
||||
print(result.cleaned_html[:300])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(markdown_from_cleaned_html())
|
||||
```
|
||||
---
|
||||
|
||||
### 4.2. Example: Markdown from `raw_html`.
|
||||
This example uses the original, unprocessed HTML fetched from the URL as the source for Markdown generation.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
|
||||
async def markdown_from_raw_html():
|
||||
md_generator = DefaultMarkdownGenerator(content_source="raw_html")
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Markdown from Raw HTML (First 300 chars) ---")
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
print("\n--- Raw Page HTML (Source - First 300 chars for comparison) ---")
|
||||
print(result.html[:300]) # result.html contains the raw HTML
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(markdown_from_raw_html())
|
||||
```
|
||||
---
|
||||
|
||||
### 4.3. Example: Markdown from `fit_html` (requires a `ContentFilterStrategy`).
|
||||
`fit_html` is the HTML content after a `ContentFilterStrategy` (like `PruningContentFilter`) has processed it.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def markdown_from_fit_html():
|
||||
# A content filter must run to produce fit_html
|
||||
pruning_filter = PruningContentFilter()
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=pruning_filter,
|
||||
content_source="fit_html" # Explicitly use the output of the filter
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a news site which PruningContentFilter can work on
|
||||
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Markdown from Fit HTML (Output of PruningFilter - First 300 chars) ---")
|
||||
# When content_source="fit_html", result.markdown.raw_markdown IS from fit_html
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
print("\n--- Fit HTML itself (Source - First 300 chars for comparison) ---")
|
||||
print(result.markdown.fit_html[:300])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(markdown_from_fit_html())
|
||||
```
|
||||
---
|
||||
|
||||
## 5. Integration with Content Filters
|
||||
`DefaultMarkdownGenerator` can work in conjunction with `ContentFilterStrategy` instances. If a filter is provided, it will produce `fit_html` and `fit_markdown`.
|
||||
|
||||
### 5.1. Example: `DefaultMarkdownGenerator` with `PruningContentFilter`.
|
||||
The `PruningContentFilter` attempts to remove boilerplate and keep main content.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def md_with_pruning_filter():
|
||||
pruning_filter = PruningContentFilter()
|
||||
# By default, raw_markdown is from cleaned_html, fit_markdown is from fit_html
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Raw Markdown (from cleaned_html - First 200 chars) ---")
|
||||
print(result.markdown.raw_markdown[:200])
|
||||
print("\n--- Fit Markdown (from PruningFilter's fit_html - First 200 chars) ---")
|
||||
print(result.markdown.fit_markdown[:200])
|
||||
print("\n--- Fit HTML (Source for Fit Markdown - First 200 chars) ---")
|
||||
print(result.markdown.fit_html[:200])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(md_with_pruning_filter())
|
||||
```
|
||||
---
|
||||
|
||||
### 5.2. Example: `DefaultMarkdownGenerator` with `BM25ContentFilter`.
|
||||
`BM25ContentFilter` filters content based on relevance to a user query.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
async def md_with_bm25_filter():
|
||||
bm25_filter = BM25ContentFilter(user_query="Python programming language features")
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a relevant page for the query
|
||||
result = await crawler.arun(url="https://docs.python.org/3/tutorial/classes.html", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Fit Markdown (from BM25Filter - First 300 chars) ---")
|
||||
print(result.markdown.fit_markdown[:300])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(md_with_bm25_filter())
|
||||
```
|
||||
---
|
||||
|
||||
### 5.3. Example: `DefaultMarkdownGenerator` with `LLMContentFilter`.
|
||||
`LLMContentFilter` uses an LLM to intelligently filter or summarize content based on instructions. (Requires API Key)
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
import os
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, LLMConfig, CacheMode
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def md_with_llm_filter():
|
||||
openai_api_key = os.getenv("OPENAI_API_KEY")
|
||||
if not openai_api_key:
|
||||
print("OPENAI_API_KEY not found. Skipping LLMContentFilter example.")
|
||||
return
|
||||
|
||||
llm_config = LLMConfig(api_token=openai_api_key, provider="openai/gpt-3.5-turbo")
|
||||
llm_filter = LLMContentFilter(
|
||||
llm_config=llm_config,
|
||||
instruction="Summarize the main arguments presented in this Hacker News discussion thread."
|
||||
)
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS # Fresh run for LLM
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Example Hacker News discussion
|
||||
result = await crawler.arun(url="https://news.ycombinator.com/item?id=39000000", config=config) # A past popular item
|
||||
if result.success and result.markdown:
|
||||
print("--- Fit Markdown (from LLMContentFilter - First 500 chars) ---")
|
||||
print(result.markdown.fit_markdown[:500])
|
||||
llm_filter.show_usage() # Show token usage
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(md_with_llm_filter())
|
||||
```
|
||||
---
|
||||
|
||||
### 5.4. Example: Forcing Markdown generation from `fit_html` when a filter is active.
|
||||
This example shows how to ensure the `raw_markdown` itself is generated from the `fit_html` (output of the filter) rather than `cleaned_html`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def md_forced_from_fit_html():
|
||||
pruning_filter = PruningContentFilter()
|
||||
# Explicitly set content_source to "fit_html"
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=pruning_filter,
|
||||
content_source="fit_html"
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://news.ycombinator.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Raw Markdown (forced from fit_html - First 300 chars) ---")
|
||||
# This raw_markdown is now generated from the output of PruningFilter
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
print("\n--- Fit HTML (Source for Raw Markdown - First 300 chars) ---")
|
||||
print(result.markdown.fit_html[:300])
|
||||
print("\n--- Fit Markdown (should be same as Raw Markdown here - First 300 chars) ---")
|
||||
print(result.markdown.fit_markdown[:300])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(md_forced_from_fit_html())
|
||||
```
|
||||
---
|
||||
|
||||
### 5.5. Example: Markdown generation when no filter is active.
|
||||
If no `content_filter` is provided to `DefaultMarkdownGenerator`, `fit_markdown` and `fit_html` will be empty or None.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
|
||||
async def md_no_filter():
|
||||
md_generator = DefaultMarkdownGenerator() # No filter provided
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Raw Markdown (First 300 chars) ---")
|
||||
print(result.markdown.raw_markdown[:300])
|
||||
print("\n--- Fit Markdown (Expected: None or empty) ---")
|
||||
print(result.markdown.fit_markdown)
|
||||
print("\n--- Fit HTML (Expected: None or empty) ---")
|
||||
print(result.markdown.fit_html)
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(md_no_filter())
|
||||
```
|
||||
---
|
||||
|
||||
## 6. Understanding `MarkdownGenerationResult` Output Fields
|
||||
|
||||
### 6.1. Example: Accessing all fields of `MarkdownGenerationResult`.
|
||||
This example demonstrates how to access all the different Markdown and HTML outputs available in the `MarkdownGenerationResult` object.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter # Using a filter to populate fit_html/fit_markdown
|
||||
|
||||
async def access_all_markdown_fields():
|
||||
# Setup with a filter to ensure fit_html and fit_markdown are generated
|
||||
content_filter = PruningContentFilter()
|
||||
md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=content_filter,
|
||||
content_source="cleaned_html" # raw_markdown will be from cleaned_html
|
||||
)
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a content-rich page
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=config)
|
||||
if result.success and result.markdown:
|
||||
md_result = result.markdown
|
||||
|
||||
print("--- Accessing MarkdownGenerationResult Fields ---")
|
||||
|
||||
print(f"\n1. Raw Markdown (from '{md_generator.content_source}' - snippet):")
|
||||
print(md_result.raw_markdown[:300] + "...")
|
||||
|
||||
print(f"\n2. Markdown with Citations (snippet):")
|
||||
print(md_result.markdown_with_citations[:300] + "...")
|
||||
|
||||
print(f"\n3. References Markdown (snippet):")
|
||||
print(md_result.references_markdown[:200] + "...")
|
||||
|
||||
print(f"\n4. Fit HTML (from ContentFilter - snippet):")
|
||||
if md_result.fit_html:
|
||||
print(md_result.fit_html[:300] + "...")
|
||||
else:
|
||||
print("None (No filter or filter produced no output)")
|
||||
|
||||
print(f"\n5. Fit Markdown (from fit_html - snippet):")
|
||||
if md_result.fit_markdown:
|
||||
print(md_result.fit_markdown[:300] + "...")
|
||||
else:
|
||||
print("None (No filter or filter produced no output)")
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(access_all_markdown_fields())
|
||||
```
|
||||
---
|
||||
|
||||
## 7. Advanced and Specific Scenarios
|
||||
|
||||
### 7.1. Example: Handling HTML with complex table structures.
|
||||
`DefaultMarkdownGenerator` (via `html2text`) attempts to render HTML tables into Markdown tables.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def markdown_for_tables():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html><body>
|
||||
<h3>Product Comparison</h3>
|
||||
<table>
|
||||
<thead>
|
||||
<tr><th>Feature</th><th>Product A</th><th>Product B</th></tr>
|
||||
</thead>
|
||||
<tbody>
|
||||
<tr><td>Price</td><td>$100</td><td>$120</td></tr>
|
||||
<tr><td>Rating</td><td>4.5 stars</td><td>4.2 stars</td></tr>
|
||||
<tr><td>Multi-row<br/>Feature</td><td colspan="2">Supported by Both</td></tr>
|
||||
</tbody>
|
||||
</table>
|
||||
</body></html>
|
||||
"""
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown for Table ---")
|
||||
print(result_md.raw_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
markdown_for_tables()
|
||||
```
|
||||
---
|
||||
|
||||
### 7.2. Example: Handling HTML with code blocks.
|
||||
Code blocks are generally preserved in Markdown format.
|
||||
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
|
||||
def markdown_for_code_blocks():
|
||||
generator = DefaultMarkdownGenerator()
|
||||
html_content = """
|
||||
<html><body>
|
||||
<p>Here is some Python code:</p>
|
||||
<pre><code class="language-python">
|
||||
def greet(name):
|
||||
print(f"Hello, {name}!")
|
||||
|
||||
greet("World")
|
||||
</code></pre>
|
||||
<p>And an inline <code>example_function()</code>.</p>
|
||||
</body></html>
|
||||
"""
|
||||
result_md = generator.generate_markdown(input_html=html_content)
|
||||
|
||||
print("--- Markdown for Code Blocks ---")
|
||||
print(result_md.raw_markdown)
|
||||
|
||||
if __name__ == "__main__":
|
||||
markdown_for_code_blocks()
|
||||
```
|
||||
---
|
||||
|
||||
### 7.3. Example: Using a custom `MarkdownGenerationStrategy` (conceptual).
|
||||
You can create your own Markdown generation logic by subclassing `MarkdownGenerationStrategy`.
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler, CrawlerRunConfig, CacheMode,
|
||||
MarkdownGenerationStrategy, MarkdownGenerationResult
|
||||
)
|
||||
|
||||
# Define a minimal custom Markdown generator
|
||||
class CustomMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
def __init__(self, prefix="CUSTOM MD: ", **kwargs):
|
||||
super().__init__(**kwargs) # Pass along any other options
|
||||
self.prefix = prefix
|
||||
|
||||
def generate_markdown(
|
||||
self,
|
||||
input_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: dict = None, # Can be used by html2text
|
||||
citations: bool = True, # Standard param
|
||||
**kwargs # For other potential strategy-specific params
|
||||
) -> MarkdownGenerationResult:
|
||||
# Simplified custom logic: just prefix and take a snippet
|
||||
# A real custom generator would do more sophisticated parsing/conversion
|
||||
custom_raw_md = self.prefix + input_html[:100].strip() + "..."
|
||||
|
||||
# For simplicity, we'll just return the custom raw markdown for all fields
|
||||
return MarkdownGenerationResult(
|
||||
raw_markdown=custom_raw_md,
|
||||
markdown_with_citations=custom_raw_md, # No real citation logic here
|
||||
references_markdown="",
|
||||
fit_markdown=None, # Not implementing filtering here
|
||||
fit_html=None
|
||||
)
|
||||
|
||||
async def use_custom_markdown_generator():
|
||||
custom_generator = CustomMarkdownGenerator(prefix="[MyGenerator Says]: ")
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=custom_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=config)
|
||||
if result.success and result.markdown:
|
||||
print("--- Output from CustomMarkdownGenerator ---")
|
||||
print(result.markdown.raw_markdown)
|
||||
# Since our custom generator doesn't really do citations or filtering:
|
||||
print(f"Citations: '{result.markdown.markdown_with_citations}'")
|
||||
print(f"References: '{result.markdown.references_markdown}'")
|
||||
print(f"Fit Markdown: '{result.markdown.fit_markdown}'")
|
||||
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(use_custom_markdown_generator())
|
||||
```
|
||||
---
|
||||
**End of Examples Document**
|
||||
```
|
||||
@@ -1,228 +0,0 @@
|
||||
Okay, I have processed the instructions and the provided information. I will now generate the detailed "Foundational Memory" document for the `markdown` component of `crawl4ai`, adhering strictly to the outline you've provided and focusing on factual API details.
|
||||
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - markdown Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_markdown.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Markdown Generation in Crawl4ai
|
||||
|
||||
* 1.1. Purpose: This section outlines the `markdown` component of the `crawl4ai` library. Its primary role is to convert HTML content, obtained during web crawling, into various Markdown formats. These formats are designed to be suitable for consumption by Large Language Models (LLMs), as well as for other applications requiring structured text from web pages.
|
||||
* 1.2. Key Abstractions:
|
||||
* `MarkdownGenerationStrategy`: An abstract base class that defines the interface for different markdown generation algorithms and approaches. This allows for customizable Markdown conversion processes.
|
||||
* `DefaultMarkdownGenerator`: The standard, out-of-the-box implementation of `MarkdownGenerationStrategy`. It handles the conversion of HTML to Markdown, including features like link-to-citation conversion and integration with content filtering.
|
||||
* `MarkdownGenerationResult`: A Pydantic data model that encapsulates the various outputs of the markdown generation process, such as raw markdown, markdown with citations, and markdown derived from filtered content.
|
||||
* `CrawlerRunConfig.markdown_generator`: An attribute within the `CrawlerRunConfig` class that allows users to specify which instance of a `MarkdownGenerationStrategy` should be used for a particular crawl operation.
|
||||
* 1.3. Relationship with Content Filtering: The markdown generation process can be integrated with `RelevantContentFilter` strategies. When a content filter is applied, it first refines the input HTML, and then this filtered HTML is used to produce a `fit_markdown` output, providing a more focused version of the content.
|
||||
|
||||
## 2. Core Interface: `MarkdownGenerationStrategy`
|
||||
|
||||
* 2.1. Purpose: The `MarkdownGenerationStrategy` class is an abstract base class (ABC) that defines the contract for all markdown generation strategies within `crawl4ai`. It ensures that any custom markdown generator will adhere to a common interface, making them pluggable into the crawling process.
|
||||
* 2.2. Source File: `crawl4ai/markdown_generation_strategy.py`
|
||||
* 2.3. Initialization (`__init__`)
|
||||
* 2.3.1. Signature:
|
||||
```python
|
||||
class MarkdownGenerationStrategy(ABC):
|
||||
def __init__(
|
||||
self,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
verbose: bool = False,
|
||||
content_source: str = "cleaned_html",
|
||||
):
|
||||
# ...
|
||||
```
|
||||
* 2.3.2. Parameters:
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter will be used to process the HTML before generating the `fit_markdown` and `fit_html` outputs in the `MarkdownGenerationResult`.
|
||||
* `options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary for strategy-specific custom options. This allows subclasses to receive additional configuration parameters. Defaults to an empty dictionary if `None`.
|
||||
* `verbose (bool`, default: `False`)`: If `True`, enables verbose logging for the markdown generation process.
|
||||
* `content_source (str`, default: `"cleaned_html"`)`: A string indicating the source of HTML to use for Markdown generation. Common values might include `"raw_html"` (original HTML from the page), `"cleaned_html"` (HTML after initial cleaning by the scraping strategy), or `"fit_html"` (HTML after being processed by `content_filter`). The actual available sources depend on the `ScrapingResult` provided to the markdown generator.
|
||||
* 2.4. Abstract Methods:
|
||||
* 2.4.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
|
||||
* Purpose: This abstract method must be implemented by concrete subclasses. It is responsible for taking an HTML string and converting it into various Markdown representations, encapsulated within a `MarkdownGenerationResult` object.
|
||||
* Parameters:
|
||||
* `input_html (str)`: The HTML string content to be converted to Markdown.
|
||||
* `base_url (str`, default: `""`)`: The base URL of the crawled page. This is crucial for resolving relative URLs, especially when converting links to citations.
|
||||
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: A dictionary of options to be passed to the underlying HTML-to-text conversion engine (e.g., `CustomHTML2Text`).
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: An optional `RelevantContentFilter` instance. If provided, this filter is used to generate `fit_markdown` and `fit_html`. This parameter overrides any filter set during the strategy's initialization for this specific call.
|
||||
* `citations (bool`, default: `True`)`: A boolean flag indicating whether to convert Markdown links into a citation format (e.g., `[text]^[1]^`) with a corresponding reference list.
|
||||
* `**kwargs`: Additional keyword arguments to allow for future extensions or strategy-specific parameters.
|
||||
* Returns: (`MarkdownGenerationResult`) An object containing the results of the Markdown generation, including `raw_markdown`, `markdown_with_citations`, `references_markdown`, and potentially `fit_markdown` and `fit_html`.
|
||||
|
||||
## 3. Default Implementation: `DefaultMarkdownGenerator`
|
||||
|
||||
* 3.1. Purpose: `DefaultMarkdownGenerator` is the standard concrete implementation of `MarkdownGenerationStrategy`. It provides a robust mechanism for converting HTML to Markdown, featuring link-to-citation conversion and the ability to integrate with `RelevantContentFilter` strategies for focused content output.
|
||||
* 3.2. Source File: `crawl4ai/markdown_generation_strategy.py`
|
||||
* 3.3. Inheritance: Inherits from `MarkdownGenerationStrategy`.
|
||||
* 3.4. Initialization (`__init__`)
|
||||
* 3.4.1. Signature:
|
||||
```python
|
||||
class DefaultMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
def __init__(
|
||||
self,
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
options: Optional[Dict[str, Any]] = None,
|
||||
# content_source parameter from parent is available
|
||||
# verbose parameter from parent is available
|
||||
):
|
||||
super().__init__(content_filter, options, content_source=kwargs.get("content_source", "cleaned_html"), verbose=kwargs.get("verbose", False))
|
||||
```
|
||||
*(Note: The provided code snippet for `DefaultMarkdownGenerator.__init__` does not explicitly list `verbose` and `content_source`, but they are passed to `super().__init__` through `**kwargs` in the actual library code, so their effective signature matches the parent.)*
|
||||
* 3.4.2. Parameters:
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
|
||||
* `options (Optional[Dict[str, Any]]`, default: `None`)`: As defined in `MarkdownGenerationStrategy`.
|
||||
* `verbose (bool`, default: `False`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
|
||||
* `content_source (str`, default: `"cleaned_html"`)`: (Passed via `kwargs` to parent) As defined in `MarkdownGenerationStrategy`.
|
||||
* 3.5. Key Class Attributes:
|
||||
* 3.5.1. `LINK_PATTERN (re.Pattern)`: A compiled regular expression pattern used to find Markdown links. The pattern is `r'!\[(.[^\]]*)\]\(([^)]*?)(?:\s*\"(.*)\")?\)'`.
|
||||
* 3.6. Key Public Methods:
|
||||
* 3.6.1. `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`
|
||||
* Purpose: Implements the conversion of HTML to Markdown. It uses `CustomHTML2Text` for the base conversion, handles link-to-citation transformation, and integrates with an optional `RelevantContentFilter` to produce `fit_markdown`.
|
||||
* Parameters:
|
||||
* `input_html (str)`: The HTML content to convert.
|
||||
* `base_url (str`, default: `""`)`: Base URL for resolving relative links.
|
||||
* `html2text_options (Optional[Dict[str, Any]]`, default: `None`)`: Options for the `CustomHTML2Text` converter. If not provided, it uses `self.options`.
|
||||
* `content_filter (Optional[RelevantContentFilter]`, default: `None`)`: Overrides the instance's `content_filter` for this call.
|
||||
* `citations (bool`, default: `True`)`: Whether to convert links to citations.
|
||||
* `**kwargs`: Additional arguments (not currently used by this specific implementation beyond parent class).
|
||||
* Core Logic:
|
||||
1. Instantiates `CustomHTML2Text` using `base_url` and the resolved `html2text_options` (merged from method arg, `self.options`, and defaults).
|
||||
2. Converts `input_html` to `raw_markdown` using the `CustomHTML2Text` instance.
|
||||
3. If `citations` is `True`, calls `self.convert_links_to_citations(raw_markdown, base_url)` to get `markdown_with_citations` and `references_markdown`.
|
||||
4. If `citations` is `False`, `markdown_with_citations` is set to `raw_markdown`, and `references_markdown` is an empty string.
|
||||
5. Determines the active `content_filter` (parameter or instance's `self.content_filter`).
|
||||
6. If an active `content_filter` exists:
|
||||
* Calls `active_filter.filter_content(input_html)` to get a list of filtered HTML strings.
|
||||
* Joins these strings with `\n` and wraps them in `<div>` tags to form `fit_html`.
|
||||
* Uses a new `CustomHTML2Text` instance to convert `fit_html` into `fit_markdown`.
|
||||
7. Otherwise, `fit_html` and `fit_markdown` are set to `None` (or empty strings based on implementation details).
|
||||
8. Constructs and returns a `MarkdownGenerationResult` object with all generated Markdown variants.
|
||||
* 3.6.2. `convert_links_to_citations(self, markdown: str, base_url: str = "") -> Tuple[str, str]`
|
||||
* Purpose: Transforms standard Markdown links within the input `markdown` string into a citation format (e.g., `[Link Text]^[1]^`) and generates a corresponding numbered list of references.
|
||||
* Parameters:
|
||||
* `markdown (str)`: The input Markdown string.
|
||||
* `base_url (str`, default: `""`)`: The base URL used to resolve relative link URLs before they are added to the reference list.
|
||||
* Returns: (`Tuple[str, str]`) A tuple where the first element is the Markdown string with links converted to citations, and the second element is a string containing the formatted list of references.
|
||||
* Internal Logic:
|
||||
* Uses the `LINK_PATTERN` regex to find all Markdown links.
|
||||
* For each link, it resolves the URL using `fast_urljoin(base, url)` if `base_url` is provided and the link is relative.
|
||||
* Assigns a unique citation number to each unique URL.
|
||||
* Replaces the original link markup with the citation format (e.g., `[Text]^[Number]^`).
|
||||
* Constructs a Markdown formatted reference list string.
|
||||
* 3.7. Role of `CustomHTML2Text`:
|
||||
* `CustomHTML2Text` is a customized version of an HTML-to-Markdown converter, likely based on the `html2text` library.
|
||||
* It's instantiated by `DefaultMarkdownGenerator` to perform the core HTML to plain Markdown conversion.
|
||||
* Its behavior is controlled by options passed via `html2text_options` in `generate_markdown` or `self.options` of the `DefaultMarkdownGenerator`. These options can include `body_width`, `ignore_links`, `ignore_images`, etc., influencing the final Markdown output. (Refer to `crawl4ai/html2text.py` for specific options).
|
||||
|
||||
## 4. Output Data Model: `MarkdownGenerationResult`
|
||||
|
||||
* 4.1. Purpose: `MarkdownGenerationResult` is a Pydantic `BaseModel` designed to structure and encapsulate the various Markdown outputs generated by any `MarkdownGenerationStrategy`. It provides a consistent way to access different versions of the converted content.
|
||||
* 4.2. Source File: `crawl4ai/models.py`
|
||||
* 4.3. Fields:
|
||||
* 4.3.1. `raw_markdown (str)`: The direct result of converting the input HTML to Markdown, before any citation processing or specific content filtering (by the generator itself) is applied. This represents the most basic Markdown version of the content.
|
||||
* 4.3.2. `markdown_with_citations (str)`: Markdown content where hyperlinks have been converted into a citation style (e.g., `[Link Text]^[1]^`). This is typically derived from `raw_markdown`.
|
||||
* 4.3.3. `references_markdown (str)`: A string containing a formatted list of references (e.g., numbered list of URLs) corresponding to the citations found in `markdown_with_citations`.
|
||||
* 4.3.4. `fit_markdown (Optional[str]`, default: `None`)`: Markdown content generated from HTML that has been processed by a `RelevantContentFilter`. This version is intended to be more concise or focused on relevant parts of the original content. It is `None` if no content filter was applied or if the filter resulted in no content.
|
||||
* 4.3.5. `fit_html (Optional[str]`, default: `None`)`: The HTML content that remains after being processed by a `RelevantContentFilter`. `fit_markdown` is generated from this `fit_html`. It is `None` if no content filter was applied or if the filter resulted in no content.
|
||||
* 4.4. Methods:
|
||||
* 4.4.1. `__str__(self) -> str`:
|
||||
* Purpose: Defines the string representation of a `MarkdownGenerationResult` object.
|
||||
* Signature: `__str__(self) -> str`
|
||||
* Returns: (`str`) The content of the `raw_markdown` field.
|
||||
|
||||
## 5. Integration with Content Filtering (`RelevantContentFilter`)
|
||||
|
||||
* 5.1. Purpose of Integration: `DefaultMarkdownGenerator` allows integration with `RelevantContentFilter` strategies to produce a `fit_markdown` output. This enables generating Markdown from a version of the HTML that has been refined or focused based on relevance criteria defined by the filter (e.g., keywords, semantic similarity, or LLM-based assessment).
|
||||
* 5.2. Mechanism:
|
||||
* A `RelevantContentFilter` instance can be passed to `DefaultMarkdownGenerator` either during its initialization (via the `content_filter` parameter) or directly to its `generate_markdown` method. The filter passed to `generate_markdown` takes precedence if both are provided.
|
||||
* When an active filter is present, `DefaultMarkdownGenerator.generate_markdown` calls the filter's `filter_content(input_html)` method. This method is expected to return a list of HTML string chunks deemed relevant.
|
||||
* These chunks are then joined (typically with `\n` and wrapped in `<div>` tags) to form the `fit_html` string.
|
||||
* This `fit_html` is then converted to Markdown using `CustomHTML2Text`, and the result is stored as `fit_markdown`.
|
||||
* 5.3. Impact on `MarkdownGenerationResult`:
|
||||
* If a `RelevantContentFilter` is successfully used:
|
||||
* `MarkdownGenerationResult.fit_markdown` will contain the Markdown derived from the filtered HTML.
|
||||
* `MarkdownGenerationResult.fit_html` will contain the actual filtered HTML string.
|
||||
* If no filter is used, or if the filter returns an empty list of chunks (indicating no content passed the filter), `fit_markdown` and `fit_html` will be `None` (or potentially empty strings, depending on the exact implementation details of joining an empty list).
|
||||
* 5.4. Supported Filter Types (High-Level Mention):
|
||||
* `PruningContentFilter`: A filter that likely removes irrelevant HTML sections based on predefined rules or structural analysis (e.g., removing common boilerplate like headers, footers, navbars).
|
||||
* `BM25ContentFilter`: A filter that uses the BM25 ranking algorithm to score and select HTML chunks based on their relevance to a user-provided query.
|
||||
* `LLMContentFilter`: A filter that leverages a Large Language Model to assess the relevance of HTML chunks, potentially based on a user query or a general understanding of content importance.
|
||||
* *Note: Detailed descriptions and usage of each filter strategy are covered in their respective documentation sections.*
|
||||
|
||||
## 6. Configuration via `CrawlerRunConfig`
|
||||
|
||||
* 6.1. `CrawlerRunConfig.markdown_generator`
|
||||
* Purpose: This attribute of the `CrawlerRunConfig` class allows a user to specify a custom `MarkdownGenerationStrategy` instance to be used for the markdown conversion phase of a crawl. This provides flexibility in how HTML content is transformed into Markdown.
|
||||
* Type: `MarkdownGenerationStrategy` (accepts any concrete implementation of this ABC).
|
||||
* Default Value: If not specified, an instance of `DefaultMarkdownGenerator()` is used by default within the `AsyncWebCrawler`'s `aprocess_html` method when `config.markdown_generator` is `None`.
|
||||
* Usage Example:
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator, AsyncWebCrawler
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
import asyncio
|
||||
|
||||
# Example: Configure a markdown generator with a BM25 filter
|
||||
bm25_filter = BM25ContentFilter(user_query="Python programming language")
|
||||
custom_md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||
|
||||
run_config_with_custom_md = CrawlerRunConfig(
|
||||
markdown_generator=custom_md_generator,
|
||||
# Other run configurations...
|
||||
)
|
||||
|
||||
async def example_crawl():
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Python_(programming_language)",
|
||||
config=run_config_with_custom_md
|
||||
)
|
||||
if result.success and result.markdown:
|
||||
print("Raw Markdown (snippet):", result.markdown.raw_markdown[:200])
|
||||
if result.markdown.fit_markdown:
|
||||
print("Fit Markdown (snippet):", result.markdown.fit_markdown[:200])
|
||||
|
||||
# asyncio.run(example_crawl())
|
||||
```
|
||||
|
||||
## 7. Influencing Markdown Output for LLM Consumption
|
||||
|
||||
* 7.1. Role of `DefaultMarkdownGenerator.options` and `html2text_options`:
|
||||
* The `options` parameter in `DefaultMarkdownGenerator.__init__` and the `html2text_options` parameter in its `generate_markdown` method are used to pass configuration settings directly to the underlying `CustomHTML2Text` instance.
|
||||
* `html2text_options` provided to `generate_markdown` will take precedence over `self.options` set during initialization.
|
||||
* These options control various aspects of the HTML-to-Markdown conversion, such as line wrapping, handling of links, images, and emphasis, which can be crucial for preparing text for LLMs.
|
||||
* 7.2. Key `CustomHTML2Text` Options (via `html2text_options` or `DefaultMarkdownGenerator.options`):
|
||||
* `bodywidth (int`, default: `0` when `DefaultMarkdownGenerator` calls `CustomHTML2Text` for `raw_markdown` and `fit_markdown` if not otherwise specified): Determines the width for wrapping lines. A value of `0` disables line wrapping, which is often preferred for LLM processing as it preserves sentence structure across lines.
|
||||
* `ignore_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, all hyperlinks (`<a>` tags) are removed from the output, leaving only their anchor text.
|
||||
* `ignore_images (bool`, default: `False` in `CustomHTML2Text`): If `True`, all image tags (`<img>`) are removed from the output.
|
||||
* `ignore_emphasis (bool`, default: `False` in `CustomHTML2Text`): If `True`, emphasized text (e.g., `<em>`, `<strong>`) is rendered as plain text without Markdown emphasis characters (like `*` or `_`).
|
||||
* `bypass_tables (bool`, default: `False` in `CustomHTML2Text`): If `True`, tables are not formatted as Markdown tables but are rendered as a series of paragraphs, which might be easier for some LLMs to process.
|
||||
* `default_image_alt (str`, default: `""` in `CustomHTML2Text`): Specifies a default alt text for images that do not have an `alt` attribute.
|
||||
* `protect_links (bool`, default: `False` in `CustomHTML2Text`): If `True`, URLs in links are not processed or modified.
|
||||
* `single_line_break (bool`, default: `True` in `CustomHTML2Text`): If `True`, single newlines in HTML are converted to Markdown line breaks (two spaces then a newline). This can help preserve some formatting.
|
||||
* `mark_code (bool`, default: `True` in `CustomHTML2Text`): If `True`, `<code>` and `<pre>` blocks are appropriately marked in Markdown.
|
||||
* `escape_snob (bool`, default: `False` in `CustomHTML2Text`): If `True`, more aggressive escaping of special Markdown characters is performed.
|
||||
* *Note: This list is based on common `html2text` options; refer to `crawl4ai/html2text.py` for the exact implementation and default behaviors within `CustomHTML2Text`.*
|
||||
* 7.3. Impact of `citations (bool)` in `generate_markdown`:
|
||||
* When `citations=True` (default in `DefaultMarkdownGenerator.generate_markdown`):
|
||||
* Standard Markdown links `[text](url)` are converted to `[text]^[citation_number]^`.
|
||||
* A `references_markdown` string is generated, listing all unique URLs with their corresponding citation numbers. This helps LLMs trace information back to its source and can reduce token count if URLs are long or repetitive.
|
||||
* When `citations=False`:
|
||||
* Links remain in their original Markdown format `[text](url)`.
|
||||
* `references_markdown` will be an empty string.
|
||||
* This might be preferred if the LLM needs to directly process the URLs or if the citation format is not desired.
|
||||
* 7.4. Role of `content_source` in `MarkdownGenerationStrategy`:
|
||||
* This parameter (defaulting to `"cleaned_html"` in `DefaultMarkdownGenerator`) specifies which HTML version is used as the primary input for the `generate_markdown` method.
|
||||
* `"cleaned_html"`: Typically refers to HTML that has undergone initial processing by the `ContentScrapingStrategy` (e.g., removal of scripts, styles, and potentially some boilerplate based on the scraping strategy's rules). This is usually the recommended source for general Markdown conversion.
|
||||
* `"raw_html"`: The original, unmodified HTML content fetched from the web page. Using this source would bypass any initial cleaning done by the scraping strategy.
|
||||
* `"fit_html"`: This source is relevant when a `RelevantContentFilter` is used. `fit_html` is the HTML output *after* the `RelevantContentFilter` has processed the `input_html` (which itself is determined by `content_source`). If `content_source` is, for example, `"cleaned_html"`, then `fit_html` is the result of filtering that cleaned HTML. `fit_markdown` is then generated from this `fit_html`.
|
||||
* 7.5. `fit_markdown` vs. `raw_markdown`/`markdown_with_citations`:
|
||||
* `raw_markdown` (or `markdown_with_citations` if `citations=True`) is generated from the HTML specified by `content_source` (e.g., `"cleaned_html"`). It represents a general conversion of that source.
|
||||
* `fit_markdown` is generated *only if* a `RelevantContentFilter` is active (either set in `DefaultMarkdownGenerator` or passed to `generate_markdown`). It is derived from the `fit_html` (the output of the content filter).
|
||||
* **Choosing which to use for LLMs:**
|
||||
* Use `fit_markdown` when you need a concise, highly relevant subset of the page's content tailored to a specific query or set of criteria defined by the filter. This can reduce noise and token count for the LLM.
|
||||
* Use `raw_markdown` or `markdown_with_citations` when you need a more comprehensive representation of the page's textual content, or when no specific filtering criteria are applied.
|
||||
```
|
||||
@@ -1,760 +0,0 @@
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - markdown Component
|
||||
|
||||
**Target Document Type:** reasoning
|
||||
**Target Output Filename Suggestion:** `llm_reasoning_markdown_generation.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
## 1. Introduction to Markdown Generation in Crawl4AI
|
||||
|
||||
* 1.1. **Why Markdown Generation Matters for LLMs**
|
||||
* 1.1.1. The role of clean, structured text for Large Language Model consumption.
|
||||
* **Explanation:** LLMs perform significantly better when input data is well-structured and free of irrelevant noise (like HTML tags, scripts, or complex layouts not meant for textual understanding). Markdown, with its simple syntax, provides a human-readable and machine-parseable format that captures essential semantic structure (headings, lists, paragraphs, code blocks, tables) without the clutter of full HTML. This makes it easier for LLMs to understand the content's hierarchy, identify key information, and perform tasks like summarization, question-answering, or RAG (Retrieval Augmented Generation) more accurately and efficiently.
|
||||
* 1.1.2. Benefits of Markdown: readability, structure preservation, common format.
|
||||
* **Explanation:**
|
||||
* **Readability:** Markdown is designed to be easily readable in its raw form, making it simple for developers and users to inspect and understand the crawled content.
|
||||
* **Structure Preservation:** It effectively preserves the semantic structure of the original HTML (headings, lists, emphasis, etc.), which is crucial context for LLMs.
|
||||
* **Common Format:** Markdown is a widely adopted standard, ensuring compatibility with a vast ecosystem of tools, editors, and LLM input pipelines.
|
||||
* 1.1.3. How Crawl4AI's Markdown generation facilitates RAG and other LLM applications.
|
||||
* **Explanation:** For RAG, Crawl4AI's Markdown output, especially when combined with content filtering, provides clean, relevant text chunks that can be easily embedded and indexed. This improves the quality of retrieved context for LLM prompts. For fine-tuning or direct prompting, the structured Markdown helps the LLM focus on the core content, leading to better quality responses and reducing token consumption by eliminating HTML overhead.
|
||||
|
||||
* 1.2. **Overview of Crawl4AI's Markdown Generation Pipeline**
|
||||
* 1.2.1. High-level flow: HTML -> (Optional Filtering) -> Markdown Conversion -> (Optional Citation Handling).
|
||||
* **Explanation:**
|
||||
1. **Input HTML:** The process starts with either raw HTML from the crawled page or a cleaned/selected HTML segment.
|
||||
2. **Optional Content Filtering:** Before Markdown conversion, a `RelevantContentFilter` can be applied to the HTML. This step aims to remove boilerplate, ads, or irrelevant sections, resulting in `fit_html`. This is crucial for generating `fit_markdown`.
|
||||
3. **Markdown Conversion:** The selected HTML (either the original, cleaned, or filtered `fit_html`) is converted into Markdown using an underlying `html2text` library, specifically `CustomHTML2Text` in Crawl4AI for enhanced control.
|
||||
4. **Optional Citation Handling:** If enabled, inline links in the generated Markdown are converted to a citation format (e.g., `text [^1^]`), and a separate list of references is created.
|
||||
* 1.2.2. Key components involved: `MarkdownGenerationStrategy`, `DefaultMarkdownGenerator`, `CustomHTML2Text`, `RelevantContentFilter`.
|
||||
* **Explanation:**
|
||||
* **`MarkdownGenerationStrategy`:** An interface defining how Markdown should be generated. Allows for custom implementations.
|
||||
* **`DefaultMarkdownGenerator`:** The standard implementation of `MarkdownGenerationStrategy`, using `CustomHTML2Text`. It orchestrates filtering (if provided) and citation handling.
|
||||
* **`CustomHTML2Text`:** An enhanced version of the `html2text` library, providing fine-grained control over the HTML-to-Markdown conversion.
|
||||
* **`RelevantContentFilter`:** An interface for strategies that filter HTML content before it's converted to Markdown, producing `fit_html` and consequently `fit_markdown`.
|
||||
* 1.2.3. How `CrawlerRunConfig` ties these components together.
|
||||
* **Explanation:** The `CrawlerRunConfig` object allows you to specify which `MarkdownGenerationStrategy` (and by extension, which filters and `CustomHTML2Text` options) should be used for a particular crawl run via its `markdown_generator` parameter. This provides run-specific control over the Markdown output.
|
||||
|
||||
* 1.3. **Goals of this Guide**
|
||||
* 1.3.1. Understanding how to configure and customize Markdown output.
|
||||
* **Explanation:** This guide will walk you through the various configuration options available, from choosing HTML sources and content filters to fine-tuning the `html2text` conversion itself.
|
||||
* 1.3.2. Best practices for generating LLM-friendly Markdown.
|
||||
* **Explanation:** We'll discuss tips and techniques to produce Markdown that is optimally structured and cleaned for consumption by Large Language Models.
|
||||
* 1.3.3. Troubleshooting common Markdown generation issues.
|
||||
* **Explanation:** We'll cover common problems encountered during Markdown generation (e.g., noisy output, missing content) and provide strategies for diagnosing and resolving them.
|
||||
|
||||
## 2. Core Concepts in Markdown Generation
|
||||
|
||||
* 2.1. **The `MarkdownGenerationStrategy` Interface**
|
||||
* 2.1.1. **Purpose and Design Rationale:**
|
||||
* Why use a strategy pattern for Markdown generation? (Flexibility, extensibility).
|
||||
* **Explanation:** The strategy pattern allows Crawl4AI to define a common interface for Markdown generation while enabling different concrete implementations. This means users can easily swap out the default Markdown generator for a custom one without altering the core crawler logic. It promotes flexibility and makes the system extensible for future Markdown conversion needs or integration with other libraries.
|
||||
* Core problem it solves: Decoupling Markdown generation logic from the crawler.
|
||||
* **Explanation:** By abstracting Markdown generation into a strategy, the `AsyncWebCrawler` itself doesn't need to know the specifics of *how* Markdown is created. It simply delegates the task to the configured strategy. This separation of concerns makes the codebase cleaner and easier to maintain.
|
||||
* 2.1.2. **When to Implement a Custom `MarkdownGenerationStrategy`:**
|
||||
* Scenarios requiring completely different Markdown conversion logic.
|
||||
* **Example:** If you need to convert HTML to a very specific dialect of Markdown not supported by `html2text`, or if you want to use a different underlying conversion library entirely.
|
||||
* Integrating third-party Markdown conversion libraries.
|
||||
* **Example:** If you prefer to use a library like `turndown` or `mistune` for its specific features or output style.
|
||||
* Advanced pre/post-processing of Markdown.
|
||||
* **Example:** If you need to perform complex transformations on the Markdown *after* initial generation, such as custom table formatting, complex footnote handling beyond standard citations, or domain-specific semantic tagging within the Markdown.
|
||||
* 2.1.3. **How to Implement a Custom `MarkdownGenerationStrategy`:**
|
||||
* Key methods to override (`generate_markdown`).
|
||||
* **Explanation:** The primary method to implement is `generate_markdown(self, input_html: str, base_url: str = "", html2text_options: Optional[Dict[str, Any]] = None, content_filter: Optional[RelevantContentFilter] = None, citations: bool = True, **kwargs) -> MarkdownGenerationResult`. This method will receive the HTML (based on `content_source`), and it's responsible for returning a `MarkdownGenerationResult` object.
|
||||
* Input parameters and expected output (`MarkdownGenerationResult`).
|
||||
* **Explanation:** Your custom strategy will receive the `input_html`, the `base_url` (for resolving relative links if needed), `html2text_options` (which you can choose to use or ignore), an optional `content_filter`, and a `citations` flag. It must return an instance of `MarkdownGenerationResult` populated with the relevant Markdown strings.
|
||||
* *Code Example:*
|
||||
```python
|
||||
from crawl4ai import MarkdownGenerationStrategy, MarkdownGenerationResult, RelevantContentFilter
|
||||
from typing import Optional, Dict, Any
|
||||
|
||||
class MyCustomMarkdownStrategy(MarkdownGenerationStrategy):
|
||||
def __init__(self, content_source: str = "cleaned_html", **kwargs):
|
||||
super().__init__(content_source=content_source, **kwargs)
|
||||
# Initialize any custom resources if needed
|
||||
|
||||
def generate_markdown(
|
||||
self,
|
||||
input_html: str,
|
||||
base_url: str = "",
|
||||
html2text_options: Optional[Dict[str, Any]] = None, # You can use or ignore these
|
||||
content_filter: Optional[RelevantContentFilter] = None,
|
||||
citations: bool = True, # You can decide how to handle this
|
||||
**kwargs
|
||||
) -> MarkdownGenerationResult:
|
||||
|
||||
# 1. Apply content filter if provided and desired
|
||||
fit_html_output = ""
|
||||
if content_filter:
|
||||
# Assuming content_filter.filter_content returns a list of HTML strings
|
||||
filtered_html_blocks = content_filter.filter_content(input_html)
|
||||
fit_html_output = "\n".join(filtered_html_blocks)
|
||||
|
||||
# 2. Your custom HTML to Markdown conversion logic
|
||||
# This is where you'd use your preferred library or custom logic
|
||||
raw_markdown_text = f"# Custom Markdown for {base_url}\n\n{input_html[:200]}..." # Placeholder
|
||||
|
||||
markdown_with_citations_text = raw_markdown_text # Placeholder for citation logic
|
||||
references_markdown_text = "" # Placeholder for references
|
||||
|
||||
# If you used a filter, also generate fit_markdown
|
||||
fit_markdown_text = ""
|
||||
if fit_html_output:
|
||||
fit_markdown_text = f"# Custom Filtered Markdown\n\n{fit_html_output[:200]}..." # Placeholder
|
||||
|
||||
return MarkdownGenerationResult(
|
||||
raw_markdown=raw_markdown_text,
|
||||
markdown_with_citations=markdown_with_citations_text,
|
||||
references_markdown=references_markdown_text,
|
||||
fit_markdown=fit_markdown_text,
|
||||
fit_html=fit_html_output
|
||||
)
|
||||
|
||||
# Usage:
|
||||
# custom_md_generator = MyCustomMarkdownStrategy()
|
||||
# run_config = CrawlerRunConfig(markdown_generator=custom_md_generator)
|
||||
```
|
||||
* Common pitfalls when creating custom strategies.
|
||||
* **Explanation:**
|
||||
* Forgetting to handle all fields in `MarkdownGenerationResult` (even if some are empty strings).
|
||||
* Incorrectly managing `base_url` for relative links if your custom converter doesn't handle it.
|
||||
* Performance bottlenecks if your custom logic is inefficient.
|
||||
* Not properly integrating with the `content_filter` if one is provided.
|
||||
* 2.1.4. **Understanding `content_source` in `MarkdownGenerationStrategy`**
|
||||
* 2.1.4.1. Purpose: What HTML source should be used for Markdown generation?
|
||||
* **Explanation:** The `content_source` attribute of a `MarkdownGenerationStrategy` (including `DefaultMarkdownGenerator`) tells the strategy which version of the HTML to use as the primary input for generating `raw_markdown` and `markdown_with_citations`.
|
||||
* 2.1.4.2. Available options: `"cleaned_html"`, `"raw_html"`, `"fit_html"`.
|
||||
* **`"cleaned_html"` (Default):** This is the HTML after Crawl4AI's internal `ContentScrapingStrategy` (e.g., `WebScrapingStrategy` or `LXMLWebScrapingStrategy`) has processed it. This usually involves removing scripts, styles, and applying structural cleaning or selection based on `target_elements` or `css_selector` in `CrawlerRunConfig`.
|
||||
* **`"raw_html"`:** The original, unmodified HTML fetched from the page. This is useful if you want to apply your own complete cleaning and Markdown conversion pipeline.
|
||||
* **`"fit_html"`:** The HTML *after* a `RelevantContentFilter` (if provided to the `MarkdownGenerationStrategy`) has processed the input HTML (which would be `cleaned_html` or `raw_html` depending on the initial source). This option is powerful when you want Markdown generated *only* from the most relevant parts of the page.
|
||||
* 2.1.4.3. **Decision Guide: Choosing the Right `content_source`**:
|
||||
* **When to use `"cleaned_html"`:** This is the recommended default for most LLM use cases. It provides a good balance of structured content without excessive noise, as common boilerplate is often removed by the scraping strategy.
|
||||
* **When to use `"raw_html"`:** Choose this if you need absolute control over the HTML input for your Markdown converter, or if Crawl4AI's default cleaning removes elements you wish to keep. Be aware that this might result in noisier Markdown.
|
||||
* **When to use `"fit_html"`:** Opt for this when you are using a `RelevantContentFilter` with your `MarkdownGenerationStrategy` and you want the `raw_markdown` and `markdown_with_citations` to be based *only* on the filtered content. This is distinct from just using the `fit_markdown` field in the result, as it makes the filtered content the *primary* source for all main Markdown outputs.
|
||||
* **Impact on performance and output quality:**
|
||||
* `"raw_html"` might be slightly faster if Crawl4AI's cleaning is complex, but could lead to lower quality Markdown due to more noise.
|
||||
* `"cleaned_html"` offers a good trade-off.
|
||||
* `"fit_html"` depends on the performance of the `RelevantContentFilter` itself.
|
||||
* 2.1.4.4. *Example Scenarios:*
|
||||
* **General Summarization:** `"cleaned_html"` is usually best.
|
||||
* **Highly Specific Q&A on a Section:** Use a `RelevantContentFilter` to produce `fit_html`, then set `content_source="fit_html"` (or just use the `fit_markdown` from the result if `raw_markdown` from `"cleaned_html"` is also desired).
|
||||
* **Archiving Raw Structure:** `"raw_html"` might be chosen if the goal is to convert the entire, unmodified page structure to Markdown, perhaps for later, more nuanced processing.
|
||||
|
||||
* 2.2. **The `MarkdownGenerationResult` Model**
|
||||
* 2.2.1. **Understanding its Purpose:** Why a structured result object?
|
||||
* **Explanation:** A structured object like `MarkdownGenerationResult` is used instead of a single Markdown string to provide different views or versions of the generated Markdown, catering to various use cases. This allows users to pick the representation that best suits their needs (e.g., with or without citations, raw vs. filtered) without re-processing. It also clearly separates the main content from metadata like references or the intermediate `fit_html`.
|
||||
* 2.2.2. **Deep Dive into `MarkdownGenerationResult` Fields:**
|
||||
* `raw_markdown`:
|
||||
* **What it is:** This is the direct, primary Markdown output generated from the `content_source` (e.g., `cleaned_html`) defined in the `MarkdownGenerationStrategy`. It does *not* have inline links converted to citation format.
|
||||
* **How to use it:** Use this when you need the most "vanilla" Markdown, perhaps for LLMs that are sensitive to citation formats or if you plan to implement your own link/reference handling.
|
||||
* **When it's useful:** For direct input to LLMs that don't require source attribution within the text, or as a base for further custom Markdown processing.
|
||||
* `markdown_with_citations`:
|
||||
* **What it is:** This takes the `raw_markdown` and converts its inline links (e.g., `[link text](http://example.com)`) into a citation format (e.g., `link text [^1^]`).
|
||||
* **How it's generated:** The `DefaultMarkdownGenerator` (via `CustomHTML2Text`) scans `raw_markdown` for links, assigns unique numerical IDs to each unique URL, replaces the inline link with the text and citation marker, and populates `references_markdown`.
|
||||
* **How to use it:** This is often the most useful Markdown for LLM tasks requiring RAG or for generating human-readable documents where sources are important. Combine it with `references_markdown`.
|
||||
* *Example:*
|
||||
```html
|
||||
<!-- Input HTML fragment -->
|
||||
<p>Crawl4AI is an <a href="https://github.com/unclecode/crawl4ai">open-source</a> library.</p>
|
||||
```
|
||||
```markdown
|
||||
// Resulting markdown_with_citations (simplified)
|
||||
Crawl4AI is an open-source [^1^] library.
|
||||
```
|
||||
* `references_markdown`:
|
||||
* **What it is:** A separate Markdown string that lists all unique URLs found and converted to citations, formatted typically as a numbered list.
|
||||
* **How to use it:** Append this string to the end of `markdown_with_citations` to create a complete document with a bibliography or reference section.
|
||||
* **Why it's separate:** This provides flexibility. You can choose to display references at the end, in a sidebar, or not at all.
|
||||
* *Example:*
|
||||
```markdown
|
||||
## References
|
||||
|
||||
[^1^]: https://github.com/unclecode/crawl4ai
|
||||
```
|
||||
* `fit_markdown`:
|
||||
* **What it is:** This is Markdown generated *exclusively* from the `fit_html`. `fit_html` itself is the output of a `RelevantContentFilter` if one was provided to the `MarkdownGenerationStrategy`. If no filter was used, `fit_markdown` will likely be empty or reflect the `raw_markdown`.
|
||||
* **How to use it:** When your primary goal is to feed an LLM with the most relevant, filtered content. This is excellent for tasks like generating concise summaries or providing highly focused context for RAG.
|
||||
* **Relationship with `raw_markdown`:** If a filter is active, `fit_markdown` is based on a *subset* or *transformed version* of the HTML that `raw_markdown` was based on (assuming `content_source` wasn't `"fit_html"`). If `content_source` *was* `"fit_html"`, then `raw_markdown` and `fit_markdown` would be derived from the same filtered HTML, but `fit_markdown` might still undergo different processing if the strategy handles it distinctly.
|
||||
* *Example:* Imagine a news article page. `raw_markdown` might contain the article, comments, ads, and navigation. If a `BM25ContentFilter` is used with a query about "stock market impact", `fit_markdown` would ideally only contain paragraphs related to that topic, stripped of other page elements.
|
||||
* `fit_html`:
|
||||
* **What it is:** The actual HTML string *after* a `RelevantContentFilter` (like `PruningContentFilter` or `LLMContentFilter`) has processed the input HTML. If no filter is applied, this field will be empty.
|
||||
* **How to use it:** Primarily for debugging your content filters. You can inspect `fit_html` to see exactly what HTML content was deemed "relevant" by your filter before it was converted to `fit_markdown`. It can also be useful if you need this filtered HTML for purposes other than Markdown generation.
|
||||
* **Why it's included:** It provides transparency into the filtering process and allows advanced users to work with the intermediate filtered HTML directly.
|
||||
|
||||
## 3. The `DefaultMarkdownGenerator` - Your Go-To Solution
|
||||
|
||||
* 3.1. **Understanding the `DefaultMarkdownGenerator`**
|
||||
* 3.1.1. **Purpose and Design:** The `DefaultMarkdownGenerator` is Crawl4AI's standard, out-of-the-box mechanism for converting HTML content into various Markdown representations. It's designed to be a robust and generally applicable solution for most common use cases, especially when targeting LLM consumption.
|
||||
* 3.1.2. Core Functionality: Its primary task is to orchestrate the HTML-to-Markdown conversion. It internally uses an instance of `CustomHTML2Text` (Crawl4AI's enhanced `html2text` wrapper) to perform the actual conversion.
|
||||
* 3.1.3. How it handles citations and references by default.
|
||||
* **Explanation:** If the `citations` parameter in its `generate_markdown` method is `True` (which it is by default), `DefaultMarkdownGenerator` will post-process the initially generated Markdown to convert inline links into citation markers (e.g., `[^1^]`) and generate a corresponding `references_markdown` block. This is done by its internal `CustomHTML2Text` instance.
|
||||
|
||||
* 3.2. **Configuring `DefaultMarkdownGenerator`**
|
||||
* 3.2.1. **Initialization Options:**
|
||||
* `content_filter (Optional[RelevantContentFilter])`:
|
||||
* **Why use it:** To refine the HTML *before* it's converted to Markdown. This is essential if you want `fit_markdown` (and consequently `fit_html`) to contain only the most relevant parts of the page, leading to a more focused Markdown output.
|
||||
* **How it integrates:** When `generate_markdown` is called, if a `content_filter` is present, `DefaultMarkdownGenerator` first passes the `input_html` (determined by `content_source`) to this filter. The filter returns a list of HTML strings (or a single string if merged). This filtered HTML becomes the `fit_html`. Then, `fit_markdown` is generated from this `fit_html`. The `raw_markdown` and `markdown_with_citations` are still generated from the original `content_source` unless `content_source` itself is set to `"fit_html"`.
|
||||
* *Impact:* Directly influences `fit_markdown` and `fit_html` fields in `MarkdownGenerationResult`. Can significantly reduce the noise and improve the relevance of the final Markdown for LLMs.
|
||||
* *Code Example:*
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
# Initialize a filter
|
||||
pruning_filter = PruningContentFilter(threshold_type="fixed", threshold=0.5)
|
||||
|
||||
# Initialize DefaultMarkdownGenerator with the filter
|
||||
md_generator_with_filter = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||
|
||||
# This generator will now produce 'fit_markdown' based on pruning.
|
||||
# run_config = CrawlerRunConfig(markdown_generator=md_generator_with_filter)
|
||||
# result = await crawler.arun(url="...", config=run_config)
|
||||
# print(result.markdown.fit_markdown)
|
||||
```
|
||||
* `options (Optional[Dict[str, Any]])`:
|
||||
* **What it is:** This dictionary allows you to pass configuration options directly to the underlying `CustomHTML2Text` instance. These options control the specifics of the HTML-to-Markdown conversion process.
|
||||
* **How to use it:** Provide a dictionary where keys are `html2text` option names (e.g., `body_width`, `ignore_links`) and values are their desired settings.
|
||||
* *See Section 6: Mastering `CustomHTML2Text` for detailed options.*
|
||||
* `content_source (str)`:
|
||||
* **Reiteration:** As discussed in section 2.1.4, this determines the primary HTML input for `raw_markdown` and `markdown_with_citations`.
|
||||
* **How it interacts with `content_filter`:**
|
||||
* If `content_source` is, for example, `"cleaned_html"` and a `content_filter` is also provided, the `content_filter` will process this `"cleaned_html"` to produce `fit_html`. The `fit_markdown` field in `MarkdownGenerationResult` will be based on this `fit_html`.
|
||||
* However, `raw_markdown` and `markdown_with_citations` will still be based on the original `"cleaned_html"` (unless `content_source` was explicitly set to `"fit_html"`). This allows you to have both a "fuller" Markdown and a "filtered" Markdown from a single generation step.
|
||||
|
||||
* 3.3. **Common Workflows with `DefaultMarkdownGenerator`**
|
||||
* 3.3.1. **Workflow: Generating Basic Markdown with Citations**
|
||||
* Steps: Instantiate `DefaultMarkdownGenerator` (or use the crawler's default). The crawler calls its `generate_markdown` method. Access `result.markdown.markdown_with_citations` and `result.markdown.references_markdown`.
|
||||
* *Code Example:*
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
|
||||
async def basic_markdown_workflow():
|
||||
# DefaultMarkdownGenerator is used implicitly if none is specified in CrawlerRunConfig
|
||||
# Or explicitly:
|
||||
md_generator = DefaultMarkdownGenerator()
|
||||
run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
if result.success:
|
||||
print("--- Markdown with Citations ---")
|
||||
print(result.markdown.markdown_with_citations[:500]) # Show first 500 chars
|
||||
print("\n--- References ---")
|
||||
print(result.markdown.references_markdown)
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
* 3.3.2. **Workflow: Generating Focused Markdown using a Content Filter**
|
||||
* Steps:
|
||||
1. Choose and instantiate a `RelevantContentFilter` (e.g., `BM25ContentFilter`).
|
||||
2. Instantiate `DefaultMarkdownGenerator`, passing the filter to its `content_filter` parameter.
|
||||
3. Set this `DefaultMarkdownGenerator` instance in `CrawlerRunConfig.markdown_generator`.
|
||||
4. After crawling, access `result.markdown.fit_markdown`.
|
||||
* Key configuration considerations for the filter and generator:
|
||||
* For `BM25ContentFilter`, ensure you provide a relevant `user_query`.
|
||||
* Adjust filter thresholds (e.g., `bm25_threshold`) as needed.
|
||||
* The `content_source` for `DefaultMarkdownGenerator` will be the input to the filter.
|
||||
* *Code Example:*
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
async def filtered_markdown_workflow():
|
||||
user_query = "information about Crawl4AI library"
|
||||
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=md_generator,
|
||||
cache_mode=CacheMode.BYPASS # For consistent demo results
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Using a page that hopefully has content related to the query
|
||||
result = await crawler.arun(url="https://github.com/unclecode/crawl4ai", config=run_config)
|
||||
if result.success:
|
||||
print("--- Fit Markdown (BM25 Filtered) ---")
|
||||
print(result.markdown.fit_markdown) # This is the key output
|
||||
# You can also inspect fit_html to see what the filter selected
|
||||
# print("\n--- Fit HTML ---")
|
||||
# print(result.markdown.fit_html[:500])
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
* 3.3.3. **Workflow: Customizing Markdown Style via `html2text_options`**
|
||||
* Steps: Instantiate `DefaultMarkdownGenerator` passing a dictionary of `html2text` options to its `options` parameter.
|
||||
* *Code Example:*
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
|
||||
async def custom_style_markdown_workflow():
|
||||
# Example: Disable line wrapping and ignore images
|
||||
html2text_opts = {
|
||||
"body_width": 0, # Disable line wrapping
|
||||
"ignore_images": True # Don't include image markdown 
|
||||
}
|
||||
md_generator = DefaultMarkdownGenerator(options=html2text_opts)
|
||||
|
||||
run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
if result.success:
|
||||
print("--- Custom Styled Markdown (No Wrap, No Images) ---")
|
||||
print(result.markdown.raw_markdown[:500]) # raw_markdown will reflect these options
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
* 3.4. **Best Practices for `DefaultMarkdownGenerator`**
|
||||
* **When to use `DefaultMarkdownGenerator` vs. a custom strategy:**
|
||||
* Use `DefaultMarkdownGenerator` for most cases. It's robust and highly configurable through `content_filter` and `html2text_options`.
|
||||
* Opt for a custom strategy only if you need fundamentally different conversion logic or integration with external Markdown libraries that `CustomHTML2Text` doesn't cover.
|
||||
* **Tips for choosing the right `content_source` and `content_filter`:**
|
||||
* Start with `content_source="cleaned_html"` (default) and no filter.
|
||||
* If the output is too noisy, introduce a `RelevantContentFilter`. `PruningContentFilter` is a good first step for general boilerplate. Use `BM25ContentFilter` or `LLMContentFilter` for more targeted filtering based on semantic relevance.
|
||||
* If your filter is very effective and you *only* want Markdown from the filtered content, consider setting `content_source="fit_html"` in your `DefaultMarkdownGenerator` instance.
|
||||
* **How to leverage `MarkdownGenerationResult` effectively:**
|
||||
* For LLM input where source attribution is important, use `markdown_with_citations` + `references_markdown`.
|
||||
* For tasks needing maximum conciseness based on relevance, use `fit_markdown` (after configuring a `content_filter`).
|
||||
* Use `raw_markdown` if you need the "purest" Markdown conversion without citation processing.
|
||||
* Inspect `fit_html` to debug your content filters.
|
||||
|
||||
## 4. Integrating Content Filters for Smarter Markdown (`fit_markdown`)
|
||||
|
||||
* 4.1. **The "Why": Purpose of Content Filtering Before Markdown Generation**
|
||||
* 4.1.1. Reducing noise and improving relevance for LLMs.
|
||||
* **Explanation:** Web pages often contain much more than just the main article content (e.g., navigation, ads, footers, related articles). These can be detrimental to LLM performance, increasing token count, processing time, and potentially confusing the model. Content filters aim to isolate the core, relevant information.
|
||||
* 4.1.2. Generating more concise and focused Markdown (`fit_markdown`).
|
||||
* **Explanation:** By filtering the HTML *before* converting it to Markdown, the resulting `fit_markdown` is inherently more concise and focused on what the filter deemed important. This is ideal for tasks where brevity and relevance are key.
|
||||
* 4.1.3. How `fit_html` is generated and its role.
|
||||
* **Explanation:** When a `RelevantContentFilter` is used with a `MarkdownGenerationStrategy`, the strategy first passes the input HTML (e.g., `cleaned_html`) to the filter's `filter_content` method. This method returns a list of HTML strings (or a single merged string). This output is stored as `fit_html` in the `MarkdownGenerationResult`. `fit_markdown` is then generated by converting this `fit_html` to Markdown.
|
||||
|
||||
* 4.2. **Overview of `RelevantContentFilter` Strategies**
|
||||
* 4.2.1. **`PruningContentFilter`**:
|
||||
* **How it works:** Applies heuristic rules to remove common boilerplate. For example, it might remove elements with very short text content, elements with a high link-to-text ratio, or elements matching common boilerplate CSS classes/IDs (like "footer", "nav", "sidebar").
|
||||
* **When to use it:** A good first-pass filter for general-purpose cleaning. It's fast and doesn't require LLM calls or complex configuration.
|
||||
* **Impact on `fit_markdown`:** Typically good at removing obvious non-content sections, resulting in a cleaner, more article-focused Markdown.
|
||||
* 4.2.2. **`BM25ContentFilter`**:
|
||||
* **How it works:** This filter uses the BM25 algorithm, a classical information retrieval technique. It tokenizes the HTML content into chunks and scores each chunk's relevance against a `user_query`. Chunks exceeding a `bm25_threshold` are kept.
|
||||
* **When to use it:** When you want to extract content specifically related to a user's query from a larger page. Excellent for targeted information retrieval.
|
||||
* **Impact on `fit_markdown`:** The output will be highly tailored to the query. If the query is "Tell me about Crawl4AI's caching", `fit_markdown` should primarily contain sections discussing caching.
|
||||
* 4.2.3. **`LLMContentFilter`**:
|
||||
* **How it works:** This is the most powerful and flexible filter. It chunks the input HTML and sends each chunk (or a summary) to an LLM with specific `instructions` (e.g., "Extract only the paragraphs discussing financial results"). The LLM decides which chunks are relevant.
|
||||
* **When to use it:** For complex filtering criteria that are hard to express with rules or keywords, or when nuanced understanding of content is required.
|
||||
* **Impact on `fit_markdown`:** Can produce very precise and contextually relevant Markdown. However, it's generally slower and can be more expensive due to LLM API calls.
|
||||
* 4.3. **Decision Guide: Choosing the Right `RelevantContentFilter`**
|
||||
* *Table:*
|
||||
| Filter | Speed | Cost (LLM API) | Accuracy/Nuance | Use Case Examples | Configuration Complexity |
|
||||
|-----------------------|------------|----------------|-----------------|----------------------------------------------------|--------------------------|
|
||||
| `PruningContentFilter`| Very Fast | None | Low-Medium | General boilerplate removal, quick cleaning. | Low |
|
||||
| `BM25ContentFilter` | Fast | None | Medium | Query-focused extraction, finding relevant sections. | Medium (query, threshold)|
|
||||
| `LLMContentFilter` | Slow | Potentially High| High | Complex criteria, nuanced extraction, summarization. | High (prompt engineering) |
|
||||
* Factors to consider:
|
||||
* **Desired Output Quality:** For the highest semantic relevance, `LLMContentFilter` is often best, but at a cost.
|
||||
* **Performance Constraints:** If speed is critical, `PruningContentFilter` or `BM25ContentFilter` are preferred.
|
||||
* **Nature of the HTML Content:** For well-structured articles, `PruningContentFilter` might be sufficient. For diverse content or Q&A, `BM25ContentFilter` or `LLMContentFilter` might be better.
|
||||
* **Specificity of Task:** If you have a clear query, `BM25ContentFilter` excels. If you have complex instructions, `LLMContentFilter` is suitable.
|
||||
* 4.4. **Code Examples: Combining Filters with `DefaultMarkdownGenerator`**
|
||||
* 4.4.1. *Example:* [Using `PruningContentFilter` to generate `fit_markdown`].
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def pruning_filter_example():
|
||||
pruning_filter = PruningContentFilter(threshold=0.4, threshold_type="fixed") # Adjust threshold as needed
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=pruning_filter)
|
||||
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||
if result.success:
|
||||
print("--- Fit Markdown (Pruned) ---")
|
||||
print(result.markdown.fit_markdown[:1000]) # Show first 1000 chars
|
||||
# print("\n--- Original Raw Markdown (for comparison) ---")
|
||||
# print(result.markdown.raw_markdown[:1000])
|
||||
```
|
||||
* 4.4.2. *Example:* [Using `BM25ContentFilter` with a query to generate query-focused `fit_markdown`].
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
async def bm25_filter_example():
|
||||
user_query = "Python syntax and semantics"
|
||||
bm25_filter = BM25ContentFilter(user_query=user_query, bm25_threshold=0.1)
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=bm25_filter)
|
||||
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||
if result.success:
|
||||
print(f"--- Fit Markdown (BM25 Filtered for query: '{user_query}') ---")
|
||||
print(result.markdown.fit_markdown)
|
||||
```
|
||||
* 4.4.3. *Example:* [Using `LLMContentFilter` for nuanced content selection before Markdown generation].
|
||||
```python
|
||||
from crawl4ai import DefaultMarkdownGenerator, CrawlerRunConfig, AsyncWebCrawler, LLMConfig, CacheMode
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
import os
|
||||
|
||||
async def llm_filter_example():
|
||||
# Ensure OPENAI_API_KEY is set in your environment
|
||||
if not os.getenv("OPENAI_API_KEY"):
|
||||
print("OPENAI_API_KEY not set. Skipping LLMContentFilter example.")
|
||||
return
|
||||
|
||||
llm_config_obj = LLMConfig(provider="openai/gpt-4o-mini", api_key=os.getenv("OPENAI_API_KEY"))
|
||||
|
||||
instruction = "Extract only the sections that discuss Python's history and its creator."
|
||||
llm_filter = LLMContentFilter(
|
||||
llm_config=llm_config_obj,
|
||||
instruction=instruction,
|
||||
# chunk_token_threshold=1000 # Adjust as needed
|
||||
)
|
||||
|
||||
md_generator = DefaultMarkdownGenerator(content_filter=llm_filter, content_source="cleaned_html")
|
||||
|
||||
run_config = CrawlerRunConfig(markdown_generator=md_generator, cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://en.wikipedia.org/wiki/Python_(programming_language)", config=run_config)
|
||||
if result.success:
|
||||
print(f"--- Fit Markdown (LLM Filtered with instruction: '{instruction}') ---")
|
||||
print(result.markdown.fit_markdown)
|
||||
llm_filter.show_usage() # Display token usage
|
||||
else:
|
||||
print(f"Crawl failed: {result.error_message}")
|
||||
```
|
||||
* 4.5. **Best Practices for Content Filtering for Markdown**
|
||||
* **Start Simple:** Begin with `PruningContentFilter` for general cleanup. It's fast and often effective for removing common boilerplate.
|
||||
* **Query-Specific Tasks:** If your goal is to extract information relevant to a specific query, `BM25ContentFilter` is a great, cost-effective choice.
|
||||
* **Nuanced Selection:** Reserve `LLMContentFilter` for tasks requiring deeper semantic understanding or complex filtering logic that rules-based or keyword-based approaches can't handle. Be mindful of its cost and latency.
|
||||
* **Iterate and Test:** Content filtering is often an iterative process. Test your filter configurations on various pages to ensure they behave as expected. Inspect `fit_html` to understand what the filter is selecting/discarding.
|
||||
* **Combine with `content_source`:** Remember that `fit_markdown` is derived from the output of the filter. If you also need Markdown from the pre-filtered content, ensure your `MarkdownGenerationStrategy`'s `content_source` is set appropriately (e.g., `"cleaned_html"`) so that `raw_markdown` reflects that, while `fit_markdown` reflects the filtered version.
|
||||
|
||||
## 5. Customizing Markdown Output via `CrawlerRunConfig`
|
||||
|
||||
* 5.1. **The Role of `CrawlerRunConfig.markdown_generator`**
|
||||
* 5.1.1. How it allows specifying a custom Markdown generation strategy for a crawl run.
|
||||
* **Explanation:** The `markdown_generator` parameter within the `CrawlerRunConfig` object is the primary way to control how Markdown is generated for a specific crawl operation (i.e., a call to `crawler.arun()` or tasks within `crawler.arun_many()`). You can assign an instance of any class that adheres to the `MarkdownGenerationStrategy` interface to it.
|
||||
* 5.1.2. Overriding the default Markdown generation behavior.
|
||||
* **Explanation:** If `CrawlerRunConfig.markdown_generator` is not set (i.e., it's `None`), Crawl4AI will use a default instance of `DefaultMarkdownGenerator` with its standard settings. By providing your own `MarkdownGenerationStrategy` instance (be it a configured `DefaultMarkdownGenerator` or a custom class), you override this default behavior for that particular run.
|
||||
|
||||
* 5.2. **Scenarios for Using `CrawlerRunConfig.markdown_generator`**
|
||||
* 5.2.1. Applying a pre-configured `DefaultMarkdownGenerator` with specific filters or options.
|
||||
* **Why:** You might want different filtering logic or `html2text` options for different URLs or types of content you're crawling, even within the same `AsyncWebCrawler` instance.
|
||||
* 5.2.2. Plugging in a completely custom `MarkdownGenerationStrategy`.
|
||||
* **Why:** As discussed in section 2.1.2, if you have unique Markdown requirements or want to use a different conversion library.
|
||||
* 5.2.3. Disabling Markdown generation entirely by setting it to `None` (if applicable, or by using a "NoOp" strategy).
|
||||
* **Why:** If, for a specific crawl, you only need the HTML or extracted structured data and don't require Markdown output, you can pass `markdown_generator=None` (or a strategy that does nothing) to save processing time.
|
||||
* *Note:* To truly disable Markdown generation and its associated `CustomHTML2Text` processing, you might need a "NoOpMarkdownGenerator". If `markdown_generator` is `None`, the crawler might still fall back to a default. A NoOp strategy would explicitly do nothing.
|
||||
```python
|
||||
# class NoOpMarkdownGenerator(MarkdownGenerationStrategy):
|
||||
# def generate_markdown(self, input_html: str, **kwargs) -> MarkdownGenerationResult:
|
||||
# return MarkdownGenerationResult(raw_markdown="", markdown_with_citations="", references_markdown="")
|
||||
# run_config = CrawlerRunConfig(markdown_generator=NoOpMarkdownGenerator())
|
||||
```
|
||||
|
||||
* 5.3. **Code Examples:**
|
||||
* 5.3.1. *Example:* [Setting a `DefaultMarkdownGenerator` with a `PruningContentFilter` in `CrawlerRunConfig`].
|
||||
```python
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
DefaultMarkdownGenerator,
|
||||
CacheMode
|
||||
)
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def run_with_specific_md_generator():
|
||||
# Configure a specific markdown generator
|
||||
pruning_filter = PruningContentFilter(threshold=0.6)
|
||||
specific_md_generator = DefaultMarkdownGenerator(
|
||||
content_filter=pruning_filter,
|
||||
options={"body_width": 0, "ignore_links": True}
|
||||
)
|
||||
|
||||
# Configure the crawl run to use this generator
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=specific_md_generator,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url="https://example.com/article1", config=run_config)
|
||||
if result.success:
|
||||
print("--- Markdown from Article 1 (Pruned, No Links, No Wrap) ---")
|
||||
print(result.markdown.fit_markdown[:500])
|
||||
# raw_markdown would also reflect no-wrap and no-links from html2text_options
|
||||
|
||||
# For another URL, you could use a different (or default) generator
|
||||
# default_run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
# result2 = await crawler.arun(url="https://example.com/article2", config=default_run_config)
|
||||
|
||||
# asyncio.run(run_with_specific_md_generator())
|
||||
```
|
||||
* 5.3.2. *Example:* [Setting a custom `MyMarkdownStrategy` in `CrawlerRunConfig` (assuming `MyCustomMarkdownStrategy` from 2.1.3)].
|
||||
```python
|
||||
# Assuming MyCustomMarkdownStrategy is defined as in section 2.1.3
|
||||
# from crawl4ai import CrawlerRunConfig, AsyncWebCrawler, CacheMode
|
||||
# from your_module import MyCustomMarkdownStrategy # If it's in another file
|
||||
|
||||
# async def run_with_custom_md_strategy():
|
||||
# custom_strategy = MyCustomMarkdownStrategy(content_source="raw_html")
|
||||
# run_config_custom = CrawlerRunConfig(
|
||||
# markdown_generator=custom_strategy,
|
||||
# cache_mode=CacheMode.BYPASS
|
||||
# )
|
||||
|
||||
# async with AsyncWebCrawler() as crawler:
|
||||
# result = await crawler.arun(url="https://example.com", config=run_config_custom)
|
||||
# if result.success:
|
||||
# print("--- Markdown from Custom Strategy ---")
|
||||
# print(result.markdown.raw_markdown) # Or other fields your strategy populates
|
||||
|
||||
# asyncio.run(run_with_custom_md_strategy())
|
||||
```
|
||||
* 5.4. **Interaction with Global vs. Run-Specific Configurations**
|
||||
* **Explanation:** `AsyncWebCrawler` itself does not have a global `markdown_generator` setting during its initialization. Markdown generation is configured *per run* via `CrawlerRunConfig`. This design choice provides maximum flexibility, allowing different Markdown strategies for different URLs or tasks within the same crawler instance lifecycle. If `CrawlerRunConfig.markdown_generator` is not provided, a default `DefaultMarkdownGenerator` instance is used for that specific run.
|
||||
|
||||
## 6. Mastering `CustomHTML2Text` for Fine-Grained Control
|
||||
|
||||
* 6.1. **Understanding `CustomHTML2Text`**
|
||||
* 6.1.1. **Purpose:** Why Crawl4AI includes its own `html2text` extension.
|
||||
* **Enhanced control:** `CustomHTML2Text` is a subclass of the standard `html2text.HTML2Text` library. Crawl4AI uses this custom version to gain more precise control over the HTML-to-Markdown conversion process, particularly to make the output more suitable for LLMs.
|
||||
* **Specific adaptations:** It includes logic for handling Crawl4AI's citation and reference generation (`convert_links_to_citations`), and potentially other tweaks that improve the quality and utility of the Markdown output for AI applications.
|
||||
* 6.1.2. **How it's used by `DefaultMarkdownGenerator`**.
|
||||
* **Explanation:** `DefaultMarkdownGenerator` instantiates `CustomHTML2Text` internally. When you pass `options` to `DefaultMarkdownGenerator`, these are ultimately used to configure this `CustomHTML2Text` instance. The `handle()` method of `CustomHTML2Text` is what performs the core HTML to Markdown conversion.
|
||||
|
||||
* 6.2. **Key `html2text_options` and Their Impact**
|
||||
* (These options are passed via `DefaultMarkdownGenerator(options=...)`)
|
||||
* 6.2.1. `body_width`:
|
||||
* **What it does:** Controls the maximum width of lines in the generated Markdown before wrapping.
|
||||
* **Why configure it:** For LLM consumption, it's often best to disable automatic line wrapping to allow the LLM to process text based on natural paragraph breaks. Setting `body_width=0` achieves this.
|
||||
* *Example:*
|
||||
* `body_width=80` (default-ish for some tools):
|
||||
```markdown
|
||||
This is a longer sentence that will be wrapped by html2text if the body_width is
|
||||
set to a value like 80 characters.
|
||||
```
|
||||
* `body_width=0`:
|
||||
```markdown
|
||||
This is a longer sentence that will not be wrapped by html2text if body_width is 0, allowing the LLM to handle line breaks.
|
||||
```
|
||||
* 6.2.2. `ignore_links`:
|
||||
* **What it does:** If `True`, all hyperlink information (`[text](url)`) is removed, leaving only the link text.
|
||||
* **Why configure it:** Set to `True` if links are considered noise for your LLM task and you don't need source attribution. If `False` (default for Crawl4AI's `CustomHTML2Text` unless overridden), links are preserved and can then be converted to citations by `DefaultMarkdownGenerator`.
|
||||
* *Example:*
|
||||
* `ignore_links=False` (then processed for citations): `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI [^1^]`
|
||||
* `ignore_links=True`: `Visit [Crawl4AI](https://crawl4ai.com)` -> `Visit Crawl4AI`
|
||||
* 6.2.3. `ignore_images`:
|
||||
* **What it does:** If `True`, image tags (`<img>`) are completely ignored, and no Markdown image syntax (``) is generated.
|
||||
* **Why configure it:** Useful if image information is irrelevant to your LLM task and you want cleaner, more text-focused Markdown.
|
||||
* *Example:*
|
||||
* HTML: `<img src="logo.png" alt="My Logo">`
|
||||
* `ignore_images=False`: ``
|
||||
* `ignore_images=True`: (nothing is output for the image)
|
||||
* 6.2.4. `protect_links`:
|
||||
* **What it does:** If `True`, surrounds link URLs with `<` and `>`. E.g., `[text](<url>)`.
|
||||
* **Why configure it:** This can sometimes help Markdown parsers that might misinterpret URLs containing special characters. However, with Crawl4AI's citation handling, this is generally not needed, as the raw URLs are moved to the reference section.
|
||||
* 6.2.5. `mark_code`:
|
||||
* **What it does:** Controls how `<pre>` and `<code>` tags are handled. If `True`, it attempts to use Markdown code block syntax (backticks).
|
||||
* **Why configure it:** Essential for preserving code snippets correctly. Usually, you'd want this to be `True`.
|
||||
* 6.2.6. `default_image_alt`:
|
||||
* **What it does:** Provides a default alt text string if an `<img>` tag is missing an `alt` attribute.
|
||||
* **Why configure it:** Can make Markdown more consistent if you choose to include images.
|
||||
* 6.2.7. `bypass_tables`:
|
||||
* **What it does:** If `True`, `<table>` elements are not converted into Markdown table syntax. Their content might be rendered as plain text or omitted, depending on other settings.
|
||||
* **Why configure it:** Standard Markdown table syntax is limited and may not handle complex tables (with `colspan`, `rowspan`, nested tables) well. If you encounter mangled tables, setting this to `True` and processing the table HTML separately (e.g., by extracting the `<table>` HTML and using a specialized table-to-text or table-to-JSON library) might be a better approach.
|
||||
* 6.2.8. `pad_tables`:
|
||||
* **What it does:** If `True`, adds padding spaces around cell content in Markdown tables for better visual alignment in raw Markdown.
|
||||
* **Why configure it:** Mostly an aesthetic choice for human readability of the raw Markdown; LLMs typically don't care about this padding.
|
||||
* *Other relevant options identified from `CustomHTML2Text` (or base `html2text`) source:*
|
||||
* `escape_snob`: If `True`, escapes `>` and `&` characters. Default is `False`.
|
||||
* `skip_internal_links`: If `True`, ignores links that start with `#`. Default is `False`.
|
||||
* `links_each_paragraph`: If `True`, puts a link list after each paragraph. Default is `False`. Crawl4AI's citation system provides a better alternative.
|
||||
* `unicode_snob`: If `True`, uses Unicode characters instead of ASCII approximations. Default is `False` in base `html2text`, but `CustomHTML2Text` might behave differently or Crawl4AI ensures UTF-8 handling.
|
||||
* 6.3. **Best Practices for Configuring `CustomHTML2Text`**
|
||||
* 6.3.1. **General recommendations for LLM-friendly output:**
|
||||
* Set `body_width=0` to disable line wrapping and let paragraphs flow naturally.
|
||||
* Consider `ignore_images=True` if images are not relevant to the LLM's task.
|
||||
* Usually, keep `ignore_links=False` (Crawl4AI default) to allow `DefaultMarkdownGenerator` to handle citations properly.
|
||||
* 6.3.2. **How to balance information preservation with conciseness:**
|
||||
* Be selective with `ignore_*` options. Removing too much might discard useful context.
|
||||
* Use content filters (Section 4) for semantic reduction rather than relying solely on `html2text` options to remove large irrelevant sections.
|
||||
* 6.3.3. **Experimenting with options to achieve desired Markdown style:**
|
||||
* Create a small test HTML snippet.
|
||||
* Instantiate `DefaultMarkdownGenerator` with different `options` dictionaries.
|
||||
* Call its `generate_markdown` method directly (or `_html_to_markdown` on its internal `CustomHTML2Text` instance if you want to bypass citation logic for testing) and observe the output.
|
||||
* 6.4. **Handling Citations and References (`convert_links_to_citations` method in `CustomHTML2Text`)**
|
||||
* 6.4.1. **How it works:**
|
||||
* The `convert_links_to_citations` method (called by `DefaultMarkdownGenerator` if citations are enabled) iterates through the Markdown produced by `html2text.handle()`.
|
||||
* It uses a regular expression (`LINK_PATTERN`) to find all Markdown links (`[text](url "optional title")`).
|
||||
* For each unique URL, it assigns an incremental citation number.
|
||||
* It replaces the original Markdown link with `text [^N^]` (or `![text][^N^]` for images if not ignored).
|
||||
* It builds up a list of reference strings like `[^N^]: url "optional title - text if different from title"`.
|
||||
* 6.4.2. **When it's called:** This method is invoked by `DefaultMarkdownGenerator.generate_markdown()` *after* the initial HTML-to-Markdown conversion by `CustomHTML2Text.handle()` if the `citations` flag is `True`.
|
||||
* 6.4.3. **Impact on `MarkdownGenerationResult` fields:**
|
||||
* The modified Markdown (with `[^N^]` markers) is stored in `markdown_with_citations`.
|
||||
* The collected reference list is stored in `references_markdown`.
|
||||
* `raw_markdown` remains the version *before* citation processing.
|
||||
* 6.4.4. **Customizing Citation Behavior (if possible through options or by subclassing)**.
|
||||
* **Explanation:** Direct customization of the citation format (e.g., changing `[^N^]` to `(N)`) via options is not explicitly provided in `CustomHTML2Text`.
|
||||
* To change this, you would need to:
|
||||
1. Create your own class inheriting from `DefaultMarkdownGenerator`.
|
||||
2. Override the `generate_markdown` method.
|
||||
3. In your override, you could either:
|
||||
* Call the parent's `generate_markdown`, get the `MarkdownGenerationResult`, and then post-process `markdown_with_citations` and `references_markdown` to your desired format.
|
||||
* Or, more invasively, replicate the logic but modify the citation generation part. This might involve creating a custom version of `CustomHTML2Text` or its `convert_links_to_citations` method.
|
||||
* For most users, the default citation format is standard and widely accepted.
|
||||
|
||||
## 7. Advanced Markdown Generation Techniques & Best Practices
|
||||
|
||||
* 7.1. **Achieving LLM-Friendly Markdown Output**
|
||||
* 7.1.1. Prioritizing semantic structure (headings, lists, paragraphs).
|
||||
* **Why:** LLMs leverage structural cues to understand context and hierarchy. Ensure your `html2text_options` (e.g., for headings, list indentation) preserve this structure faithfully.
|
||||
* **How:** Rely on `CustomHTML2Text`'s default handling of semantic HTML tags. If specific tags are problematic, consider pre-processing the HTML.
|
||||
* 7.1.2. Handling complex HTML structures (nested tables, complex layouts).
|
||||
* **Strategies for simplifying or selectively extracting from them:**
|
||||
* **Tables:** For very complex tables, consider `html2text_options={'bypass_tables': True}`. Then, extract the table HTML separately (e.g., using `CrawlResult.html` and a CSS selector for the table) and process it with a specialized table parsing library or even an LLM call focused just on table interpretation.
|
||||
* **Layouts:** Aggressive `RelevantContentFilter` strategies can help. If parts of a complex layout are consistently noise, use `CrawlerRunConfig.excluded_selector` to remove them before they even reach the Markdown generator.
|
||||
* 7.1.3. When to prefer `fit_markdown` over `raw_markdown` (or `markdown_with_citations`).
|
||||
* **Reasoning:**
|
||||
* **`fit_markdown`:** Best for tasks requiring high relevance and conciseness (e.g., RAG context, focused summarization). It reflects the output of your content filtering.
|
||||
* **`raw_markdown` / `markdown_with_citations`:** Better when you need a broader representation of the page's textual content, or when the filtering might be too aggressive and discard potentially useful context. Also, if your `content_source` is already very clean (e.g., from a targeted CSS selector), the difference might be minimal.
|
||||
* 7.1.4. Balancing detail vs. conciseness for different LLM tasks (e.g., summarization vs. Q&A).
|
||||
* **Summarization:** `fit_markdown` from a well-configured `LLMContentFilter` or `BM25ContentFilter` is often ideal. You might also use more aggressive `html2text_options` to remove minor elements.
|
||||
* **Q&A / RAG:** You might prefer a slightly less aggressive filter or even `raw_markdown` (if `content_source` is clean) to ensure all potentially relevant details are available. Citations (`markdown_with_citations` and `references_markdown`) are crucial here for source tracking.
|
||||
|
||||
* 7.2. **Pre-processing HTML for Better Markdown**
|
||||
* 7.2.1. Using `CrawlerRunConfig.excluded_tags` or `excluded_selector` to remove noise before Markdown generation.
|
||||
* **How:** These parameters in `CrawlerRunConfig` are applied by the `ContentScrapingStrategy` *before* the HTML even reaches the `MarkdownGenerationStrategy`.
|
||||
* **Why:** This is the most efficient way to remove large, consistently irrelevant sections (like global headers, footers, sidebars, ad blocks) across all outputs (HTML, Markdown, etc.).
|
||||
* *Code Example:*
|
||||
```python
|
||||
# In CrawlerRunConfig
|
||||
# config = CrawlerRunConfig(
|
||||
# excluded_tags=["nav", "footer", "script", "style"],
|
||||
# excluded_selector=".ads, #social-share-buttons"
|
||||
# )
|
||||
```
|
||||
* 7.2.2. The role of `ContentScrapingStrategy` (e.g., `LXMLWebScrapingStrategy` or the default `WebScrapingStrategy` using BeautifulSoup) in preparing the HTML that `DefaultMarkdownGenerator` receives.
|
||||
* **Explanation:** The `ContentScrapingStrategy` is responsible for the initial cleaning of the HTML. Its output (what becomes `cleaned_html`) is the direct input to `DefaultMarkdownGenerator` if `content_source` is `"cleaned_html"`. Understanding how your chosen scraping strategy cleans HTML is key to predicting the input for Markdown generation. `LXMLWebScrapingStrategy` is generally faster and can be more robust for heavily malformed HTML.
|
||||
|
||||
* 7.3. **Post-processing Generated Markdown**
|
||||
* 7.3.1. When and why you might need to further process Markdown from `MarkdownGenerationResult`.
|
||||
* **Scenarios:**
|
||||
* Custom formatting not achievable with `html2text` options (e.g., specific table styles, unique list markers).
|
||||
* Domain-specific transformations (e.g., converting certain patterns to custom shortcodes).
|
||||
* Further cleaning or condensing based on rules `html2text` or content filters don't cover.
|
||||
* 7.3.2. *Example:* [Python snippet for custom regex replacements or structural adjustments on `raw_markdown`].
|
||||
```python
|
||||
import re
|
||||
|
||||
def custom_post_process_markdown(markdown_text):
|
||||
# Example: Replace all occurrences of "Crawl4AI" with "**Crawl4AI**"
|
||||
markdown_text = re.sub(r"Crawl4AI", r"**Crawl4AI**", markdown_text)
|
||||
|
||||
# Example: Add a horizontal rule after every H2 heading
|
||||
markdown_text = re.sub(r"(^## .*)", r"\1\n\n---", markdown_text, flags=re.MULTILINE)
|
||||
return markdown_text
|
||||
|
||||
# result = await crawler.arun(...)
|
||||
# if result.success:
|
||||
# final_markdown = custom_post_process_markdown(result.markdown.raw_markdown)
|
||||
# print(final_markdown)
|
||||
```
|
||||
|
||||
* 7.4. **Combining Different Strategies for Optimal Results**
|
||||
* 7.4.1. *Scenario:* Using a `RelevantContentFilter` to get `fit_html`, then passing `fit_html` to a custom Markdown generator that expects highly focused input.
|
||||
* **How:**
|
||||
1. Instantiate your filter (e.g., `LLMContentFilter`).
|
||||
2. Instantiate your custom Markdown generator (`MyCustomMarkdownStrategy`).
|
||||
3. In `CrawlerRunConfig`, set `markdown_generator` to your custom generator.
|
||||
4. Crucially, within your custom generator's `generate_markdown` method, ensure you *first* apply the `content_filter` (passed as an argument) to the `input_html` to get the `fit_html`, and then process this `fit_html` with your custom logic. Or, configure your custom generator's `content_source="fit_html"` and pass the filter during its initialization.
|
||||
* 7.4.2. *Scenario:* Using one set of `html2text_options` for `raw_markdown` and another for generating an alternative Markdown representation (perhaps for a different LLM or purpose).
|
||||
* **How:** This would typically require two separate calls to `crawler.arun()` with different `CrawlerRunConfig` objects, each specifying a `DefaultMarkdownGenerator` with different `options`. Alternatively, a custom `MarkdownGenerationStrategy` could internally generate multiple Markdown versions with different settings and include them in custom fields within `MarkdownGenerationResult` (though this would require modifying or extending `MarkdownGenerationResult`).
|
||||
|
||||
## 8. Troubleshooting Common Markdown Generation Issues
|
||||
|
||||
* 8.1. **Problem: Markdown is too noisy / includes boilerplate**
|
||||
* 8.1.1. **Solutions:**
|
||||
* **Use a `RelevantContentFilter`**:
|
||||
* Start with `PruningContentFilter`. It's fast and good for common boilerplate.
|
||||
```python
|
||||
# from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
# from crawl4ai import DefaultMarkdownGenerator
|
||||
# md_generator = DefaultMarkdownGenerator(content_filter=PruningContentFilter(threshold=0.5))
|
||||
```
|
||||
* If more precision is needed, try `BM25ContentFilter` with a relevant query or `LLMContentFilter` with clear instructions.
|
||||
* **Refine `excluded_tags` or `excluded_selector` in `CrawlerRunConfig`**: This removes elements *before* any Markdown strategy sees them.
|
||||
```python
|
||||
# run_config = CrawlerRunConfig(
|
||||
# excluded_tags=["nav", "footer", "aside", "script"],
|
||||
# excluded_selector=".ad-banner, #social-links"
|
||||
# )
|
||||
```
|
||||
* **Adjust `html2text_options`**: Options like `ignore_links`, `ignore_images`, `skip_internal_links` can reduce clutter.
|
||||
```python
|
||||
# from crawl4ai import DefaultMarkdownGenerator
|
||||
# md_generator = DefaultMarkdownGenerator(options={"ignore_images": True, "ignore_links": True})
|
||||
```
|
||||
|
||||
* 8.2. **Problem: Important content is missing from Markdown**
|
||||
* 8.2.1. **Solutions:**
|
||||
* **Check if `content_filter` is too aggressive**: If using a filter, try lowering its threshold (e.g., `bm25_threshold` for `BM25ContentFilter`) or simplifying instructions for `LLMContentFilter`. Temporarily disable the filter to see if the content appears in `raw_markdown`.
|
||||
* **Ensure `word_count_threshold` in `CrawlerRunConfig` (or scraping strategy) is not too high**: The default `WebScrapingStrategy` might have its own cleaning. If `CrawlerRunConfig.word_count_threshold` is too high, it might remove short but important paragraphs.
|
||||
* **Verify `html2text_options` are not inadvertently removing desired content**: For example, if `ignore_links=True` is set, link text itself might still be there, but the link URL will be gone.
|
||||
* **Examine `cleaned_html` or `fit_html`**: Inspect `result.markdown.fit_html` (if a filter was used) or `result.cleaned_html` (if no filter and `content_source` was `cleaned_html`). If the content is missing here, the issue is with HTML cleaning or filtering, not the Markdown conversion itself. If it's present in these HTML versions but not in the final Markdown, the issue is likely with `html2text_options` or the conversion process.
|
||||
|
||||
* 8.3. **Problem: Tables are mangled or poorly formatted**
|
||||
* 8.3.1. **Solutions:**
|
||||
* **Try `html2text_options={'bypass_tables': True}`**: This tells `html2text` to skip converting tables.
|
||||
```python
|
||||
# from crawl4ai import DefaultMarkdownGenerator
|
||||
# md_generator = DefaultMarkdownGenerator(options={"bypass_tables": True})
|
||||
# run_config = CrawlerRunConfig(markdown_generator=md_generator)
|
||||
# result = await crawler.arun(...)
|
||||
# # Now result.markdown.raw_markdown will not have Markdown tables.
|
||||
# # You'd need to parse tables from result.cleaned_html or result.markdown.fit_html
|
||||
```
|
||||
You can then extract the table HTML directly from `result.cleaned_html` (or `result.markdown.fit_html`) using BeautifulSoup or lxml and parse it with a library better suited for complex tables (e.g., pandas `read_html`, or a custom parser).
|
||||
* **Experiment with other `html2text` table formatting options**: Options like `pad_tables` might slightly improve appearance, but won't fix fundamentally complex table structures.
|
||||
* **Consider if the table is truly a data table or a layout table**: Layout tables are often problematic for Markdown conversion and should ideally be filtered out by `PruningContentFilter` or more aggressive cleaning.
|
||||
|
||||
* 8.4. **Problem: Citations or references are incorrect/missing**
|
||||
* 8.4.1. **Solutions:**
|
||||
* **Ensure links are present in the HTML input to `DefaultMarkdownGenerator`**: If the links were removed during an earlier HTML cleaning stage (e.g., by an aggressive `ContentScrapingStrategy` or `excluded_tags`), they can't be converted to citations.
|
||||
* **Verify `ignore_links` is not `True` in `html2text_options`**: `DefaultMarkdownGenerator` relies on `CustomHTML2Text` to see the links to convert them. If `ignore_links=True`, the links are stripped before citation processing can occur.
|
||||
* **Check for unusual link structures in the HTML**: Very non-standard link formats (e.g., heavily JavaScript-driven links without `href` attributes) might not be picked up. `CustomHTML2Text` primarily looks for standard `<a href="...">` tags.
|
||||
|
||||
* 8.5. **Problem: Markdown formatting is not ideal for a specific LLM**
|
||||
* 8.5.1. **Solutions:**
|
||||
* **Fine-tune `html2text_options` extensively**: This is the first line of defense. Experiment with all available options (see Section 6.2) to control aspects like heading styles, list formatting, code block rendering, etc.
|
||||
* **Consider a custom `MarkdownGenerationStrategy`**: If `html2text` options are insufficient, you might need to build your own strategy, possibly using a different Markdown conversion library or implementing custom transformation logic (see Section 2.1.3).
|
||||
* **Implement post-processing steps**: After getting the Markdown from `MarkdownGenerationResult`, apply your own Python scripts (e.g., using regex) to further refine the formatting (see Section 7.3.2).
|
||||
|
||||
* 8.6. **Debugging Workflow**
|
||||
* 8.6.1. **Start with `raw_html` from `CrawlResult`**: `print(result.html)` This is the very first HTML fetched, before any processing. Is your target content even here?
|
||||
* 8.6.2. **Examine `cleaned_html` (or `fit_html`)**:
|
||||
* If no content filter is used in `MarkdownGenerationStrategy`, inspect `result.cleaned_html`. This is what `DefaultMarkdownGenerator` (with `content_source="cleaned_html"`) will use.
|
||||
* If a content filter *is* used, inspect `result.markdown.fit_html`. This is what `DefaultMarkdownGenerator` will use to produce `fit_markdown`.
|
||||
* Is your target content present in these intermediate HTML stages?
|
||||
* 8.6.3. **Isolate the issue**:
|
||||
* **HTML Cleaning/Scraping:** If content is missing from `cleaned_html` (but present in `raw_html`), the issue lies with the `ContentScrapingStrategy` or `CrawlerRunConfig` parameters like `excluded_tags`, `css_selector`, `target_elements`.
|
||||
* **Content Filtering:** If content is in `cleaned_html` but missing from `fit_html`, the issue is with your `RelevantContentFilter` configuration.
|
||||
* **Markdown Conversion:** If content is in `cleaned_html`/`fit_html` but malformed or missing in the final Markdown fields (`raw_markdown`, `fit_markdown`), the issue is likely with `html2text_options` or the `CustomHTML2Text` conversion process.
|
||||
* 8.6.4. **Use `verbose=True` in relevant configs**: Set `verbose=True` in `BrowserConfig` and `CrawlerRunConfig` for more detailed logging output from Crawl4AI, which can provide clues.
|
||||
|
||||
## 9. Conclusion and Next Steps
|
||||
|
||||
* 9.1. Recap of key strategies for effective Markdown generation.
|
||||
* **Summary:** Crawl4AI provides a flexible Markdown generation pipeline. Start with `DefaultMarkdownGenerator`. Use `html2text_options` for stylistic control. Employ `RelevantContentFilter` strategies (`PruningContentFilter`, `BM25ContentFilter`, `LLMContentFilter`) to create focused `fit_markdown` for LLMs. Choose the appropriate `content_source` based on your needs. For highly custom requirements, implement your own `MarkdownGenerationStrategy`.
|
||||
* 9.2. Pointers to other relevant documentation sections (e.g., `RelevantContentFilter` deep dive, `CustomHTML2Text` options in API reference).
|
||||
* **Suggestion:** For a detailed breakdown of each `RelevantContentFilter`, see the "Content Filtering Strategies" guide. For an exhaustive list of `html2text` options, refer to the `CustomHTML2Text` API documentation or the original `html2text` library's documentation.
|
||||
* 9.3. Encouragement for experimentation and community contributions.
|
||||
* **Call to Action:** The best way to master Markdown generation is to experiment with different configurations and content types. If you develop useful custom strategies or identify improvements, consider contributing them back to the Crawl4AI community!
|
||||
|
||||
---
|
||||
```
|
||||
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
@@ -1,188 +0,0 @@
|
||||
Okay, I have read the "vibe" description for `crawl4ai`. Based on this, and adhering to the "memory" document type requirements, here is the detailed Markdown outline:
|
||||
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - vibe Component
|
||||
|
||||
**Target Document Type:** memory
|
||||
**Target Output Filename Suggestion:** `llm_memory_vibe_coding.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
## 1. Vibe Coding with Crawl4AI: Core Concept
|
||||
|
||||
* 1.1. Purpose:
|
||||
* Provides a conceptual framework for interacting with the `crawl4ai` library, particularly when using AI coding assistants.
|
||||
* Aims to simplify the process of building web data applications by focusing on high-level capabilities and key building blocks, enabling users to guide AI assistants effectively even with limited direct `crawl4ai` API knowledge.
|
||||
* 1.2. Principle:
|
||||
* Describes how users can communicate their web scraping and data extraction goals to an AI assistant, which then translates these "vibes" or high-level intentions into `crawl4ai` Python code by leveraging knowledge of the library's core components and configurations.
|
||||
|
||||
## 2. `crawl4ai` High-Level Capabilities (for Vibe Prompts)
|
||||
|
||||
* 2.1. Fetching Webpages
|
||||
* 2.1.1. Description: The library can retrieve content from specified web URLs.
|
||||
* 2.2. Converting Web Content to Clean Markdown
|
||||
* 2.2.1. Description: The library can process raw HTML content and convert it into a cleaned, structured Markdown format.
|
||||
* 2.2.2. Applications: Suitable for content summarization, input for Question & Answering systems, and as a pre-processing step for other LLMs.
|
||||
* 2.3. Extracting Specific Information (JSON)
|
||||
* 2.3.1. Description: The library can extract targeted data elements from webpages and organize them into a JSON structure.
|
||||
* 2.3.2. Examples: Can be used to extract product names, prices from e-commerce sites, article headlines, author names, etc.
|
||||
* 2.4. Crawling Multiple Pages
|
||||
* 2.4.1. Description: The library supports concurrent fetching and processing of a list of URLs.
|
||||
* 2.5. Taking Screenshots and Generating PDFs
|
||||
* 2.5.1. Description: The library can capture visual representations of webpages as PNG screenshots or generate PDF documents.
|
||||
* 2.6. Handling Simple Page Interactions
|
||||
* 2.6.1. Description: The library can execute JavaScript to simulate basic user interactions on a webpage, such as clicking buttons (e.g., "load more") or scrolling.
|
||||
|
||||
## 3. Key `crawl4ai` Building Blocks (API Reference for Vibe Coding Context)
|
||||
|
||||
* 3.1. Class `AsyncWebCrawler`
|
||||
* 3.1.1. Purpose: The primary entry point and main tool within `crawl4ai` for orchestrating web crawling and data extraction tasks.
|
||||
* 3.1.2. Initialization (`__init__`):
|
||||
* Signature: `AsyncWebCrawler(self, crawler_strategy: Optional[AsyncCrawlerStrategy] = None, config: Optional[BrowserConfig] = None, base_directory: str = ..., thread_safe: bool = False, logger: Optional[AsyncLoggerBase] = None, **kwargs)`
|
||||
* Parameters:
|
||||
* `crawler_strategy (Optional[AsyncCrawlerStrategy])`: The underlying strategy for web crawling (e.g., `AsyncPlaywrightCrawlerStrategy`). Defaults to `AsyncPlaywrightCrawlerStrategy`.
|
||||
* `config (Optional[BrowserConfig])`: Configuration for the browser instance. See section 3.5 for details.
|
||||
* Other parameters are generally handled by defaults for vibe coding.
|
||||
* 3.2. Method `AsyncWebCrawler.arun()`
|
||||
* 3.2.1. Purpose: Executes a crawl operation on a single URL or resource.
|
||||
* 3.2.2. Signature: `async def arun(self, url: str, config: Optional[CrawlerRunConfig] = None, **kwargs) -> RunManyReturn`
|
||||
* 3.2.3. Parameters:
|
||||
* `url (str)`: The target resource.
|
||||
* Description: Can be a standard web URL (e.g., "https://example.com"), a local file path (e.g., "file:///path/to/file.html"), or raw HTML content (e.g., "raw:<html>...</html>").
|
||||
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` specifying how this particular crawl run should be executed. See section 3.4 for details.
|
||||
* 3.3. Method `AsyncWebCrawler.arun_many()`
|
||||
* 3.3.1. Purpose: Executes crawl operations on a list of URLs or resources, often concurrently.
|
||||
* 3.3.2. Signature: `async def arun_many(self, urls: List[str], config: Optional[CrawlerRunConfig] = None, dispatcher: Optional[BaseDispatcher] = None, **kwargs) -> RunManyReturn`
|
||||
* 3.3.3. Parameters:
|
||||
* `urls (List[str])`: A list of target resources (URLs, file paths, raw HTML strings).
|
||||
* `config (Optional[CrawlerRunConfig])`: An instance of `CrawlerRunConfig` applied to all URLs in the list. See section 3.4 for details.
|
||||
* 3.4. Class `CrawlerRunConfig`
|
||||
* 3.4.1. Purpose: Configuration object for individual crawl runs, controlling aspects like content extraction, page interaction, and output formats.
|
||||
* 3.4.2. Key Parameters for Vibe Coding Context:
|
||||
* `markdown_generator (Optional[MarkdownGenerationStrategy])`:
|
||||
* Description: Specifies the strategy for generating Markdown.
|
||||
* Default: An instance of `DefaultMarkdownGenerator`.
|
||||
* Note for Vibe Coding: Can be `DefaultMarkdownGenerator(content_filter=PruningContentFilter())` for cleaner output.
|
||||
* `extraction_strategy (Optional[ExtractionStrategy])`:
|
||||
* Description: Specifies the strategy for extracting structured data.
|
||||
* Supported Strategies (for Vibe Coding):
|
||||
* `JsonCssExtractionStrategy`: For extracting data based on CSS selectors from structured HTML. Requires a `schema` dictionary.
|
||||
* `LLMExtractionStrategy`: For extracting data using an LLM, often for complex or unstructured HTML. Requires an `LLMConfig` and an `instruction` or Pydantic model defining the desired output.
|
||||
* `js_code (Optional[Union[str, List[str]]])`:
|
||||
* Description: JavaScript code (or a list of code snippets) to be executed on the page after it loads.
|
||||
* `wait_for (Optional[str])`:
|
||||
* Description: A CSS selector or JavaScript expression. The crawler will wait for this condition to be met after `js_code` execution before proceeding.
|
||||
* `session_id (Optional[str])`:
|
||||
* Description: An identifier used to maintain the state of a browser page across multiple `arun` calls. Essential for multi-step interactions on the same page.
|
||||
* `js_only (bool)`:
|
||||
* Description: If `True` (and `session_id` is used), only executes `js_code` on the existing page without a full navigation/reload. Default is `False`.
|
||||
* `screenshot (bool)`:
|
||||
* Description: If `True`, captures a screenshot of the page. Result in `CrawlResult.screenshot`. Default is `False`.
|
||||
* `pdf (bool)`:
|
||||
* Description: If `True`, generates a PDF of the page. Result in `CrawlResult.pdf`. Default is `False`.
|
||||
* `cache_mode (Optional[CacheMode])`:
|
||||
* Description: Controls caching behavior.
|
||||
* Type: `crawl4ai.cache_context.CacheMode` (Enum).
|
||||
* Common Values: `CacheMode.ENABLED`, `CacheMode.BYPASS`.
|
||||
* 3.5. Class `BrowserConfig`
|
||||
* 3.5.1. Purpose: Configures persistent browser-level settings for an `AsyncWebCrawler` instance.
|
||||
* 3.5.2. Key Parameters for Vibe Coding Context:
|
||||
* `headless (bool)`:
|
||||
* Description: If `True`, the browser runs without a visible UI. If `False`, the browser UI is shown.
|
||||
* Default: `True`.
|
||||
* `proxy_config (Optional[Union[ProxyConfig, Dict[str, str]]])`:
|
||||
* Description: Configuration for using a proxy server.
|
||||
* Structure (if dict): `{"server": "http://<host>:<port>", "username": "<user>", "password": "<pass>"}`.
|
||||
* `user_agent (Optional[str])`:
|
||||
* Description: Custom User-Agent string to be used by the browser.
|
||||
* 3.6. Class `LLMConfig`
|
||||
* 3.6.1. Purpose: Configures settings for interacting with Large Language Models, used by `LLMExtractionStrategy`.
|
||||
* 3.6.2. Key Parameters:
|
||||
* `provider (str)`:
|
||||
* Description: Specifies the LLM provider and model identifier.
|
||||
* Examples: "openai/gpt-4o-mini", "ollama/llama3", "anthropic/claude-3-opus-20240229".
|
||||
* `api_token (Optional[str])`:
|
||||
* Description: API key for the LLM provider. Can be the actual key or an environment variable reference (e.g., "env:OPENAI_API_KEY").
|
||||
* 3.7. Class `CrawlResult`
|
||||
* 3.7.1. Purpose: The data object returned by `crawl4ai` operations, containing the results and metadata of a crawl.
|
||||
* 3.7.2. Key Attributes:
|
||||
* `success (bool)`: `True` if the crawl was successful, `False` otherwise.
|
||||
* `markdown (MarkdownGenerationResult)`: Object containing Markdown representations.
|
||||
* `markdown.raw_markdown (str)`: Markdown generated directly from the cleaned HTML.
|
||||
* `markdown.fit_markdown (str)`: Markdown potentially further processed by content filters.
|
||||
* `extracted_content (Optional[str])`: JSON string of structured data if an `ExtractionStrategy` was used and successful.
|
||||
* `links (Links)`: Object containing `internal` and `external` lists of `Link` objects. Each `Link` object has `href`, `text`, `title`.
|
||||
* `media (Media)`: Object containing lists of `MediaItem` for `images`, `videos`, `audios`, and `tables`. Each `MediaItem` has `src`, `alt`, `score`, etc.
|
||||
* `screenshot (Optional[str])`: Base64 encoded string of the PNG screenshot, if `screenshot=True`.
|
||||
* `pdf (Optional[bytes])`: Raw bytes of the PDF document, if `pdf=True`.
|
||||
* `error_message (Optional[str])`: Description of the error if `success` is `False`.
|
||||
|
||||
## 4. Common `crawl4ai` Usage Patterns (Vibe Recipes Mapped to Components)
|
||||
|
||||
* 4.1. Task: Get Clean Markdown from a Page
|
||||
* 4.1.1. Description: Fetch a single webpage and convert its main content into clean Markdown.
|
||||
* 4.1.2. Key `crawl4ai` elements:
|
||||
* `AsyncWebCrawler`
|
||||
* `arun()` method.
|
||||
* `CrawlerRunConfig`:
|
||||
* `markdown_generator`: Typically `DefaultMarkdownGenerator()`. For very clean output, `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
|
||||
* 4.2. Task: Extract All Product Names and Prices from an E-commerce Category Page
|
||||
* 4.2.1. Description: Scrape structured data (e.g., product names, prices) from a page with repeating elements.
|
||||
* 4.2.2. Key `crawl4ai` elements:
|
||||
* `AsyncWebCrawler`
|
||||
* `arun()` method.
|
||||
* `CrawlerRunConfig`:
|
||||
* `extraction_strategy`: `JsonCssExtractionStrategy(schema={"name_field": "h2.product-title", "price_field": "span.price"})`. The schema's CSS selectors identify where to find the data.
|
||||
* 4.3. Task: Extract Key Information from an Article using an LLM
|
||||
* 4.3.1. Description: Use an LLM to parse an article and extract specific fields like author, date, and a summary into a JSON format.
|
||||
* 4.3.2. Key `crawl4ai` elements:
|
||||
* `AsyncWebCrawler`
|
||||
* `arun()` method.
|
||||
* `CrawlerRunConfig`:
|
||||
* `extraction_strategy`: `LLMExtractionStrategy(llm_config=..., instruction=..., schema=...)`.
|
||||
* `LLMConfig`: Instance specifying `provider` (e.g., "openai/gpt-4o-mini") and `api_token`.
|
||||
* Schema for `LLMExtractionStrategy`: Can be a Pydantic model definition or a dictionary describing the target JSON structure.
|
||||
* 4.4. Task: Crawl Multiple Pages of a Blog (Clicking "Next Page")
|
||||
* 4.4.1. Description: Navigate through paginated content by simulating clicks on "Next Page" or similar links, collecting data from each page.
|
||||
* 4.4.2. Key `crawl4ai` elements:
|
||||
* `AsyncWebCrawler`
|
||||
* Multiple sequential calls to `arun()` (typically in a loop).
|
||||
* `CrawlerRunConfig` (reused or cloned for each step):
|
||||
* `session_id`: A consistent identifier (e.g., "blog_pagination_session") to maintain the browser state across `arun` calls.
|
||||
* `js_code`: JavaScript to trigger the "Next Page" action (e.g., `document.querySelector('a.next-page-link').click();`).
|
||||
* `wait_for`: A CSS selector or JavaScript condition to ensure the new page content has loaded before proceeding.
|
||||
* `js_only=True`: For subsequent `arun` calls after the initial page load to indicate only JS interaction without full navigation.
|
||||
* 4.5. Task: Get Screenshots of a List of URLs
|
||||
* 4.5.1. Description: Capture screenshots for a batch of URLs.
|
||||
* 4.5.2. Key `crawl4ai` elements:
|
||||
* `AsyncWebCrawler`
|
||||
* `arun_many()` method.
|
||||
* `CrawlerRunConfig`:
|
||||
* `screenshot=True`.
|
||||
|
||||
## 5. Key Input Considerations for `crawl4ai` Operations (Inferred from Vibe Prompting Tips)
|
||||
|
||||
* 5.1. Clear Objective: `crawl4ai` operations are guided by the configuration. The configuration should reflect the user's goal (e.g., Markdown generation, specific data extraction, media capture).
|
||||
* 5.2. URL Input: The `arun` method requires a single `url` string. `arun_many` requires a `List[str]` of URLs.
|
||||
* 5.3. Structured Data Extraction Guidance:
|
||||
* For `JsonCssExtractionStrategy`, the `schema` parameter (a dictionary mapping desired field names to CSS selectors) is essential.
|
||||
* For `LLMExtractionStrategy`, the `instruction` parameter (natural language description of desired data) and/or a `schema` (Pydantic model or dictionary) are crucial, along with a configured `LLMConfig`.
|
||||
* 5.4. LLM Configuration: When `LLMExtractionStrategy` is used, an `LLMConfig` instance specifying `provider` and `api_token` (if applicable) must be provided.
|
||||
* 5.5. Dynamic Page Handling: For pages requiring interaction, `CrawlerRunConfig` parameters like `js_code`, `wait_for`, `session_id`, and `js_only` are used.
|
||||
|
||||
## 6. Expected Output Data from `crawl4ai` Operations (Accessing `CrawlResult`)
|
||||
|
||||
* 6.1. Generated Python Code: When using an AI assistant with `crawl4ai` context, the AI is expected to generate Python code that utilizes `crawl4ai` classes and methods.
|
||||
* 6.2. `CrawlResult` Object: The primary output of `arun()` and `arun_many()` calls.
|
||||
* `result.success (bool)`: Indicates if the individual crawl operation was successful.
|
||||
* `result.markdown.raw_markdown (str)` / `result.markdown.fit_markdown (str)`: Contains the generated Markdown content.
|
||||
* `result.extracted_content (Optional[str])`: Contains the JSON string of structured data if an extraction strategy was successful.
|
||||
* `result.links (Links)`: Provides access to lists of internal and external links.
|
||||
* `result.media (Media)`: Provides access to lists of images, videos, audio files, and tables.
|
||||
* `result.screenshot (Optional[str])`: Base64 encoded screenshot data.
|
||||
* `result.pdf (Optional[bytes])`: Raw PDF data.
|
||||
* `result.error_message (Optional[str])`: Error details if `success` is `False`.
|
||||
* 6.3. Files on Disk: Operations like screenshot or PDF generation, or custom code within an AI-generated script, might save files to the local disk (e.g., PNGs, PDFs, JSON files). The paths depend on the configuration or the custom code.
|
||||
|
||||
```
|
||||
@@ -1,643 +0,0 @@
|
||||
```markdown
|
||||
# Detailed Outline for crawl4ai - vibe Component
|
||||
|
||||
**Target Document Type:** reasoning
|
||||
**Target Output Filename Suggestion:** `llm_reasoning_vibe.md`
|
||||
**Library Version Context:** 0.6.3
|
||||
**Outline Generation Date:** 2025-05-24
|
||||
---
|
||||
|
||||
# Vibe Coding with Crawl4AI: A Guide for AI Coding Assistants
|
||||
|
||||
## 1. Introduction: What is "Vibe Coding" with Crawl4AI?
|
||||
|
||||
* **1.1. Explaining the Core Idea:**
|
||||
"Vibe Coding" with Crawl4AI is about empowering users, even those with limited direct coding experience, to build powerful web data applications. Instead of meticulously writing every line of Python, you focus on clearly describing your data extraction or web interaction goals to an AI coding assistant. This guide teaches you how to provide that AI assistant with the *right context* about Crawl4AI, so it can generate effective and efficient code for you. The goal is to translate your "vibe" – your high-level intent – into working Crawl4AI solutions quickly.
|
||||
|
||||
* **1.2. Who is this Guide For?**
|
||||
This guide is designed for:
|
||||
* **Users new to web scraping or `crawl4ai`:** If you prefer to articulate your needs in natural language and have an AI assistant handle the code generation, this guide is for you.
|
||||
* **Data analysts, researchers, and product managers:** Anyone who needs web data but doesn't want to get bogged down in the intricacies of web scraping libraries.
|
||||
* **Developers looking for rapid prototyping:** Even experienced developers can use "vibe coding" to quickly generate boilerplate or test ideas with `crawl4ai` before refining the code.
|
||||
* **AI Coding Assistant Users:** This guide helps you understand what information to feed your AI to get the best `crawl4ai` code.
|
||||
|
||||
* **1.3. How this Guide Helps You (and Your AI Assistant):**
|
||||
By understanding the concepts in this guide, you (and by extension, your AI assistant) will:
|
||||
* Grasp the high-level capabilities of `crawl4ai` that are most relevant for prompting an AI.
|
||||
* Learn the key terminology and building blocks of `crawl4ai` to include in your prompts for precise code generation.
|
||||
* Discover common "vibe recipes" – typical data extraction tasks and how to prompt an AI to solve them using `crawl4ai`.
|
||||
* Pick up effective prompting patterns to maximize the quality of AI-generated `crawl4ai` code.
|
||||
|
||||
## 2. High-Level Capabilities of Crawl4AI (What to Tell Your AI Assistant Crawl4AI Can Do)
|
||||
|
||||
When you're "vibe coding" with your AI assistant, you don't need to explain every nuance of `crawl4ai`. Instead, focus on what it *can do* for you. Here's a high-level overview of capabilities you can confidently tell your AI assistant about:
|
||||
|
||||
* **2.1. Fetching Any Webpage:**
|
||||
* **How to tell your AI:** "Crawl4AI can fetch the content of any webpage, whether it's a simple static page or a complex JavaScript-heavy application."
|
||||
* **Why it's important:** This establishes the fundamental capability – getting the raw HTML from a target URL.
|
||||
|
||||
* **2.2. Converting Web Content into Clean Markdown:**
|
||||
* **How to tell your AI:** "Crawl4AI is great at turning messy web pages into clean, readable Markdown. This is perfect if I need to summarize an article, feed content into another LLM for Q&A, or just get the main text."
|
||||
* **Why it's important:** Markdown is often the desired end-format for LLM-based tasks, and `crawl4ai` simplifies this conversion.
|
||||
|
||||
* **2.3. Extracting Specific Pieces of Information (Structured Data/JSON):**
|
||||
* **How to tell your AI:** "If I need specific data from a page, like all the product names and prices from an e-commerce site, or all the headlines from a news page, Crawl4AI can extract that and give it to me as structured JSON."
|
||||
* **Why it's important:** This highlights `crawl4ai`'s ability to go beyond simple text extraction and pull out specific, targeted information.
|
||||
|
||||
* **2.4. Crawling Multiple Pages at Once:**
|
||||
* **How to tell your AI:** "If I have a list of URLs, Crawl4AI can process them all efficiently, often in parallel, instead of me having to do them one by one."
|
||||
* **Why it's important:** This points to the library's capability for batch processing, saving significant time for larger tasks.
|
||||
|
||||
* **2.5. Taking Screenshots or Making PDFs of Pages:**
|
||||
* **How to tell your AI:** "Crawl4AI can take a screenshot of a webpage or save it as a PDF document. This is useful for archiving, visual checks, or sharing."
|
||||
* **Why it's important:** Visual capture and PDF generation are common requirements that `crawl4ai` handles easily.
|
||||
|
||||
* **2.6. Handling Simple Page Interactions:**
|
||||
* **How to tell your AI:** "For pages that need a bit of interaction, like clicking a 'load more' button to reveal more content or scrolling down, Crawl4AI can execute JavaScript to do that."
|
||||
* **Why it'S important:** This shows that `crawl4ai` isn't limited to static content and can handle basic dynamic page elements to access more data.
|
||||
|
||||
## 3. Key Building Blocks: Communicating Crawl4AI Concepts to Your AI Assistant
|
||||
|
||||
To get the best results from your AI assistant, you'll want to mention the key components of `crawl4ai` it should use. Think of these as the essential vocabulary for your "vibe" prompts.
|
||||
|
||||
* **3.1. `AsyncWebCrawler`**: The Main Tool
|
||||
* **3.1.1. Explaining its Role:**
|
||||
* **What to tell your AI:** "The main tool we'll use in `crawl4ai` is called `AsyncWebCrawler`. We always need to create an instance of this first. It's best to use it with an `async with` statement to make sure resources are managed correctly."
|
||||
* **Why it's important for the AI:** This directs the AI to the central class for all crawling operations and hints at the asynchronous nature and proper resource management.
|
||||
* **3.1.2. Basic Usage Pattern:**
|
||||
* **Vibe Prompt:** "Let's start a `crawl4ai` script. We'll need an `AsyncWebCrawler`."
|
||||
* **Expected AI action:** The AI should generate the basic setup:
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig # Common imports
|
||||
|
||||
async def main():
|
||||
# browser_config = BrowserConfig(...) # Optional, if needed
|
||||
async with AsyncWebCrawler(# config=browser_config
|
||||
) as crawler:
|
||||
# ... your crawl logic will go here ...
|
||||
pass
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
* **3.2. `arun(url, config)`**: Crawling a Single Page
|
||||
* **3.2.1. Explaining its Purpose:**
|
||||
* **What to tell your AI:** "To crawl a single page, we'll use the `arun` method of the `AsyncWebCrawler` instance. It takes the URL and an optional configuration object."
|
||||
* **Why it's important for the AI:** This points to the primary method for individual page crawls.
|
||||
* **3.2.2. Guiding the AI on `url` Specification:**
|
||||
* **What to tell your AI:** "The `url` for `arun` can be a regular web address (like 'https://example.com'), a local file path (if it starts with `file://`), or even raw HTML content (if it starts with `raw:`)."
|
||||
* **Vibe Prompts & AI Interpretation:**
|
||||
* "Crawl this URL: https://my-target-site.com" -> AI uses `url="https://my-target-site.com"`
|
||||
* "Process this local HTML file: /path/to/my/page.html" -> AI uses `url="file:///path/to/my/page.html"`
|
||||
* "I have some raw HTML I want to parse: `<html><body><h1>Test</h1></body></html>`" -> AI uses `url="raw:<html><body><h1>Test</h1></body></html>"`
|
||||
* **3.2.3. Emphasizing `config` (`CrawlerRunConfig`):**
|
||||
* **What to tell your AI:** "The most important part for customizing a crawl is the `config` parameter of `arun`. This is where we'll use `CrawlerRunConfig` to tell `crawl4ai` *how* to crawl, what to extract, and what to do with the page."
|
||||
* **Vibe Prompt:** "...and when you call `arun`, pass in a `CrawlerRunConfig` object with these settings: [describe settings]."
|
||||
|
||||
* **3.3. `arun_many(urls, config)`**: Crawling Multiple Pages
|
||||
* **3.3.1. Explaining its Purpose:**
|
||||
* **What to tell your AI:** "If I have a list of URLs, we should use `arun_many` instead of calling `arun` in a loop. It's more efficient for batch crawling."
|
||||
* **Why it's important for the AI:** Guides the AI to use the optimized method for multiple URLs, which handles concurrency internally.
|
||||
* **3.3.2. Vibe Prompt:**
|
||||
* "Crawl all these pages: https://example.com/page1, https://example.com/page2, https://example.com/page3."
|
||||
* **Expected AI action:**
|
||||
```python
|
||||
# ... inside async def main() and async with AsyncWebCrawler ...
|
||||
list_of_urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3"
|
||||
]
|
||||
# run_config = CrawlerRunConfig(...) # Define config for all URLs
|
||||
results = await crawler.arun_many(urls=list_of_urls, config=run_config)
|
||||
for result in results:
|
||||
# process each result
|
||||
pass
|
||||
```
|
||||
|
||||
* **3.4. `CrawlerRunConfig`**: Tailoring the Crawl (Simplified for Vibe Coding)
|
||||
* **3.4.1. Getting Markdown:**
|
||||
* **Basic Request Vibe:** "Get the Markdown for this page [URL]."
|
||||
* **AI's understanding:** The AI should know that `crawl4ai` produces Markdown by default, so a very simple `CrawlerRunConfig()` or even `None` might suffice if no other settings are needed.
|
||||
* **For Cleaner Markdown Vibe:** "I need very clean Markdown from [URL], suitable for feeding into another LLM. Remove boilerplate like headers, footers, and navigation."
|
||||
* **AI's understanding:** "Ah, the user wants cleaner output. I should suggest or use `DefaultMarkdownGenerator` combined with `PruningContentFilter`. The `PruningContentFilter` heuristically tries to remove common non-content sections."
|
||||
* **Code Snippet AI might generate:**
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_generator
|
||||
)
|
||||
# result = await crawler.arun(url="...", config=run_config)
|
||||
# print(result.markdown.fit_markdown) # Access the filtered markdown
|
||||
```
|
||||
* **Reasoning to convey (implicitly or explicitly):** `PruningContentFilter` is a good first step for general boilerplate removal. `DefaultMarkdownGenerator` then converts the filtered HTML to Markdown. Accessing `result.markdown.fit_markdown` gives this cleaner version.
|
||||
* **3.4.2. Extracting Specific Data (JSON):**
|
||||
* **3.4.2.1. Using `JsonCssExtractionStrategy` for Structured Pages:**
|
||||
* **When to suggest it (Vibe):** "The page I want to crawl ([URL]) has a list of items, like products on an e-commerce category page, and each item has a similar layout. I want to extract these repeating items into a JSON list."
|
||||
* **Information to provide the AI (Vibe):** "For each item, I want to get the 'product_name', which is usually in an `<h2>` tag, and the 'price', which seems to be in a `<span>` tag with a class like 'price-tag' or 'current-price'."
|
||||
* **AI's Role & Reasoning:** The AI should recognize this pattern and suggest `JsonCssExtractionStrategy`. It understands that the user is describing a schema. The AI's job is to translate "name from h2" into `{"name": "product_name", "selector": "h2", "type": "text"}` within the `fields` list of a schema dictionary, and the overall repeating item selector into `baseSelector`. The AI should also know to set `extraction_type="schema"` on `LLMExtractionStrategy` if it were using that for schema generation, but here it's direct CSS.
|
||||
* **Code Snippet AI might generate:**
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
# AI would help construct this schema based on user's description
|
||||
schema = {
|
||||
"name": "ProductList",
|
||||
"baseSelector": "div.product-item", # Example selector for each product block
|
||||
"fields": [
|
||||
{"name": "product_name", "selector": "h2.product-title", "type": "text"},
|
||||
{"name": "price", "selector": "span.price-tag", "type": "text"}
|
||||
]
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=schema)
|
||||
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||||
# result = await crawler.arun(url="...", config=run_config)
|
||||
# if result.success and result.extracted_content:
|
||||
# products = json.loads(result.extracted_content)
|
||||
# for product in products:
|
||||
# print(f"Name: {product.get('product_name')}, Price: {product.get('price')}")
|
||||
```
|
||||
* **3.4.2.2. Using `LLMExtractionStrategy` for Complex/Unclear Structures:**
|
||||
* **When to suggest it (Vibe):** "The page ([URL]) has the information I want, but it's not in a clear, repeating list, or it's mixed in with a lot of text. I need the AI to understand the content to pull out specific details." Or, "I want to extract information that requires some interpretation, like summarizing a paragraph."
|
||||
* **Information to provide the AI (Vibe):**
|
||||
* "Use `LLMExtractionStrategy` for this."
|
||||
* "The LLM I want to use is [LLM provider/model, e.g., 'openai/gpt-4o-mini'] and my API key is [YOUR_API_KEY_OR_ENV_VAR_NAME] (or tell it to look for an env var)."
|
||||
* **Option A (Describing fields):** "I need a JSON object with the following fields: 'author_name', 'article_publish_date', and a 'short_summary' (about 2 sentences)."
|
||||
* **Option B (Example JSON):** "The JSON output should look something like this: `{\"author\": \"Jane Doe\", \"published_on\": \"2024-05-23\", \"summary\": \"This article discusses...\"}`."
|
||||
* **Option C (Pydantic Model - more advanced but best for AI):** "Here's a Pydantic model that defines the structure I want: [Pydantic Class Code Snippet]. Use this for the schema."
|
||||
* **AI's Role & Reasoning:** The AI needs to construct an `LLMConfig` and an `LLMExtractionStrategy`. If the user provides field descriptions or an example JSON, the AI can generate a simple schema dictionary. If a Pydantic model is provided, the AI should use `MyPydanticModel.model_json_schema()` to create the schema for `LLMExtractionStrategy`. This strategy is powerful because it leverages the LLM's understanding.
|
||||
* **Code Snippet AI might generate (with Pydantic example):**
|
||||
```python
|
||||
from crawl4ai import CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel, Field # Assuming user might provide this
|
||||
|
||||
# User might provide this, or AI generates it from description
|
||||
class ArticleInfo(BaseModel):
|
||||
author_name: str = Field(description="The main author of the article")
|
||||
publication_date: str = Field(description="The date the article was published, e.g., YYYY-MM-DD")
|
||||
short_summary: str = Field(description="A concise 2-3 sentence summary of the article")
|
||||
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o-mini", # Or user's choice
|
||||
api_token="env:OPENAI_API_KEY" # Or direct key if user insists and understands risk
|
||||
)
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llm_config=llm_config,
|
||||
schema=ArticleInfo.model_json_schema(),
|
||||
# instruction="Extract author, publication date, and a summary." # Could also be used
|
||||
extraction_type="schema" # Important for Pydantic/JSON schema
|
||||
)
|
||||
run_config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
|
||||
# result = await crawler.arun(url="...", config=run_config)
|
||||
# if result.success and result.extracted_content:
|
||||
# article_data = json.loads(result.extracted_content) # Or ArticleInfo.model_validate_json(result.extracted_content)
|
||||
# print(article_data)
|
||||
```
|
||||
* **3.4.3. Interacting with Pages (Dynamic Content):**
|
||||
* **How to tell your AI (Vibe):** "This page ([URL]) loads more content when you scroll down, or when you click a 'Show More' button. `crawl4ai` needs to perform this interaction."
|
||||
* **For clicking (Vibe):** "To get all the data, we need to click the button with text 'Load All Comments'."
|
||||
* **AI's understanding:** This requires `js_code` to find and click the button. The AI should be guided that finding elements by text might involve more complex JS like `Array.from(document.querySelectorAll('button')).find(btn => btn.textContent.includes('Load All Comments')).click();`.
|
||||
* **For scrolling (Vibe):** "Scroll to the bottom of the page to make sure everything loads."
|
||||
* **AI's understanding:** `js_code` like `window.scrollTo(0, document.body.scrollHeight);`
|
||||
* **Ensuring actions complete (Vibe):** "After clicking 'Load More', wait for the new items to appear. They usually show up in a `div` with class `comment-list` and we expect more than 10 comments."
|
||||
* **AI's understanding:** Use `wait_for`. This can be a CSS selector (e.g., `wait_for="css:.comment-list .comment-item:nth-child(11)"`) or a JS condition (e.g., `wait_for="js:() => document.querySelectorAll('.comment-item').length > 10"`).
|
||||
* **For multi-step interactions on the same page (Vibe):** "I need to first click 'Agree to Cookies', then click 'Show Details', then extract the text. Make sure these happen on the same page view."
|
||||
* **AI's understanding:** "This requires a `session_id` to persist the page state across multiple `arun` calls. Each `arun` call will perform one step of the interaction."
|
||||
* **If only JS interaction is needed (Vibe):** "After the first page load, the next actions (like clicking 'Next Page') only update part of the page with JavaScript, they don't reload everything."
|
||||
* **AI's understanding:** "For these subsequent `arun` calls within the same session, set `js_only=True` in `CrawlerRunConfig` to prevent unnecessary full page navigations, making it faster."
|
||||
* **3.4.4. Taking Screenshots or PDFs:**
|
||||
* **Simple request (Vibe):** "Get me a screenshot of [URL]."
|
||||
* **AI's understanding:** Set `screenshot=True` in `CrawlerRunConfig`.
|
||||
* **Or (Vibe):** "I need a PDF version of [URL]."
|
||||
* **AI's understanding:** Set `pdf=True` in `CrawlerRunConfig`.
|
||||
* **3.4.5. Caching for Speed and Freshness:**
|
||||
* **For faster testing/development (Vibe):** "I'm testing my script for [URL]. Can you make it use the cache so it's faster after the first run?"
|
||||
* **AI's understanding:** Use `cache_mode=CacheMode.ENABLED`.
|
||||
* **Reasoning:** This saves time during development by re-using previously fetched content.
|
||||
* **To get the latest data (Vibe):** "I need the absolute latest version of [URL], don't use any cached data."
|
||||
* **AI's understanding:** Use `cache_mode=CacheMode.BYPASS`.
|
||||
* **Reasoning:** Ensures `crawl4ai` fetches a fresh copy from the web, ignoring any local cache. This is crucial for production or when data changes frequently.
|
||||
|
||||
* **3.5. `BrowserConfig`**: Browser-Level Settings (Simplified for Vibe Coding)
|
||||
* **3.5.1. When it's usually not needed:**
|
||||
* **What to tell your AI (or what it should infer):** "For most simple crawls where you just need to fetch a page and get its content, you probably don't need to worry about `BrowserConfig`. `crawl4ai` uses sensible defaults."
|
||||
* **3.5.2. Headless Mode:**
|
||||
* **Default behavior (Vibe):** "By default, `crawl4ai` runs 'headless', which means you won't see a browser window pop up. This is usually what we want for automated scripts."
|
||||
* **To see the browser (Vibe):** "I'm having trouble with [URL], and I want to see what the browser is doing. Can you make it run in non-headless mode?"
|
||||
* **AI's understanding:** The user wants to visually debug. Set `headless=False` in `BrowserConfig`.
|
||||
* **3.5.3. Using a Proxy:**
|
||||
* **How to ask (Vibe):** "I need to make `crawl4ai` use a proxy server for this crawl."
|
||||
* **Information to provide (Vibe):** "The proxy server address is `http://myproxy.server.com:8080`. If it needs a username and password, they are `myuser` and `mypass`. Make sure to use the `proxy_config` setting in `BrowserConfig`."
|
||||
* **AI's understanding:** The AI should construct a `ProxyConfig` object (or dictionary that `BrowserConfig` can handle) and pass it to `BrowserConfig`.
|
||||
* **3.5.4. Changing User Agent:**
|
||||
* **How to ask (Vibe):** "The website [URL] might be blocking default user agents. Can we make `crawl4ai` look like it's Firefox on a Mac?"
|
||||
* **Information to provide (Vibe):** "You can set a custom `user_agent` string in `BrowserConfig`. For example, 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) Gecko/20100101 Firefox/115.0'."
|
||||
* **AI's understanding:** The AI should pass the provided string to the `user_agent` parameter of `BrowserConfig`.
|
||||
|
||||
* **3.6. `LLMConfig`**: Configuring Language Models (Simplified for Vibe Coding)
|
||||
* **3.6.1. When it's needed:**
|
||||
* **What to tell your AI:** "If we're using `LLMExtractionStrategy` to extract structured data or `LLMContentFilter` to clean up content, we need to tell `crawl4ai` which language model to use. This is done with an `LLMConfig` object."
|
||||
* **3.6.2. Information to provide the AI (Vibe):**
|
||||
* **Model choice:** "For this task, let's use the `provider` called 'openai/gpt-4o-mini'." (Other examples: 'ollama/llama3', 'anthropic/claude-3-opus-20240229').
|
||||
* **API Key:** "My `api_token` for this provider is [YOUR_API_KEY_PLACEHOLDER]. (Best practice is to tell the AI to get it from an environment variable, e.g., 'env:OPENAI_API_KEY')."
|
||||
* **AI's understanding:** The AI will create an `LLMConfig(provider="...", api_token="...")` and pass it to the relevant strategy.
|
||||
* **Code Snippet AI might generate:**
|
||||
```python
|
||||
from crawl4ai import LLMConfig
|
||||
# For OpenAI
|
||||
llm_conf = LLMConfig(provider="openai/gpt-4o-mini", api_token="env:OPENAI_API_KEY")
|
||||
# For Ollama (locally running Llama3)
|
||||
# llm_conf = LLMConfig(provider="ollama/llama3") # api_token often not needed for local Ollama
|
||||
```
|
||||
|
||||
* **3.7. The `CrawlResult`**: Understanding What You Get Back
|
||||
* **3.7.1. Checking for Success:**
|
||||
* **What to tell your AI (Crucial Vibe):** "When `crawl4ai` finishes an `arun` or `arun_many` call, the most important first step is to check if it was successful. Tell the AI to always generate code that checks `result.success`. This will be `True` or `False`."
|
||||
* **If `False` (Vibe):** "If `result.success` is `False`, the AI should print or log `result.error_message` to tell us what went wrong."
|
||||
* **3.7.2. Accessing Markdown Content:**
|
||||
* **Raw Markdown (Vibe):** "The main text content of the page, converted to Markdown, is usually in `result.markdown.raw_markdown`."
|
||||
* **Filtered Markdown (Vibe):** "If we used a content filter (like `PruningContentFilter`), the cleaner, more focused Markdown will be in `result.markdown.fit_markdown`."
|
||||
* **3.7.3. Accessing Extracted Structured Data (JSON):**
|
||||
* **Where to find it (Vibe):** "If we asked `crawl4ai` to extract specific structured data (using `JsonCssExtractionStrategy` or `LLMExtractionStrategy`), that data will be in `result.extracted_content`."
|
||||
* **How to use it (Vibe):** "The `result.extracted_content` is a JSON string. To use it in Python, tell the AI to parse it using `json.loads()`."
|
||||
* **3.7.4. Accessing Links:**
|
||||
* **What it contains (Vibe):** "`result.links` is a dictionary. It usually has two keys: 'internal' and 'external'. Each of these is a list of links found on the page, with details like the link text and URL."
|
||||
* **3.7.5. Accessing Media Information:**
|
||||
* **What it contains (Vibe):** "`result.media` is another dictionary. It can contain lists of 'images', 'videos', or 'audios' found on the page, along with their URLs and sometimes alt text or descriptions."
|
||||
* **3.7.6. Accessing Screenshots and PDFs:**
|
||||
* **Screenshots (Vibe):** "If we set `screenshot=True` in the config, the screenshot image data (which is a base64 encoded string) will be in `result.screenshot`."
|
||||
* **PDFs (Vibe):** "If we set `pdf=True`, the PDF data (as raw bytes) will be in `result.pdf`."
|
||||
|
||||
## 4. Common "Vibe" Recipes & Prompting Patterns for AI Assistants
|
||||
|
||||
Here are some common scenarios and how you can prompt your AI assistant to use `crawl4ai` to solve them.
|
||||
|
||||
* **4.1. Recipe 1: Get Clean Markdown from a Page**
|
||||
* **4.1.1. User Vibe Prompt to AI:**
|
||||
* *Example 1 (Basic):* "Hey AI, can you use `crawl4ai` to get me the clean Markdown content of `https://blog.example.com/my-latest-post`?"
|
||||
* *Example 2 (Requesting more cleaning):* "AI, I need *very* clean Markdown from `https://some.news.site/article123`. It's for an LLM, so please try to remove ads, navigation, and footers."
|
||||
* **4.1.2. Key `crawl4ai` elements the AI should use (and why):**
|
||||
* **`AsyncWebCrawler`:** The starting point for any crawl.
|
||||
* **`arun`:** Since it's a single URL.
|
||||
* **`CrawlerRunConfig`:**
|
||||
* For Example 1, the default `DefaultMarkdownGenerator` might be sufficient.
|
||||
* For Example 2 ("very clean"), the AI should be guided to use `DefaultMarkdownGenerator(content_filter=PruningContentFilter())`.
|
||||
* **Reasoning:** `PruningContentFilter` is designed to heuristically remove common boilerplate like headers, footers, and navigation bars before Markdown conversion, leading to cleaner text suitable for LLMs.
|
||||
* **4.1.3. Expected Output from AI-generated code:**
|
||||
* A Python script that initializes `AsyncWebCrawler`, calls `arun` with the appropriate URL and config.
|
||||
* The script should then access and print (or save) `result.markdown.raw_markdown` (for basic) or `result.markdown.fit_markdown` (if `PruningContentFilter` was used).
|
||||
* **Code Example (for "very clean"):**
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
|
||||
async def get_clean_markdown(url_to_crawl):
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter()
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_generator,
|
||||
cache_mode="BYPASS" # Ensure fresh crawl for demo
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||
if result.success:
|
||||
print(f"--- Fit Markdown for {url_to_crawl} ---")
|
||||
print(result.markdown.fit_markdown)
|
||||
# You might also want to see raw_markdown to compare
|
||||
# print(f"--- Raw Markdown for {url_to_crawl} ---")
|
||||
# print(result.markdown.raw_markdown)
|
||||
else:
|
||||
print(f"Failed to crawl {url_to_crawl}: {result.error_message}")
|
||||
|
||||
# asyncio.run(get_clean_markdown("https://en.wikipedia.org/wiki/Python_(programming_language)"))
|
||||
```
|
||||
|
||||
* **4.2. Recipe 2: Extract All Product Names and Prices from an E-commerce Category Page**
|
||||
* **4.2.1. User Vibe Prompt to AI:**
|
||||
* *Example:* "AI, I need to use `crawl4ai` to get all product names and their prices from `https://www.example-store.com/laptops`. On that page, product names look like they are in `<h3>` tags with a class `product-title`, and prices are in `<span>` elements with the class `final-price`."
|
||||
* **4.2.2. Key `crawl4ai` elements AI should use (and why):**
|
||||
* **`AsyncWebCrawler`**, **`arun`**.
|
||||
* **`CrawlerRunConfig`** with **`JsonCssExtractionStrategy`**.
|
||||
* **Reasoning:** The user described a page with repeating structured items. `JsonCssExtractionStrategy` is ideal for this as it uses CSS selectors to pinpoint the data. The AI's task is to translate the user's description of element locations into a valid schema for the strategy.
|
||||
* The AI needs to understand that `baseSelector` in the schema should target the container for each product, and `fields` will target individual pieces of data within that container.
|
||||
* **4.2.3. Expected Output from AI-generated code:**
|
||||
* A Python script that defines the schema dictionary.
|
||||
* Initializes `JsonCssExtractionStrategy` with this schema.
|
||||
* Passes the strategy to `CrawlerRunConfig`.
|
||||
* After `arun`, it parses `result.extracted_content` using `json.loads()` and likely iterates through the list of extracted product dictionaries.
|
||||
* **Code Example:**
|
||||
```python
|
||||
import asyncio
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def extract_products(url_to_crawl):
|
||||
# AI helps create this schema based on user's description
|
||||
product_schema = {
|
||||
"name": "LaptopList",
|
||||
"baseSelector": "div.product-listing-item", # Hypothetical selector for each product's container
|
||||
"fields": [
|
||||
{"name": "product_name", "selector": "h3.product-title", "type": "text"},
|
||||
{"name": "price", "selector": "span.final-price", "type": "text"}
|
||||
]
|
||||
}
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=product_schema)
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy,
|
||||
cache_mode="BYPASS"
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||
if result.success and result.extracted_content:
|
||||
products = json.loads(result.extracted_content)
|
||||
print(f"Found {len(products)} products:")
|
||||
for i, product in enumerate(products[:3]): # Print first 3
|
||||
print(f" Product {i+1}: Name='{product.get('product_name')}', Price='{product.get('price')}'")
|
||||
else:
|
||||
print(f"Failed to extract products from {url_to_crawl}: {result.error_message}")
|
||||
|
||||
# asyncio.run(extract_products("https://www.example-store.com/laptops")) # Replace with a real URL for testing
|
||||
```
|
||||
|
||||
* **4.3. Recipe 3: Extract Key Information from an Article using an LLM**
|
||||
* **4.3.1. User Vibe Prompt to AI:**
|
||||
* *Example:* "AI, I want `crawl4ai` to read this article: `https://example.com/news/ai-breakthrough`. Use `openai/gpt-4o-mini` to extract the author's name, the publication date, and a short (2-3 sentence) summary. The output should be JSON. My OpenAI API key is in the `OPENAI_API_KEY` environment variable."
|
||||
* **4.3.2. Key `crawl4ai` elements AI should use (and why):**
|
||||
* **`AsyncWebCrawler`**, **`arun`**.
|
||||
* **`CrawlerRunConfig`** with **`LLMExtractionStrategy`**.
|
||||
* **`LLMConfig`**: To specify the `provider` ("openai/gpt-4o-mini") and `api_token` ("env:OPENAI_API_KEY").
|
||||
* **Reasoning:** The task requires understanding and summarization, making `LLMExtractionStrategy` suitable. The AI needs to construct a schema (either a simple dictionary or a Pydantic model `model_json_schema()`) that tells the LLM what fields to populate. The instruction to the LLM will be implicitly derived from the schema field descriptions or can be explicitly provided.
|
||||
* **4.3.3. Expected Output from AI-generated code:**
|
||||
* Python script that defines a Pydantic model (or a dictionary schema).
|
||||
* Initializes `LLMConfig` and `LLMExtractionStrategy`.
|
||||
* Parses `result.extracted_content`.
|
||||
* **Code Example (using Pydantic):**
|
||||
```python
|
||||
import asyncio
|
||||
import json
|
||||
import os
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class ArticleDetails(BaseModel):
|
||||
author_name: str = Field(..., description="The main author of the article.")
|
||||
publication_date: str = Field(..., description="The date the article was published (e.g., YYYY-MM-DD).")
|
||||
summary: str = Field(..., description="A concise 2-3 sentence summary of the article.")
|
||||
|
||||
async def extract_article_info_llm(url_to_crawl):
|
||||
if not os.getenv("OPENAI_API_KEY"): # Or your specific key variable
|
||||
print("API key environment variable not set. Skipping LLM extraction.")
|
||||
return
|
||||
|
||||
llm_config = LLMConfig(
|
||||
provider="openai/gpt-4o-mini", # Use a cost-effective model for demos
|
||||
api_token="env:OPENAI_API_KEY"
|
||||
)
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llm_config=llm_config,
|
||||
schema=ArticleDetails.model_json_schema(),
|
||||
extraction_type="schema" # Crucial for Pydantic/JSON schema
|
||||
)
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=extraction_strategy,
|
||||
cache_mode="BYPASS"
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(url=url_to_crawl, config=run_config)
|
||||
if result.success and result.extracted_content:
|
||||
try:
|
||||
article_data = ArticleDetails.model_validate_json(result.extracted_content)
|
||||
print(f"Extracted Article Info for {url_to_crawl}:")
|
||||
print(json.dumps(article_data.model_dump(), indent=2))
|
||||
except Exception as e:
|
||||
print(f"Error parsing LLM output: {e}")
|
||||
print(f"Raw LLM output: {result.extracted_content}")
|
||||
else:
|
||||
print(f"Failed to extract article info from {url_to_crawl}: {result.error_message}")
|
||||
|
||||
# asyncio.run(extract_article_info_llm("https://www.example.com/news/ai-breakthrough")) # Replace with real article
|
||||
```
|
||||
|
||||
* **4.4. Recipe 4: Crawl the first 3 pages of a blog (clicking "Next Page")**
|
||||
* **4.4.1. User Vibe Prompt to AI:**
|
||||
* *Example:* "AI, can you use `crawl4ai` to get the Markdown from the first 3 pages of `https://myblog.example.com/archive`? To get to the next page, I think you need to click a link that says 'Older Posts'."
|
||||
* **4.4.2. Key `crawl4ai` elements AI should use (and why):**
|
||||
* **`AsyncWebCrawler`**.
|
||||
* **Multiple `arun` calls** in a loop (3 iterations).
|
||||
* **`CrawlerRunConfig`** with:
|
||||
* `session_id="blog_session"`: **Crucial** for maintaining the browser state (cookies, current page) across the multiple clicks.
|
||||
* `js_code`: JavaScript to find and click the "Older Posts" link. The AI might need to generate robust JS like:
|
||||
`Array.from(document.querySelectorAll('a')).find(a => a.textContent.trim() === 'Older Posts')?.click();`
|
||||
* `wait_for`: After clicking, wait for a condition that indicates the next page has loaded (e.g., a specific element on the new page, or a change in an existing element). This can be tricky and might require some iteration. A simple `wait_for` for a few seconds could also be a starting point, like `wait_for=3000` (milliseconds).
|
||||
* `js_only=True`: For the second and third `arun` calls, after the initial page load. This tells `crawl4ai` to only execute the JS and not perform a full new navigation to the original URL.
|
||||
* **4.4.3. Expected Output from AI-generated code:**
|
||||
* A Python script with a loop that calls `arun` three times.
|
||||
* The script should collect and potentially print or save the Markdown from each page.
|
||||
* **Code Example:**
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def crawl_blog_pages(start_url, num_pages=3):
|
||||
session_id = "my_blog_crawl_session"
|
||||
all_markdowns = []
|
||||
|
||||
# JavaScript to find and click "Older Posts" (example)
|
||||
js_click_older_posts = """
|
||||
(() => {
|
||||
const links = Array.from(document.querySelectorAll('a'));
|
||||
const olderPostsLink = links.find(a => a.textContent.trim().toLowerCase() === 'older posts');
|
||||
if (olderPostsLink) {
|
||||
olderPostsLink.click();
|
||||
return true; // Indicate click was attempted
|
||||
}
|
||||
return false; // Indicate link not found
|
||||
})();
|
||||
"""
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
current_url = start_url
|
||||
for i in range(num_pages):
|
||||
print(f"Crawling page {i+1}...")
|
||||
run_config_dict = {
|
||||
"session_id": session_id,
|
||||
"cache_mode": CacheMode.BYPASS,
|
||||
"wait_for": 2000 # Wait 2s for content to potentially load after click
|
||||
}
|
||||
if i > 0: # For subsequent pages, click and don't re-navigate
|
||||
run_config_dict["js_code"] = js_click_older_posts
|
||||
run_config_dict["js_only"] = True
|
||||
|
||||
run_config = CrawlerRunConfig(**run_config_dict)
|
||||
|
||||
result = await crawler.arun(url=current_url, config=run_config) # URL is mainly for context in js_only
|
||||
|
||||
if result.success:
|
||||
print(f" Page {i+1} ({result.url}) - Markdown length: {len(result.markdown.raw_markdown)}")
|
||||
all_markdowns.append({"url": result.url, "markdown": result.markdown.raw_markdown})
|
||||
if i < num_pages - 1 and i > 0 and not run_config_dict.get("js_code_executed_successfully", True): # Hypothetical flag
|
||||
print(f" 'Older Posts' link might not have been found or clicked on page {i+1}. Stopping.")
|
||||
break
|
||||
else:
|
||||
print(f" Failed to crawl page {i+1}: {result.error_message}")
|
||||
break
|
||||
|
||||
# Important: Clean up the session
|
||||
await crawler.crawler_strategy.kill_session(session_id)
|
||||
|
||||
print(f"\nCollected markdown for {len(all_markdowns)} pages.")
|
||||
# For demo, print first 100 chars of each
|
||||
# for i, md_data in enumerate(all_markdowns):
|
||||
# print(f"\n--- Page {i+1} URL: {md_data['url']} ---")
|
||||
# print(md_data['markdown'][:100] + "...")
|
||||
|
||||
# asyncio.run(crawl_blog_pages("YOUR_BLOG_START_URL_HERE"))
|
||||
```
|
||||
|
||||
* **4.5. Recipe 5: Get Screenshots of a List of URLs**
|
||||
* **4.5.1. User Vibe Prompt to AI:**
|
||||
* *Example:* "AI, use `crawl4ai` to take a screenshot of each of these pages: `https://example.com`, `https://crawl4ai.com`, `https://github.com`. Save them as `example_com.png`, `crawl4ai_com.png`, and `github_com.png`."
|
||||
* **4.5.2. Key `crawl4ai` elements AI should use (and why):**
|
||||
* **`AsyncWebCrawler`**.
|
||||
* **`arun_many`**: Efficient for processing a list of URLs.
|
||||
* **`CrawlerRunConfig`** with `screenshot=True`.
|
||||
* **Reasoning:** `arun_many` will process each URL with the same config. The AI needs to add logic to iterate through the results and save each `result.screenshot` (which is base64 data) to a uniquely named file.
|
||||
* **4.5.3. Expected Output from AI-generated code:**
|
||||
* Python script.
|
||||
* PNG files saved to the current directory or a specified output directory.
|
||||
* **Code Example:**
|
||||
```python
|
||||
import asyncio
|
||||
import base64
|
||||
import os
|
||||
from urllib.parse import urlparse
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def take_screenshots(urls_to_screenshot):
|
||||
run_config = CrawlerRunConfig(
|
||||
screenshot=True,
|
||||
cache_mode=CacheMode.BYPASS # Get fresh screenshots
|
||||
)
|
||||
output_dir = "screenshots_output"
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results = await crawler.arun_many(urls=urls_to_screenshot, config=run_config)
|
||||
|
||||
for result in results:
|
||||
if result.success and result.screenshot:
|
||||
# Create a filename from the URL
|
||||
parsed_url = urlparse(result.url)
|
||||
filename = "".join(c if c.isalnum() else '_' for c in parsed_url.netloc + parsed_url.path)
|
||||
if not filename or filename == "_": # Handle root path or empty paths
|
||||
filename = "homepage"
|
||||
filepath = os.path.join(output_dir, f"{filename}.png")
|
||||
|
||||
try:
|
||||
screenshot_data = base64.b64decode(result.screenshot)
|
||||
with open(filepath, "wb") as f:
|
||||
f.write(screenshot_data)
|
||||
print(f"Screenshot saved to {filepath}")
|
||||
except Exception as e:
|
||||
print(f"Error saving screenshot for {result.url}: {e}")
|
||||
elif not result.success:
|
||||
print(f"Failed to crawl {result.url}: {result.error_message}")
|
||||
elif not result.screenshot:
|
||||
print(f"Crawled {result.url} but no screenshot data was returned.")
|
||||
|
||||
# urls = ["https://example.com", "https://crawl4ai.com", "https://github.com"]
|
||||
# asyncio.run(take_screenshots(urls))
|
||||
```
|
||||
|
||||
## 5. Tips for Effective Prompting Your AI Assistant for Crawl4AI Tasks
|
||||
|
||||
To get the best code from your AI assistant when working with `crawl4ai`, consider these prompting tips:
|
||||
|
||||
* **5.1. Be Clear About Your Goal:**
|
||||
* Start with a high-level objective. Instead of just "Crawl a page," say "I need to extract all article titles from the homepage of this news site," or "Get the main content of this blog post as clean Markdown," or "Take full-page screenshots of these product pages." This helps the AI choose the right strategies and configurations.
|
||||
|
||||
* **5.2. Always Provide the URL(s):**
|
||||
* This seems obvious, but be precise. If it's a list, provide the list.
|
||||
* Remember to use the `file:///` prefix for local files (e.g., `file:///Users/me/Documents/mypage.html`) and `raw:` for inline HTML (e.g., `raw:<html><body>...</body></html>`). The AI might not always infer this correctly without a hint.
|
||||
|
||||
* **5.3. Describe Data for Extraction (Especially for `JsonCssExtractionStrategy` or `LLMExtractionStrategy`):**
|
||||
* **What you want:** List the specific pieces of information you need (e.g., "product name," "price," "author," "publication_date," "article summary").
|
||||
* **Where to find it (for CSS/XPath):** If you have an idea of the HTML structure, share it. "Product names seem to be in `<h2>` tags with class `item-title`." "The price is always in a `<span>` element right after a `<strong>` tag that says 'Price:'." This helps the AI generate accurate CSS selectors or XPath expressions for `JsonCssExtractionStrategy`.
|
||||
* **Desired structure (for LLM):** For `LLMExtractionStrategy`, tell the AI the desired JSON structure. "I want a list of objects, where each object has a 'title' and a 'link'." Or even better, "Can you define a Pydantic model for me that has 'title' as a string and 'link' as a string, and then use that for extraction?"
|
||||
|
||||
* **5.4. Specify LLM Details for LLM Extraction or Filtering:**
|
||||
* **Model/Provider:** "Use `openai/gpt-4o-mini` for this extraction." or "I want to use my local Ollama model, `ollama/llama3`."
|
||||
* **API Key:** Clearly state where the API key should come from. "My API key is in the environment variable `OPENAI_API_KEY`." (This is safer than putting the key directly in the prompt). If you must provide it directly, be aware of the security implications.
|
||||
|
||||
* **5.5. Mention Page Dynamics and Interactions:**
|
||||
* "This page loads more items when you scroll down."
|
||||
* "You need to click the 'View All Reviews' button to see all the reviews."
|
||||
* "The data I want only appears after selecting 'Category X' from a dropdown."
|
||||
* This signals to the AI that `js_code`, `wait_for`, and possibly `session_id` will be necessary. You might need to guide it on *how* to identify the elements to interact with (e.g., "The 'Load More' button has the ID `load-more-btn`").
|
||||
|
||||
* **5.6. Iterative Refinement is Key:**
|
||||
* Your first prompt might not yield perfect code. That's okay!
|
||||
* Treat it as a conversation. If the AI-generated code misses something or makes a mistake:
|
||||
* "That was close, but it missed extracting the product ratings. Ratings seem to be in a `div` with class `star-rating` inside each product item."
|
||||
* "The script timed out. Can we increase the `page_timeout` in `CrawlerRunConfig` to 90 seconds?"
|
||||
* "It didn't click the 'Next' button correctly. The button actually has the text '>>' instead of 'Next Page'."
|
||||
* Provide the error messages or incorrect output back to the AI for context.
|
||||
|
||||
## 6. What to Expect as Output (From AI-Generated Code)
|
||||
|
||||
When you use "Vibe Coding" with an AI assistant for `crawl4ai`, you should generally expect the following:
|
||||
|
||||
* **6.1. Python Code:**
|
||||
* The primary output will be a Python script that uses the `crawl4ai` library.
|
||||
* It should include necessary imports like `asyncio`, `AsyncWebCrawler`, `CrawlerRunConfig`, etc.
|
||||
* It will typically define an `async def main():` function and run it with `asyncio.run(main())`.
|
||||
|
||||
* **6.2. Accessing the `CrawlResult`:**
|
||||
* The core of the script will involve one or more calls to `crawler.arun(...)` or `crawler.arun_many(...)`.
|
||||
* These calls return `CrawlResult` objects (or a list of them for `arun_many`).
|
||||
* The AI-generated code should then show you how to access the specific data you asked for from these `CrawlResult` objects. For example:
|
||||
* `print(result.markdown.raw_markdown)` or `print(result.markdown.fit_markdown)`
|
||||
* `data = json.loads(result.extracted_content)`
|
||||
* `screenshot_data = base64.b64decode(result.screenshot)`
|
||||
* `if not result.success: print(result.error_message)`
|
||||
|
||||
* **6.3. Files Saved to Disk (if requested):**
|
||||
* If your vibe prompt included saving data (e.g., "save the screenshots as PNG files," "write the extracted JSON to `output.json`"), the AI-generated code should include the Python logic to perform these file operations.
|
||||
* **Example for saving a screenshot:**
|
||||
```python
|
||||
import base64
|
||||
# ... inside your async function, after getting 'result' ...
|
||||
if result.success and result.screenshot:
|
||||
with open("myscreenshot.png", "wb") as f:
|
||||
f.write(base64.b64decode(result.screenshot))
|
||||
print("Screenshot saved to myscreenshot.png")
|
||||
```
|
||||
|
||||
## 7. Conclusion: Vibe Your Way to Web Data!
|
||||
|
||||
* **7.1. Recap of "Vibe Coding" Benefits with `crawl4ai`:**
|
||||
"Vibe Coding" empowers you to leverage the full capabilities of `crawl4ai` without needing to memorize every API detail. By understanding the high-level concepts and key building blocks outlined in this guide, you can effectively communicate your data extraction and web interaction needs to an AI coding assistant. This leads to faster prototyping, easier access to web data for non-programmers, and a more intuitive way to build data-driven applications.
|
||||
|
||||
* **7.2. Encouragement to experiment with different prompts and `crawl4ai` features:**
|
||||
The key to successful "Vibe Coding" is experimentation. Try different ways of describing your goals to your AI assistant. If the first attempt doesn't yield the perfect `crawl4ai` code, refine your prompt with more specific details or hints. Don't be afraid to mention `crawl4ai` specific terms like `CrawlerRunConfig`, `js_code`, or `LLMExtractionStrategy` – this guide has equipped you with the essential vocabulary. The more context you provide, the better the AI can assist you.
|
||||
|
||||
* **7.3. Pointers to more detailed `crawl4ai` documentation for users who want to learn direct coding or advanced configurations:**
|
||||
While "Vibe Coding" is a great way to get started and be productive quickly, you might eventually want to dive deeper into `crawl4ai`'s capabilities or fine-tune the generated code yourself. For that, refer to:
|
||||
* **The Official Crawl4AI API Reference:** (Assuming this exists or will exist - replace with actual link if available, e.g., `https://docs.crawl4ai.com/api/`) For detailed information on all classes, methods, and parameters.
|
||||
* **Specific "Reasoning & Problem-Solving" Guides:** Check the `crawl4ai` documentation for other guides that delve into specific components like advanced `CrawlerRunConfig` options, deep crawling strategies, or custom extraction techniques.
|
||||
|
||||
Happy Vibe Coding, and may your web data adventures be fruitful!
|
||||
```
|
||||
144
docs/md_v2/assets/test/toc.js
Normal file
144
docs/md_v2/assets/test/toc.js
Normal file
@@ -0,0 +1,144 @@
|
||||
// ==== File: assets/toc.js ====
|
||||
|
||||
document.addEventListener('DOMContentLoaded', () => {
|
||||
const mainContent = document.getElementById('terminal-mkdocs-main-content');
|
||||
const tocContainer = document.getElementById('toc-sidebar');
|
||||
const mainGrid = document.querySelector('.terminal-mkdocs-main-grid'); // Get the flex container
|
||||
|
||||
if (!mainContent) {
|
||||
console.warn("TOC Generator: Main content area '#terminal-mkdocs-main-content' not found.");
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Create ToC container if it doesn't exist ---
|
||||
let tocElement = tocContainer;
|
||||
if (!tocElement) {
|
||||
if (!mainGrid) {
|
||||
console.warn("TOC Generator: Flex container '.terminal-mkdocs-main-grid' not found to append ToC.");
|
||||
return;
|
||||
}
|
||||
tocElement = document.createElement('aside');
|
||||
tocElement.id = 'toc-sidebar';
|
||||
tocElement.style.display = 'none'; // Keep hidden initially
|
||||
// Append it as the last child of the flex grid
|
||||
mainGrid.appendChild(tocElement);
|
||||
console.info("TOC Generator: Created '#toc-sidebar' element.");
|
||||
}
|
||||
|
||||
// --- Find Headings (h2, h3, h4 are common for ToC) ---
|
||||
const headings = mainContent.querySelectorAll('h2, h3, h4');
|
||||
if (headings.length === 0) {
|
||||
console.info("TOC Generator: No headings found on this page. ToC not generated.");
|
||||
tocElement.style.display = 'none'; // Ensure it's hidden
|
||||
return;
|
||||
}
|
||||
|
||||
// --- Generate ToC List ---
|
||||
const tocList = document.createElement('ul');
|
||||
const observerTargets = []; // Store headings for IntersectionObserver
|
||||
|
||||
headings.forEach((heading, index) => {
|
||||
// Ensure heading has an ID for linking
|
||||
if (!heading.id) {
|
||||
// Create a simple slug-like ID
|
||||
heading.id = `toc-heading-${index}-${heading.textContent.toLowerCase().replace(/\s+/g, '-').replace(/[^a-z0-9-]/g, '')}`;
|
||||
}
|
||||
|
||||
const listItem = document.createElement('li');
|
||||
const link = document.createElement('a');
|
||||
|
||||
link.href = `#${heading.id}`;
|
||||
link.textContent = heading.textContent;
|
||||
|
||||
// Add class for styling based on heading level
|
||||
const level = parseInt(heading.tagName.substring(1), 10); // Get 2, 3, or 4
|
||||
listItem.classList.add(`toc-level-${level}`);
|
||||
|
||||
listItem.appendChild(link);
|
||||
tocList.appendChild(listItem);
|
||||
observerTargets.push(heading); // Add to observer list
|
||||
});
|
||||
|
||||
// --- Populate and Show ToC ---
|
||||
// Optional: Add a title
|
||||
const tocTitle = document.createElement('h4');
|
||||
tocTitle.textContent = 'On this page'; // Customize title if needed
|
||||
|
||||
tocElement.innerHTML = ''; // Clear previous content if any
|
||||
tocElement.appendChild(tocTitle);
|
||||
tocElement.appendChild(tocList);
|
||||
tocElement.style.display = ''; // Show the ToC container
|
||||
|
||||
console.info(`TOC Generator: Generated ToC with ${headings.length} items.`);
|
||||
|
||||
// --- Scroll Spy using Intersection Observer ---
|
||||
const tocLinks = tocElement.querySelectorAll('a');
|
||||
let activeLink = null; // Keep track of the current active link
|
||||
|
||||
const observerOptions = {
|
||||
// Observe changes relative to the viewport, offset by the header height
|
||||
// Negative top margin pushes the intersection trigger point down
|
||||
// Negative bottom margin ensures elements low on the screen can trigger before they exit
|
||||
rootMargin: `-${getComputedStyle(document.documentElement).getPropertyValue('--header-height').trim()} 0px -60% 0px`,
|
||||
threshold: 0 // Trigger as soon as any part enters/exits the boundary
|
||||
};
|
||||
|
||||
const observerCallback = (entries) => {
|
||||
let topmostVisibleHeading = null;
|
||||
|
||||
entries.forEach(entry => {
|
||||
const link = tocElement.querySelector(`a[href="#${entry.target.id}"]`);
|
||||
if (!link) return;
|
||||
|
||||
// Check if the heading is intersecting (partially or fully visible within rootMargin)
|
||||
if (entry.isIntersecting) {
|
||||
// Among visible headings, find the one closest to the top edge (within the rootMargin)
|
||||
if (!topmostVisibleHeading || entry.boundingClientRect.top < topmostVisibleHeading.boundingClientRect.top) {
|
||||
topmostVisibleHeading = entry.target;
|
||||
}
|
||||
}
|
||||
});
|
||||
|
||||
// If we found a topmost visible heading, activate its link
|
||||
if (topmostVisibleHeading) {
|
||||
const newActiveLink = tocElement.querySelector(`a[href="#${topmostVisibleHeading.id}"]`);
|
||||
if (newActiveLink && newActiveLink !== activeLink) {
|
||||
// Remove active class from previous link
|
||||
if (activeLink) {
|
||||
activeLink.classList.remove('active');
|
||||
activeLink.parentElement.classList.remove('active-parent'); // Optional parent styling
|
||||
}
|
||||
// Add active class to the new link
|
||||
newActiveLink.classList.add('active');
|
||||
newActiveLink.parentElement.classList.add('active-parent'); // Optional parent styling
|
||||
activeLink = newActiveLink;
|
||||
|
||||
// Optional: Scroll the ToC sidebar to keep the active link visible
|
||||
// newActiveLink.scrollIntoView({ behavior: 'smooth', block: 'nearest' });
|
||||
}
|
||||
}
|
||||
// If no headings are intersecting (scrolled past the last one?), maybe deactivate all
|
||||
// Or keep the last one active - depends on desired behavior. Current logic keeps last active.
|
||||
};
|
||||
|
||||
const observer = new IntersectionObserver(observerCallback, observerOptions);
|
||||
|
||||
// Observe all target headings
|
||||
observerTargets.forEach(heading => observer.observe(heading));
|
||||
|
||||
// Initial check in case a heading is already in view on load
|
||||
// (Requires slight delay for accurate layout calculation)
|
||||
setTimeout(() => {
|
||||
observerCallback(observer.takeRecords()); // Process initial state
|
||||
}, 100);
|
||||
|
||||
// move footer and the hr before footer to the end of the main content
|
||||
const footer = document.querySelector('footer');
|
||||
const hr = footer.previousElementSibling;
|
||||
if (hr && hr.tagName === 'HR') {
|
||||
mainContent.appendChild(hr);
|
||||
}
|
||||
mainContent.appendChild(footer);
|
||||
console.info("TOC Generator: Footer moved to the end of the main content.");
|
||||
|
||||
});
|
||||
Reference in New Issue
Block a user