Enhance crawler capabilities and documentation
- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
This commit is contained in:
128
docs/examples/browser_optimization_example.py
Normal file
128
docs/examples/browser_optimization_example.py
Normal file
@@ -0,0 +1,128 @@
|
||||
"""
|
||||
This example demonstrates optimal browser usage patterns in Crawl4AI:
|
||||
1. Sequential crawling with session reuse
|
||||
2. Parallel crawling with browser instance reuse
|
||||
3. Performance optimization settings
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
|
||||
async def crawl_sequential(urls: List[str]):
|
||||
"""
|
||||
Sequential crawling using session reuse - most efficient for moderate workloads
|
||||
"""
|
||||
print("\n=== Sequential Crawling with Session Reuse ===")
|
||||
|
||||
# Configure browser with optimized settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_args=[
|
||||
"--disable-gpu", # Disable GPU acceleration
|
||||
"--disable-dev-shm-usage", # Disable /dev/shm usage
|
||||
"--no-sandbox", # Required for Docker
|
||||
],
|
||||
viewport={
|
||||
"width": 800,
|
||||
"height": 600,
|
||||
}, # Smaller viewport for better performance
|
||||
)
|
||||
|
||||
# Configure crawl settings
|
||||
crawl_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
||||
),
|
||||
)
|
||||
|
||||
# Create single crawler instance
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
session_id = "session1" # Use same session for all URLs
|
||||
for url in urls:
|
||||
result = await crawler.arun(
|
||||
url=url,
|
||||
config=crawl_config,
|
||||
session_id=session_id, # Reuse same browser tab
|
||||
)
|
||||
if result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
async def crawl_parallel(urls: List[str], max_concurrent: int = 3):
|
||||
"""
|
||||
Parallel crawling while reusing browser instance - best for large workloads
|
||||
"""
|
||||
print("\n=== Parallel Crawling with Browser Reuse ===")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
browser_args=["--disable-gpu", "--disable-dev-shm-usage", "--no-sandbox"],
|
||||
viewport={"width": 800, "height": 600},
|
||||
)
|
||||
|
||||
crawl_config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(), In case you need fit_markdown
|
||||
),
|
||||
)
|
||||
|
||||
# Create single crawler instance for all parallel tasks
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
# Create tasks in batches to control concurrency
|
||||
for i in range(0, len(urls), max_concurrent):
|
||||
batch = urls[i : i + max_concurrent]
|
||||
tasks = []
|
||||
|
||||
for j, url in enumerate(batch):
|
||||
session_id = (
|
||||
f"parallel_session_{j}" # Different session per concurrent task
|
||||
)
|
||||
task = crawler.arun(url=url, config=crawl_config, session_id=session_id)
|
||||
tasks.append(task)
|
||||
|
||||
# Wait for batch to complete
|
||||
results = await asyncio.gather(*tasks, return_exceptions=True)
|
||||
|
||||
# Process results
|
||||
for url, result in zip(batch, results):
|
||||
if isinstance(result, Exception):
|
||||
print(f"Error crawling {url}: {str(result)}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled {url}")
|
||||
print(f"Content length: {len(result.markdown_v2.raw_markdown)}")
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
|
||||
async def main():
|
||||
# Example URLs
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://example.com/page2",
|
||||
"https://example.com/page3",
|
||||
"https://example.com/page4",
|
||||
]
|
||||
|
||||
# Demo sequential crawling
|
||||
await crawl_sequential(urls)
|
||||
|
||||
# Demo parallel crawling
|
||||
await crawl_parallel(urls, max_concurrent=2)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
115
docs/examples/extraction_strategies_example.py
Normal file
115
docs/examples/extraction_strategies_example.py
Normal file
@@ -0,0 +1,115 @@
|
||||
"""
|
||||
Example demonstrating different extraction strategies with various input formats.
|
||||
This example shows how to:
|
||||
1. Use different input formats (markdown, HTML, fit_markdown)
|
||||
2. Work with JSON-based extractors (CSS and XPath)
|
||||
3. Use LLM-based extraction with different input formats
|
||||
4. Configure browser and crawler settings properly
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from typing import Dict, Any
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy
|
||||
)
|
||||
from crawl4ai.chunking_strategy import RegexChunking, IdentityChunking
|
||||
from crawl4ai.content_filter_strategy import PruningContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
|
||||
async def run_extraction(crawler: AsyncWebCrawler, url: str, strategy, name: str):
|
||||
"""Helper function to run extraction with proper configuration"""
|
||||
try:
|
||||
# Configure the crawler run settings
|
||||
config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
extraction_strategy=strategy,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter() # For fit_markdown support
|
||||
)
|
||||
)
|
||||
|
||||
# Run the crawler
|
||||
result = await crawler.arun(url=url, config=config)
|
||||
|
||||
if result.success:
|
||||
print(f"\n=== {name} Results ===")
|
||||
print(f"Extracted Content: {result.extracted_content}")
|
||||
print(f"Raw Markdown Length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(f"Citations Markdown Length: {len(result.markdown_v2.markdown_with_citations)}")
|
||||
else:
|
||||
print(f"Error in {name}: Crawl failed")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error in {name}: {str(e)}")
|
||||
|
||||
async def main():
|
||||
# Example URL (replace with actual URL)
|
||||
url = "https://example.com/product-page"
|
||||
|
||||
# Configure browser settings
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
# Initialize extraction strategies
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information including name, price, and description"
|
||||
)
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from HTML including structured data"
|
||||
)
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
provider="openai/gpt-4o-mini",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="Extract product information from cleaned markdown"
|
||||
)
|
||||
|
||||
# 2. JSON CSS Extraction (automatically uses HTML input)
|
||||
css_schema = {
|
||||
"baseSelector": ".product",
|
||||
"fields": [
|
||||
{"name": "title", "selector": "h1.product-title", "type": "text"},
|
||||
{"name": "price", "selector": ".price", "type": "text"},
|
||||
{"name": "description", "selector": ".description", "type": "text"}
|
||||
]
|
||||
}
|
||||
css_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
||||
|
||||
# 3. JSON XPath Extraction (automatically uses HTML input)
|
||||
xpath_schema = {
|
||||
"baseSelector": "//div[@class='product']",
|
||||
"fields": [
|
||||
{"name": "title", "selector": ".//h1[@class='product-title']/text()", "type": "text"},
|
||||
{"name": "price", "selector": ".//span[@class='price']/text()", "type": "text"},
|
||||
{"name": "description", "selector": ".//div[@class='description']/text()", "type": "text"}
|
||||
]
|
||||
}
|
||||
xpath_strategy = JsonXPathExtractionStrategy(schema=xpath_schema)
|
||||
|
||||
# Use context manager for proper resource handling
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Run all strategies
|
||||
await run_extraction(crawler, url, markdown_strategy, "Markdown LLM")
|
||||
await run_extraction(crawler, url, html_strategy, "HTML LLM")
|
||||
await run_extraction(crawler, url, fit_markdown_strategy, "Fit Markdown LLM")
|
||||
await run_extraction(crawler, url, css_strategy, "CSS Extraction")
|
||||
await run_extraction(crawler, url, xpath_strategy, "XPath Extraction")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -39,8 +39,8 @@ async def main():
|
||||
f.write(b64decode(result.screenshot))
|
||||
|
||||
# Save PDF
|
||||
if result.pdf_data:
|
||||
pdf_bytes = b64decode(result.pdf_data)
|
||||
if result.pdf:
|
||||
pdf_bytes = b64decode(result.pdf)
|
||||
with open(os.path.join(__location__, "page.pdf"), "wb") as f:
|
||||
f.write(pdf_bytes)
|
||||
|
||||
|
||||
Reference in New Issue
Block a user