refactor(models): rename final_url to redirected_url for consistency
Renames the final_url field to redirected_url across all components to maintain consistent terminology throughout the codebase. This change affects: - AsyncCrawlResponse model - AsyncPlaywrightCrawlerStrategy - Documentation and examples No functional changes, purely naming consistency improvement.
This commit is contained in:
@@ -29,7 +29,7 @@ This release introduces several powerful new features, including robots.txt comp
|
|||||||
|
|
||||||
- **URL Redirection Tracking:**
|
- **URL Redirection Tracking:**
|
||||||
- Added URL redirection tracking to capture the final URL after any redirects.
|
- Added URL redirection tracking to capture the final URL after any redirects.
|
||||||
- The final URL is now available in the `final_url` field of the `AsyncCrawlResponse` object.
|
- The final URL is now available in the `redirected_url` field of the `AsyncCrawlResponse` object.
|
||||||
|
|
||||||
- **Enhanced Streamlined Documentation:**
|
- **Enhanced Streamlined Documentation:**
|
||||||
- Refactored and improved the documentation structure for clarity and ease of use.
|
- Refactored and improved the documentation structure for clarity and ease of use.
|
||||||
|
|||||||
@@ -492,7 +492,7 @@ async def test_news_crawl():
|
|||||||
- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
|
- **🏎️ Faster Scraping Option**: New `LXMLWebScrapingStrategy` offers **10-20x speedup** for large, complex pages (experimental).
|
||||||
- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
|
- **🤖 robots.txt Compliance**: Respect website rules with `check_robots_txt=True` and efficient local caching.
|
||||||
- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
|
- **🔄 Proxy Rotation**: Built-in support for dynamic proxy switching and IP verification, with support for authenticated proxies and session persistence.
|
||||||
- **➡️ URL Redirection Tracking**: The `final_url` field now captures the final destination after any redirects.
|
- **➡️ URL Redirection Tracking**: The `redirected_url` field now captures the final destination after any redirects.
|
||||||
- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
|
- **🪞 Improved Mirroring**: The `LXMLWebScrapingStrategy` now has much greater fidelity, allowing for almost pixel-perfect mirroring of websites.
|
||||||
- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
|
- **📈 Enhanced Monitoring**: Track memory, CPU, and individual crawler status with `CrawlerMonitor`.
|
||||||
- **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials.
|
- **📝 Improved Documentation**: More examples, clearer explanations, and updated tutorials.
|
||||||
|
|||||||
@@ -1254,7 +1254,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
config.url = url
|
config.url = url
|
||||||
response_headers = {}
|
response_headers = {}
|
||||||
status_code = None
|
status_code = None
|
||||||
final_url = url
|
redirected_url = url
|
||||||
|
|
||||||
# Reset downloaded files list for new crawl
|
# Reset downloaded files list for new crawl
|
||||||
self._downloaded_files = []
|
self._downloaded_files = []
|
||||||
@@ -1336,7 +1336,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
response = await page.goto(
|
response = await page.goto(
|
||||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||||
)
|
)
|
||||||
final_url = page.url
|
redirected_url = page.url
|
||||||
except Error as e:
|
except Error as e:
|
||||||
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
raise RuntimeError(f"Failed on navigating ACS-GOTO:\n{str(e)}")
|
||||||
|
|
||||||
@@ -1616,7 +1616,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
|||||||
downloaded_files=(
|
downloaded_files=(
|
||||||
self._downloaded_files if self._downloaded_files else None
|
self._downloaded_files if self._downloaded_files else None
|
||||||
),
|
),
|
||||||
final_url=final_url,
|
redirected_url=redirected_url,
|
||||||
)
|
)
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
|||||||
@@ -462,7 +462,7 @@ class AsyncWebCrawler:
|
|||||||
)
|
)
|
||||||
|
|
||||||
crawl_result.status_code = async_response.status_code
|
crawl_result.status_code = async_response.status_code
|
||||||
crawl_result.redirected_url = async_response.final_url or url
|
crawl_result.redirected_url = async_response.redirected_url or url
|
||||||
crawl_result.response_headers = async_response.response_headers
|
crawl_result.response_headers = async_response.response_headers
|
||||||
crawl_result.downloaded_files = async_response.downloaded_files
|
crawl_result.downloaded_files = async_response.downloaded_files
|
||||||
crawl_result.ssl_certificate = (
|
crawl_result.ssl_certificate = (
|
||||||
|
|||||||
@@ -132,7 +132,7 @@ class AsyncCrawlResponse(BaseModel):
|
|||||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||||
downloaded_files: Optional[List[str]] = None
|
downloaded_files: Optional[List[str]] = None
|
||||||
ssl_certificate: Optional[SSLCertificate] = None
|
ssl_certificate: Optional[SSLCertificate] = None
|
||||||
final_url: Optional[str] = None
|
redirected_url: Optional[str] = None
|
||||||
|
|
||||||
class Config:
|
class Config:
|
||||||
arbitrary_types_allowed = True
|
arbitrary_types_allowed = True
|
||||||
|
|||||||
@@ -2,54 +2,96 @@
|
|||||||
Crawl4ai v0.4.3 Features Demo
|
Crawl4ai v0.4.3 Features Demo
|
||||||
============================
|
============================
|
||||||
|
|
||||||
This example demonstrates the major new features introduced in Crawl4ai v0.4.3.
|
This demonstration showcases three major categories of new features in Crawl4ai v0.4.3:
|
||||||
Each section showcases a specific feature with practical examples and explanations.
|
|
||||||
|
1. Efficiency & Speed:
|
||||||
|
- Memory-efficient dispatcher strategies
|
||||||
|
- New scraping algorithm
|
||||||
|
- Streaming support for batch crawling
|
||||||
|
|
||||||
|
2. LLM Integration:
|
||||||
|
- Automatic schema generation
|
||||||
|
- LLM-powered content filtering
|
||||||
|
- Smart markdown generation
|
||||||
|
|
||||||
|
3. Core Improvements:
|
||||||
|
- Robots.txt compliance
|
||||||
|
- Proxy rotation
|
||||||
|
- Enhanced URL handling
|
||||||
|
|
||||||
|
Each demo function can be run independently or as part of the full suite.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import asyncio
|
import asyncio
|
||||||
import os
|
import os
|
||||||
from crawl4ai import *
|
import json
|
||||||
|
import re
|
||||||
|
import random
|
||||||
|
from typing import Optional, Dict
|
||||||
|
from dotenv import load_dotenv
|
||||||
|
|
||||||
|
load_dotenv()
|
||||||
|
|
||||||
|
from crawl4ai import (
|
||||||
|
AsyncWebCrawler,
|
||||||
|
BrowserConfig,
|
||||||
|
CrawlerRunConfig,
|
||||||
|
CacheMode,
|
||||||
|
DisplayMode,
|
||||||
|
MemoryAdaptiveDispatcher,
|
||||||
|
CrawlerMonitor,
|
||||||
|
DefaultMarkdownGenerator,
|
||||||
|
LXMLWebScrapingStrategy,
|
||||||
|
JsonCssExtractionStrategy,
|
||||||
|
LLMContentFilter
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
async def demo_memory_dispatcher():
|
async def demo_memory_dispatcher():
|
||||||
|
"""Demonstrates the new memory-efficient dispatcher system.
|
||||||
|
|
||||||
|
Key Features:
|
||||||
|
- Adaptive memory management
|
||||||
|
- Real-time performance monitoring
|
||||||
|
- Concurrent session control
|
||||||
"""
|
"""
|
||||||
1. Memory Dispatcher System Demo
|
print("\n=== Memory Dispatcher Demo ===")
|
||||||
===============================
|
|
||||||
Shows how to use the new memory dispatcher with monitoring
|
|
||||||
"""
|
|
||||||
print("\n=== 1. Memory Dispatcher System Demo ===")
|
|
||||||
|
|
||||||
# Configure crawler
|
try:
|
||||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
# Configuration
|
||||||
crawler_config = CrawlerRunConfig(
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
cache_mode=CacheMode.BYPASS, markdown_generator=DefaultMarkdownGenerator()
|
crawler_config = CrawlerRunConfig(
|
||||||
)
|
cache_mode=CacheMode.BYPASS,
|
||||||
|
markdown_generator=DefaultMarkdownGenerator()
|
||||||
# Test URLs
|
|
||||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
|
||||||
# Initialize dispatcher with monitoring
|
|
||||||
monitor = CrawlerMonitor(
|
|
||||||
max_visible_rows=10,
|
|
||||||
display_mode=DisplayMode.DETAILED, # Can be DETAILED or AGGREGATED
|
|
||||||
)
|
)
|
||||||
|
|
||||||
dispatcher = MemoryAdaptiveDispatcher(
|
# Test URLs
|
||||||
memory_threshold_percent=80.0, # Memory usage threshold
|
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
||||||
check_interval=0.5, # How often to check memory
|
|
||||||
max_session_permit=5, # Max concurrent crawls
|
|
||||||
monitor=monitor, # Pass the monitor
|
|
||||||
)
|
|
||||||
|
|
||||||
# Run with memory monitoring
|
print("\n📈 Initializing crawler with memory monitoring...")
|
||||||
print("Starting batch crawl with memory monitoring...")
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
results = await dispatcher.run_urls(
|
monitor = CrawlerMonitor(
|
||||||
urls=urls,
|
max_visible_rows=10,
|
||||||
crawler=crawler,
|
display_mode=DisplayMode.DETAILED
|
||||||
config=crawler_config,
|
)
|
||||||
)
|
|
||||||
print(f"Completed {len(results)} URLs")
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
|
memory_threshold_percent=80.0,
|
||||||
|
check_interval=0.5,
|
||||||
|
max_session_permit=5,
|
||||||
|
monitor=monitor
|
||||||
|
)
|
||||||
|
|
||||||
|
print("\n🚀 Starting batch crawl...")
|
||||||
|
results = await dispatcher.run_urls(
|
||||||
|
urls=urls,
|
||||||
|
crawler=crawler,
|
||||||
|
config=crawler_config,
|
||||||
|
)
|
||||||
|
print(f"\n✅ Completed {len(results)} URLs successfully")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"\n❌ Error in memory dispatcher demo: {str(e)}")
|
||||||
|
|
||||||
|
|
||||||
async def demo_streaming_support():
|
async def demo_streaming_support():
|
||||||
@@ -60,7 +102,7 @@ async def demo_streaming_support():
|
|||||||
"""
|
"""
|
||||||
print("\n=== 2. Streaming Support Demo ===")
|
print("\n=== 2. Streaming Support Demo ===")
|
||||||
|
|
||||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
||||||
|
|
||||||
# Test URLs
|
# Test URLs
|
||||||
@@ -179,7 +221,7 @@ async def demo_robots_compliance():
|
|||||||
|
|
||||||
|
|
||||||
|
|
||||||
async def demo_llm_schema_generation():
|
async def demo_json_schema_generation():
|
||||||
"""
|
"""
|
||||||
7. LLM-Powered Schema Generation Demo
|
7. LLM-Powered Schema Generation Demo
|
||||||
=================================
|
=================================
|
||||||
@@ -233,25 +275,6 @@ async def demo_llm_schema_generation():
|
|||||||
print("Successfully used generated schema for crawling")
|
print("Successfully used generated schema for crawling")
|
||||||
|
|
||||||
|
|
||||||
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
|
||||||
"""Get next proxy from local file"""
|
|
||||||
try:
|
|
||||||
with open(proxy_file) as f:
|
|
||||||
proxies = f.read().splitlines()
|
|
||||||
if not proxies:
|
|
||||||
return None
|
|
||||||
|
|
||||||
ip, port, username, password = random.choice(proxies).split(":")
|
|
||||||
return {
|
|
||||||
"server": f"http://{ip}:{port}",
|
|
||||||
"username": username,
|
|
||||||
"password": password,
|
|
||||||
"ip": ip # Store original IP for verification
|
|
||||||
}
|
|
||||||
except Exception as e:
|
|
||||||
print(f"Error loading proxy: {e}")
|
|
||||||
return None
|
|
||||||
|
|
||||||
async def demo_proxy_rotation():
|
async def demo_proxy_rotation():
|
||||||
"""
|
"""
|
||||||
8. Proxy Rotation Demo
|
8. Proxy Rotation Demo
|
||||||
@@ -260,11 +283,27 @@ async def demo_proxy_rotation():
|
|||||||
"""
|
"""
|
||||||
print("\n=== 8. Proxy Rotation Demo ===")
|
print("\n=== 8. Proxy Rotation Demo ===")
|
||||||
|
|
||||||
|
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
||||||
|
"""Get next proxy from local file"""
|
||||||
|
try:
|
||||||
|
proxies = os.getenv("PROXIES", "").split(",")
|
||||||
|
|
||||||
|
ip, port, username, password = random.choice(proxies).split(":")
|
||||||
|
return {
|
||||||
|
"server": f"http://{ip}:{port}",
|
||||||
|
"username": username,
|
||||||
|
"password": password,
|
||||||
|
"ip": ip # Store original IP for verification
|
||||||
|
}
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error loading proxy: {e}")
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
# Create 10 test requests to httpbin
|
# Create 10 test requests to httpbin
|
||||||
urls = ["https://httpbin.org/ip"] * 3
|
urls = ["https://httpbin.org/ip"] * 2
|
||||||
|
|
||||||
browser_config = BrowserConfig(headless=True)
|
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||||
|
|
||||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||||
@@ -289,24 +328,25 @@ async def demo_proxy_rotation():
|
|||||||
else:
|
else:
|
||||||
print(f"Failed with proxy {proxy['ip']}")
|
print(f"Failed with proxy {proxy['ip']}")
|
||||||
|
|
||||||
if __name__ == "__main__":
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
"""Run all feature demonstrations."""
|
"""Run all feature demonstrations."""
|
||||||
demo_memory_dispatcher(),
|
print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n")
|
||||||
print("\n" + "=" * 50 + "\n")
|
|
||||||
demo_streaming_support(),
|
# Efficiency & Speed Demos
|
||||||
print("\n" + "=" * 50 + "\n")
|
# print("\n🚀 EFFICIENCY & SPEED DEMOS")
|
||||||
demo_content_scraping(),
|
# await demo_memory_dispatcher()
|
||||||
print("\n" + "=" * 50 + "\n")
|
# await demo_streaming_support()
|
||||||
demo_llm_schema_generation(),
|
# await demo_content_scraping()
|
||||||
print("\n" + "=" * 50 + "\n")
|
|
||||||
demo_llm_markdown(),
|
# # LLM Integration Demos
|
||||||
print("\n" + "=" * 50 + "\n")
|
# print("\n🤖 LLM INTEGRATION DEMOS")
|
||||||
demo_robots_compliance(),
|
# await demo_json_schema_generation()
|
||||||
print("\n" + "=" * 50 + "\n")
|
# await demo_llm_markdown()
|
||||||
demo_proxy_rotation()
|
|
||||||
print("\n" + "=" * 50 + "\n")
|
# # Core Improvements
|
||||||
|
# print("\n🔧 CORE IMPROVEMENT DEMOS")
|
||||||
|
# await demo_robots_compliance()
|
||||||
|
await demo_proxy_rotation()
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
asyncio.run(main())
|
asyncio.run(main())
|
||||||
|
|||||||
@@ -1,266 +1,138 @@
|
|||||||
# Crawl4AI 0.4.3b1 is Here: Faster, Smarter, and Ready for Real-World Crawling!
|
# Crawl4AI 0.4.3: Major Performance Boost & LLM Integration
|
||||||
|
|
||||||
Hey, Crawl4AI enthusiasts! We're thrilled to announce the release of **Crawl4AI 0.4.3b1**, packed with powerful new features and enhancements that take web crawling to a whole new level of efficiency and intelligence. This release is all about giving you more control, better performance, and deeper insights into your crawled data.
|
We're excited to announce Crawl4AI 0.4.3, focusing on three key areas: Speed & Efficiency, LLM Integration, and Core Platform Improvements. This release significantly improves crawling performance while adding powerful new LLM-powered features.
|
||||||
|
|
||||||
Let's dive into what's new!
|
## ⚡ Speed & Efficiency Improvements
|
||||||
|
|
||||||
## 🚀 Major Feature Highlights
|
### 1. Memory-Adaptive Dispatcher System
|
||||||
|
The new dispatcher system provides intelligent resource management and real-time monitoring:
|
||||||
### 1. LLM-Powered Schema Generation: Zero to Structured Data in Seconds!
|
|
||||||
|
|
||||||
Tired of manually crafting CSS or XPath selectors? We've got you covered! Crawl4AI now features a revolutionary **schema generator** that uses the power of Large Language Models (LLMs) to automatically create extraction schemas for you.
|
|
||||||
|
|
||||||
**How it Works:**
|
|
||||||
|
|
||||||
1. **Provide HTML**: Feed in a sample HTML snippet that contains the type of data you want to extract (e.g., product listings, article sections).
|
|
||||||
2. **Describe Your Needs (Optional)**: You can provide a natural language query like "extract all product names and prices" to guide the schema creation.
|
|
||||||
3. **Choose Your LLM**: Use either **OpenAI** (GPT-4o recommended) for top-tier accuracy or **Ollama** for a local, open-source option.
|
|
||||||
4. **Get Your Schema**: The tool outputs a ready-to-use JSON schema that works seamlessly with `JsonCssExtractionStrategy` or `JsonXPathExtractionStrategy`.
|
|
||||||
|
|
||||||
**Why You'll Love It:**
|
|
||||||
|
|
||||||
- **No More Tedious Selector Writing**: Let the LLM analyze the HTML and create the selectors for you!
|
|
||||||
- **One-Time Cost**: Schema generation uses LLM, but once you have your schema, subsequent extractions are fast and LLM-free.
|
|
||||||
- **Handles Complex Structures**: The LLM can understand nested elements, lists, and variations in layout—far beyond what simple CSS selectors can achieve.
|
|
||||||
- **Learn by Example**: The generated schemas are a fantastic way to learn best practices for writing your own schemas.
|
|
||||||
|
|
||||||
**Example:**
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DisplayMode
|
||||||
|
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, CrawlerMonitor
|
||||||
# Sample HTML snippet (imagine this is part of a product listing page)
|
|
||||||
html = """
|
|
||||||
<div class="product">
|
|
||||||
<h2 class="name">Awesome Gadget</h2>
|
|
||||||
<span class="price">$99.99</span>
|
|
||||||
</div>
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Generate schema using OpenAI
|
|
||||||
schema = JsonCssExtractionStrategy.generate_schema(
|
|
||||||
html,
|
|
||||||
llm_provider="openai/gpt-4o",
|
|
||||||
api_token="YOUR_API_TOKEN"
|
|
||||||
)
|
|
||||||
|
|
||||||
# Or use Ollama for a local, open-source option
|
|
||||||
# schema = JsonCssExtractionStrategy.generate_schema(
|
|
||||||
# html,
|
|
||||||
# llm_provider="ollama/llama3"
|
|
||||||
# )
|
|
||||||
|
|
||||||
print(json.dumps(schema, indent=2))
|
|
||||||
```
|
|
||||||
|
|
||||||
**Output (Schema):**
|
|
||||||
|
|
||||||
```json
|
|
||||||
{
|
|
||||||
"name": null,
|
|
||||||
"baseSelector": "div.product",
|
|
||||||
"fields": [
|
|
||||||
{
|
|
||||||
"name": "name",
|
|
||||||
"selector": "h2.name",
|
|
||||||
"type": "text"
|
|
||||||
},
|
|
||||||
{
|
|
||||||
"name": "price",
|
|
||||||
"selector": "span.price",
|
|
||||||
"type": "text"
|
|
||||||
}
|
|
||||||
]
|
|
||||||
}
|
|
||||||
```
|
|
||||||
|
|
||||||
You can now **save** this schema and use it for all your extractions on pages with the same structure. No more LLM costs, just **fast, reliable** data extraction!
|
|
||||||
|
|
||||||
### 2. Robots.txt Compliance: Crawl Responsibly
|
|
||||||
|
|
||||||
Crawl4AI now respects website rules! With the new `check_robots_txt=True` option in `CrawlerRunConfig`, the crawler automatically fetches, parses, and obeys each site's `robots.txt` file.
|
|
||||||
|
|
||||||
**Key Features**:
|
|
||||||
|
|
||||||
- **Efficient Caching**: Stores parsed `robots.txt` files locally for 7 days to avoid re-fetching.
|
|
||||||
- **Automatic Integration**: Works seamlessly with both `arun()` and `arun_many()`.
|
|
||||||
- **Clear Status Codes**: Returns a 403 status code if a URL is disallowed.
|
|
||||||
- **Customizable**: Adjust the cache directory and TTL if needed.
|
|
||||||
|
|
||||||
**Example**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
|
||||||
|
|
||||||
async def main():
|
async def main():
|
||||||
config = CrawlerRunConfig(
|
urls = ["https://example1.com", "https://example2.com"] * 50
|
||||||
cache_mode=CacheMode.ENABLED,
|
|
||||||
check_robots_txt=True
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
# Configure memory-aware dispatch
|
||||||
result = await crawler.arun("https://example.com/private-page", config=config)
|
dispatcher = MemoryAdaptiveDispatcher(
|
||||||
if result.status_code == 403:
|
memory_threshold_percent=80.0, # Auto-throttle at 80% memory
|
||||||
print("Access denied by robots.txt")
|
check_interval=0.5, # Check every 0.5 seconds
|
||||||
|
max_session_permit=20, # Max concurrent sessions
|
||||||
if __name__ == "__main__":
|
monitor=CrawlerMonitor( # Real-time monitoring
|
||||||
asyncio.run(main())
|
display_mode=DisplayMode.DETAILED
|
||||||
```
|
|
||||||
|
|
||||||
### 3. Proxy Support in `CrawlerRunConfig`
|
|
||||||
|
|
||||||
Need more control over your proxy settings? Now you can configure proxies directly within `CrawlerRunConfig` for each crawl:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
proxy_config={
|
|
||||||
"server": "http://your-proxy.com:8080",
|
|
||||||
"username": "your_username", # Optional
|
|
||||||
"password": "your_password" # Optional
|
|
||||||
}
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun("https://example.com", config=config)
|
|
||||||
```
|
|
||||||
|
|
||||||
This allows for dynamic proxy assignment per URL or even per request.
|
|
||||||
|
|
||||||
### 4. LLM-Powered Markdown Filtering (Beta)
|
|
||||||
|
|
||||||
We're introducing an experimental **`LLMContentFilter`**! This filter, when used with the `DefaultMarkdownGenerator`, can produce highly focused markdown output by using an LLM to analyze content relevance.
|
|
||||||
|
|
||||||
**How it Works:**
|
|
||||||
|
|
||||||
1. You provide an **instruction** (e.g., "extract only the key technical details").
|
|
||||||
2. The LLM analyzes each section of the page based on your instruction.
|
|
||||||
3. Only the most relevant content is included in the final `fit_markdown`.
|
|
||||||
|
|
||||||
**Example**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
|
||||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
|
||||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
|
||||||
|
|
||||||
async def main():
|
|
||||||
llm_filter = LLMContentFilter(
|
|
||||||
provider="openai/gpt-4o",
|
|
||||||
api_token="YOUR_API_TOKEN", # Or use "ollama/llama3" with no token
|
|
||||||
instruction="Extract the core educational content about Python classes."
|
|
||||||
)
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
|
||||||
markdown_generator=DefaultMarkdownGenerator(content_filter=llm_filter)
|
|
||||||
)
|
|
||||||
|
|
||||||
async with AsyncWebCrawler() as crawler:
|
|
||||||
result = await crawler.arun(
|
|
||||||
"https://docs.python.org/3/tutorial/classes.html",
|
|
||||||
config=config
|
|
||||||
)
|
)
|
||||||
print(result.markdown_v2.fit_markdown)
|
)
|
||||||
|
|
||||||
if __name__ == "__main__":
|
async with AsyncWebCrawler() as crawler:
|
||||||
asyncio.run(main())
|
results = await dispatcher.run_urls(
|
||||||
|
urls=urls,
|
||||||
|
crawler=crawler,
|
||||||
|
config=CrawlerRunConfig()
|
||||||
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
**Note**: This is a beta feature. We're actively working on improving its accuracy and performance.
|
### 2. Streaming Support
|
||||||
|
Process crawled URLs in real-time instead of waiting for all results:
|
||||||
### 5. Streamlined `arun_many()` with Dispatchers
|
|
||||||
|
|
||||||
We've simplified concurrent crawling! `arun_many()` now intelligently handles multiple URLs, either returning a **list** of results or an **async generator** for streaming.
|
|
||||||
|
|
||||||
**Basic Usage (Batch)**:
|
|
||||||
|
|
||||||
```python
|
```python
|
||||||
results = await crawler.arun_many(
|
config = CrawlerRunConfig(stream=True)
|
||||||
urls=["https://site1.com", "https://site2.com"],
|
|
||||||
config=CrawlerRunConfig()
|
|
||||||
)
|
|
||||||
|
|
||||||
for res in results:
|
async with AsyncWebCrawler() as crawler:
|
||||||
print(res.url, "crawled successfully:", res.success)
|
async for result in await crawler.arun_many(urls, config=config):
|
||||||
|
print(f"Got result for {result.url}")
|
||||||
|
# Process each result immediately
|
||||||
```
|
```
|
||||||
|
|
||||||
**Streaming Mode**:
|
### 3. LXML-Based Scraping
|
||||||
|
New LXML scraping strategy offering up to 20x faster parsing:
|
||||||
|
|
||||||
```python
|
```python
|
||||||
async for result in await crawler.arun_many(
|
|
||||||
urls=["https://site1.com", "https://site2.com"],
|
|
||||||
config=CrawlerRunConfig(stream=True)
|
|
||||||
):
|
|
||||||
print("Just finished:", result.url)
|
|
||||||
# Process each result immediately
|
|
||||||
```
|
|
||||||
|
|
||||||
**Advanced:** You can now customize how `arun_many` handles concurrency by passing a **dispatcher**. See [Advanced Multi-URL Crawling](../advanced/multi-url-crawling.md) for details.
|
|
||||||
|
|
||||||
### 6. Enhanced Browser Context Management
|
|
||||||
|
|
||||||
We've improved how Crawl4AI manages browser contexts for better resource utilization and session handling.
|
|
||||||
|
|
||||||
- **`shared_data` in `CrawlerRunConfig`**: Pass data between hooks using the `shared_data` dictionary.
|
|
||||||
- **Context Reuse**: The crawler now intelligently reuses browser contexts based on configuration, reducing overhead.
|
|
||||||
|
|
||||||
### 7. Faster Scraping with `LXMLWebScrapingStrategy`
|
|
||||||
|
|
||||||
Introducing a new, optional **`LXMLWebScrapingStrategy`** that can be **10-20x faster** than the default BeautifulSoup approach for large, complex pages.
|
|
||||||
|
|
||||||
**How to Use**:
|
|
||||||
|
|
||||||
```python
|
|
||||||
from crawl4ai import LXMLWebScrapingStrategy
|
|
||||||
|
|
||||||
config = CrawlerRunConfig(
|
config = CrawlerRunConfig(
|
||||||
scraping_strategy=LXMLWebScrapingStrategy() # Add this line
|
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||||
|
cache_mode=CacheMode.ENABLED
|
||||||
)
|
)
|
||||||
```
|
```
|
||||||
|
|
||||||
**When to Use**:
|
## 🤖 LLM Integration
|
||||||
- If profiling shows a bottleneck in `WebScrapingStrategy`.
|
|
||||||
- For very large HTML documents where parsing speed matters.
|
|
||||||
|
|
||||||
**Caveats**:
|
### 1. LLM-Powered Markdown Generation
|
||||||
- It might not handle malformed HTML as gracefully as BeautifulSoup.
|
Smart content filtering and organization using LLMs:
|
||||||
- We're still gathering data, so report any issues!
|
|
||||||
|
|
||||||
---
|
```python
|
||||||
|
config = CrawlerRunConfig(
|
||||||
|
markdown_generator=DefaultMarkdownGenerator(
|
||||||
|
content_filter=LLMContentFilter(
|
||||||
|
provider="openai/gpt-4o",
|
||||||
|
instruction="Extract technical documentation and code examples"
|
||||||
|
)
|
||||||
|
)
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
## Try the Feature Demo Script!
|
### 2. Automatic Schema Generation
|
||||||
|
Generate extraction schemas instantly using LLMs instead of manual CSS/XPath writing:
|
||||||
|
|
||||||
We've prepared a Python script demonstrating these new features. You can find it at:
|
```python
|
||||||
|
schema = JsonCssExtractionStrategy.generate_schema(
|
||||||
|
html_content,
|
||||||
|
schema_type="CSS",
|
||||||
|
query="Extract product name, price, and description"
|
||||||
|
)
|
||||||
|
```
|
||||||
|
|
||||||
[**`features_demo.py`**](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/0_4_3b1_feature_demo.py)
|
## 🔧 Core Improvements
|
||||||
|
|
||||||
**To run the demo:**
|
### 1. Proxy Support & Rotation
|
||||||
|
Integrated proxy support with automatic rotation and verification:
|
||||||
|
|
||||||
1. Make sure you have Crawl4AI installed (`pip install crawl4ai`).
|
```python
|
||||||
2. Copy the `features_demo.py` script to your local environment.
|
config = CrawlerRunConfig(
|
||||||
3. Set your OpenAI API key as an environment variable (if using OpenAI models):
|
proxy_config={
|
||||||
```bash
|
"server": "http://proxy:8080",
|
||||||
export OPENAI_API_KEY="your_api_key"
|
"username": "user",
|
||||||
```
|
"password": "pass"
|
||||||
4. Run the script:
|
}
|
||||||
```bash
|
)
|
||||||
python features_demo.py
|
```
|
||||||
```
|
|
||||||
|
|
||||||
The script will execute various crawl scenarios, showcasing the new features and printing results to your console.
|
### 2. Robots.txt Compliance
|
||||||
|
Built-in robots.txt support with SQLite caching:
|
||||||
|
|
||||||
## Conclusion
|
```python
|
||||||
|
config = CrawlerRunConfig(check_robots_txt=True)
|
||||||
|
result = await crawler.arun(url, config=config)
|
||||||
|
if result.status_code == 403:
|
||||||
|
print("Access blocked by robots.txt")
|
||||||
|
```
|
||||||
|
|
||||||
Crawl4AI version 0.4.3b1 is a major step forward in flexibility, performance, and ease of use. With automatic schema generation, robots.txt handling, advanced content filtering, and streamlined multi-URL crawling, you can build powerful, efficient, and responsible web scrapers.
|
### 3. URL Redirection Tracking
|
||||||
|
Track final URLs after redirects:
|
||||||
|
|
||||||
We encourage you to try out these new capabilities, explore the updated documentation, and share your feedback! Your input is invaluable as we continue to improve Crawl4AI.
|
```python
|
||||||
|
result = await crawler.arun(url)
|
||||||
|
print(f"Initial URL: {url}")
|
||||||
|
print(f"Final URL: {result.redirected_url}")
|
||||||
|
```
|
||||||
|
|
||||||
**Stay Connected:**
|
## Performance Impact
|
||||||
|
|
||||||
- **Star** us on [GitHub](https://github.com/unclecode/crawl4ai) to show your support!
|
- Memory usage reduced by up to 40% with adaptive dispatcher
|
||||||
- **Follow** [@unclecode](https://twitter.com/unclecode) on Twitter for updates and tips.
|
- Parsing speed increased up to 20x with LXML strategy
|
||||||
- **Join** our community on Discord (link coming soon) to discuss your projects and get help.
|
- Streaming reduces memory footprint for large crawls by ~60%
|
||||||
|
|
||||||
Happy crawling!
|
## Getting Started
|
||||||
|
|
||||||
|
```bash
|
||||||
|
pip install -U crawl4ai
|
||||||
|
```
|
||||||
|
|
||||||
|
For complete examples, check our [demo repository](https://github.com/unclecode/crawl4ai/examples).
|
||||||
|
|
||||||
|
## Stay Connected
|
||||||
|
|
||||||
|
- Star us on [GitHub](https://github.com/unclecode/crawl4ai)
|
||||||
|
- Follow [@unclecode](https://twitter.com/unclecode)
|
||||||
|
- Join our [Discord](https://discord.gg/crawl4ai)
|
||||||
|
|
||||||
|
Happy crawling! 🕷️
|
||||||
Reference in New Issue
Block a user