feat(proxy): add proxy rotation strategy
Implements a new proxy rotation system with the following changes: - Add ProxyRotationStrategy abstract base class - Add RoundRobinProxyStrategy concrete implementation - Integrate proxy rotation with AsyncWebCrawler - Add proxy_rotation_strategy parameter to CrawlerRunConfig - Add example script demonstrating proxy rotation usage - Remove deprecated synchronous WebCrawler code - Clean up rate limiting documentation BREAKING CHANGE: Removed synchronous WebCrawler support and related rate limiting configurations
This commit is contained in:
161
docs/examples/proxy_rotation_demo.py
Normal file
161
docs/examples/proxy_rotation_demo.py
Normal file
@@ -0,0 +1,161 @@
|
||||
import os
|
||||
import re
|
||||
from typing import List, Dict
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
RoundRobinProxyStrategy
|
||||
)
|
||||
|
||||
def load_proxies_from_env() -> List[Dict]:
|
||||
"""Load proxies from PROXIES environment variable"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv("PROXIES", "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
ip, port, username, password = proxy.split(":")
|
||||
proxies.append({
|
||||
"server": f"http://{ip}:{port}",
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP for verification
|
||||
})
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
async def demo_proxy_rotation():
|
||||
"""
|
||||
Proxy Rotation Demo using RoundRobinProxyStrategy
|
||||
===============================================
|
||||
Demonstrates proxy rotation using the strategy pattern.
|
||||
"""
|
||||
print("\n=== Proxy Rotation Demo (Round Robin) ===")
|
||||
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Create configs
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = ["https://httpbin.org/ip"] * len(proxies) # Test each proxy once
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for url in urls:
|
||||
result = await crawler.arun(url=url, config=run_config)
|
||||
|
||||
if result.success:
|
||||
# Extract IP from response
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy:
|
||||
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||
verified = ip_match and ip_match.group(0) == current_proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
else:
|
||||
print(f"Request failed: {result.error_message}")
|
||||
|
||||
async def demo_proxy_rotation_batch():
|
||||
"""
|
||||
Proxy Rotation Demo with Batch Processing
|
||||
=======================================
|
||||
Demonstrates proxy rotation using arun_many with memory dispatcher.
|
||||
"""
|
||||
print("\n=== Proxy Rotation Batch Demo ===")
|
||||
|
||||
try:
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = load_proxies_from_env()
|
||||
if not proxies:
|
||||
print("No proxies found in environment. Set PROXIES env variable!")
|
||||
return
|
||||
|
||||
proxy_strategy = RoundRobinProxyStrategy(proxies)
|
||||
|
||||
# Configurations
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
proxy_rotation_strategy=proxy_strategy,
|
||||
markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# Test URLs - multiple requests to test rotation
|
||||
urls = ["https://httpbin.org/ip"] * (len(proxies) * 2) # Test each proxy twice
|
||||
|
||||
print("\n📈 Initializing crawler with proxy rotation...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=1, #len(proxies), # Match concurrent sessions to proxy count
|
||||
# monitor=monitor
|
||||
)
|
||||
|
||||
print("\n🚀 Starting batch crawl with proxy rotation...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=run_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
# Verify results
|
||||
success_count = 0
|
||||
for result in results:
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
current_proxy = run_config.proxy_config if run_config.proxy_config else None
|
||||
|
||||
if current_proxy and ip_match:
|
||||
print(f"URL {result.url}")
|
||||
print(f"Proxy {current_proxy['server']} -> Response IP: {ip_match.group(0)}")
|
||||
verified = ip_match.group(0) == current_proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {current_proxy['ip']}")
|
||||
success_count += 1
|
||||
else:
|
||||
print("❌ Proxy failed or IP mismatch!")
|
||||
print("---")
|
||||
|
||||
print(f"\n✅ Completed {len(results)} requests with {success_count} successful proxy verifications")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in proxy rotation batch demo: {str(e)}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
MemoryAdaptiveDispatcher,
|
||||
DefaultMarkdownGenerator
|
||||
)
|
||||
|
||||
async def run_demos():
|
||||
# await demo_proxy_rotation() # Original single-request demo
|
||||
await demo_proxy_rotation_batch() # New batch processing demo
|
||||
|
||||
asyncio.run(run_demos())
|
||||
@@ -1,276 +0,0 @@
|
||||
import os, sys
|
||||
|
||||
# append the parent directory to the sys.path
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
parent_parent_dir = os.path.dirname(parent_dir)
|
||||
sys.path.append(parent_parent_dir)
|
||||
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
|
||||
__data__ = os.path.join(__location__, "__data")
|
||||
import asyncio
|
||||
from pathlib import Path
|
||||
import aiohttp
|
||||
import json
|
||||
from crawl4ai import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
|
||||
# 1. File Download Processing Example
|
||||
async def download_example():
|
||||
"""Example of downloading files from Python.org"""
|
||||
# downloads_path = os.path.join(os.getcwd(), "downloads")
|
||||
downloads_path = os.path.join(Path.home(), ".crawl4ai", "downloads")
|
||||
os.makedirs(downloads_path, exist_ok=True)
|
||||
|
||||
print(f"Downloads will be saved to: {downloads_path}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
accept_downloads=True, downloads_path=downloads_path, verbose=True
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://www.python.org/downloads/",
|
||||
js_code="""
|
||||
// Find and click the first Windows installer link
|
||||
const downloadLink = document.querySelector('a[href$=".exe"]');
|
||||
if (downloadLink) {
|
||||
console.log('Found download link:', downloadLink.href);
|
||||
downloadLink.click();
|
||||
} else {
|
||||
console.log('No .exe download link found');
|
||||
}
|
||||
""",
|
||||
delay_before_return_html=1, # Wait 5 seconds to ensure download starts
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
if result.downloaded_files:
|
||||
print("\nDownload successful!")
|
||||
print("Downloaded files:")
|
||||
for file_path in result.downloaded_files:
|
||||
print(f"- {file_path}")
|
||||
print(f" File size: {os.path.getsize(file_path) / (1024*1024):.2f} MB")
|
||||
else:
|
||||
print("\nNo files were downloaded")
|
||||
|
||||
|
||||
# 2. Local File and Raw HTML Processing Example
|
||||
async def local_and_raw_html_example():
|
||||
"""Example of processing local files and raw HTML"""
|
||||
# Create a sample HTML file
|
||||
sample_file = os.path.join(__data__, "sample.html")
|
||||
with open(sample_file, "w") as f:
|
||||
f.write(
|
||||
"""
|
||||
<html><body>
|
||||
<h1>Test Content</h1>
|
||||
<p>This is a test paragraph.</p>
|
||||
</body></html>
|
||||
"""
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Process local file
|
||||
local_result = await crawler.arun(url=f"file://{os.path.abspath(sample_file)}")
|
||||
|
||||
# Process raw HTML
|
||||
raw_html = """
|
||||
<html><body>
|
||||
<h1>Raw HTML Test</h1>
|
||||
<p>This is a test of raw HTML processing.</p>
|
||||
</body></html>
|
||||
"""
|
||||
raw_result = await crawler.arun(url=f"raw:{raw_html}")
|
||||
|
||||
# Clean up
|
||||
os.remove(sample_file)
|
||||
|
||||
print("Local file content:", local_result.markdown)
|
||||
print("\nRaw HTML content:", raw_result.markdown)
|
||||
|
||||
|
||||
# 3. Enhanced Markdown Generation Example
|
||||
async def markdown_generation_example():
|
||||
"""Example of enhanced markdown generation with citations and LLM-friendly features"""
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
# Create a content filter (optional)
|
||||
content_filter = BM25ContentFilter(
|
||||
# user_query="History and cultivation",
|
||||
bm25_threshold=1.0
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=content_filter,
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
from crawl4ai.content_filter_strategy import BM25ContentFilter
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://en.wikipedia.org/wiki/Apple",
|
||||
css_selector="main div#bodyContent",
|
||||
content_filter=BM25ContentFilter(),
|
||||
)
|
||||
print(result.markdown_v2.fit_markdown)
|
||||
|
||||
print("\nMarkdown Generation Results:")
|
||||
print(f"1. Original markdown length: {len(result.markdown)}")
|
||||
print("2. New markdown versions (markdown_v2):")
|
||||
print(f" - Raw markdown length: {len(result.markdown_v2.raw_markdown)}")
|
||||
print(
|
||||
f" - Citations markdown length: {len(result.markdown_v2.markdown_with_citations)}"
|
||||
)
|
||||
print(
|
||||
f" - References section length: {len(result.markdown_v2.references_markdown)}"
|
||||
)
|
||||
if result.markdown_v2.fit_markdown:
|
||||
print(
|
||||
f" - Filtered markdown length: {len(result.markdown_v2.fit_markdown)}"
|
||||
)
|
||||
|
||||
# Save examples to files
|
||||
output_dir = os.path.join(__data__, "markdown_examples")
|
||||
os.makedirs(output_dir, exist_ok=True)
|
||||
|
||||
# Save different versions
|
||||
with open(os.path.join(output_dir, "1_raw_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.raw_markdown)
|
||||
|
||||
with open(os.path.join(output_dir, "2_citations_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.markdown_with_citations)
|
||||
|
||||
with open(os.path.join(output_dir, "3_references.md"), "w") as f:
|
||||
f.write(result.markdown_v2.references_markdown)
|
||||
|
||||
if result.markdown_v2.fit_markdown:
|
||||
with open(os.path.join(output_dir, "4_filtered_markdown.md"), "w") as f:
|
||||
f.write(result.markdown_v2.fit_markdown)
|
||||
|
||||
print(f"\nMarkdown examples saved to: {output_dir}")
|
||||
|
||||
# Show a sample of citations and references
|
||||
print("\nSample of markdown with citations:")
|
||||
print(result.markdown_v2.markdown_with_citations[:500] + "...\n")
|
||||
print("Sample of references:")
|
||||
print(
|
||||
"\n".join(result.markdown_v2.references_markdown.split("\n")[:10]) + "..."
|
||||
)
|
||||
|
||||
|
||||
# 4. Browser Management Example
|
||||
async def browser_management_example():
|
||||
"""Example of using enhanced browser management features"""
|
||||
# Use the specified user directory path
|
||||
user_data_dir = os.path.join(Path.home(), ".crawl4ai", "browser_profile")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
print(f"Browser profile will be saved to: {user_data_dir}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
use_managed_browser=True,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=False,
|
||||
verbose=True,
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
# Use GitHub as an example - it's a good test for browser management
|
||||
# because it requires proper browser handling
|
||||
result = await crawler.arun(
|
||||
url="https://github.com/trending",
|
||||
# session_id="persistent_session_1",
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
)
|
||||
|
||||
print("\nBrowser session result:", result.success)
|
||||
if result.success:
|
||||
print("Page title:", result.metadata.get("title", "No title found"))
|
||||
|
||||
|
||||
# 5. API Usage Example
|
||||
async def api_example():
|
||||
"""Example of using the new API endpoints"""
|
||||
api_token = os.getenv("CRAWL4AI_API_TOKEN") or "test_api_code"
|
||||
headers = {"Authorization": f"Bearer {api_token}"}
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Submit crawl job
|
||||
crawl_request = {
|
||||
"urls": ["https://news.ycombinator.com"], # Hacker News as an example
|
||||
"extraction_config": {
|
||||
"type": "json_css",
|
||||
"params": {
|
||||
"schema": {
|
||||
"name": "Hacker News Articles",
|
||||
"baseSelector": ".athing",
|
||||
"fields": [
|
||||
{"name": "title", "selector": ".title a", "type": "text"},
|
||||
{"name": "score", "selector": ".score", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"selector": ".title a",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
},
|
||||
],
|
||||
}
|
||||
},
|
||||
},
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
# "use_managed_browser": True
|
||||
},
|
||||
"cache_mode": "bypass",
|
||||
# "screenshot": True,
|
||||
# "magic": True
|
||||
}
|
||||
|
||||
async with session.post(
|
||||
"http://localhost:11235/crawl", json=crawl_request, headers=headers
|
||||
) as response:
|
||||
task_data = await response.json()
|
||||
task_id = task_data["task_id"]
|
||||
|
||||
# Check task status
|
||||
while True:
|
||||
async with session.get(
|
||||
f"http://localhost:11235/task/{task_id}", headers=headers
|
||||
) as status_response:
|
||||
result = await status_response.json()
|
||||
print(f"Task status: {result['status']}")
|
||||
|
||||
if result["status"] == "completed":
|
||||
print("Task completed!")
|
||||
print("Results:")
|
||||
news = json.loads(result["results"][0]["extracted_content"])
|
||||
print(json.dumps(news[:4], indent=2))
|
||||
break
|
||||
else:
|
||||
await asyncio.sleep(1)
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
# print("Running Crawl4AI feature examples...")
|
||||
|
||||
# print("\n1. Running Download Example:")
|
||||
# await download_example()
|
||||
|
||||
# print("\n2. Running Markdown Generation Example:")
|
||||
# await markdown_generation_example()
|
||||
|
||||
# # print("\n3. Running Local and Raw HTML Example:")
|
||||
# await local_and_raw_html_example()
|
||||
|
||||
# # print("\n4. Running Browser Management Example:")
|
||||
await browser_management_example()
|
||||
|
||||
# print("\n5. Running API Example:")
|
||||
await api_example()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,464 +0,0 @@
|
||||
"""
|
||||
Crawl4AI v0.4.24 Feature Walkthrough
|
||||
===================================
|
||||
|
||||
This script demonstrates the new features introduced in Crawl4AI v0.4.24.
|
||||
Each section includes detailed examples and explanations of the new capabilities.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
from typing import List
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
)
|
||||
from crawl4ai.content_filter_strategy import RelevantContentFilter
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from bs4 import BeautifulSoup
|
||||
|
||||
# Sample HTML for demonstrations
|
||||
SAMPLE_HTML = """
|
||||
<div class="article-list">
|
||||
<article class="post" data-category="tech" data-author="john">
|
||||
<h2 class="title"><a href="/post-1">First Post</a></h2>
|
||||
<div class="meta">
|
||||
<a href="/author/john" class="author">John Doe</a>
|
||||
<span class="date">2023-12-31</span>
|
||||
</div>
|
||||
<div class="content">
|
||||
<p>First post content...</p>
|
||||
<a href="/read-more-1" class="read-more">Read More</a>
|
||||
</div>
|
||||
</article>
|
||||
<article class="post" data-category="science" data-author="jane">
|
||||
<h2 class="title"><a href="/post-2">Second Post</a></h2>
|
||||
<div class="meta">
|
||||
<a href="/author/jane" class="author">Jane Smith</a>
|
||||
<span class="date">2023-12-30</span>
|
||||
</div>
|
||||
<div class="content">
|
||||
<p>Second post content...</p>
|
||||
<a href="/read-more-2" class="read-more">Read More</a>
|
||||
</div>
|
||||
</article>
|
||||
</div>
|
||||
"""
|
||||
|
||||
|
||||
async def demo_ssl_features():
|
||||
"""
|
||||
Enhanced SSL & Security Features Demo
|
||||
-----------------------------------
|
||||
|
||||
This example demonstrates the new SSL certificate handling and security features:
|
||||
1. Custom certificate paths
|
||||
2. SSL verification options
|
||||
3. HTTPS error handling
|
||||
4. Certificate validation configurations
|
||||
|
||||
These features are particularly useful when:
|
||||
- Working with self-signed certificates
|
||||
- Dealing with corporate proxies
|
||||
- Handling mixed content websites
|
||||
- Managing different SSL security levels
|
||||
"""
|
||||
print("\n1. Enhanced SSL & Security Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
browser_config = BrowserConfig()
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
fetch_ssl_certificate=True, # Enable SSL certificate fetching
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
result = await crawler.arun(url="https://example.com", config=run_config)
|
||||
print(f"SSL Crawl Success: {result.success}")
|
||||
result.ssl_certificate.to_json(
|
||||
os.path.join(os.getcwd(), "ssl_certificate.json")
|
||||
)
|
||||
if not result.success:
|
||||
print(f"SSL Error: {result.error_message}")
|
||||
|
||||
|
||||
async def demo_content_filtering():
|
||||
"""
|
||||
Smart Content Filtering Demo
|
||||
----------------------
|
||||
|
||||
Demonstrates advanced content filtering capabilities:
|
||||
1. Custom filter to identify and extract specific content
|
||||
2. Integration with markdown generation
|
||||
3. Flexible pruning rules
|
||||
"""
|
||||
print("\n2. Smart Content Filtering Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
# Create a custom content filter
|
||||
class CustomNewsFilter(RelevantContentFilter):
|
||||
def __init__(self):
|
||||
super().__init__()
|
||||
# Add news-specific patterns
|
||||
self.negative_patterns = re.compile(
|
||||
r"nav|footer|header|sidebar|ads|comment|share|related|recommended|popular|trending",
|
||||
re.I,
|
||||
)
|
||||
self.min_word_count = 30 # Higher threshold for news content
|
||||
|
||||
def filter_content(
|
||||
self, html: str, min_word_threshold: int = None
|
||||
) -> List[str]:
|
||||
"""
|
||||
Implements news-specific content filtering logic.
|
||||
|
||||
Args:
|
||||
html (str): HTML content to be filtered
|
||||
min_word_threshold (int, optional): Minimum word count threshold
|
||||
|
||||
Returns:
|
||||
List[str]: List of filtered HTML content blocks
|
||||
"""
|
||||
if not html or not isinstance(html, str):
|
||||
return []
|
||||
|
||||
soup = BeautifulSoup(html, "lxml")
|
||||
if not soup.body:
|
||||
soup = BeautifulSoup(f"<body>{html}</body>", "lxml")
|
||||
|
||||
body = soup.find("body")
|
||||
|
||||
# Extract chunks with metadata
|
||||
chunks = self.extract_text_chunks(
|
||||
body, min_word_threshold or self.min_word_count
|
||||
)
|
||||
|
||||
# Filter chunks based on news-specific criteria
|
||||
filtered_chunks = []
|
||||
for _, text, tag_type, element in chunks:
|
||||
# Skip if element has negative class/id
|
||||
if self.is_excluded(element):
|
||||
continue
|
||||
|
||||
# Headers are important in news articles
|
||||
if tag_type == "header":
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
continue
|
||||
|
||||
# For content, check word count and link density
|
||||
text = element.get_text(strip=True)
|
||||
if len(text.split()) >= (min_word_threshold or self.min_word_count):
|
||||
# Calculate link density
|
||||
links_text = " ".join(
|
||||
a.get_text(strip=True) for a in element.find_all("a")
|
||||
)
|
||||
link_density = len(links_text) / len(text) if text else 1
|
||||
|
||||
# Accept if link density is reasonable
|
||||
if link_density < 0.5:
|
||||
filtered_chunks.append(self.clean_element(element))
|
||||
|
||||
return filtered_chunks
|
||||
|
||||
# Create markdown generator with custom filter
|
||||
markdown_gen = DefaultMarkdownGenerator(content_filter=CustomNewsFilter())
|
||||
|
||||
run_config = CrawlerRunConfig(
|
||||
markdown_generator=markdown_gen, cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://news.ycombinator.com", config=run_config
|
||||
)
|
||||
print("Filtered Content Sample:")
|
||||
print(result.markdown[:500]) # Show first 500 chars
|
||||
|
||||
|
||||
async def demo_json_extraction():
|
||||
"""
|
||||
Improved JSON Extraction Demo
|
||||
---------------------------
|
||||
|
||||
Demonstrates the enhanced JSON extraction capabilities:
|
||||
1. Base element attributes extraction
|
||||
2. Complex nested structures
|
||||
3. Multiple extraction patterns
|
||||
|
||||
Key features shown:
|
||||
- Extracting attributes from base elements (href, data-* attributes)
|
||||
- Processing repeated patterns
|
||||
- Handling optional fields
|
||||
"""
|
||||
print("\n3. Improved JSON Extraction Demo")
|
||||
print("--------------------------------")
|
||||
|
||||
# Define the extraction schema with base element attributes
|
||||
json_strategy = JsonCssExtractionStrategy(
|
||||
schema={
|
||||
"name": "Blog Posts",
|
||||
"baseSelector": "div.article-list",
|
||||
"baseFields": [
|
||||
{"name": "list_id", "type": "attribute", "attribute": "data-list-id"},
|
||||
{"name": "category", "type": "attribute", "attribute": "data-category"},
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "posts",
|
||||
"selector": "article.post",
|
||||
"type": "nested_list",
|
||||
"baseFields": [
|
||||
{
|
||||
"name": "post_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-post-id",
|
||||
},
|
||||
{
|
||||
"name": "author_id",
|
||||
"type": "attribute",
|
||||
"attribute": "data-author",
|
||||
},
|
||||
],
|
||||
"fields": [
|
||||
{
|
||||
"name": "title",
|
||||
"selector": "h2.title a",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{
|
||||
"name": "url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"name": "author",
|
||||
"selector": "div.meta a.author",
|
||||
"type": "text",
|
||||
"baseFields": [
|
||||
{
|
||||
"name": "profile_url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
}
|
||||
],
|
||||
},
|
||||
{"name": "date", "selector": "span.date", "type": "text"},
|
||||
{
|
||||
"name": "read_more",
|
||||
"selector": "a.read-more",
|
||||
"type": "nested",
|
||||
"fields": [
|
||||
{"name": "text", "type": "text"},
|
||||
{
|
||||
"name": "url",
|
||||
"type": "attribute",
|
||||
"attribute": "href",
|
||||
},
|
||||
],
|
||||
},
|
||||
],
|
||||
}
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
# Demonstrate extraction from raw HTML
|
||||
run_config = CrawlerRunConfig(
|
||||
extraction_strategy=json_strategy, cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="raw:" + SAMPLE_HTML, # Use raw: prefix for raw HTML
|
||||
config=run_config,
|
||||
)
|
||||
print("Extracted Content:")
|
||||
print(result.extracted_content)
|
||||
|
||||
|
||||
async def demo_input_formats():
|
||||
"""
|
||||
Input Format Handling Demo
|
||||
----------------------
|
||||
|
||||
Demonstrates how LLM extraction can work with different input formats:
|
||||
1. Markdown (default) - Good for simple text extraction
|
||||
2. HTML - Better when you need structure and attributes
|
||||
|
||||
This example shows how HTML input can be beneficial when:
|
||||
- You need to understand the DOM structure
|
||||
- You want to extract both visible text and HTML attributes
|
||||
- The content has complex layouts like tables or forms
|
||||
"""
|
||||
print("\n4. Input Format Handling Demo")
|
||||
print("---------------------------")
|
||||
|
||||
# Create a dummy HTML with rich structure
|
||||
dummy_html = """
|
||||
<div class="job-posting" data-post-id="12345">
|
||||
<header class="job-header">
|
||||
<h1 class="job-title">Senior AI/ML Engineer</h1>
|
||||
<div class="job-meta">
|
||||
<span class="department">AI Research Division</span>
|
||||
<span class="location" data-remote="hybrid">San Francisco (Hybrid)</span>
|
||||
</div>
|
||||
<div class="salary-info" data-currency="USD">
|
||||
<span class="range">$150,000 - $220,000</span>
|
||||
<span class="period">per year</span>
|
||||
</div>
|
||||
</header>
|
||||
|
||||
<section class="requirements">
|
||||
<div class="technical-skills">
|
||||
<h3>Technical Requirements</h3>
|
||||
<ul class="required-skills">
|
||||
<li class="skill required" data-priority="must-have">
|
||||
5+ years experience in Machine Learning
|
||||
</li>
|
||||
<li class="skill required" data-priority="must-have">
|
||||
Proficiency in Python and PyTorch/TensorFlow
|
||||
</li>
|
||||
<li class="skill preferred" data-priority="nice-to-have">
|
||||
Experience with distributed training systems
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
|
||||
<div class="soft-skills">
|
||||
<h3>Professional Skills</h3>
|
||||
<ul class="required-skills">
|
||||
<li class="skill required" data-priority="must-have">
|
||||
Strong problem-solving abilities
|
||||
</li>
|
||||
<li class="skill preferred" data-priority="nice-to-have">
|
||||
Experience leading technical teams
|
||||
</li>
|
||||
</ul>
|
||||
</div>
|
||||
</section>
|
||||
|
||||
<section class="timeline">
|
||||
<time class="deadline" datetime="2024-02-28">
|
||||
Application Deadline: February 28, 2024
|
||||
</time>
|
||||
</section>
|
||||
|
||||
<footer class="contact-section">
|
||||
<div class="hiring-manager">
|
||||
<h4>Hiring Manager</h4>
|
||||
<div class="contact-info">
|
||||
<span class="name">Dr. Sarah Chen</span>
|
||||
<span class="title">Director of AI Research</span>
|
||||
<span class="email">ai.hiring@example.com</span>
|
||||
</div>
|
||||
</div>
|
||||
<div class="team-info">
|
||||
<p>Join our team of 50+ researchers working on cutting-edge AI applications</p>
|
||||
</div>
|
||||
</footer>
|
||||
</div>
|
||||
"""
|
||||
|
||||
# Use raw:// prefix to pass HTML content directly
|
||||
url = f"raw://{dummy_html}"
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List, Optional
|
||||
|
||||
# Define our schema using Pydantic
|
||||
class JobRequirement(BaseModel):
|
||||
category: str = Field(
|
||||
description="Category of the requirement (e.g., Technical, Soft Skills)"
|
||||
)
|
||||
items: List[str] = Field(
|
||||
description="List of specific requirements in this category"
|
||||
)
|
||||
priority: str = Field(
|
||||
description="Priority level (Required/Preferred) based on the HTML class or context"
|
||||
)
|
||||
|
||||
class JobPosting(BaseModel):
|
||||
title: str = Field(description="Job title")
|
||||
department: str = Field(description="Department or team")
|
||||
location: str = Field(description="Job location, including remote options")
|
||||
salary_range: Optional[str] = Field(description="Salary range if specified")
|
||||
requirements: List[JobRequirement] = Field(
|
||||
description="Categorized job requirements"
|
||||
)
|
||||
application_deadline: Optional[str] = Field(
|
||||
description="Application deadline if specified"
|
||||
)
|
||||
contact_info: Optional[dict] = Field(
|
||||
description="Contact information from footer or contact section"
|
||||
)
|
||||
|
||||
# First try with markdown (default)
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
schema=JobPosting.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""
|
||||
Extract job posting details into structured data. Focus on the visible text content
|
||||
and organize requirements into categories.
|
||||
""",
|
||||
input_format="markdown", # default
|
||||
)
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
provider="openai/gpt-4",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
schema=JobPosting.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""
|
||||
Extract job posting details, using HTML structure to:
|
||||
1. Identify requirement priorities from CSS classes (e.g., 'required' vs 'preferred')
|
||||
2. Extract contact info from the page footer or dedicated contact section
|
||||
3. Parse salary information from specially formatted elements
|
||||
4. Determine application deadline from timestamp or date elements
|
||||
|
||||
Use HTML attributes and classes to enhance extraction accuracy.
|
||||
""",
|
||||
input_format="html", # explicitly use HTML
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Try with markdown first
|
||||
markdown_config = CrawlerRunConfig(extraction_strategy=markdown_strategy)
|
||||
markdown_result = await crawler.arun(url=url, config=markdown_config)
|
||||
print("\nMarkdown-based Extraction Result:")
|
||||
items = json.loads(markdown_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
# Then with HTML for better structure understanding
|
||||
html_config = CrawlerRunConfig(extraction_strategy=html_strategy)
|
||||
html_result = await crawler.arun(url=url, config=html_config)
|
||||
print("\nHTML-based Extraction Result:")
|
||||
items = json.loads(html_result.extracted_content)
|
||||
print(json.dumps(items, indent=2))
|
||||
|
||||
|
||||
# Main execution
|
||||
async def main():
|
||||
print("Crawl4AI v0.4.24 Feature Walkthrough")
|
||||
print("====================================")
|
||||
|
||||
# Run all demos
|
||||
await demo_ssl_features()
|
||||
await demo_content_filtering()
|
||||
await demo_json_extraction()
|
||||
# await demo_input_formats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -1,351 +0,0 @@
|
||||
"""
|
||||
Crawl4ai v0.4.3b2 Features Demo
|
||||
============================
|
||||
|
||||
This demonstration showcases three major categories of new features in Crawl4ai v0.4.3:
|
||||
|
||||
1. Efficiency & Speed:
|
||||
- Memory-efficient dispatcher strategies
|
||||
- New scraping algorithm
|
||||
- Streaming support for batch crawling
|
||||
|
||||
2. LLM Integration:
|
||||
- Automatic schema generation
|
||||
- LLM-powered content filtering
|
||||
- Smart markdown generation
|
||||
|
||||
3. Core Improvements:
|
||||
- Robots.txt compliance
|
||||
- Proxy rotation
|
||||
- Enhanced URL handling
|
||||
- Shared data among hooks
|
||||
- add page routes
|
||||
|
||||
Each demo function can be run independently or as part of the full suite.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import json
|
||||
import re
|
||||
import random
|
||||
from typing import Optional, Dict
|
||||
from dotenv import load_dotenv
|
||||
|
||||
load_dotenv()
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DisplayMode,
|
||||
MemoryAdaptiveDispatcher,
|
||||
CrawlerMonitor,
|
||||
DefaultMarkdownGenerator,
|
||||
LXMLWebScrapingStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
LLMContentFilter
|
||||
)
|
||||
|
||||
|
||||
async def demo_memory_dispatcher():
|
||||
"""Demonstrates the new memory-efficient dispatcher system.
|
||||
|
||||
Key Features:
|
||||
- Adaptive memory management
|
||||
- Real-time performance monitoring
|
||||
- Concurrent session control
|
||||
"""
|
||||
print("\n=== Memory Dispatcher Demo ===")
|
||||
|
||||
try:
|
||||
# Configuration
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator()
|
||||
)
|
||||
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 3
|
||||
|
||||
print("\n📈 Initializing crawler with memory monitoring...")
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
monitor = CrawlerMonitor(
|
||||
max_visible_rows=10,
|
||||
display_mode=DisplayMode.DETAILED
|
||||
)
|
||||
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=80.0,
|
||||
check_interval=0.5,
|
||||
max_session_permit=5,
|
||||
monitor=monitor
|
||||
)
|
||||
|
||||
print("\n🚀 Starting batch crawl...")
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
print(f"\n✅ Completed {len(results)} URLs successfully")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n❌ Error in memory dispatcher demo: {str(e)}")
|
||||
|
||||
async def demo_streaming_support():
|
||||
"""
|
||||
2. Streaming Support Demo
|
||||
======================
|
||||
Shows how to process URLs as they complete using streaming
|
||||
"""
|
||||
print("\n=== 2. Streaming Support Demo ===")
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, stream=True)
|
||||
|
||||
# Test URLs
|
||||
urls = ["http://example.com", "http://example.org", "http://example.net"] * 2
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Initialize dispatcher for streaming
|
||||
dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5)
|
||||
|
||||
print("Starting streaming crawl...")
|
||||
async for result in await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
):
|
||||
# Process each result as it arrives
|
||||
print(
|
||||
f"Received result for {result.url} - Success: {result.success}"
|
||||
)
|
||||
if result.success:
|
||||
print(f"Content length: {len(result.markdown)}")
|
||||
|
||||
async def demo_content_scraping():
|
||||
"""
|
||||
3. Content Scraping Strategy Demo
|
||||
==============================
|
||||
Demonstrates the new LXMLWebScrapingStrategy for faster content scraping.
|
||||
"""
|
||||
print("\n=== 3. Content Scraping Strategy Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://example.com/article"
|
||||
|
||||
# Configure with the new LXML strategy
|
||||
config = CrawlerRunConfig(
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print("Scraping content with LXML strategy...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success:
|
||||
print("Successfully scraped content using LXML strategy")
|
||||
|
||||
async def demo_llm_markdown():
|
||||
"""
|
||||
4. LLM-Powered Markdown Generation Demo
|
||||
===================================
|
||||
Shows how to use the new LLM-powered content filtering and markdown generation.
|
||||
"""
|
||||
print("\n=== 4. LLM-Powered Markdown Generation Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://docs.python.org/3/tutorial/classes.html"
|
||||
|
||||
content_filter = LLMContentFilter(
|
||||
provider="openai/gpt-4o",
|
||||
api_token=os.getenv("OPENAI_API_KEY"),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
- Key concepts and their explanations
|
||||
- Important code examples
|
||||
- Essential technical details
|
||||
Exclude:
|
||||
- Navigation elements
|
||||
- Sidebars
|
||||
- Footer content
|
||||
- Version information
|
||||
- Any non-essential UI elements
|
||||
|
||||
Format the output as clean markdown with proper code blocks and headers.
|
||||
""",
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Configure LLM-powered markdown generation
|
||||
config = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=content_filter
|
||||
),
|
||||
cache_mode = CacheMode.BYPASS,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
print("Generating focused markdown with LLM...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success and result.markdown_v2:
|
||||
print("Successfully generated LLM-filtered markdown")
|
||||
print("First 500 chars of filtered content:")
|
||||
print(result.markdown_v2.fit_markdown[:500])
|
||||
print("Successfully generated LLM-filtered markdown")
|
||||
|
||||
async def demo_robots_compliance():
|
||||
"""
|
||||
5. Robots.txt Compliance Demo
|
||||
==========================
|
||||
Demonstrates the new robots.txt compliance feature with SQLite caching.
|
||||
"""
|
||||
print("\n=== 5. Robots.txt Compliance Demo ===")
|
||||
|
||||
crawler = AsyncWebCrawler()
|
||||
urls = ["https://example.com", "https://facebook.com", "https://twitter.com"]
|
||||
|
||||
# Enable robots.txt checking
|
||||
config = CrawlerRunConfig(check_robots_txt=True, verbose=True)
|
||||
|
||||
print("Crawling with robots.txt compliance...")
|
||||
async with crawler:
|
||||
results = await crawler.arun_many(urls, config=config)
|
||||
for result in results:
|
||||
if result.status_code == 403:
|
||||
print(f"Access blocked by robots.txt: {result.url}")
|
||||
elif result.success:
|
||||
print(f"Successfully crawled: {result.url}")
|
||||
|
||||
async def demo_json_schema_generation():
|
||||
"""
|
||||
7. LLM-Powered Schema Generation Demo
|
||||
=================================
|
||||
Demonstrates automatic CSS and XPath schema generation using LLM models.
|
||||
"""
|
||||
print("\n=== 7. LLM-Powered Schema Generation Demo ===")
|
||||
|
||||
# Example HTML content for a job listing
|
||||
html_content = """
|
||||
<div class="job-listing">
|
||||
<h1 class="job-title">Senior Software Engineer</h1>
|
||||
<div class="job-details">
|
||||
<span class="location">San Francisco, CA</span>
|
||||
<span class="salary">$150,000 - $200,000</span>
|
||||
<div class="requirements">
|
||||
<h2>Requirements</h2>
|
||||
<ul>
|
||||
<li>5+ years Python experience</li>
|
||||
<li>Strong background in web crawling</li>
|
||||
</ul>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
print("Generating CSS selectors schema...")
|
||||
# Generate CSS selectors with a specific query
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html_content,
|
||||
schema_type="CSS",
|
||||
query="Extract job title, location, and salary information",
|
||||
provider="openai/gpt-4o", # or use other providers like "ollama"
|
||||
)
|
||||
print("\nGenerated CSS Schema:")
|
||||
print(css_schema)
|
||||
|
||||
# Example of using the generated schema with crawler
|
||||
crawler = AsyncWebCrawler()
|
||||
url = "https://example.com/job-listing"
|
||||
|
||||
# Create an extraction strategy with the generated schema
|
||||
extraction_strategy = JsonCssExtractionStrategy(schema=css_schema)
|
||||
|
||||
config = CrawlerRunConfig(extraction_strategy=extraction_strategy, verbose=True)
|
||||
|
||||
print("\nTesting generated schema with crawler...")
|
||||
async with crawler:
|
||||
result = await crawler.arun(url, config=config)
|
||||
if result.success:
|
||||
print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None)
|
||||
print("Successfully used generated schema for crawling")
|
||||
|
||||
async def demo_proxy_rotation():
|
||||
"""
|
||||
8. Proxy Rotation Demo
|
||||
===================
|
||||
Demonstrates how to rotate proxies for each request using Crawl4ai.
|
||||
"""
|
||||
print("\n=== 8. Proxy Rotation Demo ===")
|
||||
|
||||
async def get_next_proxy(proxy_file: str = f"proxies.txt") -> Optional[Dict]:
|
||||
"""Get next proxy from local file"""
|
||||
try:
|
||||
proxies = os.getenv("PROXIES", "").split(",")
|
||||
|
||||
ip, port, username, password = random.choice(proxies).split(":")
|
||||
return {
|
||||
"server": f"http://{ip}:{port}",
|
||||
"username": username,
|
||||
"password": password,
|
||||
"ip": ip # Store original IP for verification
|
||||
}
|
||||
except Exception as e:
|
||||
print(f"Error loading proxy: {e}")
|
||||
return None
|
||||
|
||||
# Create 10 test requests to httpbin
|
||||
urls = ["https://httpbin.org/ip"] * 2
|
||||
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
for url in urls:
|
||||
proxy = await get_next_proxy()
|
||||
if not proxy:
|
||||
print("No proxy available, skipping...")
|
||||
continue
|
||||
|
||||
# Create new config with proxy
|
||||
current_config = run_config.clone(proxy_config=proxy, user_agent="")
|
||||
result = await crawler.arun(url=url, config=current_config)
|
||||
|
||||
if result.success:
|
||||
ip_match = re.search(r'(?:[0-9]{1,3}\.){3}[0-9]{1,3}', result.html)
|
||||
print(f"Proxy {proxy['ip']} -> Response IP: {ip_match.group(0) if ip_match else 'Not found'}")
|
||||
verified = ip_match.group(0) == proxy['ip']
|
||||
if verified:
|
||||
print(f"✅ Proxy working! IP matches: {proxy['ip']}")
|
||||
else:
|
||||
print(f"❌ Proxy failed or IP mismatch!")
|
||||
else:
|
||||
print(f"Failed with proxy {proxy['ip']}")
|
||||
|
||||
async def main():
|
||||
"""Run all feature demonstrations."""
|
||||
print("\n📊 Running Crawl4ai v0.4.3 Feature Demos\n")
|
||||
|
||||
# Efficiency & Speed Demos
|
||||
print("\n🚀 EFFICIENCY & SPEED DEMOS")
|
||||
await demo_memory_dispatcher()
|
||||
await demo_streaming_support()
|
||||
await demo_content_scraping()
|
||||
|
||||
# # LLM Integration Demos
|
||||
print("\n🤖 LLM INTEGRATION DEMOS")
|
||||
await demo_json_schema_generation()
|
||||
await demo_llm_markdown()
|
||||
|
||||
# # Core Improvements
|
||||
print("\n🔧 CORE IMPROVEMENT DEMOS")
|
||||
await demo_robots_compliance()
|
||||
await demo_proxy_rotation()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user