Merged next branch

This commit is contained in:
Aravind Karnam
2025-04-12 10:47:02 +05:30
62 changed files with 3225 additions and 7085 deletions

View File

@@ -0,0 +1,471 @@
import asyncio
import json
import os
import base64
from pathlib import Path
from typing import List, Dict, Any
from datetime import datetime
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai import BrowserConfig
__cur_dir__ = Path(__file__).parent
# Create temp directory if it doesn't exist
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
async def demo_basic_network_capture():
"""Basic network request capturing example"""
print("\n=== 1. Basic Network Request Capturing ===")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
capture_network_requests=True,
wait_until="networkidle" # Wait for network to be idle
)
result = await crawler.arun(
url="https://example.com/",
config=config
)
if result.success and result.network_requests:
print(f"Captured {len(result.network_requests)} network events")
# Count by event type
event_types = {}
for req in result.network_requests:
event_type = req.get("event_type", "unknown")
event_types[event_type] = event_types.get(event_type, 0) + 1
print("Event types:")
for event_type, count in event_types.items():
print(f" - {event_type}: {count}")
# Show a sample request and response
request = next((r for r in result.network_requests if r.get("event_type") == "request"), None)
response = next((r for r in result.network_requests if r.get("event_type") == "response"), None)
if request:
print("\nSample request:")
print(f" URL: {request.get('url')}")
print(f" Method: {request.get('method')}")
print(f" Headers: {list(request.get('headers', {}).keys())}")
if response:
print("\nSample response:")
print(f" URL: {response.get('url')}")
print(f" Status: {response.get('status')} {response.get('status_text', '')}")
print(f" Headers: {list(response.get('headers', {}).keys())}")
async def demo_basic_console_capture():
"""Basic console message capturing example"""
print("\n=== 2. Basic Console Message Capturing ===")
# Create a simple HTML file with console messages
html_file = os.path.join(__cur_dir__, "tmp", "console_test.html")
with open(html_file, "w") as f:
f.write("""
<!DOCTYPE html>
<html>
<head>
<title>Console Test</title>
</head>
<body>
<h1>Console Message Test</h1>
<script>
console.log("This is a basic log message");
console.info("This is an info message");
console.warn("This is a warning message");
console.error("This is an error message");
// Generate an error
try {
nonExistentFunction();
} catch (e) {
console.error("Caught error:", e);
}
</script>
</body>
</html>
""")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
capture_console_messages=True,
wait_until="networkidle" # Wait to make sure all scripts execute
)
result = await crawler.arun(
url=f"file://{html_file}",
config=config
)
if result.success and result.console_messages:
print(f"Captured {len(result.console_messages)} console messages")
# Count by message type
message_types = {}
for msg in result.console_messages:
msg_type = msg.get("type", "unknown")
message_types[msg_type] = message_types.get(msg_type, 0) + 1
print("Message types:")
for msg_type, count in message_types.items():
print(f" - {msg_type}: {count}")
# Show all messages
print("\nAll console messages:")
for i, msg in enumerate(result.console_messages, 1):
print(f" {i}. [{msg.get('type', 'unknown')}] {msg.get('text', '')}")
async def demo_combined_capture():
"""Capturing both network requests and console messages"""
print("\n=== 3. Combined Network and Console Capture ===")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
capture_network_requests=True,
capture_console_messages=True,
wait_until="networkidle"
)
result = await crawler.arun(
url="https://httpbin.org/html",
config=config
)
if result.success:
network_count = len(result.network_requests) if result.network_requests else 0
console_count = len(result.console_messages) if result.console_messages else 0
print(f"Captured {network_count} network events and {console_count} console messages")
# Save the captured data to a JSON file for analysis
output_file = os.path.join(__cur_dir__, "tmp", "capture_data.json")
with open(output_file, "w") as f:
json.dump({
"url": result.url,
"timestamp": datetime.now().isoformat(),
"network_requests": result.network_requests,
"console_messages": result.console_messages
}, f, indent=2)
print(f"Full capture data saved to {output_file}")
async def analyze_spa_network_traffic():
"""Analyze network traffic of a Single-Page Application"""
print("\n=== 4. Analyzing SPA Network Traffic ===")
async with AsyncWebCrawler(config=BrowserConfig(
headless=True,
viewport_width=1280,
viewport_height=800
)) as crawler:
config = CrawlerRunConfig(
capture_network_requests=True,
capture_console_messages=True,
# Wait longer to ensure all resources are loaded
wait_until="networkidle",
page_timeout=60000, # 60 seconds
)
result = await crawler.arun(
url="https://weather.com",
config=config
)
if result.success and result.network_requests:
# Extract different types of requests
requests = []
responses = []
failures = []
for event in result.network_requests:
event_type = event.get("event_type")
if event_type == "request":
requests.append(event)
elif event_type == "response":
responses.append(event)
elif event_type == "request_failed":
failures.append(event)
print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
# Analyze request types
resource_types = {}
for req in requests:
resource_type = req.get("resource_type", "unknown")
resource_types[resource_type] = resource_types.get(resource_type, 0) + 1
print("\nResource types:")
for resource_type, count in sorted(resource_types.items(), key=lambda x: x[1], reverse=True):
print(f" - {resource_type}: {count}")
# Analyze API calls
api_calls = [r for r in requests if "api" in r.get("url", "").lower()]
if api_calls:
print(f"\nDetected {len(api_calls)} API calls:")
for i, call in enumerate(api_calls[:5], 1): # Show first 5
print(f" {i}. {call.get('method')} {call.get('url')}")
if len(api_calls) > 5:
print(f" ... and {len(api_calls) - 5} more")
# Analyze response status codes
status_codes = {}
for resp in responses:
status = resp.get("status", 0)
status_codes[status] = status_codes.get(status, 0) + 1
print("\nResponse status codes:")
for status, count in sorted(status_codes.items()):
print(f" - {status}: {count}")
# Analyze failures
if failures:
print("\nFailed requests:")
for i, failure in enumerate(failures[:5], 1): # Show first 5
print(f" {i}. {failure.get('url')} - {failure.get('failure_text')}")
if len(failures) > 5:
print(f" ... and {len(failures) - 5} more")
# Check for console errors
if result.console_messages:
errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
if errors:
print(f"\nDetected {len(errors)} console errors:")
for i, error in enumerate(errors[:3], 1): # Show first 3
print(f" {i}. {error.get('text', '')[:100]}...")
if len(errors) > 3:
print(f" ... and {len(errors) - 3} more")
# Save analysis to file
output_file = os.path.join(__cur_dir__, "tmp", "weather_network_analysis.json")
with open(output_file, "w") as f:
json.dump({
"url": result.url,
"timestamp": datetime.now().isoformat(),
"statistics": {
"request_count": len(requests),
"response_count": len(responses),
"failure_count": len(failures),
"resource_types": resource_types,
"status_codes": {str(k): v for k, v in status_codes.items()},
"api_call_count": len(api_calls),
"console_error_count": len(errors) if result.console_messages else 0
},
"network_requests": result.network_requests,
"console_messages": result.console_messages
}, f, indent=2)
print(f"\nFull analysis saved to {output_file}")
async def demo_security_analysis():
"""Using network capture for security analysis"""
print("\n=== 5. Security Analysis with Network Capture ===")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
capture_network_requests=True,
capture_console_messages=True,
wait_until="networkidle"
)
# A site that makes multiple third-party requests
result = await crawler.arun(
url="https://www.nytimes.com/",
config=config
)
if result.success and result.network_requests:
print(f"Captured {len(result.network_requests)} network events")
# Extract all domains
domains = set()
for req in result.network_requests:
if req.get("event_type") == "request":
url = req.get("url", "")
try:
from urllib.parse import urlparse
domain = urlparse(url).netloc
if domain:
domains.add(domain)
except:
pass
print(f"\nDetected requests to {len(domains)} unique domains:")
main_domain = urlparse(result.url).netloc
# Separate first-party vs third-party domains
first_party = [d for d in domains if main_domain in d]
third_party = [d for d in domains if main_domain not in d]
print(f" - First-party domains: {len(first_party)}")
print(f" - Third-party domains: {len(third_party)}")
# Look for potential trackers/analytics
tracking_keywords = ["analytics", "tracker", "pixel", "tag", "stats", "metric", "collect", "beacon"]
potential_trackers = []
for domain in third_party:
if any(keyword in domain.lower() for keyword in tracking_keywords):
potential_trackers.append(domain)
if potential_trackers:
print(f"\nPotential tracking/analytics domains ({len(potential_trackers)}):")
for i, domain in enumerate(sorted(potential_trackers)[:10], 1):
print(f" {i}. {domain}")
if len(potential_trackers) > 10:
print(f" ... and {len(potential_trackers) - 10} more")
# Check for insecure (HTTP) requests
insecure_requests = [
req.get("url") for req in result.network_requests
if req.get("event_type") == "request" and req.get("url", "").startswith("http://")
]
if insecure_requests:
print(f"\nWarning: Found {len(insecure_requests)} insecure (HTTP) requests:")
for i, url in enumerate(insecure_requests[:5], 1):
print(f" {i}. {url}")
if len(insecure_requests) > 5:
print(f" ... and {len(insecure_requests) - 5} more")
# Save security analysis to file
output_file = os.path.join(__cur_dir__, "tmp", "security_analysis.json")
with open(output_file, "w") as f:
json.dump({
"url": result.url,
"main_domain": main_domain,
"timestamp": datetime.now().isoformat(),
"analysis": {
"total_requests": len([r for r in result.network_requests if r.get("event_type") == "request"]),
"unique_domains": len(domains),
"first_party_domains": first_party,
"third_party_domains": third_party,
"potential_trackers": potential_trackers,
"insecure_requests": insecure_requests
}
}, f, indent=2)
print(f"\nFull security analysis saved to {output_file}")
async def demo_performance_analysis():
"""Using network capture for performance analysis"""
print("\n=== 6. Performance Analysis with Network Capture ===")
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
capture_network_requests=True,
wait_until="networkidle",
page_timeout=60000 # 60 seconds
)
result = await crawler.arun(
url="https://www.cnn.com/",
config=config
)
if result.success and result.network_requests:
# Filter only response events with timing information
responses_with_timing = [
r for r in result.network_requests
if r.get("event_type") == "response" and r.get("request_timing")
]
if responses_with_timing:
print(f"Analyzing timing for {len(responses_with_timing)} network responses")
# Group by resource type
resource_timings = {}
for resp in responses_with_timing:
url = resp.get("url", "")
timing = resp.get("request_timing", {})
# Determine resource type from URL extension
ext = url.split(".")[-1].lower() if "." in url.split("/")[-1] else "unknown"
if ext in ["jpg", "jpeg", "png", "gif", "webp", "svg", "ico"]:
resource_type = "image"
elif ext in ["js"]:
resource_type = "javascript"
elif ext in ["css"]:
resource_type = "css"
elif ext in ["woff", "woff2", "ttf", "otf", "eot"]:
resource_type = "font"
else:
resource_type = "other"
if resource_type not in resource_timings:
resource_timings[resource_type] = []
# Calculate request duration if timing information is available
if isinstance(timing, dict) and "requestTime" in timing and "receiveHeadersEnd" in timing:
# Convert to milliseconds
duration = (timing["receiveHeadersEnd"] - timing["requestTime"]) * 1000
resource_timings[resource_type].append({
"url": url,
"duration_ms": duration
})
# Calculate statistics for each resource type
print("\nPerformance by resource type:")
for resource_type, timings in resource_timings.items():
if timings:
durations = [t["duration_ms"] for t in timings]
avg_duration = sum(durations) / len(durations)
max_duration = max(durations)
slowest_resource = next(t["url"] for t in timings if t["duration_ms"] == max_duration)
print(f" {resource_type.upper()}:")
print(f" - Count: {len(timings)}")
print(f" - Avg time: {avg_duration:.2f} ms")
print(f" - Max time: {max_duration:.2f} ms")
print(f" - Slowest: {slowest_resource}")
# Identify the slowest resources overall
all_timings = []
for resource_type, timings in resource_timings.items():
for timing in timings:
timing["type"] = resource_type
all_timings.append(timing)
all_timings.sort(key=lambda x: x["duration_ms"], reverse=True)
print("\nTop 5 slowest resources:")
for i, timing in enumerate(all_timings[:5], 1):
print(f" {i}. [{timing['type']}] {timing['url']} - {timing['duration_ms']:.2f} ms")
# Save performance analysis to file
output_file = os.path.join(__cur_dir__, "tmp", "performance_analysis.json")
with open(output_file, "w") as f:
json.dump({
"url": result.url,
"timestamp": datetime.now().isoformat(),
"resource_timings": resource_timings,
"slowest_resources": all_timings[:10] # Save top 10
}, f, indent=2)
print(f"\nFull performance analysis saved to {output_file}")
async def main():
"""Run all demo functions sequentially"""
print("=== Network and Console Capture Examples ===")
# Make sure tmp directory exists
os.makedirs(os.path.join(__cur_dir__, "tmp"), exist_ok=True)
# Run basic examples
await demo_basic_network_capture()
await demo_basic_console_capture()
await demo_combined_capture()
# Run advanced examples
await analyze_spa_network_traffic()
await demo_security_analysis()
await demo_performance_analysis()
print("\n=== Examples Complete ===")
print(f"Check the tmp directory for output files: {os.path.join(__cur_dir__, 'tmp')}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,675 +0,0 @@
import os, sys
from crawl4ai import LLMConfig
# append parent directory to system path
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
os.environ["FIRECRAWL_API_KEY"] = "fc-84b370ccfad44beabc686b38f1769692"
import asyncio
# import nest_asyncio
# nest_asyncio.apply()
import time
import json
import os
import re
from typing import Dict, List
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode")
print("Website: https://crawl4ai.com")
async def simple_crawl():
print("\n--- Basic Usage ---")
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500]) # Print first 500 characters
async def simple_example_with_running_js_code():
print("\n--- Executing JavaScript and Using CSS Selectors ---")
# New code to handle the wait_for parameter
wait_for = """() => {
return Array.from(document.querySelectorAll('article.tease-card')).length > 10;
}"""
# wait_for can be also just a css selector
# wait_for = "article.tease-card:nth-child(10)"
async with AsyncWebCrawler(verbose=True) as crawler:
js_code = [
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
]
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=js_code,
# wait_for=wait_for,
cache_mode=CacheMode.BYPASS,
)
print(result.markdown[:500]) # Print first 500 characters
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
css_selector=".wide-tease-item__description",
cache_mode=CacheMode.BYPASS,
)
print(result.markdown[:500]) # Print first 500 characters
async def use_proxy():
print("\n--- Using a Proxy ---")
print(
"Note: Replace 'http://your-proxy-url:port' with a working proxy to run this example."
)
# Uncomment and modify the following lines to use a proxy
async with AsyncWebCrawler(
verbose=True, proxy="http://your-proxy-url:port"
) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", cache_mode=CacheMode.BYPASS
)
if result.success:
print(result.markdown[:500]) # Print first 500 characters
async def capture_and_save_screenshot(url: str, output_path: str):
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url=url, screenshot=True, cache_mode=CacheMode.BYPASS
)
if result.success and result.screenshot:
import base64
# Decode the base64 screenshot data
screenshot_data = base64.b64decode(result.screenshot)
# Save the screenshot as a JPEG file
with open(output_path, "wb") as f:
f.write(screenshot_data)
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm(
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
# extra_args = {}
extra_args = {
"temperature": 0,
"top_p": 0.9,
"max_tokens": 2000,
# any other supported parameters for litellm
}
if extra_headers:
extra_args["extra_headers"] = extra_headers
async with AsyncWebCrawler(verbose=True) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/",
word_count_threshold=1,
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content. One extracted model JSON format should look like this:
{"model_name": "GPT-4", "input_fee": "US$10.00 / 1M tokens", "output_fee": "US$30.00 / 1M tokens"}.""",
extra_args=extra_args,
),
cache_mode=CacheMode.BYPASS,
)
print(result.extracted_content)
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
schema = {
"name": "KidoCode Courses",
"baseSelector": "section.charge-methodology .w-tab-content > div",
"fields": [
{
"name": "section_title",
"selector": "h3.heading-50",
"type": "text",
},
{
"name": "section_description",
"selector": ".charge-content",
"type": "text",
},
{
"name": "course_name",
"selector": ".text-block-93",
"type": "text",
},
{
"name": "course_description",
"selector": ".course-content-text",
"type": "text",
},
{
"name": "course_icon",
"selector": ".image-92",
"type": "attribute",
"attribute": "src",
},
],
}
async with AsyncWebCrawler(headless=True, verbose=True) as crawler:
# Create the JavaScript that handles clicking multiple times
js_click_tabs = """
(async () => {
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
for(let tab of tabs) {
// scroll to the tab
tab.scrollIntoView();
tab.click();
// Wait for content to load and animations to complete
await new Promise(r => setTimeout(r, 500));
}
})();
"""
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology",
extraction_strategy=JsonCssExtractionStrategy(schema, verbose=True),
js_code=[js_click_tabs],
cache_mode=CacheMode.BYPASS,
)
companies = json.loads(result.extracted_content)
print(f"Successfully extracted {len(companies)} companies")
print(json.dumps(companies[0], indent=2))
# Advanced Session-Based Crawling with Dynamic Content 🔄
async def crawl_dynamic_content_pages_method_1():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
first_commit = ""
async def on_execution_started(page):
nonlocal first_commit
try:
while True:
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
commit = await commit.evaluate("(element) => element.textContent")
commit = re.sub(r"\s+", "", commit)
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
async with AsyncWebCrawler(verbose=True) as crawler:
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
(() => {
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
})();
"""
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
js=js_next_page if page > 0 else None,
cache_mode=CacheMode.BYPASS,
js_only=page > 0,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
soup = BeautifulSoup(result.cleaned_html, "html.parser")
commits = soup.select("li")
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def crawl_dynamic_content_pages_method_2():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
last_commit = ""
js_next_page_and_wait = """
(async () => {
const getCurrentCommit = () => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
return commits.length > 0 ? commits[0].textContent.trim() : null;
};
const initialCommit = getCurrentCommit();
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
// Poll for changes
while (true) {
await new Promise(resolve => setTimeout(resolve, 100)); // Wait 100ms
const newCommit = getCurrentCommit();
if (newCommit && newCommit !== initialCommit) {
break;
}
}
})();
"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def crawl_dynamic_content_pages_method_3():
print(
"\n--- Advanced Multi-Page Crawling with JavaScript Execution using `wait_for` ---"
)
async with AsyncWebCrawler(verbose=True) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length > 0) {
window.firstCommit = commits[0].textContent.trim();
}
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
wait_for = """() => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
if (commits.length === 0) return false;
const firstCommit = commits[0].textContent.trim();
return firstCommit !== window.firstCommit;
}"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)
for page in range(3): # Crawl 3 pages
result = await crawler.arun(
url=url,
session_id=session_id,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page if page > 0 else None,
wait_for=wait_for if page > 0 else None,
js_only=page > 0,
cache_mode=CacheMode.BYPASS,
headless=False,
)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
await crawler.crawler_strategy.kill_session(session_id)
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def crawl_custom_browser_type():
# Use Firefox
start = time.time()
async with AsyncWebCrawler(
browser_type="firefox", verbose=True, headless=True
) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use WebKit
start = time.time()
async with AsyncWebCrawler(
browser_type="webkit", verbose=True, headless=True
) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
# Use Chromium (default)
start = time.time()
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
result = await crawler.arun(
url="https://www.example.com", cache_mode=CacheMode.BYPASS
)
print(result.markdown[:500])
print("Time taken: ", time.time() - start)
async def crawl_with_user_simultion():
async with AsyncWebCrawler(verbose=True, headless=True) as crawler:
url = "YOUR-URL-HERE"
result = await crawler.arun(
url=url,
cache_mode=CacheMode.BYPASS,
magic=True, # Automatically detects and removes overlays, popups, and other elements that block content
# simulate_user = True,# Causes a series of random mouse movements and clicks to simulate user interaction
# override_navigator = True # Overrides the navigator object to make it look like a real user
)
print(result.markdown)
async def speed_comparison():
# print("\n--- Speed Comparison ---")
# print("Firecrawl (simulated):")
# print("Time taken: 7.02 seconds")
# print("Content length: 42074 characters")
# print("Images found: 49")
# print()
# Simulated Firecrawl performance
from firecrawl import FirecrawlApp
app = FirecrawlApp(api_key=os.environ["FIRECRAWL_API_KEY"])
start = time.time()
scrape_status = app.scrape_url(
"https://www.nbcnews.com/business", params={"formats": ["markdown", "html"]}
)
end = time.time()
print("Firecrawl:")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(scrape_status['markdown'])} characters")
print(f"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}")
print()
async with AsyncWebCrawler() as crawler:
# Crawl4AI simple crawl
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
verbose=False,
)
end = time.time()
print("Crawl4AI (simple crawl):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown)} characters")
print(f"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}")
print()
# Crawl4AI with advanced content filtering
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
word_count_threshold=0,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
cache_mode=CacheMode.BYPASS,
verbose=False,
)
end = time.time()
print("Crawl4AI (Markdown Plus):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
print()
# Crawl4AI with JavaScript execution
start = time.time()
result = await crawler.arun(
url="https://www.nbcnews.com/business",
js_code=[
"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();"
],
word_count_threshold=0,
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
)
# content_filter=BM25ContentFilter(user_query=None, bm25_threshold=1.0)
),
verbose=False,
)
end = time.time()
print("Crawl4AI (with JavaScript execution):")
print(f"Time taken: {end - start:.2f} seconds")
print(f"Content length: {len(result.markdown.raw_markdown)} characters")
print(f"Fit Markdown: {len(result.markdown.fit_markdown)} characters")
print(f"Images found: {result.markdown.raw_markdown.count('cldnry.s-nbcnews.com')}")
print("\nNote on Speed Comparison:")
print("The speed test conducted here may not reflect optimal conditions.")
print("When we call Firecrawl's API, we're seeing its best performance,")
print("while Crawl4AI's performance is limited by the local network speed.")
print("For a more accurate comparison, it's recommended to run these tests")
print("on servers with a stable and fast internet connection.")
print("Despite these limitations, Crawl4AI still demonstrates faster performance.")
print("If you run these tests in an environment with better network conditions,")
print("you may observe an even more significant speed advantage for Crawl4AI.")
async def generate_knowledge_graph():
class Entity(BaseModel):
name: str
description: str
class Relationship(BaseModel):
entity1: Entity
entity2: Entity
description: str
relation_type: str
class KnowledgeGraph(BaseModel):
entities: List[Entity]
relationships: List[Relationship]
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
schema=KnowledgeGraph.model_json_schema(),
extraction_type="schema",
instruction="""Extract entities and relationships from the given text.""",
)
async with AsyncWebCrawler() as crawler:
url = "https://paulgraham.com/love.html"
result = await crawler.arun(
url=url,
cache_mode=CacheMode.BYPASS,
extraction_strategy=extraction_strategy,
# magic=True
)
# print(result.extracted_content)
with open(os.path.join(__location__, "kb.json"), "w") as f:
f.write(result.extracted_content)
async def fit_markdown_remove_overlay():
async with AsyncWebCrawler(
headless=True, # Set to False to see what is happening
verbose=True,
user_agent_mode="random",
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
) as crawler:
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology",
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
),
options={"ignore_links": True},
),
# markdown_generator=DefaultMarkdownGenerator(
# content_filter=BM25ContentFilter(user_query="", bm25_threshold=1.0),
# options={
# "ignore_links": True
# }
# ),
)
if result.success:
print(len(result.markdown.raw_markdown))
print(len(result.markdown.markdown_with_citations))
print(len(result.markdown.fit_markdown))
# Save clean html
with open(os.path.join(__location__, "output/cleaned_html.html"), "w") as f:
f.write(result.cleaned_html)
with open(
os.path.join(__location__, "output/output_raw_markdown.md"), "w"
) as f:
f.write(result.markdown.raw_markdown)
with open(
os.path.join(__location__, "output/output_markdown_with_citations.md"),
"w",
) as f:
f.write(result.markdown.markdown_with_citations)
with open(
os.path.join(__location__, "output/output_fit_markdown.md"), "w"
) as f:
f.write(result.markdown.fit_markdown)
print("Done")
async def main():
# await extract_structured_data_using_llm("openai/gpt-4o", os.getenv("OPENAI_API_KEY"))
# await simple_crawl()
# await simple_example_with_running_js_code()
# await simple_example_with_css_selector()
# # await use_proxy()
# await capture_and_save_screenshot("https://www.example.com", os.path.join(__location__, "tmp/example_screenshot.jpg"))
# await extract_structured_data_using_css_extractor()
# LLM extraction examples
# await extract_structured_data_using_llm()
# await extract_structured_data_using_llm("huggingface/meta-llama/Meta-Llama-3.1-8B-Instruct", os.getenv("HUGGINGFACE_API_KEY"))
# await extract_structured_data_using_llm("ollama/llama3.2")
# You always can pass custom headers to the extraction strategy
# custom_headers = {
# "Authorization": "Bearer your-custom-token",
# "X-Custom-Header": "Some-Value"
# }
# await extract_structured_data_using_llm(extra_headers=custom_headers)
# await crawl_dynamic_content_pages_method_1()
# await crawl_dynamic_content_pages_method_2()
await crawl_dynamic_content_pages_method_3()
# await crawl_custom_browser_type()
# await speed_comparison()
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,412 @@
import asyncio
import os
import json
import base64
from pathlib import Path
from typing import List
from crawl4ai.proxy_strategy import ProxyConfig
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode, CrawlResult
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai import JsonCssExtractionStrategy, LLMExtractionStrategy
from crawl4ai import LLMConfig
from crawl4ai import PruningContentFilter, BM25ContentFilter
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import BFSDeepCrawlStrategy, DomainFilter, FilterChain
from crawl4ai import BrowserConfig
__cur_dir__ = Path(__file__).parent
async def demo_basic_crawl():
"""Basic web crawling with markdown generation"""
print("\n=== 1. Basic Web Crawling ===")
async with AsyncWebCrawler(config = BrowserConfig(
viewport_height=800,
viewport_width=1200,
headless=True,
verbose=True,
)) as crawler:
results: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com/"
)
for i, result in enumerate(results):
print(f"Result {i + 1}:")
print(f"Success: {result.success}")
if result.success:
print(f"Markdown length: {len(result.markdown.raw_markdown)} chars")
print(f"First 100 chars: {result.markdown.raw_markdown[:100]}...")
else:
print("Failed to crawl the URL")
async def demo_parallel_crawl():
"""Crawl multiple URLs in parallel"""
print("\n=== 2. Parallel Crawling ===")
urls = [
"https://news.ycombinator.com/",
"https://example.com/",
"https://httpbin.org/html",
]
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun_many(
urls=urls,
)
print(f"Crawled {len(results)} URLs in parallel:")
for i, result in enumerate(results):
print(
f" {i + 1}. {result.url} - {'Success' if result.success else 'Failed'}"
)
async def demo_fit_markdown():
"""Generate focused markdown with LLM content filter"""
print("\n=== 3. Fit Markdown with LLM Content Filter ===")
async with AsyncWebCrawler() as crawler:
result: CrawlResult = await crawler.arun(
url = "https://en.wikipedia.org/wiki/Python_(programming_language)",
config=CrawlerRunConfig(
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter()
)
),
)
# Print stats and save the fit markdown
print(f"Raw: {len(result.markdown.raw_markdown)} chars")
print(f"Fit: {len(result.markdown.fit_markdown)} chars")
async def demo_llm_structured_extraction_no_schema():
# Create a simple LLM extraction strategy (no schema required)
extraction_strategy = LLMExtractionStrategy(
llm_config=LLMConfig(
provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY",
),
instruction="This is news.ycombinator.com, extract all news, and for each, I want title, source url, number of comments.",
extract_type="schema",
schema="{title: string, url: string, comments: int}",
extra_args={
"temperature": 0.0,
"max_tokens": 4096,
},
verbose=True,
)
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
"https://news.ycombinator.com/", config=config
)
for result in results:
print(f"URL: {result.url}")
print(f"Success: {result.success}")
if result.success:
data = json.loads(result.extracted_content)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
async def demo_css_structured_extraction_no_schema():
"""Extract structured data using CSS selectors"""
print("\n=== 5. CSS-Based Structured Extraction ===")
# Sample HTML for schema generation (one-time cost)
sample_html = """
<div class="body-post clear">
<a class="story-link" href="https://thehackernews.com/2025/04/malicious-python-packages-on-pypi.html">
<div class="clear home-post-box cf">
<div class="home-img clear">
<div class="img-ratio">
<img alt="..." src="...">
</div>
</div>
<div class="clear home-right">
<h2 class="home-title">Malicious Python Packages on PyPI Downloaded 39,000+ Times, Steal Sensitive Data</h2>
<div class="item-label">
<span class="h-datetime"><i class="icon-font icon-calendar"></i>Apr 05, 2025</span>
<span class="h-tags">Malware / Supply Chain Attack</span>
</div>
<div class="home-desc"> Cybersecurity researchers have...</div>
</div>
</div>
</a>
</div>
"""
# Check if schema file exists
schema_file_path = f"{__cur_dir__}/tmp/schema.json"
if os.path.exists(schema_file_path):
with open(schema_file_path, "r") as f:
schema = json.load(f)
else:
# Generate schema using LLM (one-time setup)
schema = JsonCssExtractionStrategy.generate_schema(
html=sample_html,
llm_config=LLMConfig(
provider="groq/qwen-2.5-32b",
api_token="env:GROQ_API_KEY",
),
query="From https://thehackernews.com/, I have shared a sample of one news div with a title, date, and description. Please generate a schema for this news div.",
)
print(f"Generated schema: {json.dumps(schema, indent=2)}")
# Save the schema to a file , and use it for future extractions, in result for such extraction you will call LLM once
with open(f"{__cur_dir__}/tmp/schema.json", "w") as f:
json.dump(schema, f, indent=2)
# Create no-LLM extraction strategy with the generated schema
extraction_strategy = JsonCssExtractionStrategy(schema)
config = CrawlerRunConfig(extraction_strategy=extraction_strategy)
# Use the fast CSS extraction (no LLM calls during extraction)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
"https://thehackernews.com", config=config
)
for result in results:
print(f"URL: {result.url}")
print(f"Success: {result.success}")
if result.success:
data = json.loads(result.extracted_content)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
async def demo_deep_crawl():
"""Deep crawling with BFS strategy"""
print("\n=== 6. Deep Crawling ===")
filter_chain = FilterChain([DomainFilter(allowed_domains=["crawl4ai.com"])])
deep_crawl_strategy = BFSDeepCrawlStrategy(
max_depth=1, max_pages=5, filter_chain=filter_chain
)
async with AsyncWebCrawler() as crawler:
results: List[CrawlResult] = await crawler.arun(
url="https://docs.crawl4ai.com",
config=CrawlerRunConfig(deep_crawl_strategy=deep_crawl_strategy),
)
print(f"Deep crawl returned {len(results)} pages:")
for i, result in enumerate(results):
depth = result.metadata.get("depth", "unknown")
print(f" {i + 1}. {result.url} (Depth: {depth})")
async def demo_js_interaction():
"""Execute JavaScript to load more content"""
print("\n=== 7. JavaScript Interaction ===")
# A simple page that needs JS to reveal content
async with AsyncWebCrawler(config=BrowserConfig(headless=False)) as crawler:
# Initial load
news_schema = {
"name": "news",
"baseSelector": "tr.athing",
"fields": [
{
"name": "title",
"selector": "span.titleline",
"type": "text",
}
],
}
results: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com",
config=CrawlerRunConfig(
session_id="hn_session", # Keep session
extraction_strategy=JsonCssExtractionStrategy(schema=news_schema),
),
)
news = []
for result in results:
if result.success:
data = json.loads(result.extracted_content)
news.extend(data)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
print(f"Initial items: {len(news)}")
# Click "More" link
more_config = CrawlerRunConfig(
js_code="document.querySelector('a.morelink').click();",
js_only=True, # Continue in same page
session_id="hn_session", # Keep session
extraction_strategy=JsonCssExtractionStrategy(
schema=news_schema,
),
)
result: List[CrawlResult] = await crawler.arun(
url="https://news.ycombinator.com", config=more_config
)
# Extract new items
for result in results:
if result.success:
data = json.loads(result.extracted_content)
news.extend(data)
print(json.dumps(data, indent=2))
else:
print("Failed to extract structured data")
print(f"Total items: {len(news)}")
async def demo_media_and_links():
"""Extract media and links from a page"""
print("\n=== 8. Media and Links Extraction ===")
async with AsyncWebCrawler() as crawler:
result: List[CrawlResult] = await crawler.arun("https://en.wikipedia.org/wiki/Main_Page")
for i, result in enumerate(result):
# Extract and save all images
images = result.media.get("images", [])
print(f"Found {len(images)} images")
# Extract and save all links (internal and external)
internal_links = result.links.get("internal", [])
external_links = result.links.get("external", [])
print(f"Found {len(internal_links)} internal links")
print(f"Found {len(external_links)} external links")
# Print some of the images and links
for image in images[:3]:
print(f"Image: {image['src']}")
for link in internal_links[:3]:
print(f"Internal link: {link['href']}")
for link in external_links[:3]:
print(f"External link: {link['href']}")
# # Save everything to files
with open(f"{__cur_dir__}/tmp/images.json", "w") as f:
json.dump(images, f, indent=2)
with open(f"{__cur_dir__}/tmp/links.json", "w") as f:
json.dump(
{"internal": internal_links, "external": external_links},
f,
indent=2,
)
async def demo_screenshot_and_pdf():
"""Capture screenshot and PDF of a page"""
print("\n=== 9. Screenshot and PDF Capture ===")
async with AsyncWebCrawler() as crawler:
result: List[CrawlResult] = await crawler.arun(
# url="https://example.com",
url="https://en.wikipedia.org/wiki/Giant_anteater",
config=CrawlerRunConfig(screenshot=True, pdf=True),
)
for i, result in enumerate(result):
# if result.screenshot_data:
if result.screenshot:
# Save screenshot
screenshot_path = f"{__cur_dir__}/tmp/example_screenshot.png"
with open(screenshot_path, "wb") as f:
f.write(base64.b64decode(result.screenshot))
print(f"Screenshot saved to {screenshot_path}")
# if result.pdf_data:
if result.pdf:
# Save PDF
pdf_path = f"{__cur_dir__}/tmp/example.pdf"
with open(pdf_path, "wb") as f:
f.write(result.pdf)
print(f"PDF saved to {pdf_path}")
async def demo_proxy_rotation():
"""Proxy rotation for multiple requests"""
print("\n=== 10. Proxy Rotation ===")
# Example proxies (replace with real ones)
proxies = [
ProxyConfig(server="http://proxy1.example.com:8080"),
ProxyConfig(server="http://proxy2.example.com:8080"),
]
proxy_strategy = RoundRobinProxyStrategy(proxies)
print(f"Using {len(proxies)} proxies in rotation")
print(
"Note: This example uses placeholder proxies - replace with real ones to test"
)
async with AsyncWebCrawler() as crawler:
config = CrawlerRunConfig(
proxy_rotation_strategy=proxy_strategy
)
# In a real scenario, these would be run and the proxies would rotate
print("In a real scenario, requests would rotate through the available proxies")
async def demo_raw_html_and_file():
"""Process raw HTML and local files"""
print("\n=== 11. Raw HTML and Local Files ===")
raw_html = """
<html><body>
<h1>Sample Article</h1>
<p>This is sample content for testing Crawl4AI's raw HTML processing.</p>
</body></html>
"""
# Save to file
file_path = Path("docs/examples/tmp/sample.html").absolute()
with open(file_path, "w") as f:
f.write(raw_html)
async with AsyncWebCrawler() as crawler:
# Crawl raw HTML
raw_result = await crawler.arun(
url="raw:" + raw_html, config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
)
print("Raw HTML processing:")
print(f" Markdown: {raw_result.markdown.raw_markdown[:50]}...")
# Crawl local file
file_result = await crawler.arun(
url=f"file://{file_path}",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("\nLocal file processing:")
print(f" Markdown: {file_result.markdown.raw_markdown[:50]}...")
# Clean up
os.remove(file_path)
print(f"Processed both raw HTML and local file ({file_path})")
async def main():
"""Run all demo functions sequentially"""
print("=== Comprehensive Crawl4AI Demo ===")
print("Note: Some examples require API keys or other configurations")
# Run all demos
await demo_basic_crawl()
await demo_parallel_crawl()
await demo_fit_markdown()
await demo_llm_structured_extraction_no_schema()
await demo_css_structured_extraction_no_schema()
await demo_deep_crawl()
await demo_js_interaction()
await demo_media_and_links()
await demo_screenshot_and_pdf()
# # await demo_proxy_rotation()
await demo_raw_html_and_file()
# Clean up any temp files that may have been created
print("\n=== Demo Complete ===")
print("Check for any generated files (screenshots, PDFs) in the current directory")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,562 @@
import os, sys
from crawl4ai.types import LLMConfig
sys.path.append(
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
)
import asyncio
import time
import json
import re
from typing import Dict
from bs4 import BeautifulSoup
from pydantic import BaseModel, Field
from crawl4ai import AsyncWebCrawler, CacheMode, BrowserConfig, CrawlerRunConfig
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
from crawl4ai.content_filter_strategy import PruningContentFilter
from crawl4ai.extraction_strategy import (
JsonCssExtractionStrategy,
LLMExtractionStrategy,
)
__location__ = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__)))
print("Crawl4AI: Advanced Web Crawling and Data Extraction")
print("GitHub Repository: https://github.com/unclecode/crawl4ai")
print("Twitter: @unclecode")
print("Website: https://crawl4ai.com")
# Basic Example - Simple Crawl
async def simple_crawl():
print("\n--- Basic Usage ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def clean_content():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
excluded_tags=["nav", "footer", "aside"],
remove_overlay_elements=True,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48, threshold_type="fixed", min_word_threshold=0
),
options={"ignore_links": True},
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://en.wikipedia.org/wiki/Apple",
config=crawler_config,
)
full_markdown_length = len(result.markdown.raw_markdown)
fit_markdown_length = len(result.markdown.fit_markdown)
print(f"Full Markdown Length: {full_markdown_length}")
print(f"Fit Markdown Length: {fit_markdown_length}")
async def link_analysis():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.ENABLED,
exclude_external_links=True,
exclude_social_media_links=True,
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business",
config=crawler_config,
)
print(f"Found {len(result.links['internal'])} internal links")
print(f"Found {len(result.links['external'])} external links")
for link in result.links["internal"][:5]:
print(f"Href: {link['href']}\nText: {link['text']}\n")
# JavaScript Execution Example
async def simple_example_with_running_js_code():
print("\n--- Executing JavaScript and Using CSS Selectors ---")
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
js_code="const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();",
# wait_for="() => { return Array.from(document.querySelectorAll('article.tease-card')).length > 10; }"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
# CSS Selector Example
async def simple_example_with_css_selector():
print("\n--- Using CSS Selectors ---")
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, css_selector=".wide-tease-item__description"
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
print(result.markdown[:500])
async def media_handling():
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS, exclude_external_images=True, screenshot=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
for img in result.media["images"][:5]:
print(f"Image URL: {img['src']}, Alt: {img['alt']}, Score: {img['score']}")
async def custom_hook_workflow(verbose=True):
async with AsyncWebCrawler() as crawler:
# Set a 'before_goto' hook to run custom code just before navigation
crawler.crawler_strategy.set_hook(
"before_goto",
lambda page, context: print("[Hook] Preparing to navigate..."),
)
# Perform the crawl operation
result = await crawler.arun(url="https://crawl4ai.com")
print(result.markdown.raw_markdown[:500].replace("\n", " -- "))
# Proxy Example
async def use_proxy():
print("\n--- Using a Proxy ---")
browser_config = BrowserConfig(
headless=True,
proxy_config={
"server": "http://proxy.example.com:8080",
"username": "username",
"password": "password",
},
)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business", config=crawler_config
)
if result.success:
print(result.markdown[:500])
# Screenshot Example
async def capture_and_save_screenshot(url: str, output_path: str):
browser_config = BrowserConfig(headless=True)
crawler_config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, screenshot=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url=url, config=crawler_config)
if result.success and result.screenshot:
import base64
screenshot_data = base64.b64decode(result.screenshot)
with open(output_path, "wb") as f:
f.write(screenshot_data)
print(f"Screenshot saved successfully to {output_path}")
else:
print("Failed to capture screenshot")
# LLM Extraction Example
class OpenAIModelFee(BaseModel):
model_name: str = Field(..., description="Name of the OpenAI model.")
input_fee: str = Field(..., description="Fee for input token for the OpenAI model.")
output_fee: str = Field(
..., description="Fee for output token for the OpenAI model."
)
async def extract_structured_data_using_llm(
provider: str, api_token: str = None, extra_headers: Dict[str, str] = None
):
print(f"\n--- Extracting Structured Data with {provider} ---")
if api_token is None and provider != "ollama":
print(f"API token is required for {provider}. Skipping this example.")
return
browser_config = BrowserConfig(headless=True)
extra_args = {"temperature": 0, "top_p": 0.9, "max_tokens": 2000}
if extra_headers:
extra_args["extra_headers"] = extra_headers
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
word_count_threshold=1,
page_timeout=80000,
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider=provider,api_token=api_token),
schema=OpenAIModelFee.model_json_schema(),
extraction_type="schema",
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
Do not miss any models in the entire content.""",
extra_args=extra_args,
),
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://openai.com/api/pricing/", config=crawler_config
)
print(result.extracted_content)
# CSS Extraction Example
async def extract_structured_data_using_css_extractor():
print("\n--- Using JsonCssExtractionStrategy for Fast Structured Output ---")
schema = {
"name": "KidoCode Courses",
"baseSelector": "section.charge-methodology .framework-collection-item.w-dyn-item",
"fields": [
{
"name": "section_title",
"selector": "h3.heading-50",
"type": "text",
},
{
"name": "section_description",
"selector": ".charge-content",
"type": "text",
},
{
"name": "course_name",
"selector": ".text-block-93",
"type": "text",
},
{
"name": "course_description",
"selector": ".course-content-text",
"type": "text",
},
{
"name": "course_icon",
"selector": ".image-92",
"type": "attribute",
"attribute": "src",
},
],
}
browser_config = BrowserConfig(headless=True, java_script_enabled=True)
js_click_tabs = """
(async () => {
const tabs = document.querySelectorAll("section.charge-methodology .tabs-menu-3 > div");
for(let tab of tabs) {
tab.scrollIntoView();
tab.click();
await new Promise(r => setTimeout(r, 500));
}
})();
"""
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=JsonCssExtractionStrategy(schema),
js_code=[js_click_tabs],
delay_before_return_html=1
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(
url="https://www.kidocode.com/degrees/technology", config=crawler_config
)
companies = json.loads(result.extracted_content)
print(f"Successfully extracted {len(companies)} companies")
print(json.dumps(companies[0], indent=2))
# Dynamic Content Examples - Method 1
async def crawl_dynamic_content_pages_method_1():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
first_commit = ""
async def on_execution_started(page, **kwargs):
nonlocal first_commit
try:
while True:
await page.wait_for_selector("li.Box-sc-g0xbh4-0 h4")
commit = await page.query_selector("li.Box-sc-g0xbh4-0 h4")
commit = await commit.evaluate("(element) => element.textContent")
commit = re.sub(r"\s+", "", commit)
if commit and commit != first_commit:
first_commit = commit
break
await asyncio.sleep(0.5)
except Exception as e:
print(f"Warning: New content didn't appear after JavaScript execution: {e}")
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
async with AsyncWebCrawler(config=browser_config) as crawler:
crawler.crawler_strategy.set_hook("on_execution_started", on_execution_started)
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
js_next_page = """
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
"""
for page in range(3):
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
css_selector="li.Box-sc-g0xbh4-0",
js_code=js_next_page if page > 0 else None,
js_only=page > 0,
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
assert result.success, f"Failed to crawl page {page + 1}"
soup = BeautifulSoup(result.cleaned_html, "html.parser")
commits = soup.select("li")
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
# Dynamic Content Examples - Method 2
async def crawl_dynamic_content_pages_method_2():
print("\n--- Advanced Multi-Page Crawling with JavaScript Execution ---")
browser_config = BrowserConfig(headless=False, java_script_enabled=True)
js_next_page_and_wait = """
(async () => {
const getCurrentCommit = () => {
const commits = document.querySelectorAll('li.Box-sc-g0xbh4-0 h4');
return commits.length > 0 ? commits[0].textContent.trim() : null;
};
const initialCommit = getCurrentCommit();
const button = document.querySelector('a[data-testid="pagination-next-button"]');
if (button) button.click();
while (true) {
await new Promise(resolve => setTimeout(resolve, 100));
const newCommit = getCurrentCommit();
if (newCommit && newCommit !== initialCommit) {
break;
}
}
})();
"""
schema = {
"name": "Commit Extractor",
"baseSelector": "li.Box-sc-g0xbh4-0",
"fields": [
{
"name": "title",
"selector": "h4.markdown-title",
"type": "text",
"transform": "strip",
},
],
}
async with AsyncWebCrawler(config=browser_config) as crawler:
url = "https://github.com/microsoft/TypeScript/commits/main"
session_id = "typescript_commits_session"
all_commits = []
extraction_strategy = JsonCssExtractionStrategy(schema)
for page in range(3):
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
css_selector="li.Box-sc-g0xbh4-0",
extraction_strategy=extraction_strategy,
js_code=js_next_page_and_wait if page > 0 else None,
js_only=page > 0,
session_id=session_id,
)
result = await crawler.arun(url=url, config=crawler_config)
assert result.success, f"Failed to crawl page {page + 1}"
commits = json.loads(result.extracted_content)
all_commits.extend(commits)
print(f"Page {page + 1}: Found {len(commits)} commits")
print(f"Successfully crawled {len(all_commits)} commits across 3 pages")
async def cosine_similarity_extraction():
from crawl4ai.extraction_strategy import CosineStrategy
crawl_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
extraction_strategy=CosineStrategy(
word_count_threshold=10,
max_dist=0.2, # Maximum distance between two words
linkage_method="ward", # Linkage method for hierarchical clustering (ward, complete, average, single)
top_k=3, # Number of top keywords to extract
sim_threshold=0.3, # Similarity threshold for clustering
semantic_filter="McDonald's economic impact, American consumer trends", # Keywords to filter the content semantically using embeddings
verbose=True,
),
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://www.nbcnews.com/business/consumer/how-mcdonalds-e-coli-crisis-inflation-politics-reflect-american-story-rcna177156",
config=crawl_config,
)
print(json.loads(result.extracted_content)[:5])
# Browser Comparison
async def crawl_custom_browser_type():
print("\n--- Browser Comparison ---")
# Firefox
browser_config_firefox = BrowserConfig(browser_type="firefox", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_firefox) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Firefox:", time.time() - start)
print(result.markdown[:500])
# WebKit
browser_config_webkit = BrowserConfig(browser_type="webkit", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_webkit) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("WebKit:", time.time() - start)
print(result.markdown[:500])
# Chromium (default)
browser_config_chromium = BrowserConfig(browser_type="chromium", headless=True)
start = time.time()
async with AsyncWebCrawler(config=browser_config_chromium) as crawler:
result = await crawler.arun(
url="https://www.example.com",
config=CrawlerRunConfig(cache_mode=CacheMode.BYPASS),
)
print("Chromium:", time.time() - start)
print(result.markdown[:500])
# Anti-Bot and User Simulation
async def crawl_with_user_simulation():
browser_config = BrowserConfig(
headless=True,
user_agent_mode="random",
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
)
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
magic=True,
simulate_user=True,
override_navigator=True,
)
async with AsyncWebCrawler(config=browser_config) as crawler:
result = await crawler.arun(url="YOUR-URL-HERE", config=crawler_config)
print(result.markdown)
async def ssl_certification():
# Configure crawler to fetch SSL certificate
config = CrawlerRunConfig(
fetch_ssl_certificate=True,
cache_mode=CacheMode.BYPASS, # Bypass cache to always get fresh certificates
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(url="https://example.com", config=config)
if result.success and result.ssl_certificate:
cert = result.ssl_certificate
tmp_dir = os.path.join(__location__, "tmp")
os.makedirs(tmp_dir, exist_ok=True)
# 1. Access certificate properties directly
print("\nCertificate Information:")
print(f"Issuer: {cert.issuer.get('CN', '')}")
print(f"Valid until: {cert.valid_until}")
print(f"Fingerprint: {cert.fingerprint}")
# 2. Export certificate in different formats
cert.to_json(os.path.join(tmp_dir, "certificate.json")) # For analysis
print("\nCertificate exported to:")
print(f"- JSON: {os.path.join(tmp_dir, 'certificate.json')}")
pem_data = cert.to_pem(
os.path.join(tmp_dir, "certificate.pem")
) # For web servers
print(f"- PEM: {os.path.join(tmp_dir, 'certificate.pem')}")
der_data = cert.to_der(
os.path.join(tmp_dir, "certificate.der")
) # For Java apps
print(f"- DER: {os.path.join(tmp_dir, 'certificate.der')}")
# Main execution
async def main():
# Basic examples
await simple_crawl()
await simple_example_with_running_js_code()
await simple_example_with_css_selector()
# Advanced examples
await extract_structured_data_using_css_extractor()
await extract_structured_data_using_llm(
"openai/gpt-4o", os.getenv("OPENAI_API_KEY")
)
await crawl_dynamic_content_pages_method_1()
await crawl_dynamic_content_pages_method_2()
# Browser comparisons
await crawl_custom_browser_type()
# Screenshot example
await capture_and_save_screenshot(
"https://www.example.com",
os.path.join(__location__, "tmp/example_screenshot.jpg")
)
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -1,405 +0,0 @@
import os
import time
from crawl4ai import LLMConfig
from crawl4ai.web_crawler import WebCrawler
from crawl4ai.chunking_strategy import *
from crawl4ai.extraction_strategy import *
from crawl4ai.crawler_strategy import *
from rich import print
from rich.console import Console
from functools import lru_cache
console = Console()
@lru_cache()
def create_crawler():
crawler = WebCrawler(verbose=True)
crawler.warmup()
return crawler
def print_result(result):
# Print each key in one line and just the first 10 characters of each one's value and three dots
console.print("\t[bold]Result:[/bold]")
for key, value in result.model_dump().items():
if isinstance(value, str) and value:
console.print(f"\t{key}: [green]{value[:20]}...[/green]")
if result.extracted_content:
items = json.loads(result.extracted_content)
print(f"\t[bold]{len(items)} blocks is extracted![/bold]")
def cprint(message, press_any_key=False):
console.print(message)
if press_any_key:
console.print("Press any key to continue...", style="")
input()
def basic_usage(crawler):
cprint(
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
)
result = crawler.run(url="https://www.nbcnews.com/business", only_text=True)
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def basic_usage_some_params(crawler):
cprint(
"🛠️ [bold cyan]Basic Usage: Simply provide a URL and let Crawl4ai do the magic![/bold cyan]"
)
result = crawler.run(
url="https://www.nbcnews.com/business", word_count_threshold=1, only_text=True
)
cprint("[LOG] 📦 [bold yellow]Basic crawl result:[/bold yellow]")
print_result(result)
def screenshot_usage(crawler):
cprint("\n📸 [bold cyan]Let's take a screenshot of the page![/bold cyan]")
result = crawler.run(url="https://www.nbcnews.com/business", screenshot=True)
cprint("[LOG] 📦 [bold yellow]Screenshot result:[/bold yellow]")
# Save the screenshot to a file
with open("screenshot.png", "wb") as f:
f.write(base64.b64decode(result.screenshot))
cprint("Screenshot saved to 'screenshot.png'!")
print_result(result)
def understanding_parameters(crawler):
cprint(
"\n🧠 [bold cyan]Understanding 'bypass_cache' and 'include_raw_html' parameters:[/bold cyan]"
)
cprint(
"By default, Crawl4ai caches the results of your crawls. This means that subsequent crawls of the same URL will be much faster! Let's see this in action."
)
# First crawl (reads from cache)
cprint("1⃣ First crawl (caches the result):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business")
end_time = time.time()
cprint(
f"[LOG] 📦 [bold yellow]First crawl took {end_time - start_time} seconds and result (from cache):[/bold yellow]"
)
print_result(result)
# Force to crawl again
cprint("2⃣ Second crawl (Force to crawl again):", True)
start_time = time.time()
result = crawler.run(url="https://www.nbcnews.com/business", bypass_cache=True)
end_time = time.time()
cprint(
f"[LOG] 📦 [bold yellow]Second crawl took {end_time - start_time} seconds and result (forced to crawl):[/bold yellow]"
)
print_result(result)
def add_chunking_strategy(crawler):
# Adding a chunking strategy: RegexChunking
cprint(
"\n🧩 [bold cyan]Let's add a chunking strategy: RegexChunking![/bold cyan]",
True,
)
cprint(
"RegexChunking is a simple chunking strategy that splits the text based on a given regex pattern. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
chunking_strategy=RegexChunking(patterns=["\n\n"]),
)
cprint("[LOG] 📦 [bold yellow]RegexChunking result:[/bold yellow]")
print_result(result)
# Adding another chunking strategy: NlpSentenceChunking
cprint(
"\n🔍 [bold cyan]Time to explore another chunking strategy: NlpSentenceChunking![/bold cyan]",
True,
)
cprint(
"NlpSentenceChunking uses NLP techniques to split the text into sentences. Let's see how it performs!"
)
result = crawler.run(
url="https://www.nbcnews.com/business", chunking_strategy=NlpSentenceChunking()
)
cprint("[LOG] 📦 [bold yellow]NlpSentenceChunking result:[/bold yellow]")
print_result(result)
def add_extraction_strategy(crawler):
# Adding an extraction strategy: CosineStrategy
cprint(
"\n🧠 [bold cyan]Let's get smarter with an extraction strategy: CosineStrategy![/bold cyan]",
True,
)
cprint(
"CosineStrategy uses cosine similarity to extract semantically similar blocks of text. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
word_count_threshold=10,
max_dist=0.2,
linkage_method="ward",
top_k=3,
sim_threshold=0.3,
verbose=True,
),
)
cprint("[LOG] 📦 [bold yellow]CosineStrategy result:[/bold yellow]")
print_result(result)
# Using semantic_filter with CosineStrategy
cprint(
"You can pass other parameters like 'semantic_filter' to the CosineStrategy to extract semantically similar blocks of text. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=CosineStrategy(
semantic_filter="inflation rent prices",
),
)
cprint(
"[LOG] 📦 [bold yellow]CosineStrategy result with semantic filter:[/bold yellow]"
)
print_result(result)
def add_llm_extraction_strategy(crawler):
# Adding an LLM extraction strategy without instructions
cprint(
"\n🤖 [bold cyan]Time to bring in the big guns: LLMExtractionStrategy without instructions![/bold cyan]",
True,
)
cprint(
"LLMExtractionStrategy uses a large language model to extract relevant information from the web page. Let's see it in action!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (no instructions) result:[/bold yellow]"
)
print_result(result)
# Adding an LLM extraction strategy with instructions
cprint(
"\n📜 [bold cyan]Let's make it even more interesting: LLMExtractionStrategy with instructions![/bold cyan]",
True,
)
cprint(
"Let's say we are only interested in financial news. Let's see how LLMExtractionStrategy performs with instructions!"
)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="I am interested in only financial news",
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with instructions) result:[/bold yellow]"
)
print_result(result)
result = crawler.run(
url="https://www.nbcnews.com/business",
extraction_strategy=LLMExtractionStrategy(
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
instruction="Extract only content related to technology",
),
)
cprint(
"[LOG] 📦 [bold yellow]LLMExtractionStrategy (with technology instruction) result:[/bold yellow]"
)
print_result(result)
def targeted_extraction(crawler):
# Using a CSS selector to extract only H2 tags
cprint(
"\n🎯 [bold cyan]Targeted extraction: Let's use a CSS selector to extract only H2 tags![/bold cyan]",
True,
)
result = crawler.run(url="https://www.nbcnews.com/business", css_selector="h2")
cprint("[LOG] 📦 [bold yellow]CSS Selector (H2 tags) result:[/bold yellow]")
print_result(result)
def interactive_extraction(crawler):
# Passing JavaScript code to interact with the page
cprint(
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
True,
)
cprint(
"In this example we try to click the 'Load More' button on the page using JavaScript code."
)
js_code = """
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
cprint(
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
)
print_result(result)
def multiple_scrip(crawler):
# Passing JavaScript code to interact with the page
cprint(
"\n🖱️ [bold cyan]Let's get interactive: Passing JavaScript code to click 'Load More' button![/bold cyan]",
True,
)
cprint(
"In this example we try to click the 'Load More' button on the page using JavaScript code."
)
js_code = [
"""
const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More'));
loadMoreButton && loadMoreButton.click();
"""
] * 2
# crawler_strategy = LocalSeleniumCrawlerStrategy(js_code=js_code)
# crawler = WebCrawler(crawler_strategy=crawler_strategy, always_by_pass_cache=True)
result = crawler.run(url="https://www.nbcnews.com/business", js=js_code)
cprint(
"[LOG] 📦 [bold yellow]JavaScript Code (Load More button) result:[/bold yellow]"
)
print_result(result)
def using_crawler_hooks(crawler):
# Example usage of the hooks for authentication and setting a cookie
def on_driver_created(driver):
print("[HOOK] on_driver_created")
# Example customization: maximize the window
driver.maximize_window()
# Example customization: logging in to a hypothetical website
driver.get("https://example.com/login")
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.NAME, "username"))
)
driver.find_element(By.NAME, "username").send_keys("testuser")
driver.find_element(By.NAME, "password").send_keys("password123")
driver.find_element(By.NAME, "login").click()
WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "welcome"))
)
# Add a custom cookie
driver.add_cookie({"name": "test_cookie", "value": "cookie_value"})
return driver
def before_get_url(driver):
print("[HOOK] before_get_url")
# Example customization: add a custom header
# Enable Network domain for sending headers
driver.execute_cdp_cmd("Network.enable", {})
# Add a custom header
driver.execute_cdp_cmd(
"Network.setExtraHTTPHeaders", {"headers": {"X-Test-Header": "test"}}
)
return driver
def after_get_url(driver):
print("[HOOK] after_get_url")
# Example customization: log the URL
print(driver.current_url)
return driver
def before_return_html(driver, html):
print("[HOOK] before_return_html")
# Example customization: log the HTML
print(len(html))
return driver
cprint(
"\n🔗 [bold cyan]Using Crawler Hooks: Let's see how we can customize the crawler using hooks![/bold cyan]",
True,
)
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook("on_driver_created", on_driver_created)
crawler_strategy.set_hook("before_get_url", before_get_url)
crawler_strategy.set_hook("after_get_url", after_get_url)
crawler_strategy.set_hook("before_return_html", before_return_html)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
result = crawler.run(url="https://example.com")
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result=result)
def using_crawler_hooks_dleay_example(crawler):
def delay(driver):
print("Delaying for 5 seconds...")
time.sleep(5)
print("Resuming...")
def create_crawler():
crawler_strategy = LocalSeleniumCrawlerStrategy(verbose=True)
crawler_strategy.set_hook("after_get_url", delay)
crawler = WebCrawler(verbose=True, crawler_strategy=crawler_strategy)
crawler.warmup()
return crawler
cprint(
"\n🔗 [bold cyan]Using Crawler Hooks: Let's add a delay after fetching the url to make sure entire page is fetched.[/bold cyan]"
)
crawler = create_crawler()
result = crawler.run(url="https://google.com", bypass_cache=True)
cprint("[LOG] 📦 [bold yellow]Crawler Hooks result:[/bold yellow]")
print_result(result)
def main():
cprint(
"🌟 [bold green]Welcome to the Crawl4ai Quickstart Guide! Let's dive into some web crawling fun! 🌐[/bold green]"
)
cprint(
"⛳️ [bold cyan]First Step: Create an instance of WebCrawler and call the `warmup()` function.[/bold cyan]"
)
cprint(
"If this is the first time you're running Crawl4ai, this might take a few seconds to load required model files."
)
crawler = create_crawler()
crawler.always_by_pass_cache = True
basic_usage(crawler)
# basic_usage_some_params(crawler)
understanding_parameters(crawler)
crawler.always_by_pass_cache = True
screenshot_usage(crawler)
add_chunking_strategy(crawler)
add_extraction_strategy(crawler)
add_llm_extraction_strategy(crawler)
targeted_extraction(crawler)
interactive_extraction(crawler)
multiple_scrip(crawler)
cprint(
"\n🎉 [bold green]Congratulations! You've made it through the Crawl4ai Quickstart Guide! Now go forth and crawl the web like a pro! 🕸️[/bold green]"
)
if __name__ == "__main__":
main()

View File

@@ -1,735 +0,0 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "6yLvrXn7yZQI"
},
"source": [
"# Crawl4AI: Advanced Web Crawling and Data Extraction\n",
"\n",
"Welcome to this interactive notebook showcasing Crawl4AI, an advanced asynchronous web crawling and data extraction library.\n",
"\n",
"- GitHub Repository: [https://github.com/unclecode/crawl4ai](https://github.com/unclecode/crawl4ai)\n",
"- Twitter: [@unclecode](https://twitter.com/unclecode)\n",
"- Website: [https://crawl4ai.com](https://crawl4ai.com)\n",
"\n",
"Let's explore the powerful features of Crawl4AI!"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "KIn_9nxFyZQK"
},
"source": [
"## Installation\n",
"\n",
"First, let's install Crawl4AI from GitHub:"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "mSnaxLf3zMog"
},
"outputs": [],
"source": [
"!sudo apt-get update && sudo apt-get install -y libwoff1 libopus0 libwebp6 libwebpdemux2 libenchant1c2a libgudev-1.0-0 libsecret-1-0 libhyphen0 libgdk-pixbuf2.0-0 libegl1 libnotify4 libxslt1.1 libevent-2.1-7 libgles2 libvpx6 libxcomposite1 libatk1.0-0 libatk-bridge2.0-0 libepoxy0 libgtk-3-0 libharfbuzz-icu0"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "xlXqaRtayZQK"
},
"outputs": [],
"source": [
"!pip install crawl4ai\n",
"!pip install nest-asyncio\n",
"!playwright install"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "qKCE7TI7yZQL"
},
"source": [
"Now, let's import the necessary libraries:"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"id": "I67tr7aAyZQL"
},
"outputs": [],
"source": [
"import asyncio\n",
"import nest_asyncio\n",
"from crawl4ai import AsyncWebCrawler\n",
"from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, LLMExtractionStrategy\n",
"import json\n",
"import time\n",
"from pydantic import BaseModel, Field\n",
"\n",
"nest_asyncio.apply()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "h7yR_Rt_yZQM"
},
"source": [
"## Basic Usage\n",
"\n",
"Let's start with a simple crawl example:"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "yBh6hf4WyZQM",
"outputId": "0f83af5c-abba-4175-ed95-70b7512e6bcc"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.05 seconds\n",
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.05 seconds.\n",
"18102\n"
]
}
],
"source": [
"async def simple_crawl():\n",
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
" result = await crawler.arun(url=\"https://www.nbcnews.com/business\")\n",
" print(len(result.markdown))\n",
"await simple_crawl()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "9rtkgHI28uI4"
},
"source": [
"💡 By default, **Crawl4AI** caches the result of every URL, so the next time you call it, youll get an instant result. But if you want to bypass the cache, just set `bypass_cache=True`."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "MzZ0zlJ9yZQM"
},
"source": [
"## Advanced Features\n",
"\n",
"### Executing JavaScript and Using CSS Selectors"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "gHStF86xyZQM",
"outputId": "34d0fb6d-4dec-4677-f76e-85a1f082829b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
"[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
"[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 6.06 seconds\n",
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.10 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.11 seconds.\n",
"41135\n"
]
}
],
"source": [
"async def js_and_css():\n",
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
" js_code = [\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"]\n",
" result = await crawler.arun(\n",
" url=\"https://www.nbcnews.com/business\",\n",
" js_code=js_code,\n",
" # css_selector=\"YOUR_CSS_SELECTOR_HERE\",\n",
" bypass_cache=True\n",
" )\n",
" print(len(result.markdown))\n",
"\n",
"await js_and_css()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "cqE_W4coyZQM"
},
"source": [
"### Using a Proxy\n",
"\n",
"Note: You'll need to replace the proxy URL with a working proxy for this example to run successfully."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "QjAyiAGqyZQM"
},
"outputs": [],
"source": [
"async def use_proxy():\n",
" async with AsyncWebCrawler(verbose=True, proxy=\"http://your-proxy-url:port\") as crawler:\n",
" result = await crawler.arun(\n",
" url=\"https://www.nbcnews.com/business\",\n",
" bypass_cache=True\n",
" )\n",
" print(result.markdown[:500]) # Print first 500 characters\n",
"\n",
"# Uncomment the following line to run the proxy example\n",
"# await use_proxy()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "XTZ88lbayZQN"
},
"source": [
"### Extracting Structured Data with OpenAI\n",
"\n",
"Note: You'll need to set your OpenAI API key as an environment variable for this example to work."
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "fIOlDayYyZQN",
"outputId": "cb8359cc-dee0-4762-9698-5dfdcee055b8"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
"[LOG] 🕸️ Crawling https://openai.com/api/pricing/ using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://openai.com/api/pricing/ successfully!\n",
"[LOG] 🚀 Crawling done for https://openai.com/api/pricing/, success: True, time taken: 3.77 seconds\n",
"[LOG] 🚀 Content extracted for https://openai.com/api/pricing/, success: True, time taken: 0.21 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://openai.com/api/pricing/, Strategy: AsyncWebCrawler\n",
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 0\n",
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 1\n",
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 2\n",
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 3\n",
"[LOG] Extracted 4 blocks from URL: https://openai.com/api/pricing/ block index: 3\n",
"[LOG] Call LLM for https://openai.com/api/pricing/ - block index: 4\n",
"[LOG] Extracted 5 blocks from URL: https://openai.com/api/pricing/ block index: 0\n",
"[LOG] Extracted 1 blocks from URL: https://openai.com/api/pricing/ block index: 4\n",
"[LOG] Extracted 8 blocks from URL: https://openai.com/api/pricing/ block index: 1\n",
"[LOG] Extracted 12 blocks from URL: https://openai.com/api/pricing/ block index: 2\n",
"[LOG] 🚀 Extraction done for https://openai.com/api/pricing/, time taken: 8.55 seconds.\n",
"5029\n"
]
}
],
"source": [
"import os\n",
"from google.colab import userdata\n",
"os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n",
"\n",
"class OpenAIModelFee(BaseModel):\n",
" model_name: str = Field(..., description=\"Name of the OpenAI model.\")\n",
" input_fee: str = Field(..., description=\"Fee for input token for the OpenAI model.\")\n",
" output_fee: str = Field(..., description=\"Fee for output token for the OpenAI model.\")\n",
"\n",
"async def extract_openai_fees():\n",
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
" result = await crawler.arun(\n",
" url='https://openai.com/api/pricing/',\n",
" word_count_threshold=1,\n",
" extraction_strategy=LLMExtractionStrategy(\n",
" provider=\"openai/gpt-4o\", api_token=os.getenv('OPENAI_API_KEY'),\n",
" schema=OpenAIModelFee.schema(),\n",
" extraction_type=\"schema\",\n",
" instruction=\"\"\"From the crawled content, extract all mentioned model names along with their fees for input and output tokens.\n",
" Do not miss any models in the entire content. One extracted model JSON format should look like this:\n",
" {\"model_name\": \"GPT-4\", \"input_fee\": \"US$10.00 / 1M tokens\", \"output_fee\": \"US$30.00 / 1M tokens\"}.\"\"\"\n",
" ),\n",
" bypass_cache=True,\n",
" )\n",
" print(len(result.extracted_content))\n",
"\n",
"# Uncomment the following line to run the OpenAI extraction example\n",
"await extract_openai_fees()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "BypA5YxEyZQN"
},
"source": [
"### Advanced Multi-Page Crawling with JavaScript Execution"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "tfkcVQ0b7mw-"
},
"source": [
"## Advanced Multi-Page Crawling with JavaScript Execution\n",
"\n",
"This example demonstrates Crawl4AI's ability to handle complex crawling scenarios, specifically extracting commits from multiple pages of a GitHub repository. The challenge here is that clicking the \"Next\" button doesn't load a new page, but instead uses asynchronous JavaScript to update the content. This is a common hurdle in modern web crawling.\n",
"\n",
"To overcome this, we use Crawl4AI's custom JavaScript execution to simulate clicking the \"Next\" button, and implement a custom hook to detect when new data has loaded. Our strategy involves comparing the first commit's text before and after \"clicking\" Next, waiting until it changes to confirm new data has rendered. This showcases Crawl4AI's flexibility in handling dynamic content and its ability to implement custom logic for even the most challenging crawling tasks."
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "qUBKGpn3yZQN",
"outputId": "3e555b6a-ed33-42f4-cce9-499a923fbe17"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 5.16 seconds\n",
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.28 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.28 seconds.\n",
"Page 1: Found 35 commits\n",
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.78 seconds\n",
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.90 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.90 seconds.\n",
"Page 2: Found 35 commits\n",
"[LOG] 🕸️ Crawling https://github.com/microsoft/TypeScript/commits/main using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://github.com/microsoft/TypeScript/commits/main successfully!\n",
"[LOG] 🚀 Crawling done for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 2.00 seconds\n",
"[LOG] 🚀 Content extracted for https://github.com/microsoft/TypeScript/commits/main, success: True, time taken: 0.74 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://github.com/microsoft/TypeScript/commits/main, Strategy: AsyncWebCrawler\n",
"[LOG] 🚀 Extraction done for https://github.com/microsoft/TypeScript/commits/main, time taken: 0.75 seconds.\n",
"Page 3: Found 35 commits\n",
"Successfully crawled 105 commits across 3 pages\n"
]
}
],
"source": [
"import re\n",
"from bs4 import BeautifulSoup\n",
"\n",
"async def crawl_typescript_commits():\n",
" first_commit = \"\"\n",
" async def on_execution_started(page):\n",
" nonlocal first_commit\n",
" try:\n",
" while True:\n",
" await page.wait_for_selector('li.Box-sc-g0xbh4-0 h4')\n",
" commit = await page.query_selector('li.Box-sc-g0xbh4-0 h4')\n",
" commit = await commit.evaluate('(element) => element.textContent')\n",
" commit = re.sub(r'\\s+', '', commit)\n",
" if commit and commit != first_commit:\n",
" first_commit = commit\n",
" break\n",
" await asyncio.sleep(0.5)\n",
" except Exception as e:\n",
" print(f\"Warning: New content didn't appear after JavaScript execution: {e}\")\n",
"\n",
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
" crawler.crawler_strategy.set_hook('on_execution_started', on_execution_started)\n",
"\n",
" url = \"https://github.com/microsoft/TypeScript/commits/main\"\n",
" session_id = \"typescript_commits_session\"\n",
" all_commits = []\n",
"\n",
" js_next_page = \"\"\"\n",
" const button = document.querySelector('a[data-testid=\"pagination-next-button\"]');\n",
" if (button) button.click();\n",
" \"\"\"\n",
"\n",
" for page in range(3): # Crawl 3 pages\n",
" result = await crawler.arun(\n",
" url=url,\n",
" session_id=session_id,\n",
" css_selector=\"li.Box-sc-g0xbh4-0\",\n",
" js=js_next_page if page > 0 else None,\n",
" bypass_cache=True,\n",
" js_only=page > 0\n",
" )\n",
"\n",
" assert result.success, f\"Failed to crawl page {page + 1}\"\n",
"\n",
" soup = BeautifulSoup(result.cleaned_html, 'html.parser')\n",
" commits = soup.select(\"li\")\n",
" all_commits.extend(commits)\n",
"\n",
" print(f\"Page {page + 1}: Found {len(commits)} commits\")\n",
"\n",
" await crawler.crawler_strategy.kill_session(session_id)\n",
" print(f\"Successfully crawled {len(all_commits)} commits across 3 pages\")\n",
"\n",
"await crawl_typescript_commits()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "EJRnYsp6yZQN"
},
"source": [
"### Using JsonCssExtractionStrategy for Fast Structured Output"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "1ZMqIzB_8SYp"
},
"source": [
"The JsonCssExtractionStrategy is a powerful feature of Crawl4AI that allows for precise, structured data extraction from web pages. Here's how it works:\n",
"\n",
"1. You define a schema that describes the pattern of data you're interested in extracting.\n",
"2. The schema includes a base selector that identifies repeating elements on the page.\n",
"3. Within the schema, you define fields, each with its own selector and type.\n",
"4. These field selectors are applied within the context of each base selector element.\n",
"5. The strategy supports nested structures, lists within lists, and various data types.\n",
"6. You can even include computed fields for more complex data manipulation.\n",
"\n",
"This approach allows for highly flexible and precise data extraction, transforming semi-structured web content into clean, structured JSON data. It's particularly useful for extracting consistent data patterns from pages like product listings, news articles, or search results.\n",
"\n",
"For more details and advanced usage, check out the full documentation on the Crawl4AI website."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "trCMR2T9yZQN",
"outputId": "718d36f4-cccf-40f4-8d8c-c3ba73524d16"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[LOG] 🌤️ Warming up the AsyncWebCrawler\n",
"[LOG] 🌞 AsyncWebCrawler is ready to crawl\n",
"[LOG] 🕸️ Crawling https://www.nbcnews.com/business using AsyncPlaywrightCrawlerStrategy...\n",
"[LOG] ✅ Crawled https://www.nbcnews.com/business successfully!\n",
"[LOG] 🚀 Crawling done for https://www.nbcnews.com/business, success: True, time taken: 7.00 seconds\n",
"[LOG] 🚀 Content extracted for https://www.nbcnews.com/business, success: True, time taken: 0.32 seconds\n",
"[LOG] 🔥 Extracting semantic blocks for https://www.nbcnews.com/business, Strategy: AsyncWebCrawler\n",
"[LOG] 🚀 Extraction done for https://www.nbcnews.com/business, time taken: 0.48 seconds.\n",
"Successfully extracted 11 news teasers\n",
"{\n",
" \"category\": \"Business News\",\n",
" \"headline\": \"NBC ripped up its Olympics playbook for 2024 \\u2014 so far, the new strategy paid off\",\n",
" \"summary\": \"The Olympics have long been key to NBCUniversal. Paris marked the 18th Olympic Games broadcast by NBC in the U.S.\",\n",
" \"time\": \"13h ago\",\n",
" \"image\": {\n",
" \"src\": \"https://media-cldnry.s-nbcnews.com/image/upload/t_focal-200x100,f_auto,q_auto:best/rockcms/2024-09/240903-nbc-olympics-ch-1344-c7a486.jpg\",\n",
" \"alt\": \"Mike Tirico.\"\n",
" },\n",
" \"link\": \"https://www.nbcnews.com/business\"\n",
"}\n"
]
}
],
"source": [
"async def extract_news_teasers():\n",
" schema = {\n",
" \"name\": \"News Teaser Extractor\",\n",
" \"baseSelector\": \".wide-tease-item__wrapper\",\n",
" \"fields\": [\n",
" {\n",
" \"name\": \"category\",\n",
" \"selector\": \".unibrow span[data-testid='unibrow-text']\",\n",
" \"type\": \"text\",\n",
" },\n",
" {\n",
" \"name\": \"headline\",\n",
" \"selector\": \".wide-tease-item__headline\",\n",
" \"type\": \"text\",\n",
" },\n",
" {\n",
" \"name\": \"summary\",\n",
" \"selector\": \".wide-tease-item__description\",\n",
" \"type\": \"text\",\n",
" },\n",
" {\n",
" \"name\": \"time\",\n",
" \"selector\": \"[data-testid='wide-tease-date']\",\n",
" \"type\": \"text\",\n",
" },\n",
" {\n",
" \"name\": \"image\",\n",
" \"type\": \"nested\",\n",
" \"selector\": \"picture.teasePicture img\",\n",
" \"fields\": [\n",
" {\"name\": \"src\", \"type\": \"attribute\", \"attribute\": \"src\"},\n",
" {\"name\": \"alt\", \"type\": \"attribute\", \"attribute\": \"alt\"},\n",
" ],\n",
" },\n",
" {\n",
" \"name\": \"link\",\n",
" \"selector\": \"a[href]\",\n",
" \"type\": \"attribute\",\n",
" \"attribute\": \"href\",\n",
" },\n",
" ],\n",
" }\n",
"\n",
" extraction_strategy = JsonCssExtractionStrategy(schema, verbose=True)\n",
"\n",
" async with AsyncWebCrawler(verbose=True) as crawler:\n",
" result = await crawler.arun(\n",
" url=\"https://www.nbcnews.com/business\",\n",
" extraction_strategy=extraction_strategy,\n",
" bypass_cache=True,\n",
" )\n",
"\n",
" assert result.success, \"Failed to crawl the page\"\n",
"\n",
" news_teasers = json.loads(result.extracted_content)\n",
" print(f\"Successfully extracted {len(news_teasers)} news teasers\")\n",
" print(json.dumps(news_teasers[0], indent=2))\n",
"\n",
"await extract_news_teasers()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "FnyVhJaByZQN"
},
"source": [
"## Speed Comparison\n",
"\n",
"Let's compare the speed of Crawl4AI with Firecrawl, a paid service. Note that we can't run Firecrawl in this Colab environment, so we'll simulate its performance based on previously recorded data."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "agDD186f3wig"
},
"source": [
"💡 **Note on Speed Comparison:**\n",
"\n",
"The speed test conducted here is running on Google Colab, where the internet speed and performance can vary and may not reflect optimal conditions. When we call Firecrawl's API, we're seeing its best performance, while Crawl4AI's performance is limited by Colab's network speed.\n",
"\n",
"For a more accurate comparison, it's recommended to run these tests on your own servers or computers with a stable and fast internet connection. Despite these limitations, Crawl4AI still demonstrates faster performance in this environment.\n",
"\n",
"If you run these tests locally, you may observe an even more significant speed advantage for Crawl4AI compared to other services."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"id": "F7KwHv8G1LbY"
},
"outputs": [],
"source": [
"!pip install firecrawl"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "91813zILyZQN",
"outputId": "663223db-ab89-4976-b233-05ceca62b19b"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Firecrawl (simulated):\n",
"Time taken: 4.38 seconds\n",
"Content length: 41967 characters\n",
"Images found: 49\n",
"\n",
"Crawl4AI (simple crawl):\n",
"Time taken: 4.22 seconds\n",
"Content length: 18221 characters\n",
"Images found: 49\n",
"\n",
"Crawl4AI (with JavaScript execution):\n",
"Time taken: 9.13 seconds\n",
"Content length: 34243 characters\n",
"Images found: 89\n"
]
}
],
"source": [
"import os\n",
"from google.colab import userdata\n",
"os.environ['FIRECRAWL_API_KEY'] = userdata.get('FIRECRAWL_API_KEY')\n",
"import time\n",
"from firecrawl import FirecrawlApp\n",
"\n",
"async def speed_comparison():\n",
" # Simulated Firecrawl performance\n",
" app = FirecrawlApp(api_key=os.environ['FIRECRAWL_API_KEY'])\n",
" start = time.time()\n",
" scrape_status = app.scrape_url(\n",
" 'https://www.nbcnews.com/business',\n",
" params={'formats': ['markdown', 'html']}\n",
" )\n",
" end = time.time()\n",
" print(\"Firecrawl (simulated):\")\n",
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
" print(f\"Content length: {len(scrape_status['markdown'])} characters\")\n",
" print(f\"Images found: {scrape_status['markdown'].count('cldnry.s-nbcnews.com')}\")\n",
" print()\n",
"\n",
" async with AsyncWebCrawler() as crawler:\n",
" # Crawl4AI simple crawl\n",
" start = time.time()\n",
" result = await crawler.arun(\n",
" url=\"https://www.nbcnews.com/business\",\n",
" word_count_threshold=0,\n",
" bypass_cache=True,\n",
" verbose=False\n",
" )\n",
" end = time.time()\n",
" print(\"Crawl4AI (simple crawl):\")\n",
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
" print(f\"Content length: {len(result.markdown)} characters\")\n",
" print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
" print()\n",
"\n",
" # Crawl4AI with JavaScript execution\n",
" start = time.time()\n",
" result = await crawler.arun(\n",
" url=\"https://www.nbcnews.com/business\",\n",
" js_code=[\"const loadMoreButton = Array.from(document.querySelectorAll('button')).find(button => button.textContent.includes('Load More')); loadMoreButton && loadMoreButton.click();\"],\n",
" word_count_threshold=0,\n",
" bypass_cache=True,\n",
" verbose=False\n",
" )\n",
" end = time.time()\n",
" print(\"Crawl4AI (with JavaScript execution):\")\n",
" print(f\"Time taken: {end - start:.2f} seconds\")\n",
" print(f\"Content length: {len(result.markdown)} characters\")\n",
" print(f\"Images found: {result.markdown.count('cldnry.s-nbcnews.com')}\")\n",
"\n",
"await speed_comparison()"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "OBFFYVJIyZQN"
},
"source": [
"If you run on a local machine with a proper internet speed:\n",
"- Simple crawl: Crawl4AI is typically over 3-4 times faster than Firecrawl.\n",
"- With JavaScript execution: Even when executing JavaScript to load more content (potentially doubling the number of images found), Crawl4AI is still faster than Firecrawl's simple crawl.\n",
"\n",
"Please note that actual performance may vary depending on network conditions and the specific content being crawled."
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "A6_1RK1_yZQO"
},
"source": [
"## Conclusion\n",
"\n",
"In this notebook, we've explored the powerful features of Crawl4AI, including:\n",
"\n",
"1. Basic crawling\n",
"2. JavaScript execution and CSS selector usage\n",
"3. Proxy support\n",
"4. Structured data extraction with OpenAI\n",
"5. Advanced multi-page crawling with JavaScript execution\n",
"6. Fast structured output using JsonCssExtractionStrategy\n",
"7. Speed comparison with other services\n",
"\n",
"Crawl4AI offers a fast, flexible, and powerful solution for web crawling and data extraction tasks. Its asynchronous architecture and advanced features make it suitable for a wide range of applications, from simple web scraping to complex, multi-page data extraction scenarios.\n",
"\n",
"For more information and advanced usage, please visit the [Crawl4AI documentation](https://docs.crawl4ai.com/).\n",
"\n",
"Happy crawling!"
]
}
],
"metadata": {
"colab": {
"provenance": []
},
"kernelspec": {
"display_name": "venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.13"
}
},
"nbformat": 4,
"nbformat_minor": 0
}

View File

@@ -0,0 +1,205 @@
# Network Requests & Console Message Capturing
Crawl4AI can capture all network requests and browser console messages during a crawl, which is invaluable for debugging, security analysis, or understanding page behavior.
## Configuration
To enable network and console capturing, use these configuration options:
```python
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
# Enable both network request capture and console message capture
config = CrawlerRunConfig(
capture_network_requests=True, # Capture all network requests and responses
capture_console_messages=True # Capture all browser console output
)
```
## Example Usage
```python
import asyncio
import json
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
# Enable both network request capture and console message capture
config = CrawlerRunConfig(
capture_network_requests=True,
capture_console_messages=True
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun(
url="https://example.com",
config=config
)
if result.success:
# Analyze network requests
if result.network_requests:
print(f"Captured {len(result.network_requests)} network events")
# Count request types
request_count = len([r for r in result.network_requests if r.get("event_type") == "request"])
response_count = len([r for r in result.network_requests if r.get("event_type") == "response"])
failed_count = len([r for r in result.network_requests if r.get("event_type") == "request_failed"])
print(f"Requests: {request_count}, Responses: {response_count}, Failed: {failed_count}")
# Find API calls
api_calls = [r for r in result.network_requests
if r.get("event_type") == "request" and "api" in r.get("url", "")]
if api_calls:
print(f"Detected {len(api_calls)} API calls:")
for call in api_calls[:3]: # Show first 3
print(f" - {call.get('method')} {call.get('url')}")
# Analyze console messages
if result.console_messages:
print(f"Captured {len(result.console_messages)} console messages")
# Group by type
message_types = {}
for msg in result.console_messages:
msg_type = msg.get("type", "unknown")
message_types[msg_type] = message_types.get(msg_type, 0) + 1
print("Message types:", message_types)
# Show errors (often the most important)
errors = [msg for msg in result.console_messages if msg.get("type") == "error"]
if errors:
print(f"Found {len(errors)} console errors:")
for err in errors[:2]: # Show first 2
print(f" - {err.get('text', '')[:100]}")
# Export all captured data to a file for detailed analysis
with open("network_capture.json", "w") as f:
json.dump({
"url": result.url,
"network_requests": result.network_requests or [],
"console_messages": result.console_messages or []
}, f, indent=2)
print("Exported detailed capture data to network_capture.json")
if __name__ == "__main__":
asyncio.run(main())
```
## Captured Data Structure
### Network Requests
The `result.network_requests` contains a list of dictionaries, each representing a network event with these common fields:
| Field | Description |
|-------|-------------|
| `event_type` | Type of event: `"request"`, `"response"`, or `"request_failed"` |
| `url` | The URL of the request |
| `timestamp` | Unix timestamp when the event was captured |
#### Request Event Fields
```json
{
"event_type": "request",
"url": "https://example.com/api/data.json",
"method": "GET",
"headers": {"User-Agent": "...", "Accept": "..."},
"post_data": "key=value&otherkey=value",
"resource_type": "fetch",
"is_navigation_request": false,
"timestamp": 1633456789.123
}
```
#### Response Event Fields
```json
{
"event_type": "response",
"url": "https://example.com/api/data.json",
"status": 200,
"status_text": "OK",
"headers": {"Content-Type": "application/json", "Cache-Control": "..."},
"from_service_worker": false,
"request_timing": {"requestTime": 1234.56, "receiveHeadersEnd": 1234.78},
"timestamp": 1633456789.456
}
```
#### Failed Request Event Fields
```json
{
"event_type": "request_failed",
"url": "https://example.com/missing.png",
"method": "GET",
"resource_type": "image",
"failure_text": "net::ERR_ABORTED 404",
"timestamp": 1633456789.789
}
```
### Console Messages
The `result.console_messages` contains a list of dictionaries, each representing a console message with these common fields:
| Field | Description |
|-------|-------------|
| `type` | Message type: `"log"`, `"error"`, `"warning"`, `"info"`, etc. |
| `text` | The message text |
| `timestamp` | Unix timestamp when the message was captured |
#### Console Message Example
```json
{
"type": "error",
"text": "Uncaught TypeError: Cannot read property 'length' of undefined",
"location": "https://example.com/script.js:123:45",
"timestamp": 1633456790.123
}
```
## Key Benefits
- **Full Request Visibility**: Capture all network activity including:
- Requests (URLs, methods, headers, post data)
- Responses (status codes, headers, timing)
- Failed requests (with error messages)
- **Console Message Access**: View all JavaScript console output:
- Log messages
- Warnings
- Errors with stack traces
- Developer debugging information
- **Debugging Power**: Identify issues such as:
- Failed API calls or resource loading
- JavaScript errors affecting page functionality
- CORS or other security issues
- Hidden API endpoints and data flows
- **Security Analysis**: Detect:
- Unexpected third-party requests
- Data leakage in request payloads
- Suspicious script behavior
- **Performance Insights**: Analyze:
- Request timing data
- Resource loading patterns
- Potential bottlenecks
## Use Cases
1. **API Discovery**: Identify hidden endpoints and data flows in single-page applications
2. **Debugging**: Track down JavaScript errors affecting page functionality
3. **Security Auditing**: Detect unwanted third-party requests or data leakage
4. **Performance Analysis**: Identify slow-loading resources
5. **Ad/Tracker Analysis**: Detect and catalog advertising or tracking calls
This capability is especially valuable for complex sites with heavy JavaScript, single-page applications, or when you need to understand the exact communication happening between a browser and servers.

View File

@@ -15,6 +15,7 @@ class CrawlResult(BaseModel):
downloaded_files: Optional[List[str]] = None
screenshot: Optional[str] = None
pdf : Optional[bytes] = None
mhtml: Optional[str] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
@@ -236,7 +237,16 @@ if result.pdf:
f.write(result.pdf)
```
### 5.5 **`metadata`** *(Optional[dict])*
### 5.5 **`mhtml`** *(Optional[str])*
**What**: MHTML snapshot of the page if `capture_mhtml=True` in `CrawlerRunConfig`. MHTML (MIME HTML) format preserves the entire web page with all its resources (CSS, images, scripts, etc.) in a single file.
**Usage**:
```python
if result.mhtml:
with open("page.mhtml", "w", encoding="utf-8") as f:
f.write(result.mhtml)
```
### 5.6 **`metadata`** *(Optional[dict])*
**What**: Page-level metadata if discovered (title, description, OG data, etc.).
**Usage**:
```python
@@ -271,7 +281,69 @@ for result in results:
---
## 7. Example: Accessing Everything
## 7. Network Requests & Console Messages
When you enable network and console message capturing in `CrawlerRunConfig` using `capture_network_requests=True` and `capture_console_messages=True`, the `CrawlResult` will include these fields:
### 7.1 **`network_requests`** *(Optional[List[Dict[str, Any]]])*
**What**: A list of dictionaries containing information about all network requests, responses, and failures captured during the crawl.
**Structure**:
- Each item has an `event_type` field that can be `"request"`, `"response"`, or `"request_failed"`.
- Request events include `url`, `method`, `headers`, `post_data`, `resource_type`, and `is_navigation_request`.
- Response events include `url`, `status`, `status_text`, `headers`, and `request_timing`.
- Failed request events include `url`, `method`, `resource_type`, and `failure_text`.
- All events include a `timestamp` field.
**Usage**:
```python
if result.network_requests:
# Count different types of events
requests = [r for r in result.network_requests if r.get("event_type") == "request"]
responses = [r for r in result.network_requests if r.get("event_type") == "response"]
failures = [r for r in result.network_requests if r.get("event_type") == "request_failed"]
print(f"Captured {len(requests)} requests, {len(responses)} responses, and {len(failures)} failures")
# Analyze API calls
api_calls = [r for r in requests if "api" in r.get("url", "")]
# Identify failed resources
for failure in failures:
print(f"Failed to load: {failure.get('url')} - {failure.get('failure_text')}")
```
### 7.2 **`console_messages`** *(Optional[List[Dict[str, Any]]])*
**What**: A list of dictionaries containing all browser console messages captured during the crawl.
**Structure**:
- Each item has a `type` field indicating the message type (e.g., `"log"`, `"error"`, `"warning"`, etc.).
- The `text` field contains the actual message text.
- Some messages include `location` information (URL, line, column).
- All messages include a `timestamp` field.
**Usage**:
```python
if result.console_messages:
# Count messages by type
message_types = {}
for msg in result.console_messages:
msg_type = msg.get("type", "unknown")
message_types[msg_type] = message_types.get(msg_type, 0) + 1
print(f"Message type counts: {message_types}")
# Display errors (which are usually most important)
for msg in result.console_messages:
if msg.get("type") == "error":
print(f"Error: {msg.get('text')}")
```
These fields provide deep visibility into the page's network activity and browser console, which is invaluable for debugging, security analysis, and understanding complex web applications.
For more details on network and console capturing, see the [Network & Console Capture documentation](../advanced/network-console-capture.md).
---
## 8. Example: Accessing Everything
```python
async def handle_result(result: CrawlResult):
@@ -304,16 +376,36 @@ async def handle_result(result: CrawlResult):
if result.extracted_content:
print("Structured data:", result.extracted_content)
# Screenshot/PDF
# Screenshot/PDF/MHTML
if result.screenshot:
print("Screenshot length:", len(result.screenshot))
if result.pdf:
print("PDF bytes length:", len(result.pdf))
if result.mhtml:
print("MHTML length:", len(result.mhtml))
# Network and console capturing
if result.network_requests:
print(f"Network requests captured: {len(result.network_requests)}")
# Analyze request types
req_types = {}
for req in result.network_requests:
if "resource_type" in req:
req_types[req["resource_type"]] = req_types.get(req["resource_type"], 0) + 1
print(f"Resource types: {req_types}")
if result.console_messages:
print(f"Console messages captured: {len(result.console_messages)}")
# Count by message type
msg_types = {}
for msg in result.console_messages:
msg_types[msg.get("type", "unknown")] = msg_types.get(msg.get("type", "unknown"), 0) + 1
print(f"Message types: {msg_types}")
```
---
## 8. Key Points & Future
## 9. Key Points & Future
1. **Deprecated legacy properties of CrawlResult**
- `markdown_v2` - Deprecated in v0.5. Just use `markdown`. It holds the `MarkdownGenerationResult` now!

View File

@@ -141,6 +141,7 @@ If your page is a single-page app with repeated JS updates, set `js_only=True` i
| **`screenshot_wait_for`** | `float or None` | Extra wait time before the screenshot. |
| **`screenshot_height_threshold`** | `int` (~20000) | If the page is taller than this, alternate screenshot strategies are used. |
| **`pdf`** | `bool` (False) | If `True`, returns a PDF in `result.pdf`. |
| **`capture_mhtml`** | `bool` (False) | If `True`, captures an MHTML snapshot of the page in `result.mhtml`. MHTML includes all page resources (CSS, images, etc.) in a single file. |
| **`image_description_min_word_threshold`** | `int` (~50) | Minimum words for an images alt text or description to be considered valid. |
| **`image_score_threshold`** | `int` (~3) | Filter out low-scoring images. The crawler scores images by relevance (size, context, etc.). |
| **`exclude_external_images`** | `bool` (False) | Exclude images from other domains. |

View File

@@ -136,6 +136,12 @@ class CrawlerRunConfig:
wait_for=None,
screenshot=False,
pdf=False,
capture_mhtml=False,
enable_rate_limiting=False,
rate_limit_config=None,
memory_threshold_percent=70.0,
check_interval=1.0,
max_session_permit=20,
display_mode=None,
verbose=True,
stream=False, # Enable streaming for arun_many()
@@ -170,10 +176,9 @@ class CrawlerRunConfig:
- A CSS or JS expression to wait for before extracting content.
- Common usage: `wait_for="css:.main-loaded"` or `wait_for="js:() => window.loaded === true"`.
7. **`screenshot`** & **`pdf`**:
- If `True`, captures a screenshot or PDF after the page is fully loaded.
- The results go to `result.screenshot` (base64) or `result.pdf` (bytes).
7. **`screenshot`**, **`pdf`**, & **`capture_mhtml`**:
- If `True`, captures a screenshot, PDF, or MHTML snapshot after the page is fully loaded.
- The results go to `result.screenshot` (base64), `result.pdf` (bytes), or `result.mhtml` (string).
8. **`verbose`**:
- Logs additional runtime details.
- Overlaps with the browsers verbosity if also set to `True` in `BrowserConfig`.

View File

@@ -26,6 +26,7 @@ class CrawlResult(BaseModel):
downloaded_files: Optional[List[str]] = None
screenshot: Optional[str] = None
pdf : Optional[bytes] = None
mhtml: Optional[str] = None
markdown: Optional[Union[str, MarkdownGenerationResult]] = None
extracted_content: Optional[str] = None
metadata: Optional[dict] = None
@@ -51,6 +52,7 @@ class CrawlResult(BaseModel):
| **downloaded_files (`Optional[List[str]]`)** | If `accept_downloads=True` in `BrowserConfig`, this lists the filepaths of saved downloads. |
| **screenshot (`Optional[str]`)** | Screenshot of the page (base64-encoded) if `screenshot=True`. |
| **pdf (`Optional[bytes]`)** | PDF of the page if `pdf=True`. |
| **mhtml (`Optional[str]`)** | MHTML snapshot of the page if `capture_mhtml=True`. Contains the full page with all resources. |
| **markdown (`Optional[str or MarkdownGenerationResult]`)** | It holds a `MarkdownGenerationResult`. Over time, this will be consolidated into `markdown`. The generator can provide raw markdown, citations, references, and optionally `fit_markdown`. |
| **extracted_content (`Optional[str]`)** | The output of a structured extraction (CSS/LLM-based) stored as JSON string or other text. |
| **metadata (`Optional[dict]`)** | Additional info about the crawl or extracted data. |
@@ -190,18 +192,27 @@ for img in images:
print("Image URL:", img["src"], "Alt:", img.get("alt"))
```
### 5.3 `screenshot` and `pdf`
### 5.3 `screenshot`, `pdf`, and `mhtml`
If you set `screenshot=True` or `pdf=True` in **`CrawlerRunConfig`**, then:
If you set `screenshot=True`, `pdf=True`, or `capture_mhtml=True` in **`CrawlerRunConfig`**, then:
- `result.screenshot` contains a base64-encoded PNG string.
- `result.screenshot` contains a base64-encoded PNG string.
- `result.pdf` contains raw PDF bytes (you can write them to a file).
- `result.mhtml` contains the MHTML snapshot of the page as a string (you can write it to a .mhtml file).
```python
# Save the PDF
with open("page.pdf", "wb") as f:
f.write(result.pdf)
# Save the MHTML
if result.mhtml:
with open("page.mhtml", "w", encoding="utf-8") as f:
f.write(result.mhtml)
```
The MHTML (MIME HTML) format is particularly useful as it captures the entire web page including all of its resources (CSS, images, scripts, etc.) in a single file, making it perfect for archiving or offline viewing.
### 5.4 `ssl_certificate`
If `fetch_ssl_certificate=True`, `result.ssl_certificate` holds details about the sites SSL cert, such as issuer, validity dates, etc.

View File

@@ -4,7 +4,35 @@ In this tutorial, youll learn how to:
1. Extract links (internal, external) from crawled pages
2. Filter or exclude specific domains (e.g., social media or custom domains)
3. Access and manage media data (especially images) in the crawl result
3. Access and ma### 3.2 Excluding Images
#### Excluding External Images
If you're dealing with heavy pages or want to skip third-party images (advertisements, for example), you can turn on:
```python
crawler_cfg = CrawlerRunConfig(
exclude_external_images=True
)
```
This setting attempts to discard images from outside the primary domain, keeping only those from the site you're crawling.
#### Excluding All Images
If you want to completely remove all images from the page to maximize performance and reduce memory usage, use:
```python
crawler_cfg = CrawlerRunConfig(
exclude_all_images=True
)
```
This setting removes all images very early in the processing pipeline, which significantly improves memory efficiency and processing speed. This is particularly useful when:
- You don't need image data in your results
- You're crawling image-heavy pages that cause memory issues
- You want to focus only on text content
- You need to maximize crawling speeddata (especially images) in the crawl result
4. Configure your crawler to exclude or prioritize certain images
> **Prerequisites**
@@ -271,8 +299,41 @@ Each extracted table contains:
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
- **`capture_mhtml`**: Set to `True` if you want an MHTML snapshot of the page in `result.mhtml`. This format preserves the entire web page with all its resources (CSS, images, scripts) in a single file, making it perfect for archiving or offline viewing.
- **`wait_for_images`**: If `True`, attempts to wait until images are fully loaded before final extraction.
#### Example: Capturing Page as MHTML
```python
import asyncio
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
async def main():
crawler_cfg = CrawlerRunConfig(
capture_mhtml=True # Enable MHTML capture
)
async with AsyncWebCrawler() as crawler:
result = await crawler.arun("https://example.com", config=crawler_cfg)
if result.success and result.mhtml:
# Save the MHTML snapshot to a file
with open("example.mhtml", "w", encoding="utf-8") as f:
f.write(result.mhtml)
print("MHTML snapshot saved to example.mhtml")
else:
print("Failed to capture MHTML:", result.error_message)
if __name__ == "__main__":
asyncio.run(main())
```
The MHTML format is particularly useful because:
- It captures the complete page state including all resources
- It can be opened in most modern browsers for offline viewing
- It preserves the page exactly as it appeared during crawling
- It's a single file, making it easy to store and transfer
---
## 4. Putting It All Together: Link & Media Filtering