Files
crawl4ai/tests/pipeline/demo_browser_hub_pipeline.py

222 lines
7.6 KiB
Python
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
# demo_browser_hub.py
import asyncio
from typing import List
from crawl4ai.browser.browser_hub import BrowserHub
from pipeline import create_pipeline
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
from crawl4ai.async_logger import AsyncLogger
from crawl4ai.models import CrawlResultContainer
from crawl4ai.cache_context import CacheMode
from crawl4ai import DefaultMarkdownGenerator
from crawl4ai import PruningContentFilter
async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
"""Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
# Set up logging
logger = AsyncLogger(verbose=True)
logger.info("Setting up pre-warmed browser hub", tag="DEMO")
# Create browser configuration
browser_config = BrowserConfig(
browser_type="chromium",
headless=True, # Set to False to see the browsers in action
viewport_width=1280,
viewport_height=800,
light_mode=True, # Optimize for performance
java_script_enabled=True
)
# Create crawler configurations for pre-warming with different user agents
# This allows pages to be ready for different scenarios
crawler_configs = [
CrawlerRunConfig(
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
wait_until="networkidle"
),
# CrawlerRunConfig(
# user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
# wait_until="networkidle"
# ),
# CrawlerRunConfig(
# user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
# wait_until="networkidle"
# )
]
# Number of browsers and pages per browser
num_browsers = 1
pages_per_browser = 1
# Distribute pages across configurations
# We'll create a total of 50 pages (10 browsers × 5 pages)
page_configs = []
total_pages = num_browsers * pages_per_browser
pages_per_config = total_pages // len(crawler_configs)
for i, config in enumerate(crawler_configs):
# For the last config, add any remaining pages
if i == len(crawler_configs) - 1:
remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
page_configs.append((browser_config, config, remaining))
else:
page_configs.append((browser_config, config, pages_per_config))
# Create browser hub with pre-warmed pages
start_time = asyncio.get_event_loop().time()
logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
hub = await BrowserHub.get_browser_manager(
config=browser_config,
hub_id="demo_hub",
logger=logger,
max_browsers_per_config=num_browsers,
max_pages_per_browser=pages_per_browser,
initial_pool_size=num_browsers,
page_configs=page_configs
)
end_time = asyncio.get_event_loop().time()
logger.success(
message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
tag="DEMO",
params={
"total_pages": total_pages,
"duration": end_time - start_time
}
)
# Get and display pool status
status = await hub.get_pool_status()
logger.info(
message="Browser pool status: {status}",
tag="DEMO",
params={"status": status}
)
return hub
async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
"""Crawl a list of URLs using a pre-warmed browser hub."""
logger = AsyncLogger(verbose=True)
# Create crawler configuration
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
),
wait_until="networkidle",
screenshot=True
)
# Create pipeline with the browser hub
pipeline = await create_pipeline(
browser_hub=hub,
logger=logger
)
results = []
# Crawl all URLs in parallel
async def crawl_url(url):
logger.info(f"Crawling {url}...", tag="CRAWL")
result = await pipeline.crawl(url=url, config=crawler_config)
logger.success(f"Completed crawl of {url}", tag="CRAWL")
return result
# Create tasks for all URLs
tasks = [crawl_url(url) for url in urls]
# Execute all tasks in parallel and collect results
results = await asyncio.gather(*tasks)
return results
async def main():
"""Main demo function."""
# List of URLs to crawl
urls_to_crawl = [
"https://example.com",
# "https://www.python.org",
# "https://httpbin.org/html",
# "https://news.ycombinator.com",
# "https://github.com",
# "https://pypi.org",
# "https://docs.python.org/3/",
# "https://opensource.org",
# "https://whatismyipaddress.com",
# "https://en.wikipedia.org/wiki/Web_scraping"
]
# Set up logging
logger = AsyncLogger(verbose=True)
logger.info("Starting browser hub demo", tag="DEMO")
try:
# Create pre-warmed browser hub
hub = await create_prewarmed_browser_hub(urls_to_crawl)
# Use hub to crawl URLs
logger.info("Crawling URLs in parallel...", tag="DEMO")
start_time = asyncio.get_event_loop().time()
results = await crawl_urls_with_hub(hub, urls_to_crawl)
end_time = asyncio.get_event_loop().time()
# Display results
logger.success(
message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
tag="DEMO",
params={
"count": len(results),
"duration": end_time - start_time,
"avg": (end_time - start_time) / len(results)
}
)
# Print summary of results
logger.info("Crawl results summary:", tag="DEMO")
for i, result in enumerate(results):
logger.info(
message="{idx}. {url}: Success={success}, Content length={length}",
tag="RESULT",
params={
"idx": i+1,
"url": result.url,
"success": result.success,
"length": len(result.html) if result.html else 0
}
)
if result.success and result.markdown and result.markdown.raw_markdown:
# Print a snippet of the markdown
markdown_snippet = result.markdown.raw_markdown[:150] + "..."
logger.info(
message=" Markdown: {snippet}",
tag="RESULT",
params={"snippet": markdown_snippet}
)
# Display final browser pool status
status = await hub.get_pool_status()
logger.info(
message="Final browser pool status: {status}",
tag="DEMO",
params={"status": status}
)
finally:
# Clean up
logger.info("Shutting down browser hub...", tag="DEMO")
await BrowserHub.shutdown_all()
logger.success("Demo completed", tag="DEMO")
if __name__ == "__main__":
asyncio.run(main())