222 lines
7.6 KiB
Python
222 lines
7.6 KiB
Python
# demo_browser_hub.py
|
||
|
||
import asyncio
|
||
from typing import List
|
||
|
||
from crawl4ai.browser.browser_hub import BrowserHub
|
||
from pipeline import create_pipeline
|
||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||
from crawl4ai.async_logger import AsyncLogger
|
||
from crawl4ai.models import CrawlResultContainer
|
||
from crawl4ai.cache_context import CacheMode
|
||
from crawl4ai import DefaultMarkdownGenerator
|
||
from crawl4ai import PruningContentFilter
|
||
|
||
async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
|
||
"""Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
|
||
# Set up logging
|
||
logger = AsyncLogger(verbose=True)
|
||
logger.info("Setting up pre-warmed browser hub", tag="DEMO")
|
||
|
||
# Create browser configuration
|
||
browser_config = BrowserConfig(
|
||
browser_type="chromium",
|
||
headless=True, # Set to False to see the browsers in action
|
||
viewport_width=1280,
|
||
viewport_height=800,
|
||
light_mode=True, # Optimize for performance
|
||
java_script_enabled=True
|
||
)
|
||
|
||
# Create crawler configurations for pre-warming with different user agents
|
||
# This allows pages to be ready for different scenarios
|
||
crawler_configs = [
|
||
CrawlerRunConfig(
|
||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||
wait_until="networkidle"
|
||
),
|
||
# CrawlerRunConfig(
|
||
# user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
|
||
# wait_until="networkidle"
|
||
# ),
|
||
# CrawlerRunConfig(
|
||
# user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
|
||
# wait_until="networkidle"
|
||
# )
|
||
]
|
||
|
||
# Number of browsers and pages per browser
|
||
num_browsers = 1
|
||
pages_per_browser = 1
|
||
|
||
# Distribute pages across configurations
|
||
# We'll create a total of 50 pages (10 browsers × 5 pages)
|
||
page_configs = []
|
||
total_pages = num_browsers * pages_per_browser
|
||
pages_per_config = total_pages // len(crawler_configs)
|
||
|
||
for i, config in enumerate(crawler_configs):
|
||
# For the last config, add any remaining pages
|
||
if i == len(crawler_configs) - 1:
|
||
remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
|
||
page_configs.append((browser_config, config, remaining))
|
||
else:
|
||
page_configs.append((browser_config, config, pages_per_config))
|
||
|
||
# Create browser hub with pre-warmed pages
|
||
start_time = asyncio.get_event_loop().time()
|
||
logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
|
||
|
||
hub = await BrowserHub.get_browser_manager(
|
||
config=browser_config,
|
||
hub_id="demo_hub",
|
||
logger=logger,
|
||
max_browsers_per_config=num_browsers,
|
||
max_pages_per_browser=pages_per_browser,
|
||
initial_pool_size=num_browsers,
|
||
page_configs=page_configs
|
||
)
|
||
|
||
end_time = asyncio.get_event_loop().time()
|
||
logger.success(
|
||
message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
|
||
tag="DEMO",
|
||
params={
|
||
"total_pages": total_pages,
|
||
"duration": end_time - start_time
|
||
}
|
||
)
|
||
|
||
# Get and display pool status
|
||
status = await hub.get_pool_status()
|
||
logger.info(
|
||
message="Browser pool status: {status}",
|
||
tag="DEMO",
|
||
params={"status": status}
|
||
)
|
||
|
||
return hub
|
||
|
||
async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
|
||
"""Crawl a list of URLs using a pre-warmed browser hub."""
|
||
logger = AsyncLogger(verbose=True)
|
||
|
||
# Create crawler configuration
|
||
crawler_config = CrawlerRunConfig(
|
||
cache_mode=CacheMode.BYPASS,
|
||
markdown_generator=DefaultMarkdownGenerator(
|
||
content_filter=PruningContentFilter(
|
||
threshold=0.48,
|
||
threshold_type="fixed",
|
||
min_word_threshold=0
|
||
)
|
||
),
|
||
wait_until="networkidle",
|
||
screenshot=True
|
||
)
|
||
|
||
# Create pipeline with the browser hub
|
||
pipeline = await create_pipeline(
|
||
browser_hub=hub,
|
||
logger=logger
|
||
)
|
||
|
||
results = []
|
||
|
||
# Crawl all URLs in parallel
|
||
async def crawl_url(url):
|
||
logger.info(f"Crawling {url}...", tag="CRAWL")
|
||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||
logger.success(f"Completed crawl of {url}", tag="CRAWL")
|
||
return result
|
||
|
||
# Create tasks for all URLs
|
||
tasks = [crawl_url(url) for url in urls]
|
||
|
||
# Execute all tasks in parallel and collect results
|
||
results = await asyncio.gather(*tasks)
|
||
|
||
return results
|
||
|
||
async def main():
|
||
"""Main demo function."""
|
||
# List of URLs to crawl
|
||
urls_to_crawl = [
|
||
"https://example.com",
|
||
# "https://www.python.org",
|
||
# "https://httpbin.org/html",
|
||
# "https://news.ycombinator.com",
|
||
# "https://github.com",
|
||
# "https://pypi.org",
|
||
# "https://docs.python.org/3/",
|
||
# "https://opensource.org",
|
||
# "https://whatismyipaddress.com",
|
||
# "https://en.wikipedia.org/wiki/Web_scraping"
|
||
]
|
||
|
||
# Set up logging
|
||
logger = AsyncLogger(verbose=True)
|
||
logger.info("Starting browser hub demo", tag="DEMO")
|
||
|
||
try:
|
||
# Create pre-warmed browser hub
|
||
hub = await create_prewarmed_browser_hub(urls_to_crawl)
|
||
|
||
# Use hub to crawl URLs
|
||
logger.info("Crawling URLs in parallel...", tag="DEMO")
|
||
start_time = asyncio.get_event_loop().time()
|
||
|
||
results = await crawl_urls_with_hub(hub, urls_to_crawl)
|
||
|
||
end_time = asyncio.get_event_loop().time()
|
||
|
||
# Display results
|
||
logger.success(
|
||
message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
|
||
tag="DEMO",
|
||
params={
|
||
"count": len(results),
|
||
"duration": end_time - start_time,
|
||
"avg": (end_time - start_time) / len(results)
|
||
}
|
||
)
|
||
|
||
# Print summary of results
|
||
logger.info("Crawl results summary:", tag="DEMO")
|
||
for i, result in enumerate(results):
|
||
logger.info(
|
||
message="{idx}. {url}: Success={success}, Content length={length}",
|
||
tag="RESULT",
|
||
params={
|
||
"idx": i+1,
|
||
"url": result.url,
|
||
"success": result.success,
|
||
"length": len(result.html) if result.html else 0
|
||
}
|
||
)
|
||
|
||
if result.success and result.markdown and result.markdown.raw_markdown:
|
||
# Print a snippet of the markdown
|
||
markdown_snippet = result.markdown.raw_markdown[:150] + "..."
|
||
logger.info(
|
||
message=" Markdown: {snippet}",
|
||
tag="RESULT",
|
||
params={"snippet": markdown_snippet}
|
||
)
|
||
|
||
# Display final browser pool status
|
||
status = await hub.get_pool_status()
|
||
logger.info(
|
||
message="Final browser pool status: {status}",
|
||
tag="DEMO",
|
||
params={"status": status}
|
||
)
|
||
|
||
finally:
|
||
# Clean up
|
||
logger.info("Shutting down browser hub...", tag="DEMO")
|
||
await BrowserHub.shutdown_all()
|
||
logger.success("Demo completed", tag="DEMO")
|
||
|
||
if __name__ == "__main__":
|
||
asyncio.run(main()) |