Some refactoring, movie pipelin submodule folder into the main.
This commit is contained in:
222
tests/pipeline/demo_browser_hub_pipeline.py
Normal file
222
tests/pipeline/demo_browser_hub_pipeline.py
Normal file
@@ -0,0 +1,222 @@
|
||||
# demo_browser_hub.py
|
||||
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
from crawl4ai.browser.browser_hub import BrowserHub
|
||||
from pipeline import create_pipeline
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.models import CrawlResultContainer
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai import PruningContentFilter
|
||||
|
||||
async def create_prewarmed_browser_hub(urls_to_crawl: List[str]):
|
||||
"""Create a pre-warmed browser hub with 10 browsers and 5 pages each."""
|
||||
# Set up logging
|
||||
logger = AsyncLogger(verbose=True)
|
||||
logger.info("Setting up pre-warmed browser hub", tag="DEMO")
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True, # Set to False to see the browsers in action
|
||||
viewport_width=1280,
|
||||
viewport_height=800,
|
||||
light_mode=True, # Optimize for performance
|
||||
java_script_enabled=True
|
||||
)
|
||||
|
||||
# Create crawler configurations for pre-warming with different user agents
|
||||
# This allows pages to be ready for different scenarios
|
||||
crawler_configs = [
|
||||
CrawlerRunConfig(
|
||||
user_agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
|
||||
wait_until="networkidle"
|
||||
),
|
||||
# CrawlerRunConfig(
|
||||
# user_agent="Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Safari/605.1.15",
|
||||
# wait_until="networkidle"
|
||||
# ),
|
||||
# CrawlerRunConfig(
|
||||
# user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/92.0.4515.107 Safari/537.36",
|
||||
# wait_until="networkidle"
|
||||
# )
|
||||
]
|
||||
|
||||
# Number of browsers and pages per browser
|
||||
num_browsers = 1
|
||||
pages_per_browser = 1
|
||||
|
||||
# Distribute pages across configurations
|
||||
# We'll create a total of 50 pages (10 browsers × 5 pages)
|
||||
page_configs = []
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
pages_per_config = total_pages // len(crawler_configs)
|
||||
|
||||
for i, config in enumerate(crawler_configs):
|
||||
# For the last config, add any remaining pages
|
||||
if i == len(crawler_configs) - 1:
|
||||
remaining = total_pages - (pages_per_config * (len(crawler_configs) - 1))
|
||||
page_configs.append((browser_config, config, remaining))
|
||||
else:
|
||||
page_configs.append((browser_config, config, pages_per_config))
|
||||
|
||||
# Create browser hub with pre-warmed pages
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
logger.info("Initializing browser hub with pre-warmed pages...", tag="DEMO")
|
||||
|
||||
hub = await BrowserHub.get_browser_manager(
|
||||
config=browser_config,
|
||||
hub_id="demo_hub",
|
||||
logger=logger,
|
||||
max_browsers_per_config=num_browsers,
|
||||
max_pages_per_browser=pages_per_browser,
|
||||
initial_pool_size=num_browsers,
|
||||
page_configs=page_configs
|
||||
)
|
||||
|
||||
end_time = asyncio.get_event_loop().time()
|
||||
logger.success(
|
||||
message="Browser hub initialized with {total_pages} pre-warmed pages in {duration:.2f} seconds",
|
||||
tag="DEMO",
|
||||
params={
|
||||
"total_pages": total_pages,
|
||||
"duration": end_time - start_time
|
||||
}
|
||||
)
|
||||
|
||||
# Get and display pool status
|
||||
status = await hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Browser pool status: {status}",
|
||||
tag="DEMO",
|
||||
params={"status": status}
|
||||
)
|
||||
|
||||
return hub
|
||||
|
||||
async def crawl_urls_with_hub(hub, urls: List[str]) -> List[CrawlResultContainer]:
|
||||
"""Crawl a list of URLs using a pre-warmed browser hub."""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
# Create crawler configuration
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
)
|
||||
),
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
# Create pipeline with the browser hub
|
||||
pipeline = await create_pipeline(
|
||||
browser_hub=hub,
|
||||
logger=logger
|
||||
)
|
||||
|
||||
results = []
|
||||
|
||||
# Crawl all URLs in parallel
|
||||
async def crawl_url(url):
|
||||
logger.info(f"Crawling {url}...", tag="CRAWL")
|
||||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||||
logger.success(f"Completed crawl of {url}", tag="CRAWL")
|
||||
return result
|
||||
|
||||
# Create tasks for all URLs
|
||||
tasks = [crawl_url(url) for url in urls]
|
||||
|
||||
# Execute all tasks in parallel and collect results
|
||||
results = await asyncio.gather(*tasks)
|
||||
|
||||
return results
|
||||
|
||||
async def main():
|
||||
"""Main demo function."""
|
||||
# List of URLs to crawl
|
||||
urls_to_crawl = [
|
||||
"https://example.com",
|
||||
# "https://www.python.org",
|
||||
# "https://httpbin.org/html",
|
||||
# "https://news.ycombinator.com",
|
||||
# "https://github.com",
|
||||
# "https://pypi.org",
|
||||
# "https://docs.python.org/3/",
|
||||
# "https://opensource.org",
|
||||
# "https://whatismyipaddress.com",
|
||||
# "https://en.wikipedia.org/wiki/Web_scraping"
|
||||
]
|
||||
|
||||
# Set up logging
|
||||
logger = AsyncLogger(verbose=True)
|
||||
logger.info("Starting browser hub demo", tag="DEMO")
|
||||
|
||||
try:
|
||||
# Create pre-warmed browser hub
|
||||
hub = await create_prewarmed_browser_hub(urls_to_crawl)
|
||||
|
||||
# Use hub to crawl URLs
|
||||
logger.info("Crawling URLs in parallel...", tag="DEMO")
|
||||
start_time = asyncio.get_event_loop().time()
|
||||
|
||||
results = await crawl_urls_with_hub(hub, urls_to_crawl)
|
||||
|
||||
end_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Display results
|
||||
logger.success(
|
||||
message="Crawled {count} URLs in {duration:.2f} seconds (average: {avg:.2f} seconds per URL)",
|
||||
tag="DEMO",
|
||||
params={
|
||||
"count": len(results),
|
||||
"duration": end_time - start_time,
|
||||
"avg": (end_time - start_time) / len(results)
|
||||
}
|
||||
)
|
||||
|
||||
# Print summary of results
|
||||
logger.info("Crawl results summary:", tag="DEMO")
|
||||
for i, result in enumerate(results):
|
||||
logger.info(
|
||||
message="{idx}. {url}: Success={success}, Content length={length}",
|
||||
tag="RESULT",
|
||||
params={
|
||||
"idx": i+1,
|
||||
"url": result.url,
|
||||
"success": result.success,
|
||||
"length": len(result.html) if result.html else 0
|
||||
}
|
||||
)
|
||||
|
||||
if result.success and result.markdown and result.markdown.raw_markdown:
|
||||
# Print a snippet of the markdown
|
||||
markdown_snippet = result.markdown.raw_markdown[:150] + "..."
|
||||
logger.info(
|
||||
message=" Markdown: {snippet}",
|
||||
tag="RESULT",
|
||||
params={"snippet": markdown_snippet}
|
||||
)
|
||||
|
||||
# Display final browser pool status
|
||||
status = await hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Final browser pool status: {status}",
|
||||
tag="DEMO",
|
||||
params={"status": status}
|
||||
)
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
logger.info("Shutting down browser hub...", tag="DEMO")
|
||||
await BrowserHub.shutdown_all()
|
||||
logger.success("Demo completed", tag="DEMO")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
505
tests/pipeline/extended_browser_hub_tests.py
Normal file
505
tests/pipeline/extended_browser_hub_tests.py
Normal file
@@ -0,0 +1,505 @@
|
||||
# extended_browser_hub_tests.py
|
||||
|
||||
import asyncio
|
||||
|
||||
from crawl4ai.browser.browser_hub import BrowserHub
|
||||
from pipeline import create_pipeline
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.cache_context import CacheMode
|
||||
|
||||
# Common test URLs
|
||||
TEST_URLS = [
|
||||
"https://example.com",
|
||||
"https://example.com/page1",
|
||||
"https://httpbin.org/html",
|
||||
"https://httpbin.org/headers",
|
||||
"https://httpbin.org/ip",
|
||||
"https://httpstat.us/200"
|
||||
]
|
||||
|
||||
class TestResults:
|
||||
"""Simple container for test results"""
|
||||
def __init__(self, name: str):
|
||||
self.name = name
|
||||
self.results = []
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.errors = []
|
||||
|
||||
@property
|
||||
def duration(self) -> float:
|
||||
if self.start_time and self.end_time:
|
||||
return self.end_time - self.start_time
|
||||
return 0
|
||||
|
||||
@property
|
||||
def success_rate(self) -> float:
|
||||
if not self.results:
|
||||
return 0
|
||||
return sum(1 for r in self.results if r.success) / len(self.results) * 100
|
||||
|
||||
def log_summary(self, logger: AsyncLogger):
|
||||
logger.info(f"=== Test: {self.name} ===", tag="SUMMARY")
|
||||
logger.info(
|
||||
message="Duration: {duration:.2f}s, Success rate: {success_rate:.1f}%, Results: {count}",
|
||||
tag="SUMMARY",
|
||||
params={
|
||||
"duration": self.duration,
|
||||
"success_rate": self.success_rate,
|
||||
"count": len(self.results)
|
||||
}
|
||||
)
|
||||
|
||||
if self.errors:
|
||||
logger.error(
|
||||
message="Errors ({count}): {errors}",
|
||||
tag="SUMMARY",
|
||||
params={
|
||||
"count": len(self.errors),
|
||||
"errors": "; ".join(str(e) for e in self.errors)
|
||||
}
|
||||
)
|
||||
|
||||
# ======== TEST SCENARIO 1: Simple default configuration ========
|
||||
async def test_default_configuration():
|
||||
"""
|
||||
Test Scenario 1: Simple default configuration
|
||||
|
||||
This tests the basic case where the user does not provide any specific
|
||||
browser configuration, relying on default auto-setup.
|
||||
"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
results = TestResults("Default Configuration")
|
||||
|
||||
try:
|
||||
# Create pipeline with no browser config
|
||||
pipeline = await create_pipeline(logger=logger)
|
||||
|
||||
# Start timing
|
||||
results.start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Create basic crawler config
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="domcontentloaded"
|
||||
)
|
||||
|
||||
# Process each URL sequentially
|
||||
for url in TEST_URLS:
|
||||
try:
|
||||
logger.info(f"Crawling {url} with default configuration", tag="TEST")
|
||||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||||
results.results.append(result)
|
||||
|
||||
logger.success(
|
||||
message="Result: url={url}, success={success}, content_length={length}",
|
||||
tag="TEST",
|
||||
params={
|
||||
"url": url,
|
||||
"success": result.success,
|
||||
"length": len(result.html) if result.html else 0
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# End timing
|
||||
results.end_time = asyncio.get_event_loop().time()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed with error: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# Log summary
|
||||
results.log_summary(logger)
|
||||
|
||||
return results
|
||||
|
||||
# ======== TEST SCENARIO 2: Detailed custom configuration ========
|
||||
async def test_custom_configuration():
|
||||
"""
|
||||
Test Scenario 2: Detailed custom configuration
|
||||
|
||||
This tests the case where the user provides detailed browser configuration
|
||||
to customize the browser behavior.
|
||||
"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
results = TestResults("Custom Configuration")
|
||||
|
||||
try:
|
||||
# Create custom browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
user_agent="Mozilla/5.0 (X11; Ubuntu; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36",
|
||||
light_mode=True,
|
||||
ignore_https_errors=True,
|
||||
extra_args=["--disable-extensions"]
|
||||
)
|
||||
|
||||
# Create custom crawler config
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle",
|
||||
page_timeout=30000,
|
||||
screenshot=True,
|
||||
pdf=False,
|
||||
screenshot_wait_for=0.5,
|
||||
wait_for_images=True,
|
||||
scan_full_page=True,
|
||||
scroll_delay=0.2,
|
||||
process_iframes=True,
|
||||
remove_overlay_elements=True
|
||||
)
|
||||
|
||||
# Create pipeline with custom configuration
|
||||
pipeline = await create_pipeline(
|
||||
browser_config=browser_config,
|
||||
logger=logger
|
||||
)
|
||||
|
||||
# Start timing
|
||||
results.start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Process each URL sequentially
|
||||
for url in TEST_URLS:
|
||||
try:
|
||||
logger.info(f"Crawling {url} with custom configuration", tag="TEST")
|
||||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||||
results.results.append(result)
|
||||
|
||||
has_screenshot = result.screenshot is not None
|
||||
|
||||
logger.success(
|
||||
message="Result: url={url}, success={success}, screenshot={screenshot}, content_length={length}",
|
||||
tag="TEST",
|
||||
params={
|
||||
"url": url,
|
||||
"success": result.success,
|
||||
"screenshot": has_screenshot,
|
||||
"length": len(result.html) if result.html else 0
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# End timing
|
||||
results.end_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Get browser hub status from context
|
||||
try:
|
||||
# Run a dummy crawl to get the context with browser hub
|
||||
context = await pipeline.process({"url": "about:blank", "config": crawler_config})
|
||||
browser_hub = context.get("browser_hub")
|
||||
if browser_hub:
|
||||
status = await browser_hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Browser hub status: {status}",
|
||||
tag="TEST",
|
||||
params={"status": status}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Failed to get browser hub status: {str(e)}", tag="TEST")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed with error: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# Log summary
|
||||
results.log_summary(logger)
|
||||
|
||||
return results
|
||||
|
||||
# ======== TEST SCENARIO 3: Using pre-initialized browser hub ========
|
||||
async def test_preinitalized_browser_hub():
|
||||
"""
|
||||
Test Scenario 3: Using pre-initialized browser hub
|
||||
|
||||
This tests the case where a browser hub is initialized separately
|
||||
and then passed to the pipeline.
|
||||
"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
results = TestResults("Pre-initialized Browser Hub")
|
||||
|
||||
browser_hub = None
|
||||
try:
|
||||
# Create and initialize browser hub separately
|
||||
logger.info("Initializing browser hub separately", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
verbose=True
|
||||
)
|
||||
|
||||
browser_hub = await BrowserHub.get_browser_manager(
|
||||
config=browser_config,
|
||||
hub_id="test_preinitalized",
|
||||
logger=logger,
|
||||
max_browsers_per_config=2,
|
||||
max_pages_per_browser=3,
|
||||
initial_pool_size=2
|
||||
)
|
||||
|
||||
# Display initial status
|
||||
status = await browser_hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Initial browser hub status: {status}",
|
||||
tag="TEST",
|
||||
params={"status": status}
|
||||
)
|
||||
|
||||
# Create pipeline with pre-initialized browser hub
|
||||
pipeline = await create_pipeline(
|
||||
browser_hub=browser_hub,
|
||||
logger=logger
|
||||
)
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle",
|
||||
screenshot=True
|
||||
)
|
||||
|
||||
# Start timing
|
||||
results.start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Process URLs in parallel
|
||||
async def crawl_url(url):
|
||||
try:
|
||||
logger.info(f"Crawling {url} with pre-initialized hub", tag="TEST")
|
||||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||||
logger.success(f"Completed crawl of {url}", tag="TEST")
|
||||
return result
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
return None
|
||||
|
||||
# Create tasks for all URLs
|
||||
tasks = [crawl_url(url) for url in TEST_URLS]
|
||||
|
||||
# Execute all tasks in parallel and collect results
|
||||
all_results = await asyncio.gather(*tasks)
|
||||
results.results = [r for r in all_results if r is not None]
|
||||
|
||||
# End timing
|
||||
results.end_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Display final status
|
||||
status = await browser_hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Final browser hub status: {status}",
|
||||
tag="TEST",
|
||||
params={"status": status}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed with error: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# Log summary
|
||||
results.log_summary(logger)
|
||||
|
||||
return results, browser_hub
|
||||
|
||||
# ======== TEST SCENARIO 4: Parallel pipelines sharing browser hub ========
|
||||
async def test_parallel_pipelines():
|
||||
"""
|
||||
Test Scenario 4: Multiple parallel pipelines sharing browser hub
|
||||
|
||||
This tests the case where multiple pipelines share the same browser hub,
|
||||
demonstrating resource sharing and parallel operation.
|
||||
"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
results = TestResults("Parallel Pipelines")
|
||||
|
||||
# We'll reuse the browser hub from the previous test
|
||||
_, browser_hub = await test_preinitalized_browser_hub()
|
||||
|
||||
try:
|
||||
# Create 3 pipelines that all share the same browser hub
|
||||
pipelines = []
|
||||
for i in range(3):
|
||||
pipeline = await create_pipeline(
|
||||
browser_hub=browser_hub,
|
||||
logger=logger
|
||||
)
|
||||
pipelines.append(pipeline)
|
||||
|
||||
logger.info(f"Created {len(pipelines)} pipelines sharing the same browser hub", tag="TEST")
|
||||
|
||||
# Create crawler configs with different settings
|
||||
configs = [
|
||||
CrawlerRunConfig(wait_until="domcontentloaded", screenshot=False),
|
||||
CrawlerRunConfig(wait_until="networkidle", screenshot=True),
|
||||
CrawlerRunConfig(wait_until="load", scan_full_page=True)
|
||||
]
|
||||
|
||||
# Start timing
|
||||
results.start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Function to process URLs with a specific pipeline
|
||||
async def process_with_pipeline(pipeline_idx, urls):
|
||||
pipeline_results = []
|
||||
for url in urls:
|
||||
try:
|
||||
logger.info(f"Pipeline {pipeline_idx} crawling {url}", tag="TEST")
|
||||
result = await pipelines[pipeline_idx].crawl(
|
||||
url=url,
|
||||
config=configs[pipeline_idx]
|
||||
)
|
||||
pipeline_results.append(result)
|
||||
logger.success(
|
||||
message="Pipeline {idx} completed: url={url}, success={success}",
|
||||
tag="TEST",
|
||||
params={
|
||||
"idx": pipeline_idx,
|
||||
"url": url,
|
||||
"success": result.success
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(
|
||||
message="Pipeline {idx} error: {error}",
|
||||
tag="TEST",
|
||||
params={
|
||||
"idx": pipeline_idx,
|
||||
"error": str(e)
|
||||
}
|
||||
)
|
||||
results.errors.append(e)
|
||||
return pipeline_results
|
||||
|
||||
# Distribute URLs among pipelines
|
||||
pipeline_urls = [
|
||||
TEST_URLS[:2],
|
||||
TEST_URLS[2:4],
|
||||
TEST_URLS[4:5] * 2 # Duplicate the last URL to have 2 for pipeline 3
|
||||
]
|
||||
|
||||
# Execute all pipelines in parallel
|
||||
tasks = [
|
||||
process_with_pipeline(i, urls)
|
||||
for i, urls in enumerate(pipeline_urls)
|
||||
]
|
||||
|
||||
pipeline_results = await asyncio.gather(*tasks)
|
||||
|
||||
# Flatten results
|
||||
for res_list in pipeline_results:
|
||||
results.results.extend(res_list)
|
||||
|
||||
# End timing
|
||||
results.end_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Display browser hub status
|
||||
status = await browser_hub.get_pool_status()
|
||||
logger.info(
|
||||
message="Browser hub status after parallel pipelines: {status}",
|
||||
tag="TEST",
|
||||
params={"status": status}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed with error: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# Log summary
|
||||
results.log_summary(logger)
|
||||
|
||||
return results
|
||||
|
||||
# ======== TEST SCENARIO 5: Browser hub with connection string ========
|
||||
async def test_connection_string():
|
||||
"""
|
||||
Test Scenario 5: Browser hub with connection string
|
||||
|
||||
This tests the case where a browser hub is initialized from a connection string,
|
||||
simulating connecting to a running browser hub service.
|
||||
"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
results = TestResults("Connection String")
|
||||
|
||||
try:
|
||||
# Create pipeline with connection string
|
||||
# Note: In a real implementation, this would connect to an existing service
|
||||
# For this test, we're using a simulated connection
|
||||
connection_string = "localhost:9222" # Simulated connection string
|
||||
|
||||
pipeline = await create_pipeline(
|
||||
browser_hub_connection=connection_string,
|
||||
logger=logger
|
||||
)
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
wait_until="networkidle"
|
||||
)
|
||||
|
||||
# Start timing
|
||||
results.start_time = asyncio.get_event_loop().time()
|
||||
|
||||
# Test with a single URL
|
||||
url = TEST_URLS[0]
|
||||
try:
|
||||
logger.info(f"Crawling {url} with connection string hub", tag="TEST")
|
||||
result = await pipeline.crawl(url=url, config=crawler_config)
|
||||
results.results.append(result)
|
||||
|
||||
logger.success(
|
||||
message="Result: url={url}, success={success}, content_length={length}",
|
||||
tag="TEST",
|
||||
params={
|
||||
"url": url,
|
||||
"success": result.success,
|
||||
"length": len(result.html) if result.html else 0
|
||||
}
|
||||
)
|
||||
except Exception as e:
|
||||
logger.error(f"Error crawling {url}: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# End timing
|
||||
results.end_time = asyncio.get_event_loop().time()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed with error: {str(e)}", tag="TEST")
|
||||
results.errors.append(e)
|
||||
|
||||
# Log summary
|
||||
results.log_summary(logger)
|
||||
|
||||
return results
|
||||
|
||||
# ======== RUN ALL TESTS ========
|
||||
async def run_all_tests():
|
||||
"""Run all test scenarios"""
|
||||
logger = AsyncLogger(verbose=True)
|
||||
logger.info("=== STARTING BROWSER HUB TESTS ===", tag="MAIN")
|
||||
|
||||
try:
|
||||
# Run each test scenario
|
||||
await test_default_configuration()
|
||||
# await test_custom_configuration()
|
||||
# await test_preinitalized_browser_hub()
|
||||
# await test_parallel_pipelines()
|
||||
# await test_connection_string()
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test suite failed: {str(e)}", tag="MAIN")
|
||||
finally:
|
||||
# Clean up all browser hubs
|
||||
logger.info("Shutting down all browser hubs...", tag="MAIN")
|
||||
await BrowserHub.shutdown_all()
|
||||
logger.success("All tests completed", tag="MAIN")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_all_tests())
|
||||
109
tests/pipeline/test_pipeline.py
Normal file
109
tests/pipeline/test_pipeline.py
Normal file
@@ -0,0 +1,109 @@
|
||||
import asyncio
|
||||
from crawl4ai import (
|
||||
BrowserConfig,
|
||||
CrawlerRunConfig,
|
||||
CacheMode,
|
||||
DefaultMarkdownGenerator,
|
||||
PruningContentFilter
|
||||
)
|
||||
from pipeline import Pipeline
|
||||
|
||||
async def main():
|
||||
# Create configuration objects
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
|
||||
# Create and use pipeline with context manager
|
||||
async with Pipeline(browser_config=browser_config) as pipeline:
|
||||
result = await pipeline.crawl(
|
||||
url="https://www.example.com",
|
||||
config=crawler_config
|
||||
)
|
||||
|
||||
# Print the result
|
||||
print(f"URL: {result.url}")
|
||||
print(f"Success: {result.success}")
|
||||
|
||||
if result.success:
|
||||
print("\nMarkdown excerpt:")
|
||||
print(result.markdown.raw_markdown[:500] + "...")
|
||||
else:
|
||||
print(f"Error: {result.error_message}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
|
||||
class CrawlTarget:
|
||||
def __init__(self, urls, config=None):
|
||||
self.urls = urls
|
||||
self.config = config
|
||||
|
||||
def __repr__(self):
|
||||
return f"CrawlTarget(urls={self.urls}, config={self.config})"
|
||||
|
||||
|
||||
|
||||
|
||||
# async def main():
|
||||
# # Create configuration objects
|
||||
# browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
|
||||
# # Define different configurations
|
||||
# config1 = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.BYPASS,
|
||||
# markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(threshold=0.48)
|
||||
# ),
|
||||
# )
|
||||
|
||||
# config2 = CrawlerRunConfig(
|
||||
# cache_mode=CacheMode.ENABLED,
|
||||
# screenshot=True,
|
||||
# pdf=True
|
||||
# )
|
||||
|
||||
# # Create crawl targets
|
||||
# targets = [
|
||||
# CrawlTarget(
|
||||
# urls=["https://www.example.com", "https://www.wikipedia.org"],
|
||||
# config=config1
|
||||
# ),
|
||||
# CrawlTarget(
|
||||
# urls="https://news.ycombinator.com",
|
||||
# config=config2
|
||||
# ),
|
||||
# CrawlTarget(
|
||||
# urls=["https://github.com", "https://stackoverflow.com", "https://python.org"],
|
||||
# config=None
|
||||
# )
|
||||
# ]
|
||||
|
||||
# # Create and use pipeline with context manager
|
||||
# async with Pipeline(browser_config=browser_config) as pipeline:
|
||||
# all_results = await pipeline.crawl_batch(targets)
|
||||
|
||||
# for target_key, results in all_results.items():
|
||||
# print(f"\n===== Results for {target_key} =====")
|
||||
# print(f"Number of URLs crawled: {len(results)}")
|
||||
|
||||
# for i, result in enumerate(results):
|
||||
# print(f"\nURL {i+1}: {result.url}")
|
||||
# print(f"Success: {result.success}")
|
||||
|
||||
# if result.success:
|
||||
# print(f"Content length: {len(result.markdown.raw_markdown)} chars")
|
||||
# else:
|
||||
# print(f"Error: {result.error_message}")
|
||||
|
||||
# if __name__ == "__main__":
|
||||
# asyncio.run(main())
|
||||
Reference in New Issue
Block a user