Compare commits
32 Commits
unclecode-
...
run-many-d
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4dfd270161 | ||
|
|
8c08521301 | ||
|
|
462d5765e2 | ||
|
|
6eeb2e4076 | ||
|
|
0094cac675 | ||
|
|
4ab0893ffb | ||
|
|
6432ff1257 | ||
|
|
5358ac0fc2 | ||
|
|
a24799918c | ||
|
|
a31d7b86be | ||
|
|
7884a98be7 | ||
|
|
6e3c048328 | ||
|
|
b750542e6d | ||
|
|
dc36997a08 | ||
|
|
1630fbdafe | ||
|
|
9547bada3a | ||
|
|
9d69fce834 | ||
|
|
c6a605ccce | ||
|
|
4aeb7ef9ad | ||
|
|
a68cbb232b | ||
|
|
f78c46446b | ||
|
|
1b72880007 | ||
|
|
29f7915b79 | ||
|
|
2327db6fdc | ||
|
|
3a234ec950 | ||
|
|
9e89d27fcd | ||
|
|
b3ec7ce960 | ||
|
|
baee4949d3 | ||
|
|
9c58e4ce2e | ||
|
|
df6a6d5f4f | ||
|
|
e896c08f9c | ||
|
|
56bc3c6e45 |
3
.gitignore
vendored
3
.gitignore
vendored
@@ -255,3 +255,6 @@ continue_config.json
|
||||
|
||||
.llm.env
|
||||
.private/
|
||||
|
||||
CLAUDE_MONITOR.md
|
||||
CLAUDE.md
|
||||
33
CHANGELOG.md
33
CHANGELOG.md
@@ -5,6 +5,39 @@ All notable changes to Crawl4AI will be documented in this file.
|
||||
The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
|
||||
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
|
||||
|
||||
## Version 0.5.0.post5 (2025-03-14)
|
||||
|
||||
### Added
|
||||
|
||||
- *(crawler)* Add experimental parameters dictionary to CrawlerRunConfig to support beta features
|
||||
- *(tables)* Add comprehensive table detection and extraction functionality with scoring system
|
||||
- *(monitor)* Add real-time crawler monitoring system with memory management
|
||||
- *(content)* Add target_elements parameter for selective content extraction
|
||||
- *(browser)* Add standalone CDP browser launch capability
|
||||
- *(schema)* Add preprocess_html_for_schema utility for better HTML cleaning
|
||||
- *(api)* Add special handling for single URL requests in Docker API
|
||||
|
||||
### Changed
|
||||
|
||||
- *(filters)* Add reverse option to URLPatternFilter for inverting filter logic
|
||||
- *(browser)* Make CSP nonce headers optional via experimental config
|
||||
- *(browser)* Remove default cookie injection from page initialization
|
||||
- *(crawler)* Optimize response handling for single-URL processing
|
||||
- *(api)* Refactor crawl request handling to streamline processing
|
||||
- *(config)* Update default provider to gpt-4o
|
||||
- *(cache)* Change default cache_mode from aggressive to bypass in examples
|
||||
|
||||
### Fixed
|
||||
|
||||
- *(browser)* Clean up browser context creation code
|
||||
- *(api)* Improve code formatting in API handler
|
||||
|
||||
### Breaking Changes
|
||||
|
||||
- WebScrapingStrategy no longer returns 'scraped_html' in its output dictionary
|
||||
- Table extraction logic has been modified to better handle thead/tbody structures
|
||||
- Default cookie injection has been removed from page initialization
|
||||
|
||||
## Version 0.5.0 (2025-03-02)
|
||||
|
||||
### Added
|
||||
|
||||
@@ -420,7 +420,7 @@ if __name__ == "__main__":
|
||||
```python
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
@@ -436,7 +436,7 @@ async def main():
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# Here you can use any provider that Litellm library supports, for instance: ollama/qwen2
|
||||
# provider="ollama/qwen2", api_token="no-token",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=OpenAIModelFee.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -2,7 +2,8 @@
|
||||
import warnings
|
||||
|
||||
from .async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig, LLMConfig
|
||||
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy,
|
||||
WebScrapingStrategy,
|
||||
@@ -22,6 +23,7 @@ from .extraction_strategy import (
|
||||
CosineStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
JsonXPathExtractionStrategy,
|
||||
JsonLxmlExtractionStrategy
|
||||
)
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
from .markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
@@ -31,13 +33,12 @@ from .content_filter_strategy import (
|
||||
LLMContentFilter,
|
||||
RelevantContentFilter,
|
||||
)
|
||||
from .models import CrawlResult, MarkdownGenerationResult
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DisplayMode
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
from .async_dispatcher import (
|
||||
MemoryAdaptiveDispatcher,
|
||||
SemaphoreDispatcher,
|
||||
RateLimiter,
|
||||
CrawlerMonitor,
|
||||
DisplayMode,
|
||||
BaseDispatcher,
|
||||
)
|
||||
from .docker_client import Crawl4aiDockerClient
|
||||
@@ -47,8 +48,9 @@ from .deep_crawling import (
|
||||
DeepCrawlStrategy,
|
||||
BFSDeepCrawlStrategy,
|
||||
FilterChain,
|
||||
ContentTypeFilter,
|
||||
URLPatternFilter,
|
||||
DomainFilter,
|
||||
ContentTypeFilter,
|
||||
URLFilter,
|
||||
FilterStats,
|
||||
SEOFilter,
|
||||
@@ -68,11 +70,13 @@ __all__ = [
|
||||
"AsyncLogger",
|
||||
"AsyncWebCrawler",
|
||||
"BrowserProfiler",
|
||||
"LLMConfig",
|
||||
"DeepCrawlStrategy",
|
||||
"BFSDeepCrawlStrategy",
|
||||
"BestFirstCrawlingStrategy",
|
||||
"DFSDeepCrawlStrategy",
|
||||
"FilterChain",
|
||||
"URLPatternFilter",
|
||||
"ContentTypeFilter",
|
||||
"DomainFilter",
|
||||
"FilterStats",
|
||||
@@ -99,6 +103,7 @@ __all__ = [
|
||||
"CosineStrategy",
|
||||
"JsonCssExtractionStrategy",
|
||||
"JsonXPathExtractionStrategy",
|
||||
"JsonLxmlExtractionStrategy",
|
||||
"ChunkingStrategy",
|
||||
"RegexChunking",
|
||||
"DefaultMarkdownGenerator",
|
||||
|
||||
@@ -1,2 +1,2 @@
|
||||
# crawl4ai/_version.py
|
||||
__version__ = "0.5.0"
|
||||
__version__ = "0.5.0.post4"
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import os
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER,
|
||||
DEFAULT_PROVIDER_API_KEY,
|
||||
MIN_WORD_THRESHOLD,
|
||||
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
PROVIDER_MODELS,
|
||||
@@ -11,19 +12,27 @@ from .config import (
|
||||
)
|
||||
|
||||
from .user_agent_generator import UAGen, ValidUAGenerator # , OnlineUAGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .extraction_strategy import ExtractionStrategy, LLMExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy, RegexChunking
|
||||
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
|
||||
from .deep_crawling import DeepCrawlStrategy
|
||||
from typing import Union, List
|
||||
|
||||
from .cache_context import CacheMode
|
||||
from .proxy_strategy import ProxyRotationStrategy
|
||||
|
||||
from typing import Union, List
|
||||
import inspect
|
||||
from typing import Any, Dict, Optional
|
||||
from enum import Enum
|
||||
|
||||
from .proxy_strategy import ProxyConfig
|
||||
try:
|
||||
from .browser.docker_config import DockerConfig
|
||||
except ImportError:
|
||||
DockerConfig = None
|
||||
|
||||
|
||||
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
|
||||
"""
|
||||
@@ -164,6 +173,12 @@ class BrowserConfig:
|
||||
Default: "chromium".
|
||||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||||
Default: True.
|
||||
browser_mode (str): Determines how the browser should be initialized:
|
||||
"builtin" - use the builtin CDP browser running in background
|
||||
"dedicated" - create a new dedicated browser instance each time
|
||||
"custom" - use explicit CDP settings provided in cdp_url
|
||||
"docker" - run browser in Docker container with isolation
|
||||
Default: "dedicated"
|
||||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
advanced manipulation. Default: False.
|
||||
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
|
||||
@@ -178,8 +193,10 @@ class BrowserConfig:
|
||||
is "chromium". Default: "chromium".
|
||||
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
|
||||
Default: None.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
docker_config (DockerConfig or dict or None): Configuration for Docker-based browser automation.
|
||||
Contains settings for Docker container operation. Default: None.
|
||||
viewport_width (int): Default viewport width for pages. Default: 1080.
|
||||
viewport_height (int): Default viewport height for pages. Default: 600.
|
||||
viewport (dict): Default viewport dimensions for pages. If set, overrides viewport_width and viewport_height.
|
||||
@@ -190,7 +207,7 @@ class BrowserConfig:
|
||||
Default: False.
|
||||
downloads_path (str or None): Directory to store downloaded files. If None and accept_downloads is True,
|
||||
a default path will be created. Default: None.
|
||||
storage_state (str or dict or None): Path or object describing storage state (cookies, localStorage).
|
||||
storage_state (str or dict or None): An in-memory storage state (cookies, localStorage).
|
||||
Default: None.
|
||||
ignore_https_errors (bool): Ignore HTTPS certificate errors. Default: True.
|
||||
java_script_enabled (bool): Enable JavaScript execution in pages. Default: True.
|
||||
@@ -216,6 +233,7 @@ class BrowserConfig:
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
browser_mode: str = "dedicated",
|
||||
use_managed_browser: bool = False,
|
||||
cdp_url: str = None,
|
||||
use_persistent_context: bool = False,
|
||||
@@ -223,7 +241,8 @@ class BrowserConfig:
|
||||
chrome_channel: str = "chromium",
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
proxy_config: dict = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
docker_config: Union["DockerConfig", dict, None] = None,
|
||||
viewport_width: int = 1080,
|
||||
viewport_height: int = 600,
|
||||
viewport: dict = None,
|
||||
@@ -252,6 +271,7 @@ class BrowserConfig:
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.browser_mode = browser_mode
|
||||
self.use_managed_browser = use_managed_browser
|
||||
self.cdp_url = cdp_url
|
||||
self.use_persistent_context = use_persistent_context
|
||||
@@ -263,6 +283,12 @@ class BrowserConfig:
|
||||
self.chrome_channel = ""
|
||||
self.proxy = proxy
|
||||
self.proxy_config = proxy_config
|
||||
|
||||
# Handle docker configuration
|
||||
if isinstance(docker_config, dict) and DockerConfig is not None:
|
||||
self.docker_config = DockerConfig.from_kwargs(docker_config)
|
||||
else:
|
||||
self.docker_config = docker_config
|
||||
self.viewport_width = viewport_width
|
||||
self.viewport_height = viewport_height
|
||||
self.viewport = viewport
|
||||
@@ -285,6 +311,7 @@ class BrowserConfig:
|
||||
self.sleep_on_close = sleep_on_close
|
||||
self.verbose = verbose
|
||||
self.debugging_port = debugging_port
|
||||
self.host = host
|
||||
|
||||
fa_user_agenr_generator = ValidUAGenerator()
|
||||
if self.user_agent_mode == "random":
|
||||
@@ -297,6 +324,22 @@ class BrowserConfig:
|
||||
self.browser_hint = UAGen.generate_client_hints(self.user_agent)
|
||||
self.headers.setdefault("sec-ch-ua", self.browser_hint)
|
||||
|
||||
# Set appropriate browser management flags based on browser_mode
|
||||
if self.browser_mode == "builtin":
|
||||
# Builtin mode uses managed browser connecting to builtin CDP endpoint
|
||||
self.use_managed_browser = True
|
||||
# cdp_url will be set later by browser_manager
|
||||
elif self.browser_mode == "docker":
|
||||
# Docker mode uses managed browser with CDP to connect to browser in container
|
||||
self.use_managed_browser = True
|
||||
# cdp_url will be set later by docker browser strategy
|
||||
elif self.browser_mode == "custom" and self.cdp_url:
|
||||
# Custom mode with explicit CDP URL
|
||||
self.use_managed_browser = True
|
||||
elif self.browser_mode == "dedicated":
|
||||
# Dedicated mode uses a new browser instance each time
|
||||
pass
|
||||
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
if self.use_persistent_context:
|
||||
self.use_managed_browser = True
|
||||
@@ -306,6 +349,7 @@ class BrowserConfig:
|
||||
return BrowserConfig(
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
browser_mode=kwargs.get("browser_mode", "dedicated"),
|
||||
use_managed_browser=kwargs.get("use_managed_browser", False),
|
||||
cdp_url=kwargs.get("cdp_url"),
|
||||
use_persistent_context=kwargs.get("use_persistent_context", False),
|
||||
@@ -313,7 +357,8 @@ class BrowserConfig:
|
||||
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
||||
channel=kwargs.get("channel", "chromium"),
|
||||
proxy=kwargs.get("proxy"),
|
||||
proxy_config=kwargs.get("proxy_config"),
|
||||
proxy_config=kwargs.get("proxy_config", None),
|
||||
docker_config=kwargs.get("docker_config", None),
|
||||
viewport_width=kwargs.get("viewport_width", 1080),
|
||||
viewport_height=kwargs.get("viewport_height", 600),
|
||||
accept_downloads=kwargs.get("accept_downloads", False),
|
||||
@@ -333,12 +378,15 @@ class BrowserConfig:
|
||||
text_mode=kwargs.get("text_mode", False),
|
||||
light_mode=kwargs.get("light_mode", False),
|
||||
extra_args=kwargs.get("extra_args", []),
|
||||
debugging_port=kwargs.get("debugging_port", 9222),
|
||||
host=kwargs.get("host", "localhost"),
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
result = {
|
||||
"browser_type": self.browser_type,
|
||||
"headless": self.headless,
|
||||
"browser_mode": self.browser_mode,
|
||||
"use_managed_browser": self.use_managed_browser,
|
||||
"cdp_url": self.cdp_url,
|
||||
"use_persistent_context": self.use_persistent_context,
|
||||
@@ -365,7 +413,17 @@ class BrowserConfig:
|
||||
"sleep_on_close": self.sleep_on_close,
|
||||
"verbose": self.verbose,
|
||||
"debugging_port": self.debugging_port,
|
||||
"host": self.host,
|
||||
}
|
||||
|
||||
# Include docker_config if it exists
|
||||
if hasattr(self, "docker_config") and self.docker_config is not None:
|
||||
if hasattr(self.docker_config, "to_dict"):
|
||||
result["docker_config"] = self.docker_config.to_dict()
|
||||
else:
|
||||
result["docker_config"] = self.docker_config
|
||||
|
||||
return result
|
||||
|
||||
def clone(self, **kwargs):
|
||||
"""Create a copy of this configuration with updated values.
|
||||
@@ -497,6 +555,15 @@ class CrawlerRunConfig():
|
||||
Default: False.
|
||||
css_selector (str or None): CSS selector to extract a specific portion of the page.
|
||||
Default: None.
|
||||
|
||||
target_elements (list of str or None): List of CSS selectors for specific elements for Markdown generation
|
||||
and structured data extraction. When you set this, only the contents
|
||||
of these elements are processed for extraction and Markdown generation.
|
||||
If you do not set any value, the entire page is processed.
|
||||
The difference between this and css_selector is that this will shrink
|
||||
the initial raw HTML to the selected element, while this will only affect
|
||||
the extraction and Markdown generation.
|
||||
Default: None
|
||||
excluded_tags (list of str or None): List of HTML tags to exclude from processing.
|
||||
Default: None.
|
||||
excluded_selector (str or None): CSS selector to exclude from processing.
|
||||
@@ -513,7 +580,7 @@ class CrawlerRunConfig():
|
||||
Default: "lxml".
|
||||
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
|
||||
Default: WebScrapingStrategy.
|
||||
proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
|
||||
If None, no additional proxy config. Default: None.
|
||||
|
||||
# SSL Parameters
|
||||
@@ -593,6 +660,8 @@ class CrawlerRunConfig():
|
||||
Default: IMAGE_SCORE_THRESHOLD (e.g., 3).
|
||||
exclude_external_images (bool): If True, exclude all external images from processing.
|
||||
Default: False.
|
||||
table_score_threshold (int): Minimum score threshold for processing a table.
|
||||
Default: 7.
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains (list of str): List of domains to exclude for social media links.
|
||||
@@ -634,6 +703,12 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
|
||||
Default: None.
|
||||
|
||||
# Experimental Parameters
|
||||
experimental (dict): Dictionary containing experimental parameters that are in beta phase.
|
||||
This allows passing temporary features that are not yet fully integrated
|
||||
into the main parameter set.
|
||||
Default: None.
|
||||
|
||||
url: str = None # This is not a compulsory parameter
|
||||
"""
|
||||
|
||||
@@ -646,6 +721,7 @@ class CrawlerRunConfig():
|
||||
markdown_generator: MarkdownGenerationStrategy = None,
|
||||
only_text: bool = False,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
excluded_tags: list = None,
|
||||
excluded_selector: str = None,
|
||||
keep_data_attributes: bool = False,
|
||||
@@ -654,7 +730,7 @@ class CrawlerRunConfig():
|
||||
prettiify: bool = False,
|
||||
parser_type: str = "lxml",
|
||||
scraping_strategy: ContentScrapingStrategy = None,
|
||||
proxy_config: dict = None,
|
||||
proxy_config: Union[ProxyConfig, dict, None] = None,
|
||||
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
|
||||
# SSL Parameters
|
||||
fetch_ssl_certificate: bool = False,
|
||||
@@ -694,6 +770,7 @@ class CrawlerRunConfig():
|
||||
pdf: bool = False,
|
||||
image_description_min_word_threshold: int = IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
|
||||
image_score_threshold: int = IMAGE_SCORE_THRESHOLD,
|
||||
table_score_threshold: int = 7,
|
||||
exclude_external_images: bool = False,
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains: list = None,
|
||||
@@ -714,6 +791,8 @@ class CrawlerRunConfig():
|
||||
user_agent_generator_config: dict = {},
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy: Optional[DeepCrawlStrategy] = None,
|
||||
# Experimental Parameters
|
||||
experimental: Dict[str, Any] = None,
|
||||
):
|
||||
# TODO: Planning to set properties dynamically based on the __init__ signature
|
||||
self.url = url
|
||||
@@ -725,6 +804,7 @@ class CrawlerRunConfig():
|
||||
self.markdown_generator = markdown_generator
|
||||
self.only_text = only_text
|
||||
self.css_selector = css_selector
|
||||
self.target_elements = target_elements or []
|
||||
self.excluded_tags = excluded_tags or []
|
||||
self.excluded_selector = excluded_selector or ""
|
||||
self.keep_data_attributes = keep_data_attributes
|
||||
@@ -779,6 +859,7 @@ class CrawlerRunConfig():
|
||||
self.image_description_min_word_threshold = image_description_min_word_threshold
|
||||
self.image_score_threshold = image_score_threshold
|
||||
self.exclude_external_images = exclude_external_images
|
||||
self.table_score_threshold = table_score_threshold
|
||||
|
||||
# Link and Domain Handling Parameters
|
||||
self.exclude_social_media_domains = (
|
||||
@@ -825,6 +906,9 @@ class CrawlerRunConfig():
|
||||
|
||||
# Deep Crawl Parameters
|
||||
self.deep_crawl_strategy = deep_crawl_strategy
|
||||
|
||||
# Experimental Parameters
|
||||
self.experimental = experimental or {}
|
||||
|
||||
|
||||
def __getattr__(self, name):
|
||||
@@ -854,6 +938,7 @@ class CrawlerRunConfig():
|
||||
markdown_generator=kwargs.get("markdown_generator"),
|
||||
only_text=kwargs.get("only_text", False),
|
||||
css_selector=kwargs.get("css_selector"),
|
||||
target_elements=kwargs.get("target_elements", []),
|
||||
excluded_tags=kwargs.get("excluded_tags", []),
|
||||
excluded_selector=kwargs.get("excluded_selector", ""),
|
||||
keep_data_attributes=kwargs.get("keep_data_attributes", False),
|
||||
@@ -909,6 +994,7 @@ class CrawlerRunConfig():
|
||||
image_score_threshold=kwargs.get(
|
||||
"image_score_threshold", IMAGE_SCORE_THRESHOLD
|
||||
),
|
||||
table_score_threshold=kwargs.get("table_score_threshold", 7),
|
||||
exclude_external_images=kwargs.get("exclude_external_images", False),
|
||||
# Link and Domain Handling Parameters
|
||||
exclude_social_media_domains=kwargs.get(
|
||||
@@ -931,6 +1017,8 @@ class CrawlerRunConfig():
|
||||
# Deep Crawl Parameters
|
||||
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
|
||||
url=kwargs.get("url"),
|
||||
# Experimental Parameters
|
||||
experimental=kwargs.get("experimental"),
|
||||
)
|
||||
|
||||
# Create a funciton returns dict of the object
|
||||
@@ -954,6 +1042,7 @@ class CrawlerRunConfig():
|
||||
"markdown_generator": self.markdown_generator,
|
||||
"only_text": self.only_text,
|
||||
"css_selector": self.css_selector,
|
||||
"target_elements": self.target_elements,
|
||||
"excluded_tags": self.excluded_tags,
|
||||
"excluded_selector": self.excluded_selector,
|
||||
"keep_data_attributes": self.keep_data_attributes,
|
||||
@@ -997,6 +1086,7 @@ class CrawlerRunConfig():
|
||||
"pdf": self.pdf,
|
||||
"image_description_min_word_threshold": self.image_description_min_word_threshold,
|
||||
"image_score_threshold": self.image_score_threshold,
|
||||
"table_score_threshold": self.table_score_threshold,
|
||||
"exclude_external_images": self.exclude_external_images,
|
||||
"exclude_social_media_domains": self.exclude_social_media_domains,
|
||||
"exclude_external_links": self.exclude_external_links,
|
||||
@@ -1013,6 +1103,7 @@ class CrawlerRunConfig():
|
||||
"user_agent_generator_config": self.user_agent_generator_config,
|
||||
"deep_crawl_strategy": self.deep_crawl_strategy,
|
||||
"url": self.url,
|
||||
"experimental": self.experimental,
|
||||
}
|
||||
|
||||
def clone(self, **kwargs):
|
||||
@@ -1042,12 +1133,19 @@ class CrawlerRunConfig():
|
||||
return CrawlerRunConfig.from_kwargs(config_dict)
|
||||
|
||||
|
||||
class LlmConfig:
|
||||
class LLMConfig:
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
temprature: Optional[float] = None,
|
||||
max_tokens: Optional[int] = None,
|
||||
top_p: Optional[float] = None,
|
||||
frequency_penalty: Optional[float] = None,
|
||||
presence_penalty: Optional[float] = None,
|
||||
stop: Optional[List[str]] = None,
|
||||
n: Optional[int] = None,
|
||||
):
|
||||
"""Configuaration class for LLM provider and API token."""
|
||||
self.provider = provider
|
||||
@@ -1057,24 +1155,44 @@ class LlmConfig:
|
||||
self.api_token = os.getenv(api_token[4:])
|
||||
else:
|
||||
self.api_token = PROVIDER_MODELS.get(provider, "no-token") or os.getenv(
|
||||
"OPENAI_API_KEY"
|
||||
DEFAULT_PROVIDER_API_KEY
|
||||
)
|
||||
self.base_url = base_url
|
||||
|
||||
self.temprature = temprature
|
||||
self.max_tokens = max_tokens
|
||||
self.top_p = top_p
|
||||
self.frequency_penalty = frequency_penalty
|
||||
self.presence_penalty = presence_penalty
|
||||
self.stop = stop
|
||||
self.n = n
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "LlmConfig":
|
||||
return LlmConfig(
|
||||
def from_kwargs(kwargs: dict) -> "LLMConfig":
|
||||
return LLMConfig(
|
||||
provider=kwargs.get("provider", DEFAULT_PROVIDER),
|
||||
api_token=kwargs.get("api_token"),
|
||||
base_url=kwargs.get("base_url"),
|
||||
temprature=kwargs.get("temprature"),
|
||||
max_tokens=kwargs.get("max_tokens"),
|
||||
top_p=kwargs.get("top_p"),
|
||||
frequency_penalty=kwargs.get("frequency_penalty"),
|
||||
presence_penalty=kwargs.get("presence_penalty"),
|
||||
stop=kwargs.get("stop"),
|
||||
n=kwargs.get("n")
|
||||
)
|
||||
|
||||
def to_dict(self):
|
||||
return {
|
||||
"provider": self.provider,
|
||||
"api_token": self.api_token,
|
||||
"base_url": self.base_url
|
||||
"base_url": self.base_url,
|
||||
"temprature": self.temprature,
|
||||
"max_tokens": self.max_tokens,
|
||||
"top_p": self.top_p,
|
||||
"frequency_penalty": self.frequency_penalty,
|
||||
"presence_penalty": self.presence_penalty,
|
||||
"stop": self.stop,
|
||||
"n": self.n
|
||||
}
|
||||
|
||||
def clone(self, **kwargs):
|
||||
@@ -1084,8 +1202,10 @@ class LlmConfig:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
LLMConfig: A new instance with the specified updates
|
||||
llm_config: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return LlmConfig.from_kwargs(config_dict)
|
||||
return LLMConfig.from_kwargs(config_dict)
|
||||
|
||||
|
||||
|
||||
@@ -507,10 +507,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# Get page for session
|
||||
page, context = await self.browser_manager.get_page(crawlerRunConfig=config)
|
||||
|
||||
# await page.goto(URL)
|
||||
|
||||
# Add default cookie
|
||||
await context.add_cookies(
|
||||
[{"name": "cookiesEnabled", "value": "true", "url": url}]
|
||||
)
|
||||
# await context.add_cookies(
|
||||
# [{"name": "cookiesEnabled", "value": "true", "url": url}]
|
||||
# )
|
||||
|
||||
# Handle navigator overrides
|
||||
if config.override_navigator or config.simulate_user or config.magic:
|
||||
@@ -562,14 +564,15 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
try:
|
||||
# Generate a unique nonce for this request
|
||||
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
|
||||
if config.experimental.get("use_csp_nonce", False):
|
||||
nonce = hashlib.sha256(os.urandom(32)).hexdigest()
|
||||
|
||||
# Add CSP headers to the request
|
||||
await page.set_extra_http_headers(
|
||||
{
|
||||
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
|
||||
}
|
||||
)
|
||||
# Add CSP headers to the request
|
||||
await page.set_extra_http_headers(
|
||||
{
|
||||
"Content-Security-Policy": f"default-src 'self'; script-src 'self' 'nonce-{nonce}' 'strict-dynamic'"
|
||||
}
|
||||
)
|
||||
|
||||
response = await page.goto(
|
||||
url, wait_until=config.wait_until, timeout=config.page_timeout
|
||||
@@ -767,6 +770,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
# Handle wait_for condition
|
||||
# Todo: Decide how to handle this
|
||||
if not config.wait_for and config.css_selector and False:
|
||||
# if not config.wait_for and config.css_selector:
|
||||
config.wait_for = f"css:{config.css_selector}"
|
||||
|
||||
if config.wait_for:
|
||||
@@ -806,8 +810,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if config.remove_overlay_elements:
|
||||
await self.remove_overlay_elements(page)
|
||||
|
||||
# Get final HTML content
|
||||
html = await page.content()
|
||||
if config.css_selector:
|
||||
try:
|
||||
# Handle comma-separated selectors by splitting them
|
||||
selectors = [s.strip() for s in config.css_selector.split(',')]
|
||||
html_parts = []
|
||||
|
||||
for selector in selectors:
|
||||
try:
|
||||
content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
|
||||
html_parts.append(content)
|
||||
except Error as e:
|
||||
print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
|
||||
|
||||
# Wrap in a div to create a valid HTML structure
|
||||
html = f"<div class='crawl4ai-result'>\n" + "\n".join(html_parts) + "\n</div>"
|
||||
except Error as e:
|
||||
raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
|
||||
else:
|
||||
html = await page.content()
|
||||
|
||||
# # Get final HTML content
|
||||
# html = await page.content()
|
||||
await self.execute_hook(
|
||||
"before_return_html", page=page, html=html, context=context, config=config
|
||||
)
|
||||
|
||||
@@ -4,19 +4,14 @@ import aiosqlite
|
||||
import asyncio
|
||||
from typing import Optional, Dict
|
||||
from contextlib import asynccontextmanager
|
||||
import logging
|
||||
import json # Added for serialization/deserialization
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
import json
|
||||
from .models import CrawlResult, MarkdownGenerationResult, StringCompatibleMarkdown
|
||||
import aiofiles
|
||||
from .utils import VersionManager
|
||||
from .async_logger import AsyncLogger
|
||||
from .utils import get_error_context, create_box_message
|
||||
|
||||
# Set up logging
|
||||
# logging.basicConfig(level=logging.INFO)
|
||||
# logger = logging.getLogger(__name__)
|
||||
# logger.setLevel(logging.INFO)
|
||||
from .utils import ensure_content_dirs, generate_content_hash
|
||||
from .utils import VersionManager
|
||||
from .utils import get_error_context, create_box_message
|
||||
|
||||
base_directory = DB_PATH = os.path.join(
|
||||
os.getenv("CRAWL4_AI_BASE_DIRECTORY", Path.home()), ".crawl4ai"
|
||||
|
||||
@@ -1,20 +1,18 @@
|
||||
from typing import Dict, Optional, List, Tuple
|
||||
from typing import Dict, Optional, List, Tuple, Union
|
||||
from .async_configs import CrawlerRunConfig
|
||||
from .models import (
|
||||
CrawlResult,
|
||||
CrawlerTaskResult,
|
||||
CrawlStatus,
|
||||
DisplayMode,
|
||||
CrawlStats,
|
||||
DomainState,
|
||||
)
|
||||
|
||||
from rich.live import Live
|
||||
from rich.table import Table
|
||||
from rich.console import Console
|
||||
from rich import box
|
||||
from datetime import timedelta
|
||||
from .components.crawler_monitor import CrawlerMonitor
|
||||
|
||||
from .types import AsyncWebCrawler
|
||||
|
||||
from collections.abc import AsyncGenerator
|
||||
|
||||
import time
|
||||
import psutil
|
||||
import asyncio
|
||||
@@ -24,8 +22,6 @@ from urllib.parse import urlparse
|
||||
import random
|
||||
from abc import ABC, abstractmethod
|
||||
|
||||
from math import inf as infinity
|
||||
|
||||
|
||||
class RateLimiter:
|
||||
def __init__(
|
||||
@@ -87,201 +83,6 @@ class RateLimiter:
|
||||
return True
|
||||
|
||||
|
||||
class CrawlerMonitor:
|
||||
def __init__(
|
||||
self,
|
||||
max_visible_rows: int = 15,
|
||||
display_mode: DisplayMode = DisplayMode.DETAILED,
|
||||
):
|
||||
self.console = Console()
|
||||
self.max_visible_rows = max_visible_rows
|
||||
self.display_mode = display_mode
|
||||
self.stats: Dict[str, CrawlStats] = {}
|
||||
self.process = psutil.Process()
|
||||
self.start_time = time.time()
|
||||
self.live = Live(self._create_table(), refresh_per_second=2)
|
||||
|
||||
def start(self):
|
||||
self.live.start()
|
||||
|
||||
def stop(self):
|
||||
self.live.stop()
|
||||
|
||||
def add_task(self, task_id: str, url: str):
|
||||
self.stats[task_id] = CrawlStats(
|
||||
task_id=task_id, url=url, status=CrawlStatus.QUEUED
|
||||
)
|
||||
self.live.update(self._create_table())
|
||||
|
||||
def update_task(self, task_id: str, **kwargs):
|
||||
if task_id in self.stats:
|
||||
for key, value in kwargs.items():
|
||||
setattr(self.stats[task_id], key, value)
|
||||
self.live.update(self._create_table())
|
||||
|
||||
def _create_aggregated_table(self) -> Table:
|
||||
"""Creates a compact table showing only aggregated statistics"""
|
||||
table = Table(
|
||||
box=box.ROUNDED,
|
||||
title="Crawler Status Overview",
|
||||
title_style="bold magenta",
|
||||
header_style="bold blue",
|
||||
show_lines=True,
|
||||
)
|
||||
|
||||
# Calculate statistics
|
||||
total_tasks = len(self.stats)
|
||||
queued = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.QUEUED
|
||||
)
|
||||
in_progress = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
|
||||
)
|
||||
completed = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
|
||||
)
|
||||
failed = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
|
||||
)
|
||||
|
||||
# Memory statistics
|
||||
current_memory = self.process.memory_info().rss / (1024 * 1024)
|
||||
total_task_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||
peak_memory = max(
|
||||
(stat.peak_memory for stat in self.stats.values()), default=0.0
|
||||
)
|
||||
|
||||
# Duration
|
||||
duration = time.time() - self.start_time
|
||||
|
||||
# Create status row
|
||||
table.add_column("Status", style="bold cyan")
|
||||
table.add_column("Count", justify="right")
|
||||
table.add_column("Percentage", justify="right")
|
||||
|
||||
table.add_row("Total Tasks", str(total_tasks), "100%")
|
||||
table.add_row(
|
||||
"[yellow]In Queue[/yellow]",
|
||||
str(queued),
|
||||
f"{(queued / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
|
||||
)
|
||||
table.add_row(
|
||||
"[blue]In Progress[/blue]",
|
||||
str(in_progress),
|
||||
f"{(in_progress / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
|
||||
)
|
||||
table.add_row(
|
||||
"[green]Completed[/green]",
|
||||
str(completed),
|
||||
f"{(completed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
|
||||
)
|
||||
table.add_row(
|
||||
"[red]Failed[/red]",
|
||||
str(failed),
|
||||
f"{(failed / total_tasks * 100):.1f}%" if total_tasks > 0 else "0%",
|
||||
)
|
||||
|
||||
# Add memory information
|
||||
table.add_section()
|
||||
table.add_row(
|
||||
"[magenta]Current Memory[/magenta]", f"{current_memory:.1f} MB", ""
|
||||
)
|
||||
table.add_row(
|
||||
"[magenta]Total Task Memory[/magenta]", f"{total_task_memory:.1f} MB", ""
|
||||
)
|
||||
table.add_row(
|
||||
"[magenta]Peak Task Memory[/magenta]", f"{peak_memory:.1f} MB", ""
|
||||
)
|
||||
table.add_row(
|
||||
"[yellow]Runtime[/yellow]",
|
||||
str(timedelta(seconds=int(duration))),
|
||||
"",
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
def _create_detailed_table(self) -> Table:
|
||||
table = Table(
|
||||
box=box.ROUNDED,
|
||||
title="Crawler Performance Monitor",
|
||||
title_style="bold magenta",
|
||||
header_style="bold blue",
|
||||
)
|
||||
|
||||
# Add columns
|
||||
table.add_column("Task ID", style="cyan", no_wrap=True)
|
||||
table.add_column("URL", style="cyan", no_wrap=True)
|
||||
table.add_column("Status", style="bold")
|
||||
table.add_column("Memory (MB)", justify="right")
|
||||
table.add_column("Peak (MB)", justify="right")
|
||||
table.add_column("Duration", justify="right")
|
||||
table.add_column("Info", style="italic")
|
||||
|
||||
# Add summary row
|
||||
total_memory = sum(stat.memory_usage for stat in self.stats.values())
|
||||
active_count = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.IN_PROGRESS
|
||||
)
|
||||
completed_count = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.COMPLETED
|
||||
)
|
||||
failed_count = sum(
|
||||
1 for stat in self.stats.values() if stat.status == CrawlStatus.FAILED
|
||||
)
|
||||
|
||||
table.add_row(
|
||||
"[bold yellow]SUMMARY",
|
||||
f"Total: {len(self.stats)}",
|
||||
f"Active: {active_count}",
|
||||
f"{total_memory:.1f}",
|
||||
f"{self.process.memory_info().rss / (1024 * 1024):.1f}",
|
||||
str(
|
||||
timedelta(
|
||||
seconds=int(time.time() - self.start_time)
|
||||
)
|
||||
),
|
||||
f"✓{completed_count} ✗{failed_count}",
|
||||
style="bold",
|
||||
)
|
||||
|
||||
table.add_section()
|
||||
|
||||
# Add rows for each task
|
||||
visible_stats = sorted(
|
||||
self.stats.values(),
|
||||
key=lambda x: (
|
||||
x.status != CrawlStatus.IN_PROGRESS,
|
||||
x.status != CrawlStatus.QUEUED,
|
||||
x.end_time or infinity,
|
||||
),
|
||||
)[: self.max_visible_rows]
|
||||
|
||||
for stat in visible_stats:
|
||||
status_style = {
|
||||
CrawlStatus.QUEUED: "white",
|
||||
CrawlStatus.IN_PROGRESS: "yellow",
|
||||
CrawlStatus.COMPLETED: "green",
|
||||
CrawlStatus.FAILED: "red",
|
||||
}[stat.status]
|
||||
|
||||
table.add_row(
|
||||
stat.task_id[:8], # Show first 8 chars of task ID
|
||||
stat.url[:40] + "..." if len(stat.url) > 40 else stat.url,
|
||||
f"[{status_style}]{stat.status.value}[/{status_style}]",
|
||||
f"{stat.memory_usage:.1f}",
|
||||
f"{stat.peak_memory:.1f}",
|
||||
stat.duration,
|
||||
stat.error_message[:40] if stat.error_message else "",
|
||||
)
|
||||
|
||||
return table
|
||||
|
||||
def _create_table(self) -> Table:
|
||||
"""Creates the appropriate table based on display mode"""
|
||||
if self.display_mode == DisplayMode.AGGREGATED:
|
||||
return self._create_aggregated_table()
|
||||
return self._create_detailed_table()
|
||||
|
||||
|
||||
class BaseDispatcher(ABC):
|
||||
def __init__(
|
||||
@@ -309,7 +110,7 @@ class BaseDispatcher(ABC):
|
||||
async def run_urls(
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: "AsyncWebCrawler", # noqa: F821
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
@@ -320,71 +121,189 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
def __init__(
|
||||
self,
|
||||
memory_threshold_percent: float = 90.0,
|
||||
critical_threshold_percent: float = 95.0, # New critical threshold
|
||||
recovery_threshold_percent: float = 85.0, # New recovery threshold
|
||||
check_interval: float = 1.0,
|
||||
max_session_permit: int = 20,
|
||||
memory_wait_timeout: float = 300.0, # 5 minutes default timeout
|
||||
fairness_timeout: float = 600.0, # 10 minutes before prioritizing long-waiting URLs
|
||||
rate_limiter: Optional[RateLimiter] = None,
|
||||
monitor: Optional[CrawlerMonitor] = None,
|
||||
):
|
||||
super().__init__(rate_limiter, monitor)
|
||||
self.memory_threshold_percent = memory_threshold_percent
|
||||
self.critical_threshold_percent = critical_threshold_percent
|
||||
self.recovery_threshold_percent = recovery_threshold_percent
|
||||
self.check_interval = check_interval
|
||||
self.max_session_permit = max_session_permit
|
||||
self.memory_wait_timeout = memory_wait_timeout
|
||||
self.result_queue = asyncio.Queue() # Queue for storing results
|
||||
|
||||
self.fairness_timeout = fairness_timeout
|
||||
self.result_queue = asyncio.Queue()
|
||||
self.task_queue = asyncio.PriorityQueue() # Priority queue for better management
|
||||
self.memory_pressure_mode = False # Flag to indicate when we're in memory pressure mode
|
||||
self.current_memory_percent = 0.0 # Track current memory usage
|
||||
|
||||
async def _memory_monitor_task(self):
|
||||
"""Background task to continuously monitor memory usage and update state"""
|
||||
while True:
|
||||
self.current_memory_percent = psutil.virtual_memory().percent
|
||||
|
||||
# Enter memory pressure mode if we cross the threshold
|
||||
if not self.memory_pressure_mode and self.current_memory_percent >= self.memory_threshold_percent:
|
||||
self.memory_pressure_mode = True
|
||||
if self.monitor:
|
||||
self.monitor.update_memory_status("PRESSURE")
|
||||
|
||||
# Exit memory pressure mode if we go below recovery threshold
|
||||
elif self.memory_pressure_mode and self.current_memory_percent <= self.recovery_threshold_percent:
|
||||
self.memory_pressure_mode = False
|
||||
if self.monitor:
|
||||
self.monitor.update_memory_status("NORMAL")
|
||||
|
||||
# In critical mode, we might need to take more drastic action
|
||||
if self.current_memory_percent >= self.critical_threshold_percent:
|
||||
if self.monitor:
|
||||
self.monitor.update_memory_status("CRITICAL")
|
||||
# We could implement additional memory-saving measures here
|
||||
|
||||
await asyncio.sleep(self.check_interval)
|
||||
|
||||
def _get_priority_score(self, wait_time: float, retry_count: int) -> float:
|
||||
"""Calculate priority score (lower is higher priority)
|
||||
- URLs waiting longer than fairness_timeout get higher priority
|
||||
- More retry attempts decreases priority
|
||||
"""
|
||||
if wait_time > self.fairness_timeout:
|
||||
# High priority for long-waiting URLs
|
||||
return -wait_time
|
||||
# Standard priority based on retries
|
||||
return retry_count
|
||||
|
||||
async def crawl_url(
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig,
|
||||
task_id: str,
|
||||
) -> CrawlerTaskResult:
|
||||
retry_count: int = 0,
|
||||
) -> Union[CrawlerTaskResult, List[CrawlerTaskResult]]:
|
||||
start_time = time.time()
|
||||
error_message = ""
|
||||
memory_usage = peak_memory = 0.0
|
||||
|
||||
|
||||
# Get starting memory for accurate measurement
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
try:
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id, status=CrawlStatus.IN_PROGRESS, start_time=start_time
|
||||
task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=start_time,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
self.concurrent_sessions += 1
|
||||
|
||||
|
||||
if self.rate_limiter:
|
||||
await self.rate_limiter.wait_if_needed(url)
|
||||
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
# Check if we're in critical memory state
|
||||
if self.current_memory_percent >= self.critical_threshold_percent:
|
||||
# Requeue this task with increased priority and retry count
|
||||
enqueue_time = time.time()
|
||||
priority = self._get_priority_score(enqueue_time - start_time, retry_count + 1)
|
||||
await self.task_queue.put((priority, (url, task_id, retry_count + 1, enqueue_time)))
|
||||
|
||||
# Update monitoring
|
||||
if self.monitor:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.QUEUED,
|
||||
error_message="Requeued due to critical memory pressure"
|
||||
)
|
||||
|
||||
# Return placeholder result with requeued status
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=CrawlResult(
|
||||
url=url, html="", metadata={"status": "requeued"},
|
||||
success=False, error_message="Requeued due to critical memory pressure"
|
||||
),
|
||||
memory_usage=0,
|
||||
peak_memory=0,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message="Requeued due to critical memory pressure",
|
||||
retry_count=retry_count + 1
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
result = await self.crawler.arun(url, config=config, session_id=task_id)
|
||||
|
||||
# Measure memory usage
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
|
||||
memory_usage = peak_memory = end_memory - start_memory
|
||||
|
||||
if self.rate_limiter and result.status_code:
|
||||
|
||||
# Check if we have a container with multiple results (deep crawl result)
|
||||
if isinstance(result, list) or (hasattr(result, '_results') and len(result._results) > 1):
|
||||
# Handle deep crawling results - create a list of task results
|
||||
task_results = []
|
||||
result_list = result if isinstance(result, list) else result._results
|
||||
|
||||
for idx, single_result in enumerate(result_list):
|
||||
# Create individual task result for each crawled page
|
||||
sub_task_id = f"{task_id}_{idx}"
|
||||
single_memory = memory_usage / len(result_list) # Distribute memory usage
|
||||
|
||||
# Only update rate limiter for first result which corresponds to the original URL
|
||||
if idx == 0 and self.rate_limiter and hasattr(single_result, 'status_code') and single_result.status_code:
|
||||
if not self.rate_limiter.update_delay(url, single_result.status_code):
|
||||
error_msg = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||
|
||||
task_result = CrawlerTaskResult(
|
||||
task_id=sub_task_id,
|
||||
url=single_result.url,
|
||||
result=single_result,
|
||||
memory_usage=single_memory,
|
||||
peak_memory=single_memory,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=single_result.error_message if not single_result.success else "",
|
||||
retry_count=retry_count
|
||||
)
|
||||
task_results.append(task_result)
|
||||
|
||||
# Update monitor with completion status based on the first/primary result
|
||||
if self.monitor:
|
||||
primary_result = result_list[0]
|
||||
if not primary_result.success:
|
||||
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||
else:
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
status=CrawlStatus.COMPLETED,
|
||||
extra_info=f"Deep crawl: {len(result_list)} pages"
|
||||
)
|
||||
|
||||
return task_results
|
||||
|
||||
# Handle single result (original behavior)
|
||||
if self.rate_limiter and hasattr(result, 'status_code') and result.status_code:
|
||||
if not self.rate_limiter.update_delay(url, result.status_code):
|
||||
error_message = f"Rate limit retry count exceeded for domain {urlparse(url).netloc}"
|
||||
if self.monitor:
|
||||
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||
result = CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
result=result,
|
||||
memory_usage=memory_usage,
|
||||
peak_memory=peak_memory,
|
||||
start_time=start_time,
|
||||
end_time=time.time(),
|
||||
error_message=error_message,
|
||||
)
|
||||
await self.result_queue.put(result)
|
||||
return result
|
||||
|
||||
|
||||
# Update status based on result
|
||||
if not result.success:
|
||||
error_message = result.error_message
|
||||
if self.monitor:
|
||||
self.monitor.update_task(task_id, status=CrawlStatus.FAILED)
|
||||
elif self.monitor:
|
||||
self.monitor.update_task(task_id, status=CrawlStatus.COMPLETED)
|
||||
|
||||
|
||||
except Exception as e:
|
||||
error_message = str(e)
|
||||
if self.monitor:
|
||||
@@ -392,7 +311,7 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
result = CrawlResult(
|
||||
url=url, html="", metadata={}, success=False, error_message=str(e)
|
||||
)
|
||||
|
||||
|
||||
finally:
|
||||
end_time = time.time()
|
||||
if self.monitor:
|
||||
@@ -402,9 +321,10 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
memory_usage=memory_usage,
|
||||
peak_memory=peak_memory,
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
self.concurrent_sessions -= 1
|
||||
|
||||
|
||||
return CrawlerTaskResult(
|
||||
task_id=task_id,
|
||||
url=url,
|
||||
@@ -414,116 +334,245 @@ class MemoryAdaptiveDispatcher(BaseDispatcher):
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
error_message=error_message,
|
||||
retry_count=retry_count
|
||||
)
|
||||
|
||||
|
||||
async def run_urls(
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: "AsyncWebCrawler", # noqa: F821
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
self.crawler = crawler
|
||||
|
||||
|
||||
# Start the memory monitor task
|
||||
memory_monitor = asyncio.create_task(self._memory_monitor_task())
|
||||
|
||||
if self.monitor:
|
||||
self.monitor.start()
|
||||
|
||||
|
||||
results = []
|
||||
|
||||
try:
|
||||
pending_tasks = []
|
||||
active_tasks = []
|
||||
task_queue = []
|
||||
|
||||
for url in urls:
|
||||
task_id = str(uuid.uuid4())
|
||||
if self.monitor:
|
||||
self.monitor.add_task(task_id, url)
|
||||
task_queue.append((url, task_id))
|
||||
|
||||
while task_queue or active_tasks:
|
||||
wait_start_time = time.time()
|
||||
while len(active_tasks) < self.max_session_permit and task_queue:
|
||||
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
|
||||
# Check if we've exceeded the timeout
|
||||
if time.time() - wait_start_time > self.memory_wait_timeout:
|
||||
raise MemoryError(
|
||||
f"Memory usage above threshold ({self.memory_threshold_percent}%) for more than {self.memory_wait_timeout} seconds"
|
||||
)
|
||||
await asyncio.sleep(self.check_interval)
|
||||
continue
|
||||
|
||||
url, task_id = task_queue.pop(0)
|
||||
task = asyncio.create_task(self.crawl_url(url, config, task_id))
|
||||
active_tasks.append(task)
|
||||
|
||||
if not active_tasks:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
continue
|
||||
|
||||
done, pending = await asyncio.wait(
|
||||
active_tasks, return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
pending_tasks.extend(done)
|
||||
active_tasks = list(pending)
|
||||
|
||||
return await asyncio.gather(*pending_tasks)
|
||||
finally:
|
||||
if self.monitor:
|
||||
self.monitor.stop()
|
||||
|
||||
async def run_urls_stream(
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: "AsyncWebCrawler", # noqa: F821
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
if self.monitor:
|
||||
self.monitor.start()
|
||||
|
||||
try:
|
||||
active_tasks = []
|
||||
task_queue = []
|
||||
completed_count = 0
|
||||
total_urls = len(urls)
|
||||
|
||||
# Initialize task queue
|
||||
for url in urls:
|
||||
task_id = str(uuid.uuid4())
|
||||
if self.monitor:
|
||||
self.monitor.add_task(task_id, url)
|
||||
task_queue.append((url, task_id))
|
||||
|
||||
while completed_count < total_urls:
|
||||
# Start new tasks if memory permits
|
||||
while len(active_tasks) < self.max_session_permit and task_queue:
|
||||
if psutil.virtual_memory().percent >= self.memory_threshold_percent:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
continue
|
||||
|
||||
url, task_id = task_queue.pop(0)
|
||||
task = asyncio.create_task(self.crawl_url(url, config, task_id))
|
||||
active_tasks.append(task)
|
||||
|
||||
if not active_tasks and not task_queue:
|
||||
break
|
||||
|
||||
# Wait for any task to complete and yield results
|
||||
# Add to queue with initial priority 0, retry count 0, and current time
|
||||
await self.task_queue.put((0, (url, task_id, 0, time.time())))
|
||||
|
||||
active_tasks = []
|
||||
|
||||
# Process until both queues are empty
|
||||
while not self.task_queue.empty() or active_tasks:
|
||||
# If memory pressure is low, start new tasks
|
||||
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
|
||||
try:
|
||||
# Try to get a task with timeout to avoid blocking indefinitely
|
||||
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
|
||||
self.task_queue.get(), timeout=0.1
|
||||
)
|
||||
|
||||
# Create and start the task
|
||||
task = asyncio.create_task(
|
||||
self.crawl_url(url, config, task_id, retry_count)
|
||||
)
|
||||
active_tasks.append(task)
|
||||
|
||||
# Update waiting time in monitor
|
||||
if self.monitor:
|
||||
wait_time = time.time() - enqueue_time
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
wait_time=wait_time,
|
||||
status=CrawlStatus.IN_PROGRESS
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# No tasks in queue, that's fine
|
||||
pass
|
||||
|
||||
# Wait for completion even if queue is starved
|
||||
if active_tasks:
|
||||
done, pending = await asyncio.wait(
|
||||
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
# Process completed tasks
|
||||
for completed_task in done:
|
||||
result = await completed_task
|
||||
completed_count += 1
|
||||
yield result
|
||||
task_result = await completed_task
|
||||
|
||||
# Handle both single results and lists of results
|
||||
if isinstance(task_result, list):
|
||||
results.extend(task_result)
|
||||
else:
|
||||
results.append(task_result)
|
||||
|
||||
# Update active tasks list
|
||||
active_tasks = list(pending)
|
||||
else:
|
||||
await asyncio.sleep(self.check_interval)
|
||||
# If no active tasks but still waiting, sleep briefly
|
||||
await asyncio.sleep(self.check_interval / 2)
|
||||
|
||||
# Update priorities for waiting tasks if needed
|
||||
await self._update_queue_priorities()
|
||||
|
||||
return results
|
||||
|
||||
except Exception as e:
|
||||
if self.monitor:
|
||||
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
memory_monitor.cancel()
|
||||
if self.monitor:
|
||||
self.monitor.stop()
|
||||
|
||||
async def _update_queue_priorities(self):
|
||||
"""Periodically update priorities of items in the queue to prevent starvation"""
|
||||
# Skip if queue is empty
|
||||
if self.task_queue.empty():
|
||||
return
|
||||
|
||||
# Use a drain-and-refill approach to update all priorities
|
||||
temp_items = []
|
||||
|
||||
# Drain the queue (with a safety timeout to prevent blocking)
|
||||
try:
|
||||
drain_start = time.time()
|
||||
while not self.task_queue.empty() and time.time() - drain_start < 5.0: # 5 second safety timeout
|
||||
try:
|
||||
# Get item from queue with timeout
|
||||
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
|
||||
self.task_queue.get(), timeout=0.1
|
||||
)
|
||||
|
||||
# Calculate new priority based on current wait time
|
||||
current_time = time.time()
|
||||
wait_time = current_time - enqueue_time
|
||||
new_priority = self._get_priority_score(wait_time, retry_count)
|
||||
|
||||
# Store with updated priority
|
||||
temp_items.append((new_priority, (url, task_id, retry_count, enqueue_time)))
|
||||
|
||||
# Update monitoring stats for this task
|
||||
if self.monitor and task_id in self.monitor.stats:
|
||||
self.monitor.update_task(task_id, wait_time=wait_time)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# Queue might be empty or very slow
|
||||
break
|
||||
except Exception as e:
|
||||
# If anything goes wrong, make sure we refill the queue with what we've got
|
||||
self.monitor.update_memory_status(f"QUEUE_ERROR: {str(e)}")
|
||||
|
||||
# Calculate queue statistics
|
||||
if temp_items and self.monitor:
|
||||
total_queued = len(temp_items)
|
||||
wait_times = [item[1][3] for item in temp_items]
|
||||
highest_wait_time = time.time() - min(wait_times) if wait_times else 0
|
||||
avg_wait_time = sum(time.time() - t for t in wait_times) / len(wait_times) if wait_times else 0
|
||||
|
||||
# Update queue statistics in monitor
|
||||
self.monitor.update_queue_statistics(
|
||||
total_queued=total_queued,
|
||||
highest_wait_time=highest_wait_time,
|
||||
avg_wait_time=avg_wait_time
|
||||
)
|
||||
|
||||
# Sort by priority (lowest number = highest priority)
|
||||
temp_items.sort(key=lambda x: x[0])
|
||||
|
||||
# Refill the queue with updated priorities
|
||||
for item in temp_items:
|
||||
await self.task_queue.put(item)
|
||||
|
||||
async def run_urls_stream(
|
||||
self,
|
||||
urls: List[str],
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlerTaskResult, None]:
|
||||
self.crawler = crawler
|
||||
|
||||
# Start the memory monitor task
|
||||
memory_monitor = asyncio.create_task(self._memory_monitor_task())
|
||||
|
||||
if self.monitor:
|
||||
self.monitor.start()
|
||||
|
||||
try:
|
||||
# Initialize task queue
|
||||
for url in urls:
|
||||
task_id = str(uuid.uuid4())
|
||||
if self.monitor:
|
||||
self.monitor.add_task(task_id, url)
|
||||
# Add to queue with initial priority 0, retry count 0, and current time
|
||||
await self.task_queue.put((0, (url, task_id, 0, time.time())))
|
||||
|
||||
active_tasks = []
|
||||
completed_count = 0
|
||||
total_urls = len(urls)
|
||||
|
||||
while completed_count < total_urls:
|
||||
# If memory pressure is low, start new tasks
|
||||
if not self.memory_pressure_mode and len(active_tasks) < self.max_session_permit:
|
||||
try:
|
||||
# Try to get a task with timeout
|
||||
priority, (url, task_id, retry_count, enqueue_time) = await asyncio.wait_for(
|
||||
self.task_queue.get(), timeout=0.1
|
||||
)
|
||||
|
||||
# Create and start the task
|
||||
task = asyncio.create_task(
|
||||
self.crawl_url(url, config, task_id, retry_count)
|
||||
)
|
||||
active_tasks.append(task)
|
||||
|
||||
# Update waiting time in monitor
|
||||
if self.monitor:
|
||||
wait_time = time.time() - enqueue_time
|
||||
self.monitor.update_task(
|
||||
task_id,
|
||||
wait_time=wait_time,
|
||||
status=CrawlStatus.IN_PROGRESS
|
||||
)
|
||||
|
||||
except asyncio.TimeoutError:
|
||||
# No tasks in queue, that's fine
|
||||
pass
|
||||
|
||||
# Process completed tasks and yield results
|
||||
if active_tasks:
|
||||
done, pending = await asyncio.wait(
|
||||
active_tasks, timeout=0.1, return_when=asyncio.FIRST_COMPLETED
|
||||
)
|
||||
|
||||
for completed_task in done:
|
||||
result = await completed_task
|
||||
|
||||
# Only count as completed if it wasn't requeued
|
||||
if "requeued" not in result.error_message:
|
||||
completed_count += 1
|
||||
yield result
|
||||
|
||||
# Update active tasks list
|
||||
active_tasks = list(pending)
|
||||
else:
|
||||
# If no active tasks but still waiting, sleep briefly
|
||||
await asyncio.sleep(self.check_interval / 2)
|
||||
|
||||
# Update priorities for waiting tasks if needed
|
||||
await self._update_queue_priorities()
|
||||
|
||||
finally:
|
||||
# Clean up
|
||||
memory_monitor.cancel()
|
||||
if self.monitor:
|
||||
self.monitor.stop()
|
||||
|
||||
|
||||
class SemaphoreDispatcher(BaseDispatcher):
|
||||
def __init__(
|
||||
@@ -620,7 +669,7 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
|
||||
async def run_urls(
|
||||
self,
|
||||
crawler: "AsyncWebCrawler", # noqa: F821
|
||||
crawler: AsyncWebCrawler, # noqa: F821
|
||||
urls: List[str],
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlerTaskResult]:
|
||||
@@ -644,4 +693,4 @@ class SemaphoreDispatcher(BaseDispatcher):
|
||||
return await asyncio.gather(*tasks, return_exceptions=True)
|
||||
finally:
|
||||
if self.monitor:
|
||||
self.monitor.stop()
|
||||
self.monitor.stop()
|
||||
@@ -4,7 +4,7 @@ import sys
|
||||
import time
|
||||
from colorama import Fore
|
||||
from pathlib import Path
|
||||
from typing import Optional, List
|
||||
from typing import Optional, List, Generic, TypeVar
|
||||
import json
|
||||
import asyncio
|
||||
|
||||
@@ -13,17 +13,16 @@ from contextlib import asynccontextmanager
|
||||
from .models import CrawlResult, MarkdownGenerationResult, DispatchResult, ScrapingResult
|
||||
from .async_database import async_db_manager
|
||||
from .chunking_strategy import * # noqa: F403
|
||||
from .chunking_strategy import RegexChunking, ChunkingStrategy, IdentityChunking
|
||||
from .chunking_strategy import IdentityChunking
|
||||
from .content_filter_strategy import * # noqa: F403
|
||||
from .content_filter_strategy import RelevantContentFilter
|
||||
from .extraction_strategy import * # noqa: F403
|
||||
from .extraction_strategy import NoExtractionStrategy, ExtractionStrategy
|
||||
from .extraction_strategy import NoExtractionStrategy
|
||||
from .async_crawler_strategy import (
|
||||
AsyncCrawlerStrategy,
|
||||
AsyncPlaywrightCrawlerStrategy,
|
||||
AsyncCrawlResponse,
|
||||
)
|
||||
from .cache_context import CacheMode, CacheContext, _legacy_to_cache_mode
|
||||
from .cache_context import CacheMode, CacheContext
|
||||
from .markdown_generation_strategy import (
|
||||
DefaultMarkdownGenerator,
|
||||
MarkdownGenerationStrategy,
|
||||
@@ -34,7 +33,6 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_dispatcher import * # noqa: F403
|
||||
from .async_dispatcher import BaseDispatcher, MemoryAdaptiveDispatcher, RateLimiter
|
||||
|
||||
from .config import MIN_WORD_THRESHOLD
|
||||
from .utils import (
|
||||
sanitize_input_encode,
|
||||
InvalidCSSSelectorError,
|
||||
@@ -44,17 +42,46 @@ from .utils import (
|
||||
RobotsParser,
|
||||
)
|
||||
|
||||
from typing import Union, AsyncGenerator, TypeVar
|
||||
from typing import Union, AsyncGenerator
|
||||
|
||||
CrawlResultT = TypeVar('CrawlResultT', bound=CrawlResult)
|
||||
RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
# RunManyReturn = Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
|
||||
DeepCrawlSingleReturn = Union[List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
DeepCrawlManyReturn = Union[
|
||||
List[List[CrawlResultT]],
|
||||
AsyncGenerator[CrawlResultT, None],
|
||||
class CrawlResultContainer(Generic[CrawlResultT]):
|
||||
def __init__(self, results: Union[CrawlResultT, List[CrawlResultT]]):
|
||||
# Normalize to a list
|
||||
if isinstance(results, list):
|
||||
self._results = results
|
||||
else:
|
||||
self._results = [results]
|
||||
|
||||
def __iter__(self):
|
||||
return iter(self._results)
|
||||
|
||||
def __getitem__(self, index):
|
||||
return self._results[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self._results)
|
||||
|
||||
def __getattr__(self, attr):
|
||||
# Delegate attribute access to the first element.
|
||||
if self._results:
|
||||
return getattr(self._results[0], attr)
|
||||
raise AttributeError(f"{self.__class__.__name__} object has no attribute '{attr}'")
|
||||
|
||||
def __repr__(self):
|
||||
return f"{self.__class__.__name__}({self._results!r})"
|
||||
|
||||
# Redefine the union type. Now synchronous calls always return a container,
|
||||
# while stream mode is handled with an AsyncGenerator.
|
||||
RunManyReturn = Union[
|
||||
CrawlResultContainer[CrawlResultT],
|
||||
AsyncGenerator[CrawlResultT, None]
|
||||
]
|
||||
|
||||
|
||||
|
||||
class AsyncWebCrawler:
|
||||
"""
|
||||
Asynchronous web crawler with flexible caching capabilities.
|
||||
@@ -174,13 +201,35 @@ class AsyncWebCrawler:
|
||||
This is equivalent to using 'async with' but gives more control over the lifecycle.
|
||||
|
||||
This method will:
|
||||
1. Initialize the browser and context
|
||||
2. Perform warmup sequence
|
||||
3. Return the crawler instance for method chaining
|
||||
1. Check for builtin browser if browser_mode is 'builtin'
|
||||
2. Initialize the browser and context
|
||||
3. Perform warmup sequence
|
||||
4. Return the crawler instance for method chaining
|
||||
|
||||
Returns:
|
||||
AsyncWebCrawler: The initialized crawler instance
|
||||
"""
|
||||
# Check for builtin browser if requested
|
||||
if self.browser_config.browser_mode == "builtin" and not self.browser_config.cdp_url:
|
||||
# Import here to avoid circular imports
|
||||
from .browser_profiler import BrowserProfiler
|
||||
profiler = BrowserProfiler(logger=self.logger)
|
||||
|
||||
# Get builtin browser info or launch if needed
|
||||
browser_info = profiler.get_builtin_browser_info()
|
||||
if not browser_info:
|
||||
self.logger.info("Builtin browser not found, launching new instance...", tag="BROWSER")
|
||||
cdp_url = await profiler.launch_builtin_browser()
|
||||
if not cdp_url:
|
||||
self.logger.warning("Failed to launch builtin browser, falling back to dedicated browser", tag="BROWSER")
|
||||
else:
|
||||
self.browser_config.cdp_url = cdp_url
|
||||
self.browser_config.use_managed_browser = True
|
||||
else:
|
||||
self.logger.info(f"Using existing builtin browser at {browser_info.get('cdp_url')}", tag="BROWSER")
|
||||
self.browser_config.cdp_url = browser_info.get('cdp_url')
|
||||
self.browser_config.use_managed_browser = True
|
||||
|
||||
await self.crawler_strategy.__aenter__()
|
||||
await self.awarmup()
|
||||
return self
|
||||
@@ -223,23 +272,6 @@ class AsyncWebCrawler:
|
||||
self,
|
||||
url: str,
|
||||
config: CrawlerRunConfig = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
# word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
# extraction_strategy: ExtractionStrategy = None,
|
||||
# chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
# content_filter: RelevantContentFilter = None,
|
||||
# cache_mode: Optional[CacheMode] = None,
|
||||
# Deprecated cache parameters
|
||||
# bypass_cache: bool = False,
|
||||
# disable_cache: bool = False,
|
||||
# no_cache_read: bool = False,
|
||||
# no_cache_write: bool = False,
|
||||
# Other legacy parameters
|
||||
# css_selector: str = None,
|
||||
# screenshot: bool = False,
|
||||
# pdf: bool = False,
|
||||
# user_agent: str = None,
|
||||
# verbose=True,
|
||||
**kwargs,
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
@@ -270,47 +302,17 @@ class AsyncWebCrawler:
|
||||
Returns:
|
||||
CrawlResult: The result of crawling and processing
|
||||
"""
|
||||
crawler_config = config or CrawlerRunConfig()
|
||||
# Auto-start if not ready
|
||||
if not self.ready:
|
||||
await self.start()
|
||||
|
||||
config = config or CrawlerRunConfig()
|
||||
if not isinstance(url, str) or not url:
|
||||
raise ValueError("Invalid URL, make sure the URL is a non-empty string")
|
||||
|
||||
async with self._lock or self.nullcontext():
|
||||
try:
|
||||
self.logger.verbose = crawler_config.verbose
|
||||
# Handle configuration
|
||||
if crawler_config is not None:
|
||||
config = crawler_config
|
||||
else:
|
||||
# Merge all parameters into a single kwargs dict for config creation
|
||||
# config_kwargs = {
|
||||
# "word_count_threshold": word_count_threshold,
|
||||
# "extraction_strategy": extraction_strategy,
|
||||
# "chunking_strategy": chunking_strategy,
|
||||
# "content_filter": content_filter,
|
||||
# "cache_mode": cache_mode,
|
||||
# "bypass_cache": bypass_cache,
|
||||
# "disable_cache": disable_cache,
|
||||
# "no_cache_read": no_cache_read,
|
||||
# "no_cache_write": no_cache_write,
|
||||
# "css_selector": css_selector,
|
||||
# "screenshot": screenshot,
|
||||
# "pdf": pdf,
|
||||
# "verbose": verbose,
|
||||
# **kwargs,
|
||||
# }
|
||||
# config = CrawlerRunConfig.from_kwargs(config_kwargs)
|
||||
pass
|
||||
|
||||
# Handle deprecated cache parameters
|
||||
# if any([bypass_cache, disable_cache, no_cache_read, no_cache_write]):
|
||||
# # Convert legacy parameters if cache_mode not provided
|
||||
# if config.cache_mode is None:
|
||||
# config.cache_mode = _legacy_to_cache_mode(
|
||||
# disable_cache=disable_cache,
|
||||
# bypass_cache=bypass_cache,
|
||||
# no_cache_read=no_cache_read,
|
||||
# no_cache_write=no_cache_write,
|
||||
# )
|
||||
self.logger.verbose = config.verbose
|
||||
|
||||
# Default to ENABLED if no cache mode specified
|
||||
if config.cache_mode is None:
|
||||
@@ -457,7 +459,7 @@ class AsyncWebCrawler:
|
||||
if cache_context.should_write() and not bool(cached_result):
|
||||
await async_db_manager.acache_url(crawl_result)
|
||||
|
||||
return crawl_result
|
||||
return CrawlResultContainer(crawl_result)
|
||||
|
||||
else:
|
||||
self.logger.success(
|
||||
@@ -474,7 +476,7 @@ class AsyncWebCrawler:
|
||||
cached_result.success = bool(html)
|
||||
cached_result.session_id = getattr(config, "session_id", None)
|
||||
cached_result.redirected_url = cached_result.redirected_url or url
|
||||
return cached_result
|
||||
return CrawlResultContainer(cached_result)
|
||||
|
||||
except Exception as e:
|
||||
error_context = get_error_context(sys.exc_info())
|
||||
@@ -492,8 +494,10 @@ class AsyncWebCrawler:
|
||||
tag="ERROR",
|
||||
)
|
||||
|
||||
return CrawlResult(
|
||||
url=url, html="", success=False, error_message=error_message
|
||||
return CrawlResultContainer(
|
||||
CrawlResult(
|
||||
url=url, html="", success=False, error_message=error_message
|
||||
)
|
||||
)
|
||||
|
||||
async def aprocess_html(
|
||||
@@ -534,7 +538,8 @@ class AsyncWebCrawler:
|
||||
scraping_strategy.logger = self.logger
|
||||
|
||||
# Process HTML content
|
||||
params = {k: v for k, v in config.to_dict().items() if k not in ["url"]}
|
||||
params = config.__dict__.copy()
|
||||
params.pop("url", None)
|
||||
# add keys from kwargs to params that doesn't exist in params
|
||||
params.update({k: v for k, v in kwargs.items() if k not in params.keys()})
|
||||
|
||||
@@ -668,18 +673,6 @@ class AsyncWebCrawler:
|
||||
urls: List[str],
|
||||
config: Optional[CrawlerRunConfig] = None,
|
||||
dispatcher: Optional[BaseDispatcher] = None,
|
||||
# Legacy parameters maintained for backwards compatibility
|
||||
word_count_threshold=MIN_WORD_THRESHOLD,
|
||||
extraction_strategy: ExtractionStrategy = None,
|
||||
chunking_strategy: ChunkingStrategy = RegexChunking(),
|
||||
content_filter: RelevantContentFilter = None,
|
||||
cache_mode: Optional[CacheMode] = None,
|
||||
bypass_cache: bool = False,
|
||||
css_selector: str = None,
|
||||
screenshot: bool = False,
|
||||
pdf: bool = False,
|
||||
user_agent: str = None,
|
||||
verbose=True,
|
||||
**kwargs
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
@@ -712,20 +705,8 @@ class AsyncWebCrawler:
|
||||
):
|
||||
print(f"Processed {result.url}: {len(result.markdown)} chars")
|
||||
"""
|
||||
if config is None:
|
||||
config = CrawlerRunConfig(
|
||||
word_count_threshold=word_count_threshold,
|
||||
extraction_strategy=extraction_strategy,
|
||||
chunking_strategy=chunking_strategy,
|
||||
content_filter=content_filter,
|
||||
cache_mode=cache_mode,
|
||||
bypass_cache=bypass_cache,
|
||||
css_selector=css_selector,
|
||||
screenshot=screenshot,
|
||||
pdf=pdf,
|
||||
verbose=verbose,
|
||||
**kwargs,
|
||||
)
|
||||
config = config or CrawlerRunConfig()
|
||||
|
||||
|
||||
if dispatcher is None:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
|
||||
10
crawl4ai/browser/__init__.py
Normal file
10
crawl4ai/browser/__init__.py
Normal file
@@ -0,0 +1,10 @@
|
||||
"""Browser management module for Crawl4AI.
|
||||
|
||||
This module provides browser management capabilities using different strategies
|
||||
for browser creation and interaction.
|
||||
"""
|
||||
|
||||
from .manager import BrowserManager
|
||||
from .profiles import BrowserProfileManager
|
||||
|
||||
__all__ = ['BrowserManager', 'BrowserProfileManager']
|
||||
61
crawl4ai/browser/docker/connect.Dockerfile
Normal file
61
crawl4ai/browser/docker/connect.Dockerfile
Normal file
@@ -0,0 +1,61 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# Install dependencies with comprehensive Chromium support
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
# Sound support
|
||||
libasound2 \
|
||||
# Accessibility support
|
||||
libatspi2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
# Graphics and rendering
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
# X11 and window system
|
||||
libx11-6 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
# Text and internationalization
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
# Printing support
|
||||
libcups2 \
|
||||
# System libraries
|
||||
libdbus-1-3 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libglib2.0-0 \
|
||||
# Utilities
|
||||
xdg-utils \
|
||||
socat \
|
||||
# Process management
|
||||
procps \
|
||||
# Clean up
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Chrome
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create data directory for user data
|
||||
RUN mkdir -p /data && chmod 777 /data
|
||||
|
||||
# Add a startup script
|
||||
COPY start.sh /start.sh
|
||||
RUN chmod +x /start.sh
|
||||
|
||||
# Set entrypoint
|
||||
ENTRYPOINT ["/start.sh"]
|
||||
57
crawl4ai/browser/docker/launch.Dockerfile
Normal file
57
crawl4ai/browser/docker/launch.Dockerfile
Normal file
@@ -0,0 +1,57 @@
|
||||
FROM ubuntu:22.04
|
||||
|
||||
# Install dependencies with comprehensive Chromium support
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
wget \
|
||||
gnupg \
|
||||
ca-certificates \
|
||||
fonts-liberation \
|
||||
# Sound support
|
||||
libasound2 \
|
||||
# Accessibility support
|
||||
libatspi2.0-0 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
# Graphics and rendering
|
||||
libdrm2 \
|
||||
libgbm1 \
|
||||
libgtk-3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
# X11 and window system
|
||||
libx11-6 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
# Text and internationalization
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
# Printing support
|
||||
libcups2 \
|
||||
# System libraries
|
||||
libdbus-1-3 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libglib2.0-0 \
|
||||
# Utilities
|
||||
xdg-utils \
|
||||
socat \
|
||||
# Process management
|
||||
procps \
|
||||
# Clean up
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install Chrome
|
||||
RUN wget -q -O - https://dl-ssl.google.com/linux/linux_signing_key.pub | apt-key add - && \
|
||||
echo "deb [arch=amd64] http://dl.google.com/linux/chrome/deb/ stable main" >> /etc/apt/sources.list.d/google.list && \
|
||||
apt-get update && \
|
||||
apt-get install -y google-chrome-stable && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Create data directory for user data
|
||||
RUN mkdir -p /data && chmod 777 /data
|
||||
|
||||
# Keep container running without starting Chrome
|
||||
CMD ["tail", "-f", "/dev/null"]
|
||||
133
crawl4ai/browser/docker_config.py
Normal file
133
crawl4ai/browser/docker_config.py
Normal file
@@ -0,0 +1,133 @@
|
||||
"""Docker configuration module for Crawl4AI browser automation.
|
||||
|
||||
This module provides configuration classes for Docker-based browser automation,
|
||||
allowing flexible configuration of Docker containers for browsing.
|
||||
"""
|
||||
|
||||
from typing import Dict, List, Optional, Union
|
||||
|
||||
|
||||
class DockerConfig:
|
||||
"""Configuration for Docker-based browser automation.
|
||||
|
||||
This class contains Docker-specific settings to avoid cluttering BrowserConfig.
|
||||
|
||||
Attributes:
|
||||
mode (str): Docker operation mode - "connect" or "launch".
|
||||
- "connect": Uses a container with Chrome already running
|
||||
- "launch": Dynamically configures and starts Chrome in container
|
||||
image (str): Docker image to use. If None, defaults from DockerUtils are used.
|
||||
registry_file (str): Path to container registry file for persistence.
|
||||
persistent (bool): Keep container running after browser closes.
|
||||
remove_on_exit (bool): Remove container on exit when not persistent.
|
||||
network (str): Docker network to use.
|
||||
volumes (List[str]): Volume mappings (e.g., ["host_path:container_path"]).
|
||||
env_vars (Dict[str, str]): Environment variables to set in container.
|
||||
extra_args (List[str]): Additional docker run arguments.
|
||||
host_port (int): Host port to map to container's 9223 port.
|
||||
user_data_dir (str): Path to user data directory on host.
|
||||
container_user_data_dir (str): Path to user data directory in container.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
mode: str = "connect", # "connect" or "launch"
|
||||
image: Optional[str] = None, # Docker image to use
|
||||
registry_file: Optional[str] = None, # Path to registry file
|
||||
persistent: bool = False, # Keep container running after browser closes
|
||||
remove_on_exit: bool = True, # Remove container on exit when not persistent
|
||||
network: Optional[str] = None, # Docker network to use
|
||||
volumes: List[str] = None, # Volume mappings
|
||||
env_vars: Dict[str, str] = None, # Environment variables
|
||||
extra_args: List[str] = None, # Additional docker run arguments
|
||||
host_port: Optional[int] = None, # Host port to map to container's 9223
|
||||
user_data_dir: Optional[str] = None, # Path to user data directory on host
|
||||
container_user_data_dir: str = "/data", # Path to user data directory in container
|
||||
):
|
||||
"""Initialize Docker configuration.
|
||||
|
||||
Args:
|
||||
mode: Docker operation mode ("connect" or "launch")
|
||||
image: Docker image to use
|
||||
registry_file: Path to container registry file
|
||||
persistent: Whether to keep container running after browser closes
|
||||
remove_on_exit: Whether to remove container on exit when not persistent
|
||||
network: Docker network to use
|
||||
volumes: Volume mappings as list of strings
|
||||
env_vars: Environment variables as dictionary
|
||||
extra_args: Additional docker run arguments
|
||||
host_port: Host port to map to container's 9223
|
||||
user_data_dir: Path to user data directory on host
|
||||
container_user_data_dir: Path to user data directory in container
|
||||
"""
|
||||
self.mode = mode
|
||||
self.image = image # If None, defaults will be used from DockerUtils
|
||||
self.registry_file = registry_file
|
||||
self.persistent = persistent
|
||||
self.remove_on_exit = remove_on_exit
|
||||
self.network = network
|
||||
self.volumes = volumes or []
|
||||
self.env_vars = env_vars or {}
|
||||
self.extra_args = extra_args or []
|
||||
self.host_port = host_port
|
||||
self.user_data_dir = user_data_dir
|
||||
self.container_user_data_dir = container_user_data_dir
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert this configuration to a dictionary.
|
||||
|
||||
Returns:
|
||||
Dictionary representation of this configuration
|
||||
"""
|
||||
return {
|
||||
"mode": self.mode,
|
||||
"image": self.image,
|
||||
"registry_file": self.registry_file,
|
||||
"persistent": self.persistent,
|
||||
"remove_on_exit": self.remove_on_exit,
|
||||
"network": self.network,
|
||||
"volumes": self.volumes,
|
||||
"env_vars": self.env_vars,
|
||||
"extra_args": self.extra_args,
|
||||
"host_port": self.host_port,
|
||||
"user_data_dir": self.user_data_dir,
|
||||
"container_user_data_dir": self.container_user_data_dir
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: Dict) -> "DockerConfig":
|
||||
"""Create a DockerConfig from a dictionary of keyword arguments.
|
||||
|
||||
Args:
|
||||
kwargs: Dictionary of configuration options
|
||||
|
||||
Returns:
|
||||
New DockerConfig instance
|
||||
"""
|
||||
return DockerConfig(
|
||||
mode=kwargs.get("mode", "connect"),
|
||||
image=kwargs.get("image"),
|
||||
registry_file=kwargs.get("registry_file"),
|
||||
persistent=kwargs.get("persistent", False),
|
||||
remove_on_exit=kwargs.get("remove_on_exit", True),
|
||||
network=kwargs.get("network"),
|
||||
volumes=kwargs.get("volumes"),
|
||||
env_vars=kwargs.get("env_vars"),
|
||||
extra_args=kwargs.get("extra_args"),
|
||||
host_port=kwargs.get("host_port"),
|
||||
user_data_dir=kwargs.get("user_data_dir"),
|
||||
container_user_data_dir=kwargs.get("container_user_data_dir", "/data")
|
||||
)
|
||||
|
||||
def clone(self, **kwargs) -> "DockerConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
DockerConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return DockerConfig.from_kwargs(config_dict)
|
||||
174
crawl4ai/browser/docker_registry.py
Normal file
174
crawl4ai/browser/docker_registry.py
Normal file
@@ -0,0 +1,174 @@
|
||||
"""Docker registry module for Crawl4AI.
|
||||
|
||||
This module provides a registry system for tracking and reusing Docker containers
|
||||
across browser sessions, improving performance and resource utilization.
|
||||
"""
|
||||
|
||||
import os
|
||||
import json
|
||||
import time
|
||||
from typing import Dict, Optional
|
||||
|
||||
from ..utils import get_home_folder
|
||||
|
||||
|
||||
class DockerRegistry:
|
||||
"""Manages a registry of Docker containers used for browser automation.
|
||||
|
||||
This registry tracks containers by configuration hash, allowing reuse of appropriately
|
||||
configured containers instead of creating new ones for each session.
|
||||
|
||||
Attributes:
|
||||
registry_file (str): Path to the registry file
|
||||
containers (dict): Dictionary of container information
|
||||
port_map (dict): Map of host ports to container IDs
|
||||
last_port (int): Last port assigned
|
||||
"""
|
||||
|
||||
def __init__(self, registry_file: Optional[str] = None):
|
||||
"""Initialize the registry with an optional path to the registry file.
|
||||
|
||||
Args:
|
||||
registry_file: Path to the registry file. If None, uses default path.
|
||||
"""
|
||||
self.registry_file = registry_file or os.path.join(get_home_folder(), "docker_browser_registry.json")
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
self.load()
|
||||
|
||||
def load(self):
|
||||
"""Load container registry from file."""
|
||||
if os.path.exists(self.registry_file):
|
||||
try:
|
||||
with open(self.registry_file, 'r') as f:
|
||||
registry_data = json.load(f)
|
||||
self.containers = registry_data.get("containers", {})
|
||||
self.port_map = registry_data.get("ports", {})
|
||||
self.last_port = registry_data.get("last_port", 9222)
|
||||
except Exception:
|
||||
# Reset to defaults on error
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
else:
|
||||
# Initialize with defaults if file doesn't exist
|
||||
self.containers = {}
|
||||
self.port_map = {}
|
||||
self.last_port = 9222
|
||||
|
||||
def save(self):
|
||||
"""Save container registry to file."""
|
||||
os.makedirs(os.path.dirname(self.registry_file), exist_ok=True)
|
||||
with open(self.registry_file, 'w') as f:
|
||||
json.dump({
|
||||
"containers": self.containers,
|
||||
"ports": self.port_map,
|
||||
"last_port": self.last_port
|
||||
}, f, indent=2)
|
||||
|
||||
def register_container(self, container_id: str, host_port: int, config_hash: str):
|
||||
"""Register a container with its configuration hash and port mapping.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
host_port: Host port mapped to container
|
||||
config_hash: Hash of configuration used to create container
|
||||
"""
|
||||
self.containers[container_id] = {
|
||||
"host_port": host_port,
|
||||
"config_hash": config_hash,
|
||||
"created_at": time.time()
|
||||
}
|
||||
self.port_map[str(host_port)] = container_id
|
||||
self.save()
|
||||
|
||||
def unregister_container(self, container_id: str):
|
||||
"""Unregister a container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID to unregister
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
host_port = self.containers[container_id]["host_port"]
|
||||
if str(host_port) in self.port_map:
|
||||
del self.port_map[str(host_port)]
|
||||
del self.containers[container_id]
|
||||
self.save()
|
||||
|
||||
def find_container_by_config(self, config_hash: str, docker_utils) -> Optional[str]:
|
||||
"""Find a container that matches the given configuration hash.
|
||||
|
||||
Args:
|
||||
config_hash: Hash of configuration to match
|
||||
docker_utils: DockerUtils instance to check running containers
|
||||
|
||||
Returns:
|
||||
Container ID if found, None otherwise
|
||||
"""
|
||||
for container_id, data in self.containers.items():
|
||||
if data["config_hash"] == config_hash and docker_utils.is_container_running(container_id):
|
||||
return container_id
|
||||
return None
|
||||
|
||||
def get_container_host_port(self, container_id: str) -> Optional[int]:
|
||||
"""Get the host port mapped to the container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
|
||||
Returns:
|
||||
Host port if container is registered, None otherwise
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
return self.containers[container_id]["host_port"]
|
||||
return None
|
||||
|
||||
def get_next_available_port(self, docker_utils) -> int:
|
||||
"""Get the next available host port for Docker mapping.
|
||||
|
||||
Args:
|
||||
docker_utils: DockerUtils instance to check port availability
|
||||
|
||||
Returns:
|
||||
Available port number
|
||||
"""
|
||||
# Start from last port + 1
|
||||
port = self.last_port + 1
|
||||
|
||||
# Check if port is in use (either in our registry or system-wide)
|
||||
while port in self.port_map or docker_utils.is_port_in_use(port):
|
||||
port += 1
|
||||
|
||||
# Update last port
|
||||
self.last_port = port
|
||||
self.save()
|
||||
|
||||
return port
|
||||
|
||||
def get_container_config_hash(self, container_id: str) -> Optional[str]:
|
||||
"""Get the configuration hash for a container.
|
||||
|
||||
Args:
|
||||
container_id: Docker container ID
|
||||
|
||||
Returns:
|
||||
Configuration hash if container is registered, None otherwise
|
||||
"""
|
||||
if container_id in self.containers:
|
||||
return self.containers[container_id]["config_hash"]
|
||||
return None
|
||||
|
||||
def cleanup_stale_containers(self, docker_utils):
|
||||
"""Clean up containers that are no longer running.
|
||||
|
||||
Args:
|
||||
docker_utils: DockerUtils instance to check container status
|
||||
"""
|
||||
to_remove = []
|
||||
for container_id in self.containers:
|
||||
if not docker_utils.is_container_running(container_id):
|
||||
to_remove.append(container_id)
|
||||
|
||||
for container_id in to_remove:
|
||||
self.unregister_container(container_id)
|
||||
286
crawl4ai/browser/docker_strategy.py
Normal file
286
crawl4ai/browser/docker_strategy.py
Normal file
@@ -0,0 +1,286 @@
|
||||
"""Docker browser strategy module for Crawl4AI.
|
||||
|
||||
This module provides browser strategies for running browsers in Docker containers,
|
||||
which offers better isolation, consistency across platforms, and easy scaling.
|
||||
"""
|
||||
|
||||
import os
|
||||
import uuid
|
||||
import asyncio
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
from pathlib import Path
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .docker_config import DockerConfig
|
||||
from .docker_registry import DockerRegistry
|
||||
from .docker_utils import DockerUtils
|
||||
from .strategies import BuiltinBrowserStrategy
|
||||
|
||||
|
||||
class DockerBrowserStrategy(BuiltinBrowserStrategy):
|
||||
"""Docker-based browser strategy.
|
||||
|
||||
Extends the BuiltinBrowserStrategy to run browsers in Docker containers.
|
||||
Supports two modes:
|
||||
1. "connect" - Uses a Docker image with Chrome already running
|
||||
2. "launch" - Starts Chrome within the container with custom settings
|
||||
|
||||
Attributes:
|
||||
docker_config: Docker-specific configuration options
|
||||
container_id: ID of current Docker container
|
||||
container_name: Name assigned to the container
|
||||
registry: Registry for tracking and reusing containers
|
||||
docker_utils: Utilities for Docker operations
|
||||
chrome_process_id: Process ID of Chrome within container
|
||||
socat_process_id: Process ID of socat within container
|
||||
internal_cdp_port: Chrome's internal CDP port
|
||||
internal_mapped_port: Port that socat maps to internally
|
||||
"""
|
||||
|
||||
def __init__(self, config: BrowserConfig, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the Docker browser strategy.
|
||||
|
||||
Args:
|
||||
config: Browser configuration including Docker-specific settings
|
||||
logger: Logger for recording events and errors
|
||||
"""
|
||||
super().__init__(config, logger)
|
||||
|
||||
# Initialize Docker-specific attributes
|
||||
self.docker_config = self.config.docker_config or DockerConfig()
|
||||
self.container_id = None
|
||||
self.container_name = f"crawl4ai-browser-{uuid.uuid4().hex[:8]}"
|
||||
self.registry = DockerRegistry(self.docker_config.registry_file)
|
||||
self.docker_utils = DockerUtils(logger)
|
||||
self.chrome_process_id = None
|
||||
self.socat_process_id = None
|
||||
self.internal_cdp_port = 9222 # Chrome's internal CDP port
|
||||
self.internal_mapped_port = 9223 # Port that socat maps to internally
|
||||
self.shutting_down = False
|
||||
|
||||
async def _generate_config_hash(self) -> str:
|
||||
"""Generate a hash of the configuration for container matching.
|
||||
|
||||
Returns:
|
||||
Hash string uniquely identifying this configuration
|
||||
"""
|
||||
# Create a dict with the relevant parts of the config
|
||||
config_dict = {
|
||||
"image": self.docker_config.image,
|
||||
"mode": self.docker_config.mode,
|
||||
"browser_type": self.config.browser_type,
|
||||
"headless": self.config.headless,
|
||||
}
|
||||
|
||||
# Add browser-specific config if in launch mode
|
||||
if self.docker_config.mode == "launch":
|
||||
config_dict.update({
|
||||
"text_mode": self.config.text_mode,
|
||||
"light_mode": self.config.light_mode,
|
||||
"viewport_width": self.config.viewport_width,
|
||||
"viewport_height": self.config.viewport_height,
|
||||
})
|
||||
|
||||
# Use the utility method to generate the hash
|
||||
return self.docker_utils.generate_config_hash(config_dict)
|
||||
|
||||
async def _get_or_create_cdp_url(self) -> str:
|
||||
"""Get CDP URL by either creating a new container or using an existing one.
|
||||
|
||||
Returns:
|
||||
CDP URL for connecting to the browser
|
||||
|
||||
Raises:
|
||||
Exception: If container creation or browser launch fails
|
||||
"""
|
||||
# If CDP URL is explicitly provided, use it
|
||||
if self.config.cdp_url:
|
||||
return self.config.cdp_url
|
||||
|
||||
# Ensure Docker image exists (will build if needed)
|
||||
image_name = await self.docker_utils.ensure_docker_image_exists(
|
||||
self.docker_config.image,
|
||||
self.docker_config.mode
|
||||
)
|
||||
|
||||
# Generate config hash for container matching
|
||||
config_hash = await self._generate_config_hash()
|
||||
|
||||
# Look for existing container with matching config
|
||||
container_id = self.registry.find_container_by_config(config_hash, self.docker_utils)
|
||||
|
||||
if container_id:
|
||||
# Use existing container
|
||||
self.container_id = container_id
|
||||
host_port = self.registry.get_container_host_port(container_id)
|
||||
if self.logger:
|
||||
self.logger.info(f"Using existing Docker container: {container_id[:12]}", tag="DOCKER")
|
||||
else:
|
||||
# Get a port for the new container
|
||||
host_port = self.docker_config.host_port or self.registry.get_next_available_port(self.docker_utils)
|
||||
|
||||
# Prepare volumes list
|
||||
volumes = list(self.docker_config.volumes)
|
||||
|
||||
# Add user data directory if specified
|
||||
if self.docker_config.user_data_dir:
|
||||
# Ensure user data directory exists
|
||||
os.makedirs(self.docker_config.user_data_dir, exist_ok=True)
|
||||
volumes.append(f"{self.docker_config.user_data_dir}:{self.docker_config.container_user_data_dir}")
|
||||
|
||||
# Update config user_data_dir to point to container path
|
||||
self.config.user_data_dir = self.docker_config.container_user_data_dir
|
||||
|
||||
# Create a new container
|
||||
container_id = await self.docker_utils.create_container(
|
||||
image_name=image_name,
|
||||
host_port=host_port,
|
||||
container_name=self.container_name,
|
||||
volumes=volumes,
|
||||
network=self.docker_config.network,
|
||||
env_vars=self.docker_config.env_vars,
|
||||
extra_args=self.docker_config.extra_args
|
||||
)
|
||||
|
||||
if not container_id:
|
||||
raise Exception("Failed to create Docker container")
|
||||
|
||||
self.container_id = container_id
|
||||
|
||||
# Register the container
|
||||
self.registry.register_container(container_id, host_port, config_hash)
|
||||
|
||||
# Wait for container to be ready
|
||||
await self.docker_utils.wait_for_container_ready(container_id)
|
||||
|
||||
# Handle specific setup based on mode
|
||||
if self.docker_config.mode == "launch":
|
||||
# In launch mode, we need to start socat and Chrome
|
||||
await self.docker_utils.start_socat_in_container(container_id)
|
||||
|
||||
# Build browser arguments
|
||||
browser_args = self._build_browser_args()
|
||||
|
||||
# Launch Chrome
|
||||
await self.docker_utils.launch_chrome_in_container(container_id, browser_args)
|
||||
|
||||
# Get PIDs for later cleanup
|
||||
self.chrome_process_id = await self.docker_utils.get_process_id_in_container(
|
||||
container_id, "chrome"
|
||||
)
|
||||
self.socat_process_id = await self.docker_utils.get_process_id_in_container(
|
||||
container_id, "socat"
|
||||
)
|
||||
|
||||
# Wait for CDP to be ready
|
||||
await self.docker_utils.wait_for_cdp_ready(host_port)
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Docker container ready: {container_id[:12]} on port {host_port}", tag="DOCKER")
|
||||
|
||||
# Return CDP URL
|
||||
return f"http://localhost:{host_port}"
|
||||
|
||||
def _build_browser_args(self) -> List[str]:
|
||||
"""Build Chrome command line arguments based on BrowserConfig.
|
||||
|
||||
Returns:
|
||||
List of command line arguments for Chrome
|
||||
"""
|
||||
args = [
|
||||
"--no-sandbox",
|
||||
"--disable-gpu",
|
||||
f"--remote-debugging-port={self.internal_cdp_port}",
|
||||
"--remote-debugging-address=0.0.0.0", # Allow external connections
|
||||
"--disable-dev-shm-usage",
|
||||
]
|
||||
|
||||
if self.config.headless:
|
||||
args.append("--headless=new")
|
||||
|
||||
if self.config.viewport_width and self.config.viewport_height:
|
||||
args.append(f"--window-size={self.config.viewport_width},{self.config.viewport_height}")
|
||||
|
||||
if self.config.user_agent:
|
||||
args.append(f"--user-agent={self.config.user_agent}")
|
||||
|
||||
if self.config.text_mode:
|
||||
args.extend([
|
||||
"--blink-settings=imagesEnabled=false",
|
||||
"--disable-remote-fonts",
|
||||
"--disable-images",
|
||||
"--disable-javascript",
|
||||
])
|
||||
|
||||
if self.config.light_mode:
|
||||
# Import here to avoid circular import
|
||||
from .utils import get_browser_disable_options
|
||||
args.extend(get_browser_disable_options())
|
||||
|
||||
if self.config.user_data_dir:
|
||||
args.append(f"--user-data-dir={self.config.user_data_dir}")
|
||||
|
||||
if self.config.extra_args:
|
||||
args.extend(self.config.extra_args)
|
||||
|
||||
return args
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up Docker container if needed."""
|
||||
# Set shutting_down flag to prevent race conditions
|
||||
self.shutting_down = True
|
||||
|
||||
# Store state if needed before closing
|
||||
if self.browser and self.docker_config.user_data_dir and self.docker_config.persistent:
|
||||
for context in self.browser.contexts:
|
||||
try:
|
||||
storage_path = os.path.join(self.docker_config.user_data_dir, "storage_state.json")
|
||||
await context.storage_state(path=storage_path)
|
||||
if self.logger:
|
||||
self.logger.debug("Persisted storage state before closing browser", tag="DOCKER")
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to persist storage state: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
|
||||
# Close browser connection (but not container)
|
||||
if self.browser:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
|
||||
# Only clean up container if not persistent
|
||||
if self.container_id and not self.docker_config.persistent:
|
||||
# Stop Chrome process in "launch" mode
|
||||
if self.docker_config.mode == "launch" and self.chrome_process_id:
|
||||
await self.docker_utils.stop_process_in_container(
|
||||
self.container_id, self.chrome_process_id
|
||||
)
|
||||
|
||||
# Stop socat process in "launch" mode
|
||||
if self.docker_config.mode == "launch" and self.socat_process_id:
|
||||
await self.docker_utils.stop_process_in_container(
|
||||
self.container_id, self.socat_process_id
|
||||
)
|
||||
|
||||
# Remove or stop container based on configuration
|
||||
if self.docker_config.remove_on_exit:
|
||||
await self.docker_utils.remove_container(self.container_id)
|
||||
# Unregister from registry
|
||||
self.registry.unregister_container(self.container_id)
|
||||
else:
|
||||
await self.docker_utils.stop_container(self.container_id)
|
||||
|
||||
self.container_id = None
|
||||
|
||||
# Close Playwright
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
self.playwright = None
|
||||
|
||||
self.shutting_down = False
|
||||
582
crawl4ai/browser/docker_utils.py
Normal file
582
crawl4ai/browser/docker_utils.py
Normal file
@@ -0,0 +1,582 @@
|
||||
import os
|
||||
import json
|
||||
import asyncio
|
||||
import hashlib
|
||||
import tempfile
|
||||
import shutil
|
||||
import socket
|
||||
import subprocess
|
||||
from typing import Dict, List, Optional, Tuple, Union
|
||||
|
||||
class DockerUtils:
|
||||
"""Utility class for Docker operations in browser automation.
|
||||
|
||||
This class provides methods for managing Docker images, containers,
|
||||
and related operations needed for browser automation. It handles
|
||||
image building, container lifecycle, port management, and registry operations.
|
||||
|
||||
Attributes:
|
||||
DOCKER_FOLDER (str): Path to folder containing Docker files
|
||||
DOCKER_CONNECT_FILE (str): Path to Dockerfile for connect mode
|
||||
DOCKER_LAUNCH_FILE (str): Path to Dockerfile for launch mode
|
||||
DOCKER_START_SCRIPT (str): Path to startup script for connect mode
|
||||
DEFAULT_CONNECT_IMAGE (str): Default image name for connect mode
|
||||
DEFAULT_LAUNCH_IMAGE (str): Default image name for launch mode
|
||||
logger: Optional logger instance
|
||||
"""
|
||||
|
||||
# File paths for Docker resources
|
||||
DOCKER_FOLDER = os.path.join(os.path.dirname(__file__), "docker")
|
||||
DOCKER_CONNECT_FILE = os.path.join(DOCKER_FOLDER, "connect.Dockerfile")
|
||||
DOCKER_LAUNCH_FILE = os.path.join(DOCKER_FOLDER, "launch.Dockerfile")
|
||||
DOCKER_START_SCRIPT = os.path.join(DOCKER_FOLDER, "start.sh")
|
||||
|
||||
# Default image names
|
||||
DEFAULT_CONNECT_IMAGE = "crawl4ai/browser-connect:latest"
|
||||
DEFAULT_LAUNCH_IMAGE = "crawl4ai/browser-launch:latest"
|
||||
|
||||
def __init__(self, logger=None):
|
||||
"""Initialize Docker utilities.
|
||||
|
||||
Args:
|
||||
logger: Optional logger for recording operations
|
||||
"""
|
||||
self.logger = logger
|
||||
|
||||
# Image Management Methods
|
||||
|
||||
async def check_image_exists(self, image_name: str) -> bool:
|
||||
"""Check if a Docker image exists.
|
||||
|
||||
Args:
|
||||
image_name: Name of the Docker image to check
|
||||
|
||||
Returns:
|
||||
bool: True if the image exists, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "image", "inspect", image_name]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
_, _ = await process.communicate()
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.debug(f"Error checking if image exists: {str(e)}", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def build_docker_image(self, image_name: str, dockerfile_path: str,
|
||||
files_to_copy: Dict[str, str] = None) -> bool:
|
||||
"""Build a Docker image from a Dockerfile.
|
||||
|
||||
Args:
|
||||
image_name: Name to give the built image
|
||||
dockerfile_path: Path to the Dockerfile
|
||||
files_to_copy: Dict of {dest_name: source_path} for files to copy to build context
|
||||
|
||||
Returns:
|
||||
bool: True if image was built successfully, False otherwise
|
||||
"""
|
||||
# Create a temporary build context
|
||||
with tempfile.TemporaryDirectory() as temp_dir:
|
||||
# Copy the Dockerfile
|
||||
shutil.copy(dockerfile_path, os.path.join(temp_dir, "Dockerfile"))
|
||||
|
||||
# Copy any additional files needed
|
||||
if files_to_copy:
|
||||
for dest_name, source_path in files_to_copy.items():
|
||||
shutil.copy(source_path, os.path.join(temp_dir, dest_name))
|
||||
|
||||
# Build the image
|
||||
cmd = [
|
||||
"docker", "build",
|
||||
"-t", image_name,
|
||||
temp_dir
|
||||
]
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Building Docker image with command: {' '.join(cmd)}", tag="DOCKER")
|
||||
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to build Docker image: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr.decode()}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Successfully built Docker image: {image_name}", tag="DOCKER")
|
||||
return True
|
||||
|
||||
async def ensure_docker_image_exists(self, image_name: str, mode: str = "connect") -> str:
|
||||
"""Ensure the required Docker image exists, creating it if necessary.
|
||||
|
||||
Args:
|
||||
image_name: Name of the Docker image
|
||||
mode: Either "connect" or "launch" to determine which image to build
|
||||
|
||||
Returns:
|
||||
str: Name of the available Docker image
|
||||
|
||||
Raises:
|
||||
Exception: If image doesn't exist and can't be built
|
||||
"""
|
||||
# If image name is not specified, use default based on mode
|
||||
if not image_name:
|
||||
image_name = self.DEFAULT_CONNECT_IMAGE if mode == "connect" else self.DEFAULT_LAUNCH_IMAGE
|
||||
|
||||
# Check if the image already exists
|
||||
if await self.check_image_exists(image_name):
|
||||
if self.logger:
|
||||
self.logger.debug(f"Docker image {image_name} already exists", tag="DOCKER")
|
||||
return image_name
|
||||
|
||||
# If we're using a custom image that doesn't exist, warn and fail
|
||||
if (image_name != self.DEFAULT_CONNECT_IMAGE and image_name != self.DEFAULT_LAUNCH_IMAGE):
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
f"Custom Docker image {image_name} not found and cannot be automatically created",
|
||||
tag="DOCKER"
|
||||
)
|
||||
raise Exception(f"Docker image {image_name} not found")
|
||||
|
||||
# Build the appropriate default image
|
||||
if self.logger:
|
||||
self.logger.info(f"Docker image {image_name} not found, creating it now...", tag="DOCKER")
|
||||
|
||||
if mode == "connect":
|
||||
success = await self.build_docker_image(
|
||||
image_name,
|
||||
self.DOCKER_CONNECT_FILE,
|
||||
{"start.sh": self.DOCKER_START_SCRIPT}
|
||||
)
|
||||
else:
|
||||
success = await self.build_docker_image(
|
||||
image_name,
|
||||
self.DOCKER_LAUNCH_FILE
|
||||
)
|
||||
|
||||
if not success:
|
||||
raise Exception(f"Failed to create Docker image {image_name}")
|
||||
|
||||
return image_name
|
||||
|
||||
# Container Management Methods
|
||||
|
||||
async def create_container(self, image_name: str, host_port: int,
|
||||
container_name: Optional[str] = None,
|
||||
volumes: List[str] = None,
|
||||
network: Optional[str] = None,
|
||||
env_vars: Dict[str, str] = None,
|
||||
extra_args: List[str] = None) -> Optional[str]:
|
||||
"""Create a new Docker container.
|
||||
|
||||
Args:
|
||||
image_name: Docker image to use
|
||||
host_port: Port on host to map to container port 9223
|
||||
container_name: Optional name for the container
|
||||
volumes: List of volume mappings (e.g., ["host_path:container_path"])
|
||||
network: Optional Docker network to use
|
||||
env_vars: Dictionary of environment variables
|
||||
extra_args: Additional docker run arguments
|
||||
|
||||
Returns:
|
||||
str: Container ID if successful, None otherwise
|
||||
"""
|
||||
# Prepare container command
|
||||
cmd = [
|
||||
"docker", "run",
|
||||
"--detach",
|
||||
]
|
||||
|
||||
# Add container name if specified
|
||||
if container_name:
|
||||
cmd.extend(["--name", container_name])
|
||||
|
||||
# Add port mapping
|
||||
cmd.extend(["-p", f"{host_port}:9223"])
|
||||
|
||||
# Add volumes
|
||||
if volumes:
|
||||
for volume in volumes:
|
||||
cmd.extend(["-v", volume])
|
||||
|
||||
# Add network if specified
|
||||
if network:
|
||||
cmd.extend(["--network", network])
|
||||
|
||||
# Add environment variables
|
||||
if env_vars:
|
||||
for key, value in env_vars.items():
|
||||
cmd.extend(["-e", f"{key}={value}"])
|
||||
|
||||
# Add extra args
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
|
||||
# Add image
|
||||
cmd.append(image_name)
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Creating Docker container with command: {' '.join(cmd)}", tag="DOCKER")
|
||||
|
||||
# Run docker command
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
if process.returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to create Docker container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr.decode()}
|
||||
)
|
||||
return None
|
||||
|
||||
# Get container ID
|
||||
container_id = stdout.decode().strip()
|
||||
|
||||
if self.logger:
|
||||
self.logger.success(f"Created Docker container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return container_id
|
||||
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error creating Docker container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return None
|
||||
|
||||
async def is_container_running(self, container_id: str) -> bool:
|
||||
"""Check if a container is running.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to check
|
||||
|
||||
Returns:
|
||||
bool: True if the container is running, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "inspect", "--format", "{{.State.Running}}", container_id]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, _ = await process.communicate()
|
||||
|
||||
return process.returncode == 0 and stdout.decode().strip() == "true"
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.debug(f"Error checking if container is running: {str(e)}", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def wait_for_container_ready(self, container_id: str, timeout: int = 30) -> bool:
|
||||
"""Wait for the container to be in running state.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to wait for
|
||||
timeout: Maximum time to wait in seconds
|
||||
|
||||
Returns:
|
||||
bool: True if container is ready, False if timeout occurred
|
||||
"""
|
||||
for _ in range(timeout):
|
||||
if await self.is_container_running(container_id):
|
||||
return True
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if self.logger:
|
||||
self.logger.warning(f"Container {container_id[:12]} not ready after {timeout}s timeout", tag="DOCKER")
|
||||
return False
|
||||
|
||||
async def stop_container(self, container_id: str) -> bool:
|
||||
"""Stop a Docker container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to stop
|
||||
|
||||
Returns:
|
||||
bool: True if stopped successfully, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "stop", container_id]
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(*cmd)
|
||||
await process.communicate()
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Stopped container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to stop container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return False
|
||||
|
||||
async def remove_container(self, container_id: str, force: bool = True) -> bool:
|
||||
"""Remove a Docker container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container to remove
|
||||
force: Whether to force removal
|
||||
|
||||
Returns:
|
||||
bool: True if removed successfully, False otherwise
|
||||
"""
|
||||
cmd = ["docker", "rm"]
|
||||
if force:
|
||||
cmd.append("-f")
|
||||
cmd.append(container_id)
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(*cmd)
|
||||
await process.communicate()
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Removed container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return process.returncode == 0
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to remove container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return False
|
||||
|
||||
# Container Command Execution Methods
|
||||
|
||||
async def exec_in_container(self, container_id: str, command: List[str],
|
||||
detach: bool = False) -> Tuple[int, str, str]:
|
||||
"""Execute a command in a running container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
command: Command to execute as a list of strings
|
||||
detach: Whether to run the command in detached mode
|
||||
|
||||
Returns:
|
||||
Tuple of (return_code, stdout, stderr)
|
||||
"""
|
||||
cmd = ["docker", "exec"]
|
||||
if detach:
|
||||
cmd.append("-d")
|
||||
cmd.append(container_id)
|
||||
cmd.extend(command)
|
||||
|
||||
try:
|
||||
process = await asyncio.create_subprocess_exec(
|
||||
*cmd, stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, stderr = await process.communicate()
|
||||
|
||||
return process.returncode, stdout.decode(), stderr.decode()
|
||||
except Exception as e:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Error executing command in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": str(e)}
|
||||
)
|
||||
return -1, "", str(e)
|
||||
|
||||
async def start_socat_in_container(self, container_id: str) -> bool:
|
||||
"""Start socat in the container to map port 9222 to 9223.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
|
||||
Returns:
|
||||
bool: True if socat started successfully, False otherwise
|
||||
"""
|
||||
# Command to run socat as a background process
|
||||
cmd = ["socat", "TCP-LISTEN:9223,fork", "TCP:localhost:9222"]
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, cmd, detach=True)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to start socat in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Started socat in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
# Wait a moment for socat to start
|
||||
await asyncio.sleep(1)
|
||||
return True
|
||||
|
||||
async def launch_chrome_in_container(self, container_id: str, browser_args: List[str]) -> bool:
|
||||
"""Launch Chrome inside the container with specified arguments.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
browser_args: Chrome command line arguments
|
||||
|
||||
Returns:
|
||||
bool: True if Chrome started successfully, False otherwise
|
||||
"""
|
||||
# Build Chrome command
|
||||
chrome_cmd = ["google-chrome"]
|
||||
chrome_cmd.extend(browser_args)
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, chrome_cmd, detach=True)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
message="Failed to launch Chrome in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Launched Chrome in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return True
|
||||
|
||||
async def get_process_id_in_container(self, container_id: str, process_name: str) -> Optional[int]:
|
||||
"""Get the process ID for a process in the container.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
process_name: Name pattern to search for
|
||||
|
||||
Returns:
|
||||
int: Process ID if found, None otherwise
|
||||
"""
|
||||
cmd = ["pgrep", "-f", process_name]
|
||||
|
||||
returncode, stdout, _ = await self.exec_in_container(container_id, cmd)
|
||||
|
||||
if returncode == 0 and stdout.strip():
|
||||
pid = int(stdout.strip().split("\n")[0])
|
||||
return pid
|
||||
|
||||
return None
|
||||
|
||||
async def stop_process_in_container(self, container_id: str, pid: int) -> bool:
|
||||
"""Stop a process in the container by PID.
|
||||
|
||||
Args:
|
||||
container_id: ID of the container
|
||||
pid: Process ID to stop
|
||||
|
||||
Returns:
|
||||
bool: True if process was stopped, False otherwise
|
||||
"""
|
||||
cmd = ["kill", "-TERM", str(pid)]
|
||||
|
||||
returncode, _, stderr = await self.exec_in_container(container_id, cmd)
|
||||
|
||||
if returncode != 0:
|
||||
if self.logger:
|
||||
self.logger.warning(
|
||||
message="Failed to stop process in container: {error}",
|
||||
tag="DOCKER",
|
||||
params={"error": stderr}
|
||||
)
|
||||
return False
|
||||
|
||||
if self.logger:
|
||||
self.logger.debug(f"Stopped process {pid} in container: {container_id[:12]}", tag="DOCKER")
|
||||
|
||||
return True
|
||||
|
||||
# Network and Port Methods
|
||||
|
||||
async def wait_for_cdp_ready(self, host_port: int, timeout: int = 30) -> bool:
|
||||
"""Wait for the CDP endpoint to be ready.
|
||||
|
||||
Args:
|
||||
host_port: Port to check for CDP endpoint
|
||||
timeout: Maximum time to wait in seconds
|
||||
|
||||
Returns:
|
||||
bool: True if CDP endpoint is ready, False if timeout occurred
|
||||
"""
|
||||
import aiohttp
|
||||
|
||||
url = f"http://localhost:{host_port}/json/version"
|
||||
|
||||
for _ in range(timeout):
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
async with session.get(url, timeout=1) as response:
|
||||
if response.status == 200:
|
||||
if self.logger:
|
||||
self.logger.debug(f"CDP endpoint ready on port {host_port}", tag="DOCKER")
|
||||
return True
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(1)
|
||||
|
||||
if self.logger:
|
||||
self.logger.warning(f"CDP endpoint not ready on port {host_port} after {timeout}s timeout", tag="DOCKER")
|
||||
return False
|
||||
|
||||
def is_port_in_use(self, port: int) -> bool:
|
||||
"""Check if a port is already in use on the host.
|
||||
|
||||
Args:
|
||||
port: Port number to check
|
||||
|
||||
Returns:
|
||||
bool: True if port is in use, False otherwise
|
||||
"""
|
||||
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s:
|
||||
return s.connect_ex(('localhost', port)) == 0
|
||||
|
||||
def get_next_available_port(self, start_port: int = 9223) -> int:
|
||||
"""Get the next available port starting from a given port.
|
||||
|
||||
Args:
|
||||
start_port: Port number to start checking from
|
||||
|
||||
Returns:
|
||||
int: First available port number
|
||||
"""
|
||||
port = start_port
|
||||
while self.is_port_in_use(port):
|
||||
port += 1
|
||||
return port
|
||||
|
||||
# Configuration Hash Methods
|
||||
|
||||
def generate_config_hash(self, config_dict: Dict) -> str:
|
||||
"""Generate a hash of the configuration for container matching.
|
||||
|
||||
Args:
|
||||
config_dict: Dictionary of configuration parameters
|
||||
|
||||
Returns:
|
||||
str: Hash string uniquely identifying this configuration
|
||||
"""
|
||||
# Convert to canonical JSON string and hash
|
||||
config_json = json.dumps(config_dict, sort_keys=True)
|
||||
return hashlib.sha256(config_json.encode()).hexdigest()
|
||||
204
crawl4ai/browser/manager.py
Normal file
204
crawl4ai/browser/manager.py
Normal file
@@ -0,0 +1,204 @@
|
||||
"""Browser manager module for Crawl4AI.
|
||||
|
||||
This module provides a central browser management class that uses the
|
||||
strategy pattern internally while maintaining the existing API.
|
||||
It also implements a page pooling mechanism for improved performance.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import time
|
||||
from typing import Optional, Tuple, List
|
||||
|
||||
from playwright.async_api import Page, BrowserContext
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
from .strategies import (
|
||||
BaseBrowserStrategy,
|
||||
PlaywrightBrowserStrategy,
|
||||
CDPBrowserStrategy,
|
||||
BuiltinBrowserStrategy
|
||||
)
|
||||
|
||||
# Import DockerBrowserStrategy if available
|
||||
try:
|
||||
from .docker_strategy import DockerBrowserStrategy
|
||||
except ImportError:
|
||||
DockerBrowserStrategy = None
|
||||
|
||||
class BrowserManager:
|
||||
"""Main interface for browser management in Crawl4AI.
|
||||
|
||||
This class maintains backward compatibility with the existing implementation
|
||||
while using the strategy pattern internally for different browser types.
|
||||
|
||||
Attributes:
|
||||
config (BrowserConfig): Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
browser: The browser instance
|
||||
default_context: The default browser context
|
||||
managed_browser: The managed browser instance
|
||||
playwright: The Playwright instance
|
||||
sessions: Dictionary to store session information
|
||||
session_ttl: Session timeout in seconds
|
||||
"""
|
||||
|
||||
def __init__(self, browser_config: Optional[BrowserConfig] = None, logger: Optional[AsyncLogger] = None):
|
||||
"""Initialize the BrowserManager with a browser configuration.
|
||||
|
||||
Args:
|
||||
browser_config: Configuration object containing all browser settings
|
||||
logger: Logger instance for recording events and errors
|
||||
"""
|
||||
self.config = browser_config or BrowserConfig()
|
||||
self.logger = logger
|
||||
|
||||
# Create strategy based on configuration
|
||||
self._strategy = self._create_strategy()
|
||||
|
||||
# Initialize state variables for compatibility with existing code
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
|
||||
# For session management (from existing implementation)
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800 # 30 minutes
|
||||
|
||||
def _create_strategy(self) -> BaseBrowserStrategy:
|
||||
"""Create appropriate browser strategy based on configuration.
|
||||
|
||||
Returns:
|
||||
BaseBrowserStrategy: The selected browser strategy
|
||||
"""
|
||||
if self.config.browser_mode == "builtin":
|
||||
return BuiltinBrowserStrategy(self.config, self.logger)
|
||||
elif self.config.browser_mode == "docker":
|
||||
if DockerBrowserStrategy is None:
|
||||
if self.logger:
|
||||
self.logger.error(
|
||||
"Docker browser strategy requested but not available. "
|
||||
"Falling back to PlaywrightBrowserStrategy.",
|
||||
tag="BROWSER"
|
||||
)
|
||||
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||
return DockerBrowserStrategy(self.config, self.logger)
|
||||
elif self.config.cdp_url or self.config.use_managed_browser:
|
||||
return CDPBrowserStrategy(self.config, self.logger)
|
||||
else:
|
||||
return PlaywrightBrowserStrategy(self.config, self.logger)
|
||||
|
||||
async def start(self):
|
||||
"""Start the browser instance and set up the default context.
|
||||
|
||||
Returns:
|
||||
self: For method chaining
|
||||
"""
|
||||
# Start the strategy
|
||||
await self._strategy.start()
|
||||
|
||||
# Update legacy references
|
||||
self.browser = self._strategy.browser
|
||||
self.default_context = self._strategy.default_context
|
||||
|
||||
# Set browser process reference (for CDP strategy)
|
||||
if hasattr(self._strategy, 'browser_process'):
|
||||
self.managed_browser = self._strategy
|
||||
|
||||
# Set Playwright reference
|
||||
self.playwright = self._strategy.playwright
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
self.session_ttl = self._strategy.session_ttl
|
||||
|
||||
return self
|
||||
|
||||
async def get_page(self, crawlerRunConfig: CrawlerRunConfig) -> Tuple[Page, BrowserContext]:
|
||||
"""Get a page for the given configuration.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Configuration object for the crawler run
|
||||
|
||||
Returns:
|
||||
Tuple of (Page, BrowserContext)
|
||||
"""
|
||||
# Delegate to strategy
|
||||
page, context = await self._strategy.get_page(crawlerRunConfig)
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
|
||||
return page, context
|
||||
|
||||
async def get_pages(self, crawlerRunConfig: CrawlerRunConfig, count: int = 1) -> List[Tuple[Page, BrowserContext]]:
|
||||
"""Get multiple pages with the same configuration.
|
||||
|
||||
This method efficiently creates multiple browser pages using the same configuration,
|
||||
which is useful for parallel crawling of multiple URLs.
|
||||
|
||||
Args:
|
||||
crawlerRunConfig: Configuration for the pages
|
||||
count: Number of pages to create
|
||||
|
||||
Returns:
|
||||
List of (Page, Context) tuples
|
||||
"""
|
||||
# Delegate to strategy
|
||||
pages = await self._strategy.get_pages(crawlerRunConfig, count)
|
||||
|
||||
# Sync sessions if needed
|
||||
if hasattr(self._strategy, 'sessions'):
|
||||
self.sessions = self._strategy.sessions
|
||||
|
||||
return pages
|
||||
|
||||
async def kill_session(self, session_id: str):
|
||||
"""Kill a browser session and clean up resources.
|
||||
|
||||
Args:
|
||||
session_id: The session ID to kill
|
||||
"""
|
||||
# Handle kill_session via our strategy if it supports it
|
||||
if hasattr(self._strategy, '_kill_session'):
|
||||
await self._strategy._kill_session(session_id)
|
||||
elif session_id in self.sessions:
|
||||
context, page, _ = self.sessions[session_id]
|
||||
await page.close()
|
||||
# Only close context if not using CDP
|
||||
if not self.config.use_managed_browser and not self.config.cdp_url and not self.config.browser_mode == "builtin":
|
||||
await context.close()
|
||||
del self.sessions[session_id]
|
||||
|
||||
def _cleanup_expired_sessions(self):
|
||||
"""Clean up expired sessions based on TTL."""
|
||||
# Use strategy's implementation if available
|
||||
if hasattr(self._strategy, '_cleanup_expired_sessions'):
|
||||
self._strategy._cleanup_expired_sessions()
|
||||
return
|
||||
|
||||
# Otherwise use our own implementation
|
||||
current_time = time.time()
|
||||
expired_sessions = [
|
||||
sid
|
||||
for sid, (_, _, last_used) in self.sessions.items()
|
||||
if current_time - last_used > self.session_ttl
|
||||
]
|
||||
for sid in expired_sessions:
|
||||
asyncio.create_task(self.kill_session(sid))
|
||||
|
||||
async def close(self):
|
||||
"""Close the browser and clean up resources."""
|
||||
# Delegate to strategy
|
||||
await self._strategy.close()
|
||||
|
||||
# Reset legacy references
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.playwright = None
|
||||
self.sessions = {}
|
||||
0
crawl4ai/browser/models.py
Normal file
0
crawl4ai/browser/models.py
Normal file
457
crawl4ai/browser/profiles.py
Normal file
457
crawl4ai/browser/profiles.py
Normal file
@@ -0,0 +1,457 @@
|
||||
"""Browser profile management module for Crawl4AI.
|
||||
|
||||
This module provides functionality for creating and managing browser profiles
|
||||
that can be used for authenticated browsing.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import signal
|
||||
import sys
|
||||
import datetime
|
||||
import uuid
|
||||
import shutil
|
||||
from typing import List, Dict, Optional, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from ..async_configs import BrowserConfig
|
||||
from ..async_logger import AsyncLogger, AsyncLoggerBase
|
||||
from ..utils import get_home_folder
|
||||
|
||||
class BrowserProfileManager:
|
||||
"""Manages browser profiles for Crawl4AI.
|
||||
|
||||
This class provides functionality to create and manage browser profiles
|
||||
that can be used for authenticated browsing with Crawl4AI.
|
||||
|
||||
Profiles are stored by default in ~/.crawl4ai/profiles/
|
||||
"""
|
||||
|
||||
def __init__(self, logger: Optional[AsyncLoggerBase] = None):
|
||||
"""Initialize the BrowserProfileManager.
|
||||
|
||||
Args:
|
||||
logger: Logger for outputting messages. If None, a default AsyncLogger is created.
|
||||
"""
|
||||
# Initialize colorama for colorful terminal output
|
||||
init()
|
||||
|
||||
# Create a logger if not provided
|
||||
if logger is None:
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
elif not isinstance(logger, AsyncLoggerBase):
|
||||
self.logger = AsyncLogger(verbose=True)
|
||||
else:
|
||||
self.logger = logger
|
||||
|
||||
# Ensure profiles directory exists
|
||||
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
|
||||
os.makedirs(self.profiles_dir, exist_ok=True)
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
browser_config: Optional[BrowserConfig] = None) -> Optional[str]:
|
||||
"""Create a browser profile interactively.
|
||||
|
||||
Args:
|
||||
profile_name: Name for the profile. If None, a name is generated.
|
||||
browser_config: Configuration for the browser. If None, a default configuration is used.
|
||||
|
||||
Returns:
|
||||
Path to the created profile directory, or None if creation failed
|
||||
"""
|
||||
# Create default browser config if none provided
|
||||
if browser_config is None:
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=False, # Must be visible for user interaction
|
||||
verbose=True
|
||||
)
|
||||
else:
|
||||
# Ensure headless is False for user interaction
|
||||
browser_config.headless = False
|
||||
|
||||
# Generate profile name if not provided
|
||||
if not profile_name:
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
profile_name = f"profile_{timestamp}_{uuid.uuid4().hex[:6]}"
|
||||
|
||||
# Sanitize profile name (replace spaces and special chars)
|
||||
profile_name = "".join(c if c.isalnum() or c in "-_" else "_" for c in profile_name)
|
||||
|
||||
# Set user data directory
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
os.makedirs(profile_path, exist_ok=True)
|
||||
|
||||
# Print instructions for the user with colorama formatting
|
||||
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
||||
self.logger.info(f"\n{border}", tag="PROFILE")
|
||||
self.logger.info(f"Creating browser profile: {Fore.GREEN}{profile_name}{Style.RESET_ALL}", tag="PROFILE")
|
||||
self.logger.info(f"Profile directory: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
self.logger.info("\nInstructions:", tag="PROFILE")
|
||||
self.logger.info("1. A browser window will open for you to set up your profile.", tag="PROFILE")
|
||||
self.logger.info(f"2. {Fore.CYAN}Log in to websites{Style.RESET_ALL}, configure settings, etc. as needed.", tag="PROFILE")
|
||||
self.logger.info(f"3. When you're done, {Fore.YELLOW}press 'q' in this terminal{Style.RESET_ALL} to close the browser.", tag="PROFILE")
|
||||
self.logger.info("4. The profile will be saved and ready to use with Crawl4AI.", tag="PROFILE")
|
||||
self.logger.info(f"{border}\n", tag="PROFILE")
|
||||
|
||||
# Import the necessary classes with local imports to avoid circular references
|
||||
from .strategies import CDPBrowserStrategy
|
||||
|
||||
# Set browser config to use the profile path
|
||||
browser_config.user_data_dir = profile_path
|
||||
|
||||
# Create a CDP browser strategy for the profile creation
|
||||
browser_strategy = CDPBrowserStrategy(browser_config, self.logger)
|
||||
|
||||
# Set up signal handlers to ensure cleanup on interrupt
|
||||
original_sigint = signal.getsignal(signal.SIGINT)
|
||||
original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
# Define cleanup handler for signals
|
||||
async def cleanup_handler(sig, frame):
|
||||
self.logger.warning("\nCleaning up browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
if sig == signal.SIGINT:
|
||||
self.logger.error("Profile creation interrupted. Profile may be incomplete.", tag="PROFILE")
|
||||
sys.exit(1)
|
||||
|
||||
# Set signal handlers
|
||||
def sigint_handler(sig, frame):
|
||||
asyncio.create_task(cleanup_handler(sig, frame))
|
||||
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
signal.signal(signal.SIGTERM, sigint_handler)
|
||||
|
||||
# Event to signal when user is done with the browser
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
# First output the prompt
|
||||
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' when you've finished using the browser...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info(f"{Fore.GREEN}Closing browser and saving profile...{Style.RESET_ALL}", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if browser_strategy.browser_process and browser_strategy.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="PROFILE")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await browser_strategy.start()
|
||||
|
||||
# Check if browser started successfully
|
||||
if not browser_strategy.browser_process:
|
||||
self.logger.error("Failed to start browser process.", tag="PROFILE")
|
||||
return None
|
||||
|
||||
self.logger.info(f"Browser launched. {Fore.CYAN}Waiting for you to finish...{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
# Start listening for keyboard input
|
||||
listener_task = asyncio.create_task(listen_for_quit_command())
|
||||
|
||||
# Wait for either the user to press 'q' or for the browser process to exit naturally
|
||||
while not user_done_event.is_set() and browser_strategy.browser_process.poll() is None:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Cancel the listener task if it's still running
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# If the browser is still running and the user pressed 'q', terminate it
|
||||
if browser_strategy.browser_process.poll() is None and user_done_event.is_set():
|
||||
self.logger.info("Terminating browser process...", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
|
||||
self.logger.success(f"Browser closed. Profile saved at: {Fore.GREEN}{profile_path}{Style.RESET_ALL}", tag="PROFILE")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error creating profile: {str(e)}", tag="PROFILE")
|
||||
await browser_strategy.close()
|
||||
return None
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
|
||||
# Make sure browser is fully cleaned up
|
||||
await browser_strategy.close()
|
||||
|
||||
# Return the profile path
|
||||
return profile_path
|
||||
|
||||
def list_profiles(self) -> List[Dict[str, Any]]:
|
||||
"""List all available browser profiles.
|
||||
|
||||
Returns:
|
||||
List of dictionaries containing profile information
|
||||
"""
|
||||
if not os.path.exists(self.profiles_dir):
|
||||
return []
|
||||
|
||||
profiles = []
|
||||
|
||||
for name in os.listdir(self.profiles_dir):
|
||||
profile_path = os.path.join(self.profiles_dir, name)
|
||||
|
||||
# Skip if not a directory
|
||||
if not os.path.isdir(profile_path):
|
||||
continue
|
||||
|
||||
# Check if this looks like a valid browser profile
|
||||
# For Chromium: Look for Preferences file
|
||||
# For Firefox: Look for prefs.js file
|
||||
is_valid = False
|
||||
|
||||
if os.path.exists(os.path.join(profile_path, "Preferences")) or \
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")):
|
||||
is_valid = "chromium"
|
||||
elif os.path.exists(os.path.join(profile_path, "prefs.js")):
|
||||
is_valid = "firefox"
|
||||
|
||||
if is_valid:
|
||||
# Get creation time
|
||||
created = datetime.datetime.fromtimestamp(
|
||||
os.path.getctime(profile_path)
|
||||
)
|
||||
|
||||
profiles.append({
|
||||
"name": name,
|
||||
"path": profile_path,
|
||||
"created": created,
|
||||
"type": is_valid
|
||||
})
|
||||
|
||||
# Sort by creation time, newest first
|
||||
profiles.sort(key=lambda x: x["created"], reverse=True)
|
||||
|
||||
return profiles
|
||||
|
||||
def get_profile_path(self, profile_name: str) -> Optional[str]:
|
||||
"""Get the full path to a profile by name.
|
||||
|
||||
Args:
|
||||
profile_name: Name of the profile (not the full path)
|
||||
|
||||
Returns:
|
||||
Full path to the profile directory, or None if not found
|
||||
"""
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
# Check if profile_name itself is full path
|
||||
if os.path.isabs(profile_name):
|
||||
profile_path = profile_name
|
||||
else:
|
||||
return None
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return None # Not a valid browser profile
|
||||
|
||||
return profile_path
|
||||
|
||||
def delete_profile(self, profile_name_or_path: str) -> bool:
|
||||
"""Delete a browser profile by name or path.
|
||||
|
||||
Args:
|
||||
profile_name_or_path: Name of the profile or full path to profile directory
|
||||
|
||||
Returns:
|
||||
True if the profile was deleted successfully, False otherwise
|
||||
"""
|
||||
# Determine if input is a name or a path
|
||||
if os.path.isabs(profile_name_or_path):
|
||||
# Full path provided
|
||||
profile_path = profile_name_or_path
|
||||
else:
|
||||
# Just a name provided, construct path
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name_or_path)
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
return False
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
os.path.exists(os.path.join(profile_path, "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "Default", "Preferences")) or
|
||||
os.path.exists(os.path.join(profile_path, "prefs.js"))
|
||||
)
|
||||
|
||||
if not is_profile:
|
||||
return False # Not a valid browser profile
|
||||
|
||||
# Delete the profile directory
|
||||
try:
|
||||
shutil.rmtree(profile_path)
|
||||
return True
|
||||
except Exception:
|
||||
return False
|
||||
|
||||
async def interactive_manager(self, crawl_callback=None):
|
||||
"""Launch an interactive profile management console.
|
||||
|
||||
Args:
|
||||
crawl_callback: Function to call when selecting option to use
|
||||
a profile for crawling. It will be called with (profile_path, url).
|
||||
"""
|
||||
while True:
|
||||
self.logger.info(f"\n{Fore.CYAN}Profile Management Options:{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"1. {Fore.GREEN}Create a new profile{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"2. {Fore.YELLOW}List available profiles{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"3. {Fore.RED}Delete a profile{Style.RESET_ALL}", tag="MENU")
|
||||
|
||||
# Only show crawl option if callback provided
|
||||
if crawl_callback:
|
||||
self.logger.info(f"4. {Fore.CYAN}Use a profile to crawl a website{Style.RESET_ALL}", tag="MENU")
|
||||
self.logger.info(f"5. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "5"
|
||||
else:
|
||||
self.logger.info(f"4. {Fore.MAGENTA}Exit{Style.RESET_ALL}", tag="MENU")
|
||||
exit_option = "4"
|
||||
|
||||
choice = input(f"\n{Fore.CYAN}Enter your choice (1-{exit_option}): {Style.RESET_ALL}")
|
||||
|
||||
if choice == "1":
|
||||
# Create new profile
|
||||
name = input(f"{Fore.GREEN}Enter a name for the new profile (or press Enter for auto-generated name): {Style.RESET_ALL}")
|
||||
await self.create_profile(name or None)
|
||||
|
||||
elif choice == "2":
|
||||
# List profiles
|
||||
profiles = self.list_profiles()
|
||||
|
||||
if not profiles:
|
||||
self.logger.warning(" No profiles found. Create one first with option 1.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Print profile information with colorama formatting
|
||||
self.logger.info("\nAvailable profiles:", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {Fore.CYAN}{profile['name']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Path: {Fore.YELLOW}{profile['path']}{Style.RESET_ALL}", tag="PROFILES")
|
||||
self.logger.info(f" Created: {profile['created'].strftime('%Y-%m-%d %H:%M:%S')}", tag="PROFILES")
|
||||
self.logger.info(f" Browser type: {profile['type']}", tag="PROFILES")
|
||||
self.logger.info("", tag="PROFILES") # Empty line for spacing
|
||||
|
||||
elif choice == "3":
|
||||
# Delete profile
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found to delete", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to delete
|
||||
profile_idx = input(f"{Fore.RED}Enter the number of the profile to delete (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_name = profiles[idx]["name"]
|
||||
self.logger.info(f"Deleting profile: {Fore.YELLOW}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
|
||||
# Confirm deletion
|
||||
confirm = input(f"{Fore.RED}Are you sure you want to delete this profile? (y/n): {Style.RESET_ALL}")
|
||||
if confirm.lower() == 'y':
|
||||
success = self.delete_profile(profiles[idx]["path"])
|
||||
|
||||
if success:
|
||||
self.logger.success(f"Profile {Fore.GREEN}{profile_name}{Style.RESET_ALL} deleted successfully", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error(f"Failed to delete profile {Fore.RED}{profile_name}{Style.RESET_ALL}", tag="PROFILES")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif choice == "4" and crawl_callback:
|
||||
# Use profile to crawl a site
|
||||
profiles = self.list_profiles()
|
||||
if not profiles:
|
||||
self.logger.warning("No profiles found. Create one first.", tag="PROFILES")
|
||||
continue
|
||||
|
||||
# Display numbered list
|
||||
self.logger.info(f"\n{Fore.YELLOW}Available profiles:{Style.RESET_ALL}", tag="PROFILES")
|
||||
for i, profile in enumerate(profiles):
|
||||
self.logger.info(f"[{i+1}] {profile['name']}", tag="PROFILES")
|
||||
|
||||
# Get profile to use
|
||||
profile_idx = input(f"{Fore.CYAN}Enter the number of the profile to use (or 'c' to cancel): {Style.RESET_ALL}")
|
||||
if profile_idx.lower() == 'c':
|
||||
continue
|
||||
|
||||
try:
|
||||
idx = int(profile_idx) - 1
|
||||
if 0 <= idx < len(profiles):
|
||||
profile_path = profiles[idx]["path"]
|
||||
url = input(f"{Fore.CYAN}Enter the URL to crawl: {Style.RESET_ALL}")
|
||||
if url:
|
||||
# Call the provided crawl callback
|
||||
await crawl_callback(profile_path, url)
|
||||
else:
|
||||
self.logger.error("No URL provided", tag="CRAWL")
|
||||
else:
|
||||
self.logger.error("Invalid profile number", tag="PROFILES")
|
||||
except ValueError:
|
||||
self.logger.error("Please enter a valid number", tag="PROFILES")
|
||||
|
||||
elif (choice == "4" and not crawl_callback) or (choice == "5" and crawl_callback):
|
||||
# Exit
|
||||
self.logger.info("Exiting profile management", tag="MENU")
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
1256
crawl4ai/browser/strategies.py
Normal file
1256
crawl4ai/browser/strategies.py
Normal file
File diff suppressed because it is too large
Load Diff
328
crawl4ai/browser/utils.py
Normal file
328
crawl4ai/browser/utils.py
Normal file
@@ -0,0 +1,328 @@
|
||||
"""Browser utilities module for Crawl4AI.
|
||||
|
||||
This module provides utility functions for browser management,
|
||||
including process management, CDP connection utilities,
|
||||
and Playwright instance management.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import tempfile
|
||||
import subprocess
|
||||
from typing import Optional
|
||||
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
from ..utils import get_chromium_path
|
||||
from ..async_configs import BrowserConfig, CrawlerRunConfig
|
||||
|
||||
from ..async_logger import AsyncLogger
|
||||
|
||||
|
||||
_playwright_instance = None
|
||||
|
||||
async def get_playwright():
|
||||
"""Get or create the Playwright instance (singleton pattern).
|
||||
|
||||
Returns:
|
||||
Playwright: The Playwright instance
|
||||
"""
|
||||
global _playwright_instance
|
||||
if _playwright_instance is None or True:
|
||||
_playwright_instance = await async_playwright().start()
|
||||
return _playwright_instance
|
||||
|
||||
async def get_browser_executable(browser_type: str) -> str:
|
||||
"""Get the path to browser executable, with platform-specific handling.
|
||||
|
||||
Args:
|
||||
browser_type: Type of browser (chromium, firefox, webkit)
|
||||
|
||||
Returns:
|
||||
Path to browser executable
|
||||
"""
|
||||
return await get_chromium_path(browser_type)
|
||||
|
||||
def create_temp_directory(prefix="browser-profile-") -> str:
|
||||
"""Create a temporary directory for browser data.
|
||||
|
||||
Args:
|
||||
prefix: Prefix for the temporary directory name
|
||||
|
||||
Returns:
|
||||
Path to the created temporary directory
|
||||
"""
|
||||
return tempfile.mkdtemp(prefix=prefix)
|
||||
|
||||
def is_windows() -> bool:
|
||||
"""Check if the current platform is Windows.
|
||||
|
||||
Returns:
|
||||
True if Windows, False otherwise
|
||||
"""
|
||||
return sys.platform == "win32"
|
||||
|
||||
def is_macos() -> bool:
|
||||
"""Check if the current platform is macOS.
|
||||
|
||||
Returns:
|
||||
True if macOS, False otherwise
|
||||
"""
|
||||
return sys.platform == "darwin"
|
||||
|
||||
def is_linux() -> bool:
|
||||
"""Check if the current platform is Linux.
|
||||
|
||||
Returns:
|
||||
True if Linux, False otherwise
|
||||
"""
|
||||
return not (is_windows() or is_macos())
|
||||
|
||||
def is_browser_running(pid: Optional[int]) -> bool:
|
||||
"""Check if a process with the given PID is running.
|
||||
|
||||
Args:
|
||||
pid: Process ID to check
|
||||
|
||||
Returns:
|
||||
bool: True if the process is running, False otherwise
|
||||
"""
|
||||
if not pid:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Check if the process exists
|
||||
if is_windows():
|
||||
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
|
||||
capture_output=True, text=True)
|
||||
return str(pid) in process.stdout
|
||||
else:
|
||||
# Unix-like systems
|
||||
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
|
||||
return True
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
return False
|
||||
|
||||
def get_browser_disable_options() -> list:
|
||||
"""Get standard list of browser disable options for performance.
|
||||
|
||||
Returns:
|
||||
List of command-line options to disable various browser features
|
||||
"""
|
||||
return [
|
||||
"--disable-background-networking",
|
||||
"--disable-background-timer-throttling",
|
||||
"--disable-backgrounding-occluded-windows",
|
||||
"--disable-breakpad",
|
||||
"--disable-client-side-phishing-detection",
|
||||
"--disable-component-extensions-with-background-pages",
|
||||
"--disable-default-apps",
|
||||
"--disable-extensions",
|
||||
"--disable-features=TranslateUI",
|
||||
"--disable-hang-monitor",
|
||||
"--disable-ipc-flooding-protection",
|
||||
"--disable-popup-blocking",
|
||||
"--disable-prompt-on-repost",
|
||||
"--disable-sync",
|
||||
"--force-color-profile=srgb",
|
||||
"--metrics-recording-only",
|
||||
"--no-first-run",
|
||||
"--password-store=basic",
|
||||
"--use-mock-keychain",
|
||||
]
|
||||
|
||||
|
||||
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||
|
||||
Args:
|
||||
total_urls: Number of URLs to crawl
|
||||
verbose: Whether to print progress
|
||||
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||
|
||||
Returns:
|
||||
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||
"""
|
||||
from .manager import BrowserManager
|
||||
if verbose:
|
||||
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||
|
||||
# Generate test URLs with timestamp to avoid caching
|
||||
timestamp = int(time.time())
|
||||
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||
|
||||
# Limit browser configurations to test (1 browser to max 10)
|
||||
max_browsers = min(10, total_urls)
|
||||
configs_to_test = []
|
||||
|
||||
# Generate configurations (browser count, pages distribution)
|
||||
for num_browsers in range(1, max_browsers + 1):
|
||||
base_pages = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||
if remainder > 0:
|
||||
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||
else:
|
||||
distribution = [base_pages] * num_browsers
|
||||
|
||||
configs_to_test.append((num_browsers, distribution))
|
||||
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for browser_count, page_distribution in configs_to_test:
|
||||
if verbose:
|
||||
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||
|
||||
try:
|
||||
# Track memory if possible
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
if verbose:
|
||||
print("Memory tracking not available (psutil not installed)")
|
||||
start_memory = 0
|
||||
|
||||
# Start browsers in parallel
|
||||
managers = []
|
||||
start_tasks = []
|
||||
start_time = time.time()
|
||||
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
for i in range(browser_count):
|
||||
config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among browsers
|
||||
urls_per_manager = {}
|
||||
url_index = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
pages_for_this_browser = page_distribution[i]
|
||||
end_index = url_index + pages_for_this_browser
|
||||
urls_per_manager[manager] = urls[url_index:end_index]
|
||||
url_index = end_index
|
||||
|
||||
# Create pages for each browser
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls:
|
||||
continue
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Crawl pages with delay to avoid rate limiting
|
||||
async def crawl_page(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
if rate_limit_delay > 0:
|
||||
await asyncio.sleep(rate_limit_delay)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
crawl_start = time.time()
|
||||
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
crawl_time = time.time() - crawl_start
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Measure final memory usage
|
||||
if start_memory > 0:
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
memory_used = end_memory - start_memory
|
||||
else:
|
||||
memory_used = 0
|
||||
|
||||
# Close all browsers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory
|
||||
if memory_used > 0:
|
||||
efficiency = pages_per_second / (memory_used + 1)
|
||||
else:
|
||||
efficiency = pages_per_second
|
||||
|
||||
# Store result
|
||||
result = {
|
||||
"browser_count": browser_count,
|
||||
"distribution": tuple(page_distribution),
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"memory_used": memory_used,
|
||||
"pages_per_second": pages_per_second,
|
||||
"efficiency": efficiency
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||
if memory_used > 0:
|
||||
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# If no successful results, return None
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Find best configurations
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
|
||||
# Only consider memory if available
|
||||
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||
if memory_results:
|
||||
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||
else:
|
||||
lowest_memory = fastest
|
||||
|
||||
# Find most efficient (balanced speed vs memory)
|
||||
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||
|
||||
# Print summary
|
||||
if verbose:
|
||||
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||
|
||||
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||
if lowest_memory["memory_used"] > 0:
|
||||
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||
|
||||
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||
|
||||
return {
|
||||
"fastest": fastest,
|
||||
"lowest_memory": lowest_memory,
|
||||
"optimal": optimal,
|
||||
"all_configs": results
|
||||
}
|
||||
@@ -145,17 +145,60 @@ class ManagedBrowser:
|
||||
|
||||
# Start browser process
|
||||
try:
|
||||
self.browser_process = subprocess.Popen(
|
||||
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
|
||||
)
|
||||
# Monitor browser process output for errors
|
||||
asyncio.create_task(self._monitor_browser_process())
|
||||
# Use DETACHED_PROCESS flag on Windows to fully detach the process
|
||||
# On Unix, we'll use preexec_fn=os.setpgrp to start the process in a new process group
|
||||
if sys.platform == "win32":
|
||||
self.browser_process = subprocess.Popen(
|
||||
args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
creationflags=subprocess.DETACHED_PROCESS | subprocess.CREATE_NEW_PROCESS_GROUP
|
||||
)
|
||||
else:
|
||||
self.browser_process = subprocess.Popen(
|
||||
args,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
preexec_fn=os.setpgrp # Start in a new process group
|
||||
)
|
||||
|
||||
# We'll monitor for a short time to make sure it starts properly, but won't keep monitoring
|
||||
await asyncio.sleep(0.5) # Give browser time to start
|
||||
await self._initial_startup_check()
|
||||
await asyncio.sleep(2) # Give browser time to start
|
||||
return f"http://{self.host}:{self.debugging_port}"
|
||||
except Exception as e:
|
||||
await self.cleanup()
|
||||
raise Exception(f"Failed to start browser: {e}")
|
||||
|
||||
async def _initial_startup_check(self):
|
||||
"""
|
||||
Perform a quick check to make sure the browser started successfully.
|
||||
This only runs once at startup rather than continuously monitoring.
|
||||
"""
|
||||
if not self.browser_process:
|
||||
return
|
||||
|
||||
# Check that process started without immediate termination
|
||||
await asyncio.sleep(0.5)
|
||||
if self.browser_process.poll() is not None:
|
||||
# Process already terminated
|
||||
stdout, stderr = b"", b""
|
||||
try:
|
||||
stdout, stderr = self.browser_process.communicate(timeout=0.5)
|
||||
except subprocess.TimeoutExpired:
|
||||
pass
|
||||
|
||||
self.logger.error(
|
||||
message="Browser process terminated during startup | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
|
||||
tag="ERROR",
|
||||
params={
|
||||
"code": self.browser_process.returncode,
|
||||
"stdout": stdout.decode() if stdout else "",
|
||||
"stderr": stderr.decode() if stderr else "",
|
||||
},
|
||||
)
|
||||
|
||||
async def _monitor_browser_process(self):
|
||||
"""
|
||||
Monitor the browser process for unexpected termination.
|
||||
@@ -167,6 +210,7 @@ class ManagedBrowser:
|
||||
4. If any other error occurs, log the error message.
|
||||
|
||||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||||
This is DEPRECATED and should not be used for builtin browsers that need to outlive the Python process.
|
||||
"""
|
||||
if self.browser_process:
|
||||
try:
|
||||
@@ -261,22 +305,33 @@ class ManagedBrowser:
|
||||
|
||||
if self.browser_process:
|
||||
try:
|
||||
self.browser_process.terminate()
|
||||
# Wait for process to end gracefully
|
||||
for _ in range(10): # 10 attempts, 100ms each
|
||||
if self.browser_process.poll() is not None:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
# For builtin browsers that should persist, we should check if it's a detached process
|
||||
# Only terminate if we have proper control over the process
|
||||
if not self.browser_process.poll():
|
||||
# Process is still running
|
||||
self.browser_process.terminate()
|
||||
# Wait for process to end gracefully
|
||||
for _ in range(10): # 10 attempts, 100ms each
|
||||
if self.browser_process.poll() is not None:
|
||||
break
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
# Force kill if still running
|
||||
if self.browser_process.poll() is None:
|
||||
self.browser_process.kill()
|
||||
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
||||
# Force kill if still running
|
||||
if self.browser_process.poll() is None:
|
||||
if sys.platform == "win32":
|
||||
# On Windows we might need taskkill for detached processes
|
||||
try:
|
||||
subprocess.run(["taskkill", "/F", "/PID", str(self.browser_process.pid)])
|
||||
except Exception:
|
||||
self.browser_process.kill()
|
||||
else:
|
||||
self.browser_process.kill()
|
||||
await asyncio.sleep(0.1) # Brief wait for kill to take effect
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
message="Error terminating browser: {error}",
|
||||
tag="ERROR",
|
||||
tag="ERROR",
|
||||
params={"error": str(e)},
|
||||
)
|
||||
|
||||
@@ -379,7 +434,15 @@ class BrowserManager:
|
||||
sessions (dict): Dictionary to store session information
|
||||
session_ttl (int): Session timeout in seconds
|
||||
"""
|
||||
|
||||
_playwright_instance = None
|
||||
|
||||
@classmethod
|
||||
async def get_playwright(cls):
|
||||
from playwright.async_api import async_playwright
|
||||
if cls._playwright_instance is None:
|
||||
cls._playwright_instance = await async_playwright().start()
|
||||
return cls._playwright_instance
|
||||
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None):
|
||||
"""
|
||||
@@ -429,32 +492,21 @@ class BrowserManager:
|
||||
|
||||
Note: This method should be called in a separate task to avoid blocking the main event loop.
|
||||
"""
|
||||
self.playwright = await self.get_playwright()
|
||||
if self.playwright is None:
|
||||
from playwright.async_api import async_playwright
|
||||
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.use_managed_browser:
|
||||
cdp_url = await self.managed_browser.start()
|
||||
if self.config.cdp_url or self.config.use_managed_browser:
|
||||
self.config.use_managed_browser = True
|
||||
cdp_url = await self.managed_browser.start() if not self.config.cdp_url else self.config.cdp_url
|
||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||
contexts = self.browser.contexts
|
||||
if contexts:
|
||||
self.default_context = contexts[0]
|
||||
else:
|
||||
self.default_context = await self.create_browser_context()
|
||||
# self.default_context = await self.browser.new_context(
|
||||
# viewport={
|
||||
# "width": self.config.viewport_width,
|
||||
# "height": self.config.viewport_height,
|
||||
# },
|
||||
# storage_state=self.config.storage_state,
|
||||
# user_agent=self.config.headers.get(
|
||||
# "User-Agent", self.config.user_agent
|
||||
# ),
|
||||
# accept_downloads=self.config.accept_downloads,
|
||||
# ignore_https_errors=self.config.ignore_https_errors,
|
||||
# java_script_enabled=self.config.java_script_enabled,
|
||||
# )
|
||||
await self.setup_context(self.default_context)
|
||||
else:
|
||||
browser_args = self._build_browser_args()
|
||||
@@ -469,6 +521,7 @@ class BrowserManager:
|
||||
|
||||
self.default_context = self.browser
|
||||
|
||||
|
||||
def _build_browser_args(self) -> dict:
|
||||
"""Build browser launch arguments from config."""
|
||||
args = [
|
||||
@@ -530,9 +583,9 @@ class BrowserManager:
|
||||
ProxySettings(server=self.config.proxy)
|
||||
if self.config.proxy
|
||||
else ProxySettings(
|
||||
server=self.config.proxy_config.get("server"),
|
||||
username=self.config.proxy_config.get("username"),
|
||||
password=self.config.proxy_config.get("password"),
|
||||
server=self.config.proxy_config.server,
|
||||
username=self.config.proxy_config.username,
|
||||
password=self.config.proxy_config.password,
|
||||
)
|
||||
)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
@@ -790,7 +843,10 @@ class BrowserManager:
|
||||
# If using a managed browser, just grab the shared default_context
|
||||
if self.config.use_managed_browser:
|
||||
context = self.default_context
|
||||
page = await context.new_page()
|
||||
pages = context.pages
|
||||
page = next((p for p in pages if p.url == crawlerRunConfig.url), None)
|
||||
if not page:
|
||||
page = await context.new_page()
|
||||
else:
|
||||
# Otherwise, check if we have an existing context for this config
|
||||
config_signature = self._make_config_signature(crawlerRunConfig)
|
||||
@@ -840,6 +896,9 @@ class BrowserManager:
|
||||
|
||||
async def close(self):
|
||||
"""Close all browser resources and clean up."""
|
||||
if self.config.cdp_url:
|
||||
return
|
||||
|
||||
if self.config.sleep_on_close:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
|
||||
@@ -12,7 +12,10 @@ import sys
|
||||
import datetime
|
||||
import uuid
|
||||
import shutil
|
||||
from typing import List, Dict, Optional, Any
|
||||
import json
|
||||
import subprocess
|
||||
import time
|
||||
from typing import List, Dict, Optional, Any, Tuple
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
from .async_configs import BrowserConfig
|
||||
@@ -56,6 +59,11 @@ class BrowserProfiler:
|
||||
# Ensure profiles directory exists
|
||||
self.profiles_dir = os.path.join(get_home_folder(), "profiles")
|
||||
os.makedirs(self.profiles_dir, exist_ok=True)
|
||||
|
||||
# Builtin browser config file
|
||||
self.builtin_browser_dir = os.path.join(get_home_folder(), "builtin-browser")
|
||||
self.builtin_config_file = os.path.join(self.builtin_browser_dir, "browser_config.json")
|
||||
os.makedirs(self.builtin_browser_dir, exist_ok=True)
|
||||
|
||||
async def create_profile(self,
|
||||
profile_name: Optional[str] = None,
|
||||
@@ -342,7 +350,11 @@ class BrowserProfiler:
|
||||
|
||||
# Check if path exists and is a valid profile
|
||||
if not os.path.isdir(profile_path):
|
||||
return None
|
||||
# Chrck if profile_name itself is full path
|
||||
if os.path.isabs(profile_name):
|
||||
profile_path = profile_name
|
||||
else:
|
||||
return None
|
||||
|
||||
# Look for profile indicators
|
||||
is_profile = (
|
||||
@@ -541,4 +553,422 @@ class BrowserProfiler:
|
||||
break
|
||||
|
||||
else:
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
self.logger.error(f"Invalid choice. Please enter a number between 1 and {exit_option}.", tag="MENU")
|
||||
|
||||
async def launch_standalone_browser(self,
|
||||
browser_type: str = "chromium",
|
||||
user_data_dir: Optional[str] = None,
|
||||
debugging_port: int = 9222,
|
||||
headless: bool = False,
|
||||
save_as_builtin: bool = False) -> Optional[str]:
|
||||
"""
|
||||
Launch a standalone browser with CDP debugging enabled and keep it running
|
||||
until the user presses 'q'. Returns and displays the CDP URL.
|
||||
|
||||
Args:
|
||||
browser_type (str): Type of browser to launch ('chromium' or 'firefox')
|
||||
user_data_dir (str, optional): Path to user profile directory
|
||||
debugging_port (int): Port to use for CDP debugging
|
||||
headless (bool): Whether to run in headless mode
|
||||
|
||||
Returns:
|
||||
str: CDP URL for the browser, or None if launch failed
|
||||
|
||||
Example:
|
||||
```python
|
||||
profiler = BrowserProfiler()
|
||||
cdp_url = await profiler.launch_standalone_browser(
|
||||
user_data_dir="/path/to/profile",
|
||||
debugging_port=9222
|
||||
)
|
||||
# Use cdp_url to connect to the browser
|
||||
```
|
||||
"""
|
||||
# Use the provided directory if specified, otherwise create a temporary directory
|
||||
if user_data_dir:
|
||||
# Directory is provided directly, ensure it exists
|
||||
profile_path = user_data_dir
|
||||
os.makedirs(profile_path, exist_ok=True)
|
||||
else:
|
||||
# Create a temporary profile directory
|
||||
timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")
|
||||
profile_name = f"temp_{timestamp}_{uuid.uuid4().hex[:6]}"
|
||||
profile_path = os.path.join(self.profiles_dir, profile_name)
|
||||
os.makedirs(profile_path, exist_ok=True)
|
||||
|
||||
# Print initial information
|
||||
border = f"{Fore.CYAN}{'='*80}{Style.RESET_ALL}"
|
||||
self.logger.info(f"\n{border}", tag="CDP")
|
||||
self.logger.info(f"Launching standalone browser with CDP debugging", tag="CDP")
|
||||
self.logger.info(f"Browser type: {Fore.GREEN}{browser_type}{Style.RESET_ALL}", tag="CDP")
|
||||
self.logger.info(f"Profile path: {Fore.YELLOW}{profile_path}{Style.RESET_ALL}", tag="CDP")
|
||||
self.logger.info(f"Debugging port: {Fore.CYAN}{debugging_port}{Style.RESET_ALL}", tag="CDP")
|
||||
self.logger.info(f"Headless mode: {Fore.CYAN}{headless}{Style.RESET_ALL}", tag="CDP")
|
||||
|
||||
# Create managed browser instance
|
||||
managed_browser = ManagedBrowser(
|
||||
browser_type=browser_type,
|
||||
user_data_dir=profile_path,
|
||||
headless=headless,
|
||||
logger=self.logger,
|
||||
debugging_port=debugging_port
|
||||
)
|
||||
|
||||
# Set up signal handlers to ensure cleanup on interrupt
|
||||
original_sigint = signal.getsignal(signal.SIGINT)
|
||||
original_sigterm = signal.getsignal(signal.SIGTERM)
|
||||
|
||||
# Define cleanup handler for signals
|
||||
async def cleanup_handler(sig, frame):
|
||||
self.logger.warning("\nCleaning up browser process...", tag="CDP")
|
||||
await managed_browser.cleanup()
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
if sig == signal.SIGINT:
|
||||
self.logger.error("Browser terminated by user.", tag="CDP")
|
||||
sys.exit(1)
|
||||
|
||||
# Set signal handlers
|
||||
def sigint_handler(sig, frame):
|
||||
asyncio.create_task(cleanup_handler(sig, frame))
|
||||
|
||||
signal.signal(signal.SIGINT, sigint_handler)
|
||||
signal.signal(signal.SIGTERM, sigint_handler)
|
||||
|
||||
# Event to signal when user wants to exit
|
||||
user_done_event = asyncio.Event()
|
||||
|
||||
# Run keyboard input loop in a separate task
|
||||
async def listen_for_quit_command():
|
||||
import termios
|
||||
import tty
|
||||
import select
|
||||
|
||||
# First output the prompt
|
||||
self.logger.info(f"{Fore.CYAN}Press '{Fore.WHITE}q{Fore.CYAN}' to stop the browser and exit...{Style.RESET_ALL}", tag="CDP")
|
||||
|
||||
# Save original terminal settings
|
||||
fd = sys.stdin.fileno()
|
||||
old_settings = termios.tcgetattr(fd)
|
||||
|
||||
try:
|
||||
# Switch to non-canonical mode (no line buffering)
|
||||
tty.setcbreak(fd)
|
||||
|
||||
while True:
|
||||
# Check if input is available (non-blocking)
|
||||
readable, _, _ = select.select([sys.stdin], [], [], 0.5)
|
||||
if readable:
|
||||
key = sys.stdin.read(1)
|
||||
if key.lower() == 'q':
|
||||
self.logger.info(f"{Fore.GREEN}Closing browser...{Style.RESET_ALL}", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
# Check if the browser process has already exited
|
||||
if managed_browser.browser_process and managed_browser.browser_process.poll() is not None:
|
||||
self.logger.info("Browser already closed. Ending input listener.", tag="CDP")
|
||||
user_done_event.set()
|
||||
return
|
||||
|
||||
await asyncio.sleep(0.1)
|
||||
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(fd, termios.TCSADRAIN, old_settings)
|
||||
|
||||
# Function to retrieve and display CDP JSON config
|
||||
async def get_cdp_json(port):
|
||||
import aiohttp
|
||||
cdp_url = f"http://localhost:{port}"
|
||||
json_url = f"{cdp_url}/json/version"
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
# Try multiple times in case the browser is still starting up
|
||||
for _ in range(10):
|
||||
try:
|
||||
async with session.get(json_url) as response:
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
return cdp_url, data
|
||||
except Exception:
|
||||
pass
|
||||
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
return cdp_url, None
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error fetching CDP JSON: {str(e)}", tag="CDP")
|
||||
return cdp_url, None
|
||||
|
||||
cdp_url = None
|
||||
config_json = None
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await managed_browser.start()
|
||||
|
||||
# Check if browser started successfully
|
||||
browser_process = managed_browser.browser_process
|
||||
if not browser_process:
|
||||
self.logger.error("Failed to start browser process.", tag="CDP")
|
||||
return None
|
||||
|
||||
self.logger.info(f"Browser launched successfully. Retrieving CDP information...", tag="CDP")
|
||||
|
||||
# Get CDP URL and JSON config
|
||||
cdp_url, config_json = await get_cdp_json(debugging_port)
|
||||
|
||||
if cdp_url:
|
||||
self.logger.success(f"CDP URL: {Fore.GREEN}{cdp_url}{Style.RESET_ALL}", tag="CDP")
|
||||
|
||||
if config_json:
|
||||
# Display relevant CDP information
|
||||
self.logger.info(f"Browser: {Fore.CYAN}{config_json.get('Browser', 'Unknown')}{Style.RESET_ALL}", tag="CDP")
|
||||
self.logger.info(f"Protocol Version: {config_json.get('Protocol-Version', 'Unknown')}", tag="CDP")
|
||||
if 'webSocketDebuggerUrl' in config_json:
|
||||
self.logger.info(f"WebSocket URL: {Fore.GREEN}{config_json['webSocketDebuggerUrl']}{Style.RESET_ALL}", tag="CDP")
|
||||
else:
|
||||
self.logger.warning("Could not retrieve CDP configuration JSON", tag="CDP")
|
||||
else:
|
||||
self.logger.error(f"Failed to get CDP URL on port {debugging_port}", tag="CDP")
|
||||
await managed_browser.cleanup()
|
||||
return None
|
||||
|
||||
# Start listening for keyboard input
|
||||
listener_task = asyncio.create_task(listen_for_quit_command())
|
||||
|
||||
# Wait for the user to press 'q' or for the browser process to exit naturally
|
||||
while not user_done_event.is_set() and browser_process.poll() is None:
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
# Cancel the listener task if it's still running
|
||||
if not listener_task.done():
|
||||
listener_task.cancel()
|
||||
try:
|
||||
await listener_task
|
||||
except asyncio.CancelledError:
|
||||
pass
|
||||
|
||||
# If the browser is still running and the user pressed 'q', terminate it
|
||||
if browser_process.poll() is None and user_done_event.is_set():
|
||||
self.logger.info("Terminating browser process...", tag="CDP")
|
||||
await managed_browser.cleanup()
|
||||
|
||||
self.logger.success(f"Browser closed.", tag="CDP")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error launching standalone browser: {str(e)}", tag="CDP")
|
||||
await managed_browser.cleanup()
|
||||
return None
|
||||
finally:
|
||||
# Restore original signal handlers
|
||||
signal.signal(signal.SIGINT, original_sigint)
|
||||
signal.signal(signal.SIGTERM, original_sigterm)
|
||||
|
||||
# Make sure browser is fully cleaned up
|
||||
await managed_browser.cleanup()
|
||||
|
||||
# Return the CDP URL
|
||||
return cdp_url
|
||||
|
||||
async def launch_builtin_browser(self,
|
||||
browser_type: str = "chromium",
|
||||
debugging_port: int = 9222,
|
||||
headless: bool = True) -> Optional[str]:
|
||||
"""
|
||||
Launch a browser in the background for use as the builtin browser.
|
||||
|
||||
Args:
|
||||
browser_type (str): Type of browser to launch ('chromium' or 'firefox')
|
||||
debugging_port (int): Port to use for CDP debugging
|
||||
headless (bool): Whether to run in headless mode
|
||||
|
||||
Returns:
|
||||
str: CDP URL for the browser, or None if launch failed
|
||||
"""
|
||||
# Check if there's an existing browser still running
|
||||
browser_info = self.get_builtin_browser_info()
|
||||
if browser_info and self._is_browser_running(browser_info.get('pid')):
|
||||
self.logger.info("Builtin browser is already running", tag="BUILTIN")
|
||||
return browser_info.get('cdp_url')
|
||||
|
||||
# Create a user data directory for the builtin browser
|
||||
user_data_dir = os.path.join(self.builtin_browser_dir, "user_data")
|
||||
os.makedirs(user_data_dir, exist_ok=True)
|
||||
|
||||
# Create managed browser instance
|
||||
managed_browser = ManagedBrowser(
|
||||
browser_type=browser_type,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=headless,
|
||||
logger=self.logger,
|
||||
debugging_port=debugging_port
|
||||
)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await managed_browser.start()
|
||||
|
||||
# Check if browser started successfully
|
||||
browser_process = managed_browser.browser_process
|
||||
if not browser_process:
|
||||
self.logger.error("Failed to start browser process.", tag="BUILTIN")
|
||||
return None
|
||||
|
||||
# Get CDP URL
|
||||
cdp_url = f"http://localhost:{debugging_port}"
|
||||
|
||||
# Try to verify browser is responsive by fetching version info
|
||||
import aiohttp
|
||||
json_url = f"{cdp_url}/json/version"
|
||||
config_json = None
|
||||
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
for _ in range(10): # Try multiple times
|
||||
try:
|
||||
async with session.get(json_url) as response:
|
||||
if response.status == 200:
|
||||
config_json = await response.json()
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
await asyncio.sleep(0.5)
|
||||
except Exception as e:
|
||||
self.logger.warning(f"Could not verify browser: {str(e)}", tag="BUILTIN")
|
||||
|
||||
# Save browser info
|
||||
browser_info = {
|
||||
'pid': browser_process.pid,
|
||||
'cdp_url': cdp_url,
|
||||
'user_data_dir': user_data_dir,
|
||||
'browser_type': browser_type,
|
||||
'debugging_port': debugging_port,
|
||||
'start_time': time.time(),
|
||||
'config': config_json
|
||||
}
|
||||
|
||||
with open(self.builtin_config_file, 'w') as f:
|
||||
json.dump(browser_info, f, indent=2)
|
||||
|
||||
# Detach from the browser process - don't keep any references
|
||||
# This is important to allow the Python script to exit while the browser continues running
|
||||
# We'll just record the PID and other info, and the browser will run independently
|
||||
managed_browser.browser_process = None
|
||||
|
||||
self.logger.success(f"Builtin browser launched at CDP URL: {cdp_url}", tag="BUILTIN")
|
||||
return cdp_url
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error launching builtin browser: {str(e)}", tag="BUILTIN")
|
||||
if managed_browser:
|
||||
await managed_browser.cleanup()
|
||||
return None
|
||||
|
||||
def get_builtin_browser_info(self) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Get information about the builtin browser.
|
||||
|
||||
Returns:
|
||||
dict: Browser information or None if no builtin browser is configured
|
||||
"""
|
||||
if not os.path.exists(self.builtin_config_file):
|
||||
return None
|
||||
|
||||
try:
|
||||
with open(self.builtin_config_file, 'r') as f:
|
||||
browser_info = json.load(f)
|
||||
|
||||
# Check if the browser is still running
|
||||
if not self._is_browser_running(browser_info.get('pid')):
|
||||
self.logger.warning("Builtin browser is not running", tag="BUILTIN")
|
||||
return None
|
||||
|
||||
return browser_info
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error reading builtin browser config: {str(e)}", tag="BUILTIN")
|
||||
return None
|
||||
|
||||
def _is_browser_running(self, pid: Optional[int]) -> bool:
|
||||
"""Check if a process with the given PID is running"""
|
||||
if not pid:
|
||||
return False
|
||||
|
||||
try:
|
||||
# Check if the process exists
|
||||
if sys.platform == "win32":
|
||||
process = subprocess.run(["tasklist", "/FI", f"PID eq {pid}"],
|
||||
capture_output=True, text=True)
|
||||
return str(pid) in process.stdout
|
||||
else:
|
||||
# Unix-like systems
|
||||
os.kill(pid, 0) # This doesn't actually kill the process, just checks if it exists
|
||||
return True
|
||||
except (ProcessLookupError, PermissionError, OSError):
|
||||
return False
|
||||
|
||||
async def kill_builtin_browser(self) -> bool:
|
||||
"""
|
||||
Kill the builtin browser if it's running.
|
||||
|
||||
Returns:
|
||||
bool: True if the browser was killed, False otherwise
|
||||
"""
|
||||
browser_info = self.get_builtin_browser_info()
|
||||
if not browser_info:
|
||||
self.logger.warning("No builtin browser found", tag="BUILTIN")
|
||||
return False
|
||||
|
||||
pid = browser_info.get('pid')
|
||||
if not pid:
|
||||
return False
|
||||
|
||||
try:
|
||||
if sys.platform == "win32":
|
||||
subprocess.run(["taskkill", "/F", "/PID", str(pid)], check=True)
|
||||
else:
|
||||
os.kill(pid, signal.SIGTERM)
|
||||
# Wait for termination
|
||||
for _ in range(5):
|
||||
if not self._is_browser_running(pid):
|
||||
break
|
||||
await asyncio.sleep(0.5)
|
||||
else:
|
||||
# Force kill if still running
|
||||
os.kill(pid, signal.SIGKILL)
|
||||
|
||||
# Remove config file
|
||||
if os.path.exists(self.builtin_config_file):
|
||||
os.unlink(self.builtin_config_file)
|
||||
|
||||
self.logger.success("Builtin browser terminated", tag="BUILTIN")
|
||||
return True
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error killing builtin browser: {str(e)}", tag="BUILTIN")
|
||||
return False
|
||||
|
||||
async def get_builtin_browser_status(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get status information about the builtin browser.
|
||||
|
||||
Returns:
|
||||
dict: Status information with running, cdp_url, and info fields
|
||||
"""
|
||||
browser_info = self.get_builtin_browser_info()
|
||||
|
||||
if not browser_info:
|
||||
return {
|
||||
'running': False,
|
||||
'cdp_url': None,
|
||||
'info': None
|
||||
}
|
||||
|
||||
return {
|
||||
'running': True,
|
||||
'cdp_url': browser_info.get('cdp_url'),
|
||||
'info': browser_info
|
||||
}
|
||||
|
||||
|
||||
454
crawl4ai/cli.py
454
crawl4ai/cli.py
@@ -1,9 +1,8 @@
|
||||
import click
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import sys
|
||||
import shutil
|
||||
import time
|
||||
|
||||
import humanize
|
||||
from typing import Dict, Any, Optional, List
|
||||
import json
|
||||
@@ -13,7 +12,6 @@ from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.prompt import Prompt, Confirm
|
||||
from rich.style import Style
|
||||
|
||||
from crawl4ai import (
|
||||
CacheMode,
|
||||
@@ -26,12 +24,12 @@ from crawl4ai import (
|
||||
JsonXPathExtractionStrategy,
|
||||
BM25ContentFilter,
|
||||
PruningContentFilter,
|
||||
BrowserProfiler
|
||||
BrowserProfiler,
|
||||
LLMConfig
|
||||
)
|
||||
from litellm import completion
|
||||
from pathlib import Path
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
|
||||
# Initialize rich console
|
||||
console = Console()
|
||||
@@ -201,7 +199,24 @@ def show_examples():
|
||||
# 2. Then use that profile to crawl the authenticated site:
|
||||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||||
|
||||
5️⃣ Sample Config Files:
|
||||
5️⃣ CDP Mode for Browser Automation:
|
||||
# Launch browser with CDP debugging on default port 9222
|
||||
crwl cdp
|
||||
|
||||
# Use a specific profile and custom port
|
||||
crwl cdp -p my-profile -P 9223
|
||||
|
||||
# Launch headless browser with CDP enabled
|
||||
crwl cdp --headless
|
||||
|
||||
# Launch in incognito mode (ignores profile)
|
||||
crwl cdp --incognito
|
||||
|
||||
# Use the CDP URL with other tools (Puppeteer, Playwright, etc.)
|
||||
# The URL will be displayed in the terminal when the browser starts
|
||||
|
||||
|
||||
6️⃣ Sample Config Files:
|
||||
|
||||
browser.yml:
|
||||
headless: true
|
||||
@@ -259,7 +274,7 @@ llm_schema.json:
|
||||
}
|
||||
}
|
||||
|
||||
6️⃣ Advanced Usage:
|
||||
7️⃣ Advanced Usage:
|
||||
# Combine configs with direct parameters
|
||||
crwl https://example.com -B browser.yml -b "headless=false,viewport_width=1920"
|
||||
|
||||
@@ -285,7 +300,7 @@ llm_schema.json:
|
||||
|
||||
For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
|
||||
7️⃣ Q&A with LLM:
|
||||
8️⃣ Q&A with LLM:
|
||||
# Ask a question about the content
|
||||
crwl https://example.com -q "What is the main topic discussed?"
|
||||
|
||||
@@ -313,7 +328,7 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
|
||||
See full list of providers: https://docs.litellm.ai/docs/providers
|
||||
|
||||
8️⃣ Profile Management:
|
||||
9️⃣ Profile Management:
|
||||
# Launch interactive profile manager
|
||||
crwl profiles
|
||||
|
||||
@@ -326,6 +341,32 @@ For more documentation visit: https://github.com/unclecode/crawl4ai
|
||||
crwl profiles # Select "Create new profile" option
|
||||
# 2. Then use that profile to crawl authenticated content:
|
||||
crwl https://site-requiring-login.com/dashboard -p my-profile-name
|
||||
|
||||
🔄 Builtin Browser Management:
|
||||
# Start a builtin browser (runs in the background)
|
||||
crwl browser start
|
||||
|
||||
# Check builtin browser status
|
||||
crwl browser status
|
||||
|
||||
# Open a visible window to see the browser
|
||||
crwl browser view --url https://example.com
|
||||
|
||||
# Stop the builtin browser
|
||||
crwl browser stop
|
||||
|
||||
# Restart with different options
|
||||
crwl browser restart --browser-type chromium --port 9223 --no-headless
|
||||
|
||||
# Use the builtin browser in your code
|
||||
# (Just set browser_mode="builtin" in your BrowserConfig)
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Usage via CLI:
|
||||
crwl https://example.com -b "browser_mode=builtin"
|
||||
"""
|
||||
click.echo(examples)
|
||||
|
||||
@@ -552,11 +593,390 @@ async def manage_profiles():
|
||||
# Add a separator between operations
|
||||
console.print("\n")
|
||||
|
||||
|
||||
|
||||
@click.group(context_settings={"help_option_names": ["-h", "--help"]})
|
||||
def cli():
|
||||
"""Crawl4AI CLI - Web content extraction and browser profile management tool"""
|
||||
pass
|
||||
|
||||
|
||||
@cli.group("browser")
|
||||
def browser_cmd():
|
||||
"""Manage browser instances for Crawl4AI
|
||||
|
||||
Commands to manage browser instances for Crawl4AI, including:
|
||||
- status - Check status of the builtin browser
|
||||
- start - Start a new builtin browser
|
||||
- stop - Stop the running builtin browser
|
||||
- restart - Restart the builtin browser
|
||||
"""
|
||||
pass
|
||||
|
||||
@browser_cmd.command("status")
|
||||
def browser_status_cmd():
|
||||
"""Show status of the builtin browser"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
try:
|
||||
status = anyio.run(profiler.get_builtin_browser_status)
|
||||
|
||||
if status["running"]:
|
||||
info = status["info"]
|
||||
console.print(Panel(
|
||||
f"[green]Builtin browser is running[/green]\n\n"
|
||||
f"CDP URL: [cyan]{info['cdp_url']}[/cyan]\n"
|
||||
f"Process ID: [yellow]{info['pid']}[/yellow]\n"
|
||||
f"Browser type: [blue]{info['browser_type']}[/blue]\n"
|
||||
f"User data directory: [magenta]{info['user_data_dir']}[/magenta]\n"
|
||||
f"Started: [cyan]{time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(info['start_time']))}[/cyan]",
|
||||
title="Builtin Browser Status",
|
||||
border_style="green"
|
||||
))
|
||||
else:
|
||||
console.print(Panel(
|
||||
"[yellow]Builtin browser is not running[/yellow]\n\n"
|
||||
"Use 'crwl browser start' to start a builtin browser",
|
||||
title="Builtin Browser Status",
|
||||
border_style="yellow"
|
||||
))
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error checking browser status: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
@browser_cmd.command("start")
|
||||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium",
|
||||
help="Browser type (default: chromium)")
|
||||
@click.option("--port", "-p", type=int, default=9222, help="Debugging port (default: 9222)")
|
||||
@click.option("--headless/--no-headless", default=True, help="Run browser in headless mode")
|
||||
def browser_start_cmd(browser_type: str, port: int, headless: bool):
|
||||
"""Start a builtin browser instance
|
||||
|
||||
This will start a persistent browser instance that can be used by Crawl4AI
|
||||
by setting browser_mode="builtin" in BrowserConfig.
|
||||
"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
# First check if browser is already running
|
||||
status = anyio.run(profiler.get_builtin_browser_status)
|
||||
if status["running"]:
|
||||
console.print(Panel(
|
||||
"[yellow]Builtin browser is already running[/yellow]\n\n"
|
||||
f"CDP URL: [cyan]{status['cdp_url']}[/cyan]\n\n"
|
||||
"Use 'crwl browser restart' to restart the browser",
|
||||
title="Builtin Browser Start",
|
||||
border_style="yellow"
|
||||
))
|
||||
return
|
||||
|
||||
try:
|
||||
console.print(Panel(
|
||||
f"[cyan]Starting builtin browser[/cyan]\n\n"
|
||||
f"Browser type: [green]{browser_type}[/green]\n"
|
||||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
|
||||
title="Builtin Browser Start",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
cdp_url = anyio.run(
|
||||
profiler.launch_builtin_browser,
|
||||
browser_type,
|
||||
port,
|
||||
headless
|
||||
)
|
||||
|
||||
if cdp_url:
|
||||
console.print(Panel(
|
||||
f"[green]Builtin browser started successfully[/green]\n\n"
|
||||
f"CDP URL: [cyan]{cdp_url}[/cyan]\n\n"
|
||||
"This browser will be used automatically when setting browser_mode='builtin'",
|
||||
title="Builtin Browser Start",
|
||||
border_style="green"
|
||||
))
|
||||
else:
|
||||
console.print(Panel(
|
||||
"[red]Failed to start builtin browser[/red]",
|
||||
title="Builtin Browser Start",
|
||||
border_style="red"
|
||||
))
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error starting builtin browser: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
@browser_cmd.command("stop")
|
||||
def browser_stop_cmd():
|
||||
"""Stop the running builtin browser"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
try:
|
||||
# First check if browser is running
|
||||
status = anyio.run(profiler.get_builtin_browser_status)
|
||||
if not status["running"]:
|
||||
console.print(Panel(
|
||||
"[yellow]No builtin browser is currently running[/yellow]",
|
||||
title="Builtin Browser Stop",
|
||||
border_style="yellow"
|
||||
))
|
||||
return
|
||||
|
||||
console.print(Panel(
|
||||
"[cyan]Stopping builtin browser...[/cyan]",
|
||||
title="Builtin Browser Stop",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
success = anyio.run(profiler.kill_builtin_browser)
|
||||
|
||||
if success:
|
||||
console.print(Panel(
|
||||
"[green]Builtin browser stopped successfully[/green]",
|
||||
title="Builtin Browser Stop",
|
||||
border_style="green"
|
||||
))
|
||||
else:
|
||||
console.print(Panel(
|
||||
"[red]Failed to stop builtin browser[/red]",
|
||||
title="Builtin Browser Stop",
|
||||
border_style="red"
|
||||
))
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error stopping builtin browser: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
@browser_cmd.command("view")
|
||||
@click.option("--url", "-u", help="URL to navigate to (defaults to about:blank)")
|
||||
def browser_view_cmd(url: Optional[str]):
|
||||
"""
|
||||
Open a visible window of the builtin browser
|
||||
|
||||
This command connects to the running builtin browser and opens a visible window,
|
||||
allowing you to see what the browser is currently viewing or navigate to a URL.
|
||||
"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
try:
|
||||
# First check if browser is running
|
||||
status = anyio.run(profiler.get_builtin_browser_status)
|
||||
if not status["running"]:
|
||||
console.print(Panel(
|
||||
"[yellow]No builtin browser is currently running[/yellow]\n\n"
|
||||
"Use 'crwl browser start' to start a builtin browser first",
|
||||
title="Builtin Browser View",
|
||||
border_style="yellow"
|
||||
))
|
||||
return
|
||||
|
||||
info = status["info"]
|
||||
cdp_url = info["cdp_url"]
|
||||
|
||||
console.print(Panel(
|
||||
f"[cyan]Opening visible window connected to builtin browser[/cyan]\n\n"
|
||||
f"CDP URL: [green]{cdp_url}[/green]\n"
|
||||
f"URL to load: [yellow]{url or 'about:blank'}[/yellow]",
|
||||
title="Builtin Browser View",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
# Use the CDP URL to launch a new visible window
|
||||
import subprocess
|
||||
import os
|
||||
|
||||
# Determine the browser command based on platform
|
||||
if sys.platform == "darwin": # macOS
|
||||
browser_cmd = ["/Applications/Google Chrome.app/Contents/MacOS/Google Chrome"]
|
||||
elif sys.platform == "win32": # Windows
|
||||
browser_cmd = ["C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe"]
|
||||
else: # Linux
|
||||
browser_cmd = ["google-chrome"]
|
||||
|
||||
# Add arguments
|
||||
browser_args = [
|
||||
f"--remote-debugging-port={info['debugging_port']}",
|
||||
"--remote-debugging-address=localhost",
|
||||
"--no-first-run",
|
||||
"--no-default-browser-check"
|
||||
]
|
||||
|
||||
# Add URL if provided
|
||||
if url:
|
||||
browser_args.append(url)
|
||||
|
||||
# Launch browser
|
||||
try:
|
||||
subprocess.Popen(browser_cmd + browser_args)
|
||||
console.print("[green]Browser window opened. Close it when finished viewing.[/green]")
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error launching browser: {str(e)}[/red]")
|
||||
console.print(f"[yellow]Try connecting manually to {cdp_url} in Chrome or using the '--remote-debugging-port' flag.[/yellow]")
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error viewing builtin browser: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
@browser_cmd.command("restart")
|
||||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default=None,
|
||||
help="Browser type (defaults to same as current)")
|
||||
@click.option("--port", "-p", type=int, default=None, help="Debugging port (defaults to same as current)")
|
||||
@click.option("--headless/--no-headless", default=None, help="Run browser in headless mode")
|
||||
def browser_restart_cmd(browser_type: Optional[str], port: Optional[int], headless: Optional[bool]):
|
||||
"""Restart the builtin browser
|
||||
|
||||
Stops the current builtin browser if running and starts a new one.
|
||||
By default, uses the same configuration as the current browser.
|
||||
"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
try:
|
||||
# First check if browser is running and get its config
|
||||
status = anyio.run(profiler.get_builtin_browser_status)
|
||||
current_config = {}
|
||||
|
||||
if status["running"]:
|
||||
info = status["info"]
|
||||
current_config = {
|
||||
"browser_type": info["browser_type"],
|
||||
"port": info["debugging_port"],
|
||||
"headless": True # Default assumption
|
||||
}
|
||||
|
||||
# Stop the browser
|
||||
console.print(Panel(
|
||||
"[cyan]Stopping current builtin browser...[/cyan]",
|
||||
title="Builtin Browser Restart",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
success = anyio.run(profiler.kill_builtin_browser)
|
||||
if not success:
|
||||
console.print(Panel(
|
||||
"[red]Failed to stop current browser[/red]",
|
||||
title="Builtin Browser Restart",
|
||||
border_style="red"
|
||||
))
|
||||
sys.exit(1)
|
||||
|
||||
# Use provided options or defaults from current config
|
||||
browser_type = browser_type or current_config.get("browser_type", "chromium")
|
||||
port = port or current_config.get("port", 9222)
|
||||
headless = headless if headless is not None else current_config.get("headless", True)
|
||||
|
||||
# Start a new browser
|
||||
console.print(Panel(
|
||||
f"[cyan]Starting new builtin browser[/cyan]\n\n"
|
||||
f"Browser type: [green]{browser_type}[/green]\n"
|
||||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]",
|
||||
title="Builtin Browser Restart",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
cdp_url = anyio.run(
|
||||
profiler.launch_builtin_browser,
|
||||
browser_type,
|
||||
port,
|
||||
headless
|
||||
)
|
||||
|
||||
if cdp_url:
|
||||
console.print(Panel(
|
||||
f"[green]Builtin browser restarted successfully[/green]\n\n"
|
||||
f"CDP URL: [cyan]{cdp_url}[/cyan]",
|
||||
title="Builtin Browser Restart",
|
||||
border_style="green"
|
||||
))
|
||||
else:
|
||||
console.print(Panel(
|
||||
"[red]Failed to restart builtin browser[/red]",
|
||||
title="Builtin Browser Restart",
|
||||
border_style="red"
|
||||
))
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error restarting builtin browser: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
@cli.command("cdp")
|
||||
@click.option("--user-data-dir", "-d", help="Directory to use for browser data (will be created if it doesn't exist)")
|
||||
@click.option("--port", "-P", type=int, default=9222, help="Debugging port (default: 9222)")
|
||||
@click.option("--browser-type", "-b", type=click.Choice(["chromium", "firefox"]), default="chromium",
|
||||
help="Browser type (default: chromium)")
|
||||
@click.option("--headless", is_flag=True, help="Run browser in headless mode")
|
||||
@click.option("--incognito", is_flag=True, help="Run in incognito/private mode (ignores user-data-dir)")
|
||||
def cdp_cmd(user_data_dir: Optional[str], port: int, browser_type: str, headless: bool, incognito: bool):
|
||||
"""Launch a standalone browser with CDP debugging enabled
|
||||
|
||||
This command launches a browser with Chrome DevTools Protocol (CDP) debugging enabled,
|
||||
prints the CDP URL, and keeps the browser running until you press 'q'.
|
||||
|
||||
The CDP URL can be used for various automation and debugging tasks.
|
||||
|
||||
Examples:
|
||||
# Launch Chromium with CDP on default port 9222
|
||||
crwl cdp
|
||||
|
||||
# Use a specific directory for browser data and custom port
|
||||
crwl cdp --user-data-dir ~/browser-data --port 9223
|
||||
|
||||
# Launch in headless mode
|
||||
crwl cdp --headless
|
||||
|
||||
# Launch in incognito mode (ignores user-data-dir)
|
||||
crwl cdp --incognito
|
||||
"""
|
||||
profiler = BrowserProfiler()
|
||||
|
||||
try:
|
||||
# Handle data directory
|
||||
data_dir = None
|
||||
if not incognito and user_data_dir:
|
||||
# Expand user path (~/something)
|
||||
expanded_path = os.path.expanduser(user_data_dir)
|
||||
|
||||
# Create directory if it doesn't exist
|
||||
if not os.path.exists(expanded_path):
|
||||
console.print(f"[yellow]Directory '{expanded_path}' doesn't exist. Creating it.[/yellow]")
|
||||
os.makedirs(expanded_path, exist_ok=True)
|
||||
|
||||
data_dir = expanded_path
|
||||
|
||||
# Print launch info
|
||||
console.print(Panel(
|
||||
f"[cyan]Launching browser with CDP debugging[/cyan]\n\n"
|
||||
f"Browser type: [green]{browser_type}[/green]\n"
|
||||
f"Debugging port: [yellow]{port}[/yellow]\n"
|
||||
f"User data directory: [cyan]{data_dir or 'Temporary directory'}[/cyan]\n"
|
||||
f"Headless: [cyan]{'Yes' if headless else 'No'}[/cyan]\n"
|
||||
f"Incognito: [cyan]{'Yes' if incognito else 'No'}[/cyan]\n\n"
|
||||
f"[yellow]Press 'q' to quit when done[/yellow]",
|
||||
title="CDP Browser",
|
||||
border_style="cyan"
|
||||
))
|
||||
|
||||
# Run the browser
|
||||
cdp_url = anyio.run(
|
||||
profiler.launch_standalone_browser,
|
||||
browser_type,
|
||||
data_dir,
|
||||
port,
|
||||
headless
|
||||
)
|
||||
|
||||
if not cdp_url:
|
||||
console.print("[red]Failed to launch browser or get CDP URL[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
except Exception as e:
|
||||
console.print(f"[red]Error launching CDP browser: {str(e)}[/red]")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@cli.command("crawl")
|
||||
@click.argument("url", required=True)
|
||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||
@@ -647,7 +1067,7 @@ def crawl_cmd(url: str, browser_config: str, crawler_config: str, filter_config:
|
||||
raise click.ClickException("LLM provider and API token are required for LLM extraction")
|
||||
|
||||
crawler_cfg.extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
||||
llm_config=LLMConfig(provider=extract_conf["provider"], api_token=extract_conf["api_token"]),
|
||||
instruction=extract_conf["instruction"],
|
||||
schema=schema_data,
|
||||
**extract_conf.get("params", {})
|
||||
@@ -712,7 +1132,7 @@ def profiles_cmd():
|
||||
# Run interactive profile manager
|
||||
anyio.run(manage_profiles)
|
||||
|
||||
@cli.command()
|
||||
@cli.command(name="")
|
||||
@click.argument("url", required=False)
|
||||
@click.option("--example", is_flag=True, help="Show usage examples")
|
||||
@click.option("--browser-config", "-B", type=click.Path(exists=True), help="Browser config file (YAML/JSON)")
|
||||
@@ -740,6 +1160,8 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
Other commands:
|
||||
crwl profiles - Manage browser profiles for identity-based crawling
|
||||
crwl crawl - Crawl a website with advanced options
|
||||
crwl cdp - Launch browser with CDP debugging enabled
|
||||
crwl browser - Manage builtin browser (start, stop, status, restart)
|
||||
crwl examples - Show more usage examples
|
||||
"""
|
||||
|
||||
@@ -772,5 +1194,11 @@ def default(url: str, example: bool, browser_config: str, crawler_config: str, f
|
||||
profile=profile
|
||||
)
|
||||
|
||||
def main():
|
||||
import sys
|
||||
if len(sys.argv) < 2 or sys.argv[1] not in cli.commands:
|
||||
sys.argv.insert(1, "crawl")
|
||||
cli()
|
||||
|
||||
if __name__ == "__main__":
|
||||
cli()
|
||||
main()
|
||||
837
crawl4ai/components/crawler_monitor.py
Normal file
837
crawl4ai/components/crawler_monitor.py
Normal file
@@ -0,0 +1,837 @@
|
||||
import time
|
||||
import uuid
|
||||
import threading
|
||||
import psutil
|
||||
from datetime import datetime, timedelta
|
||||
from typing import Dict, Optional, List
|
||||
import threading
|
||||
from rich.console import Console
|
||||
from rich.layout import Layout
|
||||
from rich.panel import Panel
|
||||
from rich.table import Table
|
||||
from rich.text import Text
|
||||
from rich.live import Live
|
||||
from rich import box
|
||||
from ..models import CrawlStatus
|
||||
|
||||
class TerminalUI:
|
||||
"""Terminal user interface for CrawlerMonitor using rich library."""
|
||||
|
||||
def __init__(self, refresh_rate: float = 1.0, max_width: int = 120):
|
||||
"""
|
||||
Initialize the terminal UI.
|
||||
|
||||
Args:
|
||||
refresh_rate: How often to refresh the UI (in seconds)
|
||||
max_width: Maximum width of the UI in characters
|
||||
"""
|
||||
self.console = Console(width=max_width)
|
||||
self.layout = Layout()
|
||||
self.refresh_rate = refresh_rate
|
||||
self.stop_event = threading.Event()
|
||||
self.ui_thread = None
|
||||
self.monitor = None # Will be set by CrawlerMonitor
|
||||
self.max_width = max_width
|
||||
|
||||
# Setup layout - vertical layout (top to bottom)
|
||||
self.layout.split(
|
||||
Layout(name="header", size=3),
|
||||
Layout(name="pipeline_status", size=10),
|
||||
Layout(name="task_details", ratio=1),
|
||||
Layout(name="footer", size=3) # Increased footer size to fit all content
|
||||
)
|
||||
|
||||
def start(self, monitor):
|
||||
"""Start the UI thread."""
|
||||
self.monitor = monitor
|
||||
self.stop_event.clear()
|
||||
self.ui_thread = threading.Thread(target=self._ui_loop)
|
||||
self.ui_thread.daemon = True
|
||||
self.ui_thread.start()
|
||||
|
||||
def stop(self):
|
||||
"""Stop the UI thread."""
|
||||
if self.ui_thread and self.ui_thread.is_alive():
|
||||
self.stop_event.set()
|
||||
# Only try to join if we're not in the UI thread
|
||||
# This prevents "cannot join current thread" errors
|
||||
if threading.current_thread() != self.ui_thread:
|
||||
self.ui_thread.join(timeout=5.0)
|
||||
|
||||
def _ui_loop(self):
|
||||
"""Main UI rendering loop."""
|
||||
import sys
|
||||
import select
|
||||
import termios
|
||||
import tty
|
||||
|
||||
# Setup terminal for non-blocking input
|
||||
old_settings = termios.tcgetattr(sys.stdin)
|
||||
try:
|
||||
tty.setcbreak(sys.stdin.fileno())
|
||||
|
||||
# Use Live display to render the UI
|
||||
with Live(self.layout, refresh_per_second=1/self.refresh_rate, screen=True) as live:
|
||||
self.live = live # Store the live display for updates
|
||||
|
||||
# Main UI loop
|
||||
while not self.stop_event.is_set():
|
||||
self._update_display()
|
||||
|
||||
# Check for key press (non-blocking)
|
||||
if select.select([sys.stdin], [], [], 0)[0]:
|
||||
key = sys.stdin.read(1)
|
||||
# Check for 'q' to quit
|
||||
if key == 'q':
|
||||
# Signal stop but don't call monitor.stop() from UI thread
|
||||
# as it would cause the thread to try to join itself
|
||||
self.stop_event.set()
|
||||
self.monitor.is_running = False
|
||||
break
|
||||
|
||||
time.sleep(self.refresh_rate)
|
||||
|
||||
# Just check if the monitor was stopped
|
||||
if not self.monitor.is_running:
|
||||
break
|
||||
finally:
|
||||
# Restore terminal settings
|
||||
termios.tcsetattr(sys.stdin, termios.TCSADRAIN, old_settings)
|
||||
|
||||
def _update_display(self):
|
||||
"""Update the terminal display with current statistics."""
|
||||
if not self.monitor:
|
||||
return
|
||||
|
||||
# Update crawler status panel
|
||||
self.layout["header"].update(self._create_status_panel())
|
||||
|
||||
# Update pipeline status panel and task details panel
|
||||
self.layout["pipeline_status"].update(self._create_pipeline_panel())
|
||||
self.layout["task_details"].update(self._create_task_details_panel())
|
||||
|
||||
# Update footer
|
||||
self.layout["footer"].update(self._create_footer())
|
||||
|
||||
def _create_status_panel(self) -> Panel:
|
||||
"""Create the crawler status panel."""
|
||||
summary = self.monitor.get_summary()
|
||||
|
||||
# Format memory status with icon
|
||||
memory_status = self.monitor.get_memory_status()
|
||||
memory_icon = "🟢" # Default NORMAL
|
||||
if memory_status == "PRESSURE":
|
||||
memory_icon = "🟠"
|
||||
elif memory_status == "CRITICAL":
|
||||
memory_icon = "🔴"
|
||||
|
||||
# Get current memory usage
|
||||
current_memory = psutil.Process().memory_info().rss / (1024 * 1024) # MB
|
||||
memory_percent = (current_memory / psutil.virtual_memory().total) * 100
|
||||
|
||||
# Format runtime
|
||||
runtime = self.monitor._format_time(time.time() - self.monitor.start_time if self.monitor.start_time else 0)
|
||||
|
||||
# Create the status text
|
||||
status_text = Text()
|
||||
status_text.append(f"Web Crawler Dashboard | Runtime: {runtime} | Memory: {memory_percent:.1f}% {memory_icon}\n")
|
||||
status_text.append(f"Status: {memory_status} | URLs: {summary['urls_completed']}/{summary['urls_total']} | ")
|
||||
status_text.append(f"Peak Mem: {summary['peak_memory_percent']:.1f}% at {self.monitor._format_time(summary['peak_memory_time'])}")
|
||||
|
||||
return Panel(status_text, title="Crawler Status", border_style="blue")
|
||||
|
||||
def _create_pipeline_panel(self) -> Panel:
|
||||
"""Create the pipeline status panel."""
|
||||
summary = self.monitor.get_summary()
|
||||
queue_stats = self.monitor.get_queue_stats()
|
||||
|
||||
# Create a table for status counts
|
||||
table = Table(show_header=True, box=None)
|
||||
table.add_column("Status", style="cyan")
|
||||
table.add_column("Count", justify="right")
|
||||
table.add_column("Percentage", justify="right")
|
||||
table.add_column("Stat", style="cyan")
|
||||
table.add_column("Value", justify="right")
|
||||
|
||||
# Calculate overall progress
|
||||
progress = f"{summary['urls_completed']}/{summary['urls_total']}"
|
||||
progress_percent = f"{summary['completion_percentage']:.1f}%"
|
||||
|
||||
# Add rows for each status
|
||||
table.add_row(
|
||||
"Overall Progress",
|
||||
progress,
|
||||
progress_percent,
|
||||
"Est. Completion",
|
||||
summary.get('estimated_completion_time', "N/A")
|
||||
)
|
||||
|
||||
# Add rows for each status
|
||||
status_counts = summary['status_counts']
|
||||
total = summary['urls_total'] or 1 # Avoid division by zero
|
||||
|
||||
# Status rows
|
||||
table.add_row(
|
||||
"Completed",
|
||||
str(status_counts.get(CrawlStatus.COMPLETED.name, 0)),
|
||||
f"{status_counts.get(CrawlStatus.COMPLETED.name, 0) / total * 100:.1f}%",
|
||||
"Avg. Time/URL",
|
||||
f"{summary.get('avg_task_duration', 0):.2f}s"
|
||||
)
|
||||
|
||||
table.add_row(
|
||||
"Failed",
|
||||
str(status_counts.get(CrawlStatus.FAILED.name, 0)),
|
||||
f"{status_counts.get(CrawlStatus.FAILED.name, 0) / total * 100:.1f}%",
|
||||
"Concurrent Tasks",
|
||||
str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0))
|
||||
)
|
||||
|
||||
table.add_row(
|
||||
"In Progress",
|
||||
str(status_counts.get(CrawlStatus.IN_PROGRESS.name, 0)),
|
||||
f"{status_counts.get(CrawlStatus.IN_PROGRESS.name, 0) / total * 100:.1f}%",
|
||||
"Queue Size",
|
||||
str(queue_stats['total_queued'])
|
||||
)
|
||||
|
||||
table.add_row(
|
||||
"Queued",
|
||||
str(status_counts.get(CrawlStatus.QUEUED.name, 0)),
|
||||
f"{status_counts.get(CrawlStatus.QUEUED.name, 0) / total * 100:.1f}%",
|
||||
"Max Wait Time",
|
||||
f"{queue_stats['highest_wait_time']:.1f}s"
|
||||
)
|
||||
|
||||
# Requeued is a special case as it's not a status
|
||||
requeued_count = summary.get('requeued_count', 0)
|
||||
table.add_row(
|
||||
"Requeued",
|
||||
str(requeued_count),
|
||||
f"{summary.get('requeue_rate', 0):.1f}%",
|
||||
"Avg Wait Time",
|
||||
f"{queue_stats['avg_wait_time']:.1f}s"
|
||||
)
|
||||
|
||||
# Add empty row for spacing
|
||||
table.add_row(
|
||||
"",
|
||||
"",
|
||||
"",
|
||||
"Requeue Rate",
|
||||
f"{summary.get('requeue_rate', 0):.1f}%"
|
||||
)
|
||||
|
||||
return Panel(table, title="Pipeline Status", border_style="green")
|
||||
|
||||
def _create_task_details_panel(self) -> Panel:
|
||||
"""Create the task details panel."""
|
||||
# Create a table for task details
|
||||
table = Table(show_header=True, expand=True)
|
||||
table.add_column("Task ID", style="cyan", no_wrap=True, width=10)
|
||||
table.add_column("URL", style="blue", ratio=3)
|
||||
table.add_column("Status", style="green", width=15)
|
||||
table.add_column("Memory", justify="right", width=8)
|
||||
table.add_column("Peak", justify="right", width=8)
|
||||
table.add_column("Duration", justify="right", width=10)
|
||||
|
||||
# Get all task stats
|
||||
task_stats = self.monitor.get_all_task_stats()
|
||||
|
||||
# Add summary row
|
||||
active_tasks = sum(1 for stats in task_stats.values()
|
||||
if stats['status'] == CrawlStatus.IN_PROGRESS.name)
|
||||
|
||||
total_memory = sum(stats['memory_usage'] for stats in task_stats.values())
|
||||
total_peak = sum(stats['peak_memory'] for stats in task_stats.values())
|
||||
|
||||
# Summary row with separators
|
||||
table.add_row(
|
||||
"SUMMARY",
|
||||
f"Total: {len(task_stats)}",
|
||||
f"Active: {active_tasks}",
|
||||
f"{total_memory:.1f}",
|
||||
f"{total_peak:.1f}",
|
||||
"N/A"
|
||||
)
|
||||
|
||||
# Add a separator
|
||||
table.add_row("—" * 10, "—" * 20, "—" * 10, "—" * 8, "—" * 8, "—" * 10)
|
||||
|
||||
# Status icons
|
||||
status_icons = {
|
||||
CrawlStatus.QUEUED.name: "⏳",
|
||||
CrawlStatus.IN_PROGRESS.name: "🔄",
|
||||
CrawlStatus.COMPLETED.name: "✅",
|
||||
CrawlStatus.FAILED.name: "❌"
|
||||
}
|
||||
|
||||
# Calculate how many rows we can display based on available space
|
||||
# We can display more rows now that we have a dedicated panel
|
||||
display_count = min(len(task_stats), 20) # Display up to 20 tasks
|
||||
|
||||
# Add rows for each task
|
||||
for task_id, stats in sorted(
|
||||
list(task_stats.items())[:display_count],
|
||||
# Sort: 1. IN_PROGRESS first, 2. QUEUED, 3. COMPLETED/FAILED by recency
|
||||
key=lambda x: (
|
||||
0 if x[1]['status'] == CrawlStatus.IN_PROGRESS.name else
|
||||
1 if x[1]['status'] == CrawlStatus.QUEUED.name else
|
||||
2,
|
||||
-1 * (x[1].get('end_time', 0) or 0) # Most recent first
|
||||
)
|
||||
):
|
||||
# Truncate task_id and URL for display
|
||||
short_id = task_id[:8]
|
||||
url = stats['url']
|
||||
if len(url) > 50: # Allow longer URLs in the dedicated panel
|
||||
url = url[:47] + "..."
|
||||
|
||||
# Format status with icon
|
||||
status = f"{status_icons.get(stats['status'], '?')} {stats['status']}"
|
||||
|
||||
# Add row
|
||||
table.add_row(
|
||||
short_id,
|
||||
url,
|
||||
status,
|
||||
f"{stats['memory_usage']:.1f}",
|
||||
f"{stats['peak_memory']:.1f}",
|
||||
stats['duration'] if 'duration' in stats else "0:00"
|
||||
)
|
||||
|
||||
return Panel(table, title="Task Details", border_style="yellow")
|
||||
|
||||
def _create_footer(self) -> Panel:
|
||||
"""Create the footer panel."""
|
||||
from rich.columns import Columns
|
||||
from rich.align import Align
|
||||
|
||||
memory_status = self.monitor.get_memory_status()
|
||||
memory_icon = "🟢" # Default NORMAL
|
||||
if memory_status == "PRESSURE":
|
||||
memory_icon = "🟠"
|
||||
elif memory_status == "CRITICAL":
|
||||
memory_icon = "🔴"
|
||||
|
||||
# Left section - memory status
|
||||
left_text = Text()
|
||||
left_text.append("Memory Status: ", style="bold")
|
||||
status_style = "green" if memory_status == "NORMAL" else "yellow" if memory_status == "PRESSURE" else "red bold"
|
||||
left_text.append(f"{memory_icon} {memory_status}", style=status_style)
|
||||
|
||||
# Center section - copyright
|
||||
center_text = Text("© Crawl4AI 2025 | Made by UnclecCode", style="cyan italic")
|
||||
|
||||
# Right section - quit instruction
|
||||
right_text = Text()
|
||||
right_text.append("Press ", style="bold")
|
||||
right_text.append("q", style="white on blue")
|
||||
right_text.append(" to quit", style="bold")
|
||||
|
||||
# Create columns with the three sections
|
||||
footer_content = Columns(
|
||||
[
|
||||
Align.left(left_text),
|
||||
Align.center(center_text),
|
||||
Align.right(right_text)
|
||||
],
|
||||
expand=True
|
||||
)
|
||||
|
||||
# Create a more visible footer panel
|
||||
return Panel(
|
||||
footer_content,
|
||||
border_style="white",
|
||||
padding=(0, 1) # Add padding for better visibility
|
||||
)
|
||||
|
||||
|
||||
class CrawlerMonitor:
|
||||
"""
|
||||
Comprehensive monitoring and visualization system for tracking web crawler operations in real-time.
|
||||
Provides a terminal-based dashboard that displays task statuses, memory usage, queue statistics,
|
||||
and performance metrics.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
urls_total: int = 0,
|
||||
refresh_rate: float = 1.0,
|
||||
enable_ui: bool = True,
|
||||
max_width: int = 120
|
||||
):
|
||||
"""
|
||||
Initialize the CrawlerMonitor.
|
||||
|
||||
Args:
|
||||
urls_total: Total number of URLs to be crawled
|
||||
refresh_rate: How often to refresh the UI (in seconds)
|
||||
enable_ui: Whether to display the terminal UI
|
||||
max_width: Maximum width of the UI in characters
|
||||
"""
|
||||
# Core monitoring attributes
|
||||
self.stats = {} # Task ID -> stats dict
|
||||
self.memory_status = "NORMAL"
|
||||
self.start_time = None
|
||||
self.end_time = None
|
||||
self.is_running = False
|
||||
self.queue_stats = {
|
||||
"total_queued": 0,
|
||||
"highest_wait_time": 0.0,
|
||||
"avg_wait_time": 0.0
|
||||
}
|
||||
self.urls_total = urls_total
|
||||
self.urls_completed = 0
|
||||
self.peak_memory_percent = 0.0
|
||||
self.peak_memory_time = 0.0
|
||||
|
||||
# Status counts
|
||||
self.status_counts = {
|
||||
CrawlStatus.QUEUED.name: 0,
|
||||
CrawlStatus.IN_PROGRESS.name: 0,
|
||||
CrawlStatus.COMPLETED.name: 0,
|
||||
CrawlStatus.FAILED.name: 0
|
||||
}
|
||||
|
||||
# Requeue tracking
|
||||
self.requeued_count = 0
|
||||
|
||||
# Thread-safety
|
||||
self._lock = threading.RLock()
|
||||
|
||||
# Terminal UI
|
||||
self.enable_ui = enable_ui
|
||||
self.terminal_ui = TerminalUI(
|
||||
refresh_rate=refresh_rate,
|
||||
max_width=max_width
|
||||
) if enable_ui else None
|
||||
|
||||
def start(self):
|
||||
"""
|
||||
Start the monitoring session.
|
||||
|
||||
- Initializes the start_time
|
||||
- Sets is_running to True
|
||||
- Starts the terminal UI if enabled
|
||||
"""
|
||||
with self._lock:
|
||||
self.start_time = time.time()
|
||||
self.is_running = True
|
||||
|
||||
# Start the terminal UI
|
||||
if self.enable_ui and self.terminal_ui:
|
||||
self.terminal_ui.start(self)
|
||||
|
||||
def stop(self):
|
||||
"""
|
||||
Stop the monitoring session.
|
||||
|
||||
- Records end_time
|
||||
- Sets is_running to False
|
||||
- Stops the terminal UI
|
||||
- Generates final summary statistics
|
||||
"""
|
||||
with self._lock:
|
||||
self.end_time = time.time()
|
||||
self.is_running = False
|
||||
|
||||
# Stop the terminal UI
|
||||
if self.enable_ui and self.terminal_ui:
|
||||
self.terminal_ui.stop()
|
||||
|
||||
def add_task(self, task_id: str, url: str):
|
||||
"""
|
||||
Register a new task with the monitor.
|
||||
|
||||
Args:
|
||||
task_id: Unique identifier for the task
|
||||
url: URL being crawled
|
||||
|
||||
The task is initialized with:
|
||||
- status: QUEUED
|
||||
- url: The URL to crawl
|
||||
- enqueue_time: Current time
|
||||
- memory_usage: 0
|
||||
- peak_memory: 0
|
||||
- wait_time: 0
|
||||
- retry_count: 0
|
||||
"""
|
||||
with self._lock:
|
||||
self.stats[task_id] = {
|
||||
"task_id": task_id,
|
||||
"url": url,
|
||||
"status": CrawlStatus.QUEUED.name,
|
||||
"enqueue_time": time.time(),
|
||||
"start_time": None,
|
||||
"end_time": None,
|
||||
"memory_usage": 0.0,
|
||||
"peak_memory": 0.0,
|
||||
"error_message": "",
|
||||
"wait_time": 0.0,
|
||||
"retry_count": 0,
|
||||
"duration": "0:00",
|
||||
"counted_requeue": False
|
||||
}
|
||||
|
||||
# Update status counts
|
||||
self.status_counts[CrawlStatus.QUEUED.name] += 1
|
||||
|
||||
def update_task(
|
||||
self,
|
||||
task_id: str,
|
||||
status: Optional[CrawlStatus] = None,
|
||||
start_time: Optional[float] = None,
|
||||
end_time: Optional[float] = None,
|
||||
memory_usage: Optional[float] = None,
|
||||
peak_memory: Optional[float] = None,
|
||||
error_message: Optional[str] = None,
|
||||
retry_count: Optional[int] = None,
|
||||
wait_time: Optional[float] = None
|
||||
):
|
||||
"""
|
||||
Update statistics for a specific task.
|
||||
|
||||
Args:
|
||||
task_id: Unique identifier for the task
|
||||
status: New status (QUEUED, IN_PROGRESS, COMPLETED, FAILED)
|
||||
start_time: When task execution started
|
||||
end_time: When task execution ended
|
||||
memory_usage: Current memory usage in MB
|
||||
peak_memory: Maximum memory usage in MB
|
||||
error_message: Error description if failed
|
||||
retry_count: Number of retry attempts
|
||||
wait_time: Time spent in queue
|
||||
|
||||
Updates task statistics and updates status counts.
|
||||
If status changes, decrements old status count and
|
||||
increments new status count.
|
||||
"""
|
||||
with self._lock:
|
||||
# Check if task exists
|
||||
if task_id not in self.stats:
|
||||
return
|
||||
|
||||
task_stats = self.stats[task_id]
|
||||
|
||||
# Update status counts if status is changing
|
||||
old_status = task_stats["status"]
|
||||
if status and status.name != old_status:
|
||||
self.status_counts[old_status] -= 1
|
||||
self.status_counts[status.name] += 1
|
||||
|
||||
# Track completion
|
||||
if status == CrawlStatus.COMPLETED:
|
||||
self.urls_completed += 1
|
||||
|
||||
# Track requeues
|
||||
if old_status in [CrawlStatus.COMPLETED.name, CrawlStatus.FAILED.name] and not task_stats.get("counted_requeue", False):
|
||||
self.requeued_count += 1
|
||||
task_stats["counted_requeue"] = True
|
||||
|
||||
# Update task statistics
|
||||
if status:
|
||||
task_stats["status"] = status.name
|
||||
if start_time is not None:
|
||||
task_stats["start_time"] = start_time
|
||||
if end_time is not None:
|
||||
task_stats["end_time"] = end_time
|
||||
if memory_usage is not None:
|
||||
task_stats["memory_usage"] = memory_usage
|
||||
|
||||
# Update peak memory if necessary
|
||||
current_percent = (memory_usage / psutil.virtual_memory().total) * 100
|
||||
if current_percent > self.peak_memory_percent:
|
||||
self.peak_memory_percent = current_percent
|
||||
self.peak_memory_time = time.time()
|
||||
|
||||
if peak_memory is not None:
|
||||
task_stats["peak_memory"] = peak_memory
|
||||
if error_message is not None:
|
||||
task_stats["error_message"] = error_message
|
||||
if retry_count is not None:
|
||||
task_stats["retry_count"] = retry_count
|
||||
if wait_time is not None:
|
||||
task_stats["wait_time"] = wait_time
|
||||
|
||||
# Calculate duration
|
||||
if task_stats["start_time"]:
|
||||
end = task_stats["end_time"] or time.time()
|
||||
duration = end - task_stats["start_time"]
|
||||
task_stats["duration"] = self._format_time(duration)
|
||||
|
||||
def update_memory_status(self, status: str):
|
||||
"""
|
||||
Update the current memory status.
|
||||
|
||||
Args:
|
||||
status: Memory status (NORMAL, PRESSURE, CRITICAL, or custom)
|
||||
|
||||
Also updates the UI to reflect the new status.
|
||||
"""
|
||||
with self._lock:
|
||||
self.memory_status = status
|
||||
|
||||
def update_queue_statistics(
|
||||
self,
|
||||
total_queued: int,
|
||||
highest_wait_time: float,
|
||||
avg_wait_time: float
|
||||
):
|
||||
"""
|
||||
Update statistics related to the task queue.
|
||||
|
||||
Args:
|
||||
total_queued: Number of tasks currently in queue
|
||||
highest_wait_time: Longest wait time of any queued task
|
||||
avg_wait_time: Average wait time across all queued tasks
|
||||
"""
|
||||
with self._lock:
|
||||
self.queue_stats = {
|
||||
"total_queued": total_queued,
|
||||
"highest_wait_time": highest_wait_time,
|
||||
"avg_wait_time": avg_wait_time
|
||||
}
|
||||
|
||||
def get_task_stats(self, task_id: str) -> Dict:
|
||||
"""
|
||||
Get statistics for a specific task.
|
||||
|
||||
Args:
|
||||
task_id: Unique identifier for the task
|
||||
|
||||
Returns:
|
||||
Dictionary containing all task statistics
|
||||
"""
|
||||
with self._lock:
|
||||
return self.stats.get(task_id, {}).copy()
|
||||
|
||||
def get_all_task_stats(self) -> Dict[str, Dict]:
|
||||
"""
|
||||
Get statistics for all tasks.
|
||||
|
||||
Returns:
|
||||
Dictionary mapping task_ids to their statistics
|
||||
"""
|
||||
with self._lock:
|
||||
return self.stats.copy()
|
||||
|
||||
def get_memory_status(self) -> str:
|
||||
"""
|
||||
Get the current memory status.
|
||||
|
||||
Returns:
|
||||
Current memory status string
|
||||
"""
|
||||
with self._lock:
|
||||
return self.memory_status
|
||||
|
||||
def get_queue_stats(self) -> Dict:
|
||||
"""
|
||||
Get current queue statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary with queue statistics including:
|
||||
- total_queued: Number of tasks in queue
|
||||
- highest_wait_time: Longest wait time
|
||||
- avg_wait_time: Average wait time
|
||||
"""
|
||||
with self._lock:
|
||||
return self.queue_stats.copy()
|
||||
|
||||
def get_summary(self) -> Dict:
|
||||
"""
|
||||
Get a summary of all crawler statistics.
|
||||
|
||||
Returns:
|
||||
Dictionary containing:
|
||||
- runtime: Total runtime in seconds
|
||||
- urls_total: Total URLs to process
|
||||
- urls_completed: Number of completed URLs
|
||||
- completion_percentage: Percentage complete
|
||||
- status_counts: Count of tasks in each status
|
||||
- memory_status: Current memory status
|
||||
- peak_memory_percent: Highest memory usage
|
||||
- peak_memory_time: When peak memory occurred
|
||||
- avg_task_duration: Average task processing time
|
||||
- estimated_completion_time: Projected finish time
|
||||
- requeue_rate: Percentage of tasks requeued
|
||||
"""
|
||||
with self._lock:
|
||||
# Calculate runtime
|
||||
current_time = time.time()
|
||||
runtime = current_time - (self.start_time or current_time)
|
||||
|
||||
# Calculate completion percentage
|
||||
completion_percentage = 0
|
||||
if self.urls_total > 0:
|
||||
completion_percentage = (self.urls_completed / self.urls_total) * 100
|
||||
|
||||
# Calculate average task duration for completed tasks
|
||||
completed_tasks = [
|
||||
task for task in self.stats.values()
|
||||
if task["status"] == CrawlStatus.COMPLETED.name and task.get("start_time") and task.get("end_time")
|
||||
]
|
||||
|
||||
avg_task_duration = 0
|
||||
if completed_tasks:
|
||||
total_duration = sum(task["end_time"] - task["start_time"] for task in completed_tasks)
|
||||
avg_task_duration = total_duration / len(completed_tasks)
|
||||
|
||||
# Calculate requeue rate
|
||||
requeue_rate = 0
|
||||
if len(self.stats) > 0:
|
||||
requeue_rate = (self.requeued_count / len(self.stats)) * 100
|
||||
|
||||
# Calculate estimated completion time
|
||||
estimated_completion_time = "N/A"
|
||||
if avg_task_duration > 0 and self.urls_total > 0 and self.urls_completed > 0:
|
||||
remaining_tasks = self.urls_total - self.urls_completed
|
||||
estimated_seconds = remaining_tasks * avg_task_duration
|
||||
estimated_completion_time = self._format_time(estimated_seconds)
|
||||
|
||||
return {
|
||||
"runtime": runtime,
|
||||
"urls_total": self.urls_total,
|
||||
"urls_completed": self.urls_completed,
|
||||
"completion_percentage": completion_percentage,
|
||||
"status_counts": self.status_counts.copy(),
|
||||
"memory_status": self.memory_status,
|
||||
"peak_memory_percent": self.peak_memory_percent,
|
||||
"peak_memory_time": self.peak_memory_time,
|
||||
"avg_task_duration": avg_task_duration,
|
||||
"estimated_completion_time": estimated_completion_time,
|
||||
"requeue_rate": requeue_rate,
|
||||
"requeued_count": self.requeued_count
|
||||
}
|
||||
|
||||
def render(self):
|
||||
"""
|
||||
Render the terminal UI.
|
||||
|
||||
This is the main UI rendering loop that:
|
||||
1. Updates all statistics
|
||||
2. Formats the display
|
||||
3. Renders the ASCII interface
|
||||
4. Handles keyboard input
|
||||
|
||||
Note: The actual rendering is handled by the TerminalUI class
|
||||
which uses the rich library's Live display.
|
||||
"""
|
||||
if self.enable_ui and self.terminal_ui:
|
||||
# Force an update of the UI
|
||||
if hasattr(self.terminal_ui, '_update_display'):
|
||||
self.terminal_ui._update_display()
|
||||
|
||||
def _format_time(self, seconds: float) -> str:
|
||||
"""
|
||||
Format time in hours:minutes:seconds.
|
||||
|
||||
Args:
|
||||
seconds: Time in seconds
|
||||
|
||||
Returns:
|
||||
Formatted time string (e.g., "1:23:45")
|
||||
"""
|
||||
delta = timedelta(seconds=int(seconds))
|
||||
hours, remainder = divmod(delta.seconds, 3600)
|
||||
minutes, seconds = divmod(remainder, 60)
|
||||
|
||||
if hours > 0:
|
||||
return f"{hours}:{minutes:02}:{seconds:02}"
|
||||
else:
|
||||
return f"{minutes}:{seconds:02}"
|
||||
|
||||
def _calculate_estimated_completion(self) -> str:
|
||||
"""
|
||||
Calculate estimated completion time based on current progress.
|
||||
|
||||
Returns:
|
||||
Formatted time string
|
||||
"""
|
||||
summary = self.get_summary()
|
||||
return summary.get("estimated_completion_time", "N/A")
|
||||
|
||||
|
||||
# Example code for testing
|
||||
if __name__ == "__main__":
|
||||
# Initialize the monitor
|
||||
monitor = CrawlerMonitor(urls_total=100)
|
||||
|
||||
# Start monitoring
|
||||
monitor.start()
|
||||
|
||||
try:
|
||||
# Simulate some tasks
|
||||
for i in range(20):
|
||||
task_id = str(uuid.uuid4())
|
||||
url = f"https://example.com/page{i}"
|
||||
monitor.add_task(task_id, url)
|
||||
|
||||
# Simulate 20% of tasks are already running
|
||||
if i < 4:
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=time.time() - 30, # Started 30 seconds ago
|
||||
memory_usage=10.5
|
||||
)
|
||||
|
||||
# Simulate 10% of tasks are completed
|
||||
if i >= 4 and i < 6:
|
||||
start_time = time.time() - 60
|
||||
end_time = time.time() - 15
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=start_time,
|
||||
memory_usage=8.2
|
||||
)
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.COMPLETED,
|
||||
end_time=end_time,
|
||||
memory_usage=0,
|
||||
peak_memory=15.7
|
||||
)
|
||||
|
||||
# Simulate 5% of tasks fail
|
||||
if i >= 6 and i < 7:
|
||||
start_time = time.time() - 45
|
||||
end_time = time.time() - 20
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=start_time,
|
||||
memory_usage=12.3
|
||||
)
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
end_time=end_time,
|
||||
memory_usage=0,
|
||||
peak_memory=18.2,
|
||||
error_message="Connection timeout"
|
||||
)
|
||||
|
||||
# Simulate memory pressure
|
||||
monitor.update_memory_status("PRESSURE")
|
||||
|
||||
# Simulate queue statistics
|
||||
monitor.update_queue_statistics(
|
||||
total_queued=16, # 20 - 4 (in progress)
|
||||
highest_wait_time=120.5,
|
||||
avg_wait_time=60.2
|
||||
)
|
||||
|
||||
# Keep the monitor running for a demonstration
|
||||
print("Crawler Monitor is running. Press 'q' to exit.")
|
||||
while monitor.is_running:
|
||||
time.sleep(0.1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nExiting crawler monitor...")
|
||||
finally:
|
||||
# Stop the monitor
|
||||
monitor.stop()
|
||||
print("Crawler monitor exited successfully.")
|
||||
@@ -4,7 +4,8 @@ from dotenv import load_dotenv
|
||||
load_dotenv() # Load environment variables from .env file
|
||||
|
||||
# Default provider, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
DEFAULT_PROVIDER = "openai/gpt-4o-mini"
|
||||
DEFAULT_PROVIDER = "openai/gpt-4o"
|
||||
DEFAULT_PROVIDER_API_KEY = "OPENAI_API_KEY"
|
||||
MODEL_REPO_BRANCH = "new-release-0.0.2"
|
||||
# Provider-model dictionary, ONLY used when the extraction strategy is LLMExtractionStrategy
|
||||
PROVIDER_MODELS = {
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
from .proxy_config import ProxyConfig
|
||||
__all__ = ["ProxyConfig"]
|
||||
@@ -1,113 +0,0 @@
|
||||
import os
|
||||
from typing import Dict, List, Optional
|
||||
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
"""Configuration class for a single proxy.
|
||||
|
||||
Args:
|
||||
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||||
username: Optional username for proxy authentication
|
||||
password: Optional password for proxy authentication
|
||||
ip: Optional IP address for verification purposes
|
||||
"""
|
||||
self.server = server
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
# Extract IP from server if not explicitly provided
|
||||
self.ip = ip or self._extract_ip_from_server()
|
||||
|
||||
def _extract_ip_from_server(self) -> Optional[str]:
|
||||
"""Extract IP address from server URL."""
|
||||
try:
|
||||
# Simple extraction assuming http://ip:port format
|
||||
if "://" in self.server:
|
||||
parts = self.server.split("://")[1].split(":")
|
||||
return parts[0]
|
||||
else:
|
||||
parts = self.server.split(":")
|
||||
return parts[0]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a dictionary."""
|
||||
return ProxyConfig(
|
||||
server=proxy_dict.get("server"),
|
||||
username=proxy_dict.get("username"),
|
||||
password=proxy_dict.get("password"),
|
||||
ip=proxy_dict.get("ip")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||||
"""Load proxies from environment variable.
|
||||
|
||||
Args:
|
||||
env_var: Name of environment variable containing comma-separated proxy strings
|
||||
|
||||
Returns:
|
||||
List of ProxyConfig objects
|
||||
"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv(env_var, "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
proxies.append(ProxyConfig.from_string(proxy))
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"server": self.server,
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"ip": self.ip
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "ProxyConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
ProxyConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return ProxyConfig.from_dict(config_dict)
|
||||
@@ -16,13 +16,13 @@ from .utils import (
|
||||
extract_xml_data,
|
||||
merge_chunks,
|
||||
)
|
||||
from .types import LLMConfig
|
||||
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE
|
||||
from abc import ABC, abstractmethod
|
||||
import math
|
||||
from snowballstemmer import stemmer
|
||||
from .config import DEFAULT_PROVIDER, OVERLAP_RATE, WORD_TOKEN_RATE, PROVIDER_MODELS
|
||||
from .models import TokenUsage
|
||||
from .prompts import PROMPT_FILTER_CONTENT
|
||||
import os
|
||||
import json
|
||||
import hashlib
|
||||
from pathlib import Path
|
||||
@@ -770,37 +770,56 @@ class PruningContentFilter(RelevantContentFilter):
|
||||
|
||||
|
||||
class LLMContentFilter(RelevantContentFilter):
|
||||
"""Content filtering using LLMs to generate relevant markdown."""
|
||||
"""Content filtering using LLMs to generate relevant markdown.
|
||||
|
||||
How it works:
|
||||
1. Extracts page metadata with fallbacks.
|
||||
2. Extracts text chunks from the body element.
|
||||
3. Applies LLMs to generate markdown for each chunk.
|
||||
4. Filters out chunks below the threshold.
|
||||
5. Sorts chunks by score in descending order.
|
||||
6. Returns the top N chunks.
|
||||
|
||||
Attributes:
|
||||
llm_config (LLMConfig): LLM configuration object.
|
||||
instruction (str): Instruction for LLM markdown generation
|
||||
chunk_token_threshold (int): Chunk token threshold for splitting (default: 1e9).
|
||||
overlap_rate (float): Overlap rate for chunking (default: 0.5).
|
||||
word_token_rate (float): Word token rate for chunking (default: 0.2).
|
||||
verbose (bool): Enable verbose logging (default: False).
|
||||
logger (AsyncLogger): Custom logger for LLM operations (optional).
|
||||
"""
|
||||
_UNWANTED_PROPS = {
|
||||
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
|
||||
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
||||
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
||||
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
||||
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||
}
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
llmConfig: "LlmConfig" = None,
|
||||
llm_config: "LLMConfig" = None,
|
||||
instruction: str = None,
|
||||
chunk_token_threshold: int = int(1e9),
|
||||
overlap_rate: float = OVERLAP_RATE,
|
||||
word_token_rate: float = WORD_TOKEN_RATE,
|
||||
base_url: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
extra_args: Dict = None,
|
||||
# char_token_rate: float = WORD_TOKEN_RATE * 5,
|
||||
# chunk_mode: str = "char",
|
||||
verbose: bool = False,
|
||||
logger: Optional[AsyncLogger] = None,
|
||||
ignore_cache: bool = True,
|
||||
# Deprecated properties
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: Optional[str] = None,
|
||||
api_base: Optional[str] = None,
|
||||
extra_args: Dict = None,
|
||||
):
|
||||
super().__init__(None)
|
||||
self.provider = provider
|
||||
self.api_token = api_token
|
||||
self.base_url = base_url or api_base
|
||||
self.llmConfig = llmConfig
|
||||
self.llm_config = llm_config
|
||||
self.instruction = instruction
|
||||
self.chunk_token_threshold = chunk_token_threshold
|
||||
self.overlap_rate = overlap_rate
|
||||
@@ -872,7 +891,7 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
self.logger.info(
|
||||
"Starting LLM markdown content filtering process",
|
||||
tag="LLM",
|
||||
params={"provider": self.llmConfig.provider},
|
||||
params={"provider": self.llm_config.provider},
|
||||
colors={"provider": Fore.CYAN},
|
||||
)
|
||||
|
||||
@@ -959,10 +978,10 @@ class LLMContentFilter(RelevantContentFilter):
|
||||
|
||||
future = executor.submit(
|
||||
_proceed_with_chunk,
|
||||
self.llmConfig.provider,
|
||||
self.llm_config.provider,
|
||||
prompt,
|
||||
self.llmConfig.api_token,
|
||||
self.llmConfig.base_url,
|
||||
self.llm_config.api_token,
|
||||
self.llm_config.base_url,
|
||||
self.extra_args,
|
||||
)
|
||||
futures.append((i, future))
|
||||
|
||||
@@ -155,6 +155,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
for aud in raw_result.get("media", {}).get("audios", [])
|
||||
if aud
|
||||
],
|
||||
tables=raw_result.get("media", {}).get("tables", [])
|
||||
)
|
||||
|
||||
# Convert links
|
||||
@@ -193,6 +194,153 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
"""
|
||||
return await asyncio.to_thread(self._scrap, url, html, **kwargs)
|
||||
|
||||
def is_data_table(self, table: Tag, **kwargs) -> bool:
|
||||
"""
|
||||
Determine if a table element is a data table (not a layout table).
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
**kwargs: Additional keyword arguments including table_score_threshold
|
||||
|
||||
Returns:
|
||||
bool: True if the table is a data table, False otherwise
|
||||
"""
|
||||
score = 0
|
||||
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.select('thead')) > 0
|
||||
has_tbody = len(table.select('tbody')) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.select('th'))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or len(table.select('tr:first-child th')) > 0:
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.select('table')) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get('role', '').lower()
|
||||
if role in {'presentation', 'none'}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.select('tr')
|
||||
if not rows:
|
||||
return False
|
||||
|
||||
col_counts = [len(row.select('td, th')) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.select('caption'):
|
||||
score += 2
|
||||
if table.has_attr('summary') and table['summary']:
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(cell.get_text().strip()) for row in rows for cell in row.select('td, th'))
|
||||
total_tags = sum(1 for _ in table.descendants if isinstance(_, Tag))
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrs if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get('table_score_threshold', 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: Tag) -> dict:
|
||||
"""
|
||||
Extract structured data from a table element.
|
||||
|
||||
Args:
|
||||
table (Tag): BeautifulSoup Tag representing a table element
|
||||
|
||||
Returns:
|
||||
dict: Dictionary containing table data (headers, rows, caption, summary)
|
||||
"""
|
||||
caption_elem = table.select_one('caption')
|
||||
caption = caption_elem.get_text().strip() if caption_elem else ""
|
||||
summary = table.get('summary', '').strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.select('thead tr')
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].select('th')
|
||||
for cell in header_cells:
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.select('tr:first-child')
|
||||
if first_row:
|
||||
for cell in first_row[0].select('th, td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
all_rows = table.select('tr')
|
||||
thead = table.select_one('thead')
|
||||
tbody_rows = []
|
||||
|
||||
if thead:
|
||||
thead_rows = thead.select('tr')
|
||||
tbody_rows = [row for row in all_rows if row not in thead_rows]
|
||||
else:
|
||||
if all_rows and all_rows[0].select('th'):
|
||||
tbody_rows = all_rows[1:]
|
||||
else:
|
||||
tbody_rows = all_rows
|
||||
|
||||
for row in tbody_rows:
|
||||
# for row in table.select('tr:not(:has(ancestor::thead))'):
|
||||
row_data = []
|
||||
for cell in row.select('td'):
|
||||
text = cell.get_text().strip()
|
||||
colspan = int(cell.get('colspan', 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def flatten_nested_elements(self, node):
|
||||
"""
|
||||
Flatten nested elements in a HTML tree.
|
||||
@@ -431,7 +579,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
Returns:
|
||||
dict: A dictionary containing the processed element information.
|
||||
"""
|
||||
media = {"images": [], "videos": [], "audios": []}
|
||||
media = {"images": [], "videos": [], "audios": [], "tables": []}
|
||||
internal_links_dict = {}
|
||||
external_links_dict = {}
|
||||
self._process_element(
|
||||
@@ -688,6 +836,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
"""
|
||||
@@ -742,22 +891,37 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
for element in body.select(excluded_selector):
|
||||
element.extract()
|
||||
|
||||
if css_selector:
|
||||
selected_elements = body.select(css_selector)
|
||||
if not selected_elements:
|
||||
return {
|
||||
"markdown": "",
|
||||
"cleaned_html": "",
|
||||
"success": True,
|
||||
"media": {"images": [], "videos": [], "audios": []},
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
"message": f"No elements found for CSS selector: {css_selector}",
|
||||
}
|
||||
# raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
||||
body = soup.new_tag("div")
|
||||
for el in selected_elements:
|
||||
body.append(el)
|
||||
# if False and css_selector:
|
||||
# selected_elements = body.select(css_selector)
|
||||
# if not selected_elements:
|
||||
# return {
|
||||
# "markdown": "",
|
||||
# "cleaned_html": "",
|
||||
# "success": True,
|
||||
# "media": {"images": [], "videos": [], "audios": []},
|
||||
# "links": {"internal": [], "external": []},
|
||||
# "metadata": {},
|
||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
||||
# }
|
||||
# # raise InvalidCSSSelectorError(f"Invalid CSS selector, No elements found for CSS selector: {css_selector}")
|
||||
# body = soup.new_tag("div")
|
||||
# for el in selected_elements:
|
||||
# body.append(el)
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.select(target_element))
|
||||
content_element = soup.new_tag("div")
|
||||
for el in for_content_targeted_element:
|
||||
content_element.append(el)
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
kwargs["exclude_social_media_domains"] = set(
|
||||
kwargs.get("exclude_social_media_domains", []) + SOCIAL_MEDIA_DOMAINS
|
||||
@@ -797,6 +961,15 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
if result is not None
|
||||
for img in result
|
||||
]
|
||||
|
||||
# Process tables if not excluded
|
||||
excluded_tags = set(kwargs.get("excluded_tags", []) or [])
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.find_all('table')
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
body = self.flatten_nested_elements(body)
|
||||
base64_pattern = re.compile(r'data:image/[^;]+;base64,([^"]+)')
|
||||
@@ -808,7 +981,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
|
||||
str_body = ""
|
||||
try:
|
||||
str_body = body.encode_contents().decode("utf-8")
|
||||
str_body = content_element.encode_contents().decode("utf-8")
|
||||
except Exception:
|
||||
# Reset body to the original HTML
|
||||
success = False
|
||||
@@ -847,7 +1020,6 @@ class WebScrapingStrategy(ContentScrapingStrategy):
|
||||
cleaned_html = str_body.replace("\n\n", "\n").replace(" ", " ")
|
||||
|
||||
return {
|
||||
# **markdown_content,
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": success,
|
||||
"media": media,
|
||||
@@ -1187,12 +1359,125 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
|
||||
return root
|
||||
|
||||
def is_data_table(self, table: etree.Element, **kwargs) -> bool:
|
||||
score = 0
|
||||
# Check for thead and tbody
|
||||
has_thead = len(table.xpath(".//thead")) > 0
|
||||
has_tbody = len(table.xpath(".//tbody")) > 0
|
||||
if has_thead:
|
||||
score += 2
|
||||
if has_tbody:
|
||||
score += 1
|
||||
|
||||
# Check for th elements
|
||||
th_count = len(table.xpath(".//th"))
|
||||
if th_count > 0:
|
||||
score += 2
|
||||
if has_thead or table.xpath(".//tr[1]/th"):
|
||||
score += 1
|
||||
|
||||
# Check for nested tables
|
||||
if len(table.xpath(".//table")) > 0:
|
||||
score -= 3
|
||||
|
||||
# Role attribute check
|
||||
role = table.get("role", "").lower()
|
||||
if role in {"presentation", "none"}:
|
||||
score -= 3
|
||||
|
||||
# Column consistency
|
||||
rows = table.xpath(".//tr")
|
||||
if not rows:
|
||||
return False
|
||||
col_counts = [len(row.xpath(".//td|.//th")) for row in rows]
|
||||
avg_cols = sum(col_counts) / len(col_counts)
|
||||
variance = sum((c - avg_cols)**2 for c in col_counts) / len(col_counts)
|
||||
if variance < 1:
|
||||
score += 2
|
||||
|
||||
# Caption and summary
|
||||
if table.xpath(".//caption"):
|
||||
score += 2
|
||||
if table.get("summary"):
|
||||
score += 1
|
||||
|
||||
# Text density
|
||||
total_text = sum(len(''.join(cell.itertext()).strip()) for row in rows for cell in row.xpath(".//td|.//th"))
|
||||
total_tags = sum(1 for _ in table.iterdescendants())
|
||||
text_ratio = total_text / (total_tags + 1e-5)
|
||||
if text_ratio > 20:
|
||||
score += 3
|
||||
elif text_ratio > 10:
|
||||
score += 2
|
||||
|
||||
# Data attributes
|
||||
data_attrs = sum(1 for attr in table.attrib if attr.startswith('data-'))
|
||||
score += data_attrs * 0.5
|
||||
|
||||
# Size check
|
||||
if avg_cols >= 2 and len(rows) >= 2:
|
||||
score += 2
|
||||
|
||||
threshold = kwargs.get("table_score_threshold", 7)
|
||||
return score >= threshold
|
||||
|
||||
def extract_table_data(self, table: etree.Element) -> dict:
|
||||
caption = table.xpath(".//caption/text()")
|
||||
caption = caption[0].strip() if caption else ""
|
||||
summary = table.get("summary", "").strip()
|
||||
|
||||
# Extract headers with colspan handling
|
||||
headers = []
|
||||
thead_rows = table.xpath(".//thead/tr")
|
||||
if thead_rows:
|
||||
header_cells = thead_rows[0].xpath(".//th")
|
||||
for cell in header_cells:
|
||||
text = cell.text_content().strip()
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
headers.extend([text] * colspan)
|
||||
else:
|
||||
first_row = table.xpath(".//tr[1]")
|
||||
if first_row:
|
||||
for cell in first_row[0].xpath(".//th|.//td"):
|
||||
text = cell.text_content().strip()
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
headers.extend([text] * colspan)
|
||||
|
||||
# Extract rows with colspan handling
|
||||
rows = []
|
||||
for row in table.xpath(".//tr[not(ancestor::thead)]"):
|
||||
row_data = []
|
||||
for cell in row.xpath(".//td"):
|
||||
text = cell.text_content().strip()
|
||||
colspan = int(cell.get("colspan", 1))
|
||||
row_data.extend([text] * colspan)
|
||||
if row_data:
|
||||
rows.append(row_data)
|
||||
|
||||
# Align rows with headers
|
||||
max_columns = len(headers) if headers else (max(len(row) for row in rows) if rows else 0)
|
||||
aligned_rows = []
|
||||
for row in rows:
|
||||
aligned = row[:max_columns] + [''] * (max_columns - len(row))
|
||||
aligned_rows.append(aligned)
|
||||
|
||||
if not headers:
|
||||
headers = [f"Column {i+1}" for i in range(max_columns)]
|
||||
|
||||
return {
|
||||
"headers": headers,
|
||||
"rows": aligned_rows,
|
||||
"caption": caption,
|
||||
"summary": summary,
|
||||
}
|
||||
|
||||
def _scrap(
|
||||
self,
|
||||
url: str,
|
||||
html: str,
|
||||
word_count_threshold: int = MIN_WORD_THRESHOLD,
|
||||
css_selector: str = None,
|
||||
target_elements: List[str] = None,
|
||||
**kwargs,
|
||||
) -> Dict[str, Any]:
|
||||
if not html:
|
||||
@@ -1243,24 +1528,38 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
meta = {}
|
||||
|
||||
# Handle CSS selector targeting
|
||||
if css_selector:
|
||||
# if css_selector:
|
||||
# try:
|
||||
# selected_elements = body.cssselect(css_selector)
|
||||
# if not selected_elements:
|
||||
# return {
|
||||
# "markdown": "",
|
||||
# "cleaned_html": "",
|
||||
# "success": True,
|
||||
# "media": {"images": [], "videos": [], "audios": []},
|
||||
# "links": {"internal": [], "external": []},
|
||||
# "metadata": meta,
|
||||
# "message": f"No elements found for CSS selector: {css_selector}",
|
||||
# }
|
||||
# body = lhtml.Element("div")
|
||||
# body.extend(selected_elements)
|
||||
# except Exception as e:
|
||||
# self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
|
||||
# return None
|
||||
|
||||
content_element = None
|
||||
if target_elements:
|
||||
try:
|
||||
selected_elements = body.cssselect(css_selector)
|
||||
if not selected_elements:
|
||||
return {
|
||||
"markdown": "",
|
||||
"cleaned_html": "",
|
||||
"success": True,
|
||||
"media": {"images": [], "videos": [], "audios": []},
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": meta,
|
||||
"message": f"No elements found for CSS selector: {css_selector}",
|
||||
}
|
||||
body = lhtml.Element("div")
|
||||
body.extend(selected_elements)
|
||||
for_content_targeted_element = []
|
||||
for target_element in target_elements:
|
||||
for_content_targeted_element.extend(body.cssselect(target_element))
|
||||
content_element = lhtml.Element("div")
|
||||
content_element.extend(for_content_targeted_element)
|
||||
except Exception as e:
|
||||
self._log("error", f"Error with CSS selector: {str(e)}", "SCRAPE")
|
||||
self._log("error", f"Error with target element detection: {str(e)}", "SCRAPE")
|
||||
return None
|
||||
else:
|
||||
content_element = body
|
||||
|
||||
# Remove script and style tags
|
||||
for tag in ["script", "style", "link", "meta", "noscript"]:
|
||||
@@ -1284,7 +1583,7 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
form.getparent().remove(form)
|
||||
|
||||
# Process content
|
||||
media = {"images": [], "videos": [], "audios": []}
|
||||
media = {"images": [], "videos": [], "audios": [], "tables": []}
|
||||
internal_links_dict = {}
|
||||
external_links_dict = {}
|
||||
|
||||
@@ -1298,6 +1597,13 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
if 'table' not in excluded_tags:
|
||||
tables = body.xpath(".//table")
|
||||
for table in tables:
|
||||
if self.is_data_table(table, **kwargs):
|
||||
table_data = self.extract_table_data(table)
|
||||
media["tables"].append(table_data)
|
||||
|
||||
# Handle only_text option
|
||||
if kwargs.get("only_text", False):
|
||||
for tag in ONLY_TEXT_ELIGIBLE_TAGS:
|
||||
@@ -1324,7 +1630,8 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
|
||||
# Generate output HTML
|
||||
cleaned_html = lhtml.tostring(
|
||||
body,
|
||||
# body,
|
||||
content_element,
|
||||
encoding="unicode",
|
||||
pretty_print=True,
|
||||
method="html",
|
||||
@@ -1369,7 +1676,12 @@ class LXMLWebScrapingStrategy(WebScrapingStrategy):
|
||||
return {
|
||||
"cleaned_html": cleaned_html,
|
||||
"success": False,
|
||||
"media": {"images": [], "videos": [], "audios": []},
|
||||
"media": {
|
||||
"images": [],
|
||||
"videos": [],
|
||||
"audios": [],
|
||||
"tables": []
|
||||
},
|
||||
"links": {"internal": [], "external": []},
|
||||
"metadata": {},
|
||||
}
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from crawl4ai import BrowserConfig, AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.hub import BaseCrawler
|
||||
from crawl4ai.utils import optimize_html, get_home_folder
|
||||
from crawl4ai.utils import optimize_html, get_home_folder, preprocess_html_for_schema
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from pathlib import Path
|
||||
import json
|
||||
@@ -68,7 +68,8 @@ class GoogleSearchCrawler(BaseCrawler):
|
||||
home_dir = get_home_folder() if not schema_cache_path else schema_cache_path
|
||||
os.makedirs(f"{home_dir}/schema", exist_ok=True)
|
||||
|
||||
cleaned_html = optimize_html(html, threshold=100)
|
||||
# cleaned_html = optimize_html(html, threshold=100)
|
||||
cleaned_html = preprocess_html_for_schema(html)
|
||||
|
||||
organic_schema = None
|
||||
if os.path.exists(f"{home_dir}/schema/organic_schema.json"):
|
||||
|
||||
@@ -7,6 +7,7 @@ from contextvars import ContextVar
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||
|
||||
|
||||
|
||||
class DeepCrawlDecorator:
|
||||
"""Decorator that adds deep crawling capability to arun method."""
|
||||
deep_crawl_active = ContextVar("deep_crawl_active", default=False)
|
||||
@@ -59,7 +60,8 @@ class DeepCrawlStrategy(ABC):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlResult]:
|
||||
# ) -> List[CrawlResult]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Batch (non-streaming) mode:
|
||||
Processes one BFS level at a time, then yields all the results.
|
||||
@@ -72,7 +74,8 @@ class DeepCrawlStrategy(ABC):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
# ) -> AsyncGenerator[CrawlResult, None]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Streaming mode:
|
||||
Processes one BFS level at a time and yields results immediately as they arrive.
|
||||
|
||||
@@ -9,7 +9,8 @@ from ..models import TraversalStats
|
||||
from .filters import FilterChain
|
||||
from .scorers import URLScorer
|
||||
from . import DeepCrawlStrategy
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, CrawlResult, RunManyReturn
|
||||
from ..utils import normalize_url_for_deep_crawl, efficient_normalize_url_for_deep_crawl
|
||||
from math import inf as infinity
|
||||
|
||||
class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
@@ -99,14 +100,17 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
# First collect all valid links
|
||||
for link in links:
|
||||
url = link.get("href")
|
||||
if url in visited:
|
||||
# Strip URL fragments to avoid duplicate crawling
|
||||
# base_url = url.split('#')[0] if url else url
|
||||
base_url = normalize_url_for_deep_crawl(url, source_url)
|
||||
if base_url in visited:
|
||||
continue
|
||||
if not await self.can_process_url(url, next_depth):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
# Score the URL if a scorer is provided
|
||||
score = self.url_scorer.score(url) if self.url_scorer else 0
|
||||
score = self.url_scorer.score(base_url) if self.url_scorer else 0
|
||||
|
||||
# Skip URLs with scores below the threshold
|
||||
if score < self.score_threshold:
|
||||
@@ -114,7 +118,7 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
self.stats.urls_skipped += 1
|
||||
continue
|
||||
|
||||
valid_links.append((url, score))
|
||||
valid_links.append((base_url, score))
|
||||
|
||||
# If we have more valid links than capacity, sort by score and take the top ones
|
||||
if len(valid_links) > remaining_capacity:
|
||||
@@ -139,7 +143,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlResult]:
|
||||
# ) -> List[CrawlResult]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Batch (non-streaming) mode:
|
||||
Processes one BFS level at a time, then yields all the results.
|
||||
@@ -187,7 +192,8 @@ class BFSDeepCrawlStrategy(DeepCrawlStrategy):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
# ) -> AsyncGenerator[CrawlResult, None]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Streaming mode:
|
||||
Processes one BFS level at a time and yields results immediately as they arrive.
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import AsyncGenerator, Optional, Set, Dict, List, Tuple
|
||||
|
||||
from ..models import CrawlResult
|
||||
from .bfs_strategy import BFSDeepCrawlStrategy # noqa
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig
|
||||
from ..types import AsyncWebCrawler, CrawlerRunConfig, RunManyReturn
|
||||
|
||||
class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
"""
|
||||
@@ -17,7 +17,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> List[CrawlResult]:
|
||||
# ) -> List[CrawlResult]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Batch (non-streaming) DFS mode.
|
||||
Uses a stack to traverse URLs in DFS order, aggregating CrawlResults into a list.
|
||||
@@ -65,7 +66,8 @@ class DFSDeepCrawlStrategy(BFSDeepCrawlStrategy):
|
||||
start_url: str,
|
||||
crawler: AsyncWebCrawler,
|
||||
config: CrawlerRunConfig,
|
||||
) -> AsyncGenerator[CrawlResult, None]:
|
||||
# ) -> AsyncGenerator[CrawlResult, None]:
|
||||
) -> RunManyReturn:
|
||||
"""
|
||||
Streaming DFS mode.
|
||||
Uses a stack to traverse URLs in DFS order and yields CrawlResults as they become available.
|
||||
|
||||
@@ -124,6 +124,7 @@ class URLPatternFilter(URLFilter):
|
||||
"_simple_prefixes",
|
||||
"_domain_patterns",
|
||||
"_path_patterns",
|
||||
"_reverse",
|
||||
)
|
||||
|
||||
PATTERN_TYPES = {
|
||||
@@ -138,8 +139,10 @@ class URLPatternFilter(URLFilter):
|
||||
self,
|
||||
patterns: Union[str, Pattern, List[Union[str, Pattern]]],
|
||||
use_glob: bool = True,
|
||||
reverse: bool = False,
|
||||
):
|
||||
super().__init__()
|
||||
self._reverse = reverse
|
||||
patterns = [patterns] if isinstance(patterns, (str, Pattern)) else patterns
|
||||
|
||||
self._simple_suffixes = set()
|
||||
@@ -205,36 +208,40 @@ class URLPatternFilter(URLFilter):
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def apply(self, url: str) -> bool:
|
||||
"""Hierarchical pattern matching"""
|
||||
# Quick suffix check (*.html)
|
||||
if self._simple_suffixes:
|
||||
path = url.split("?")[0]
|
||||
if path.split("/")[-1].split(".")[-1] in self._simple_suffixes:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
# Domain check
|
||||
if self._domain_patterns:
|
||||
for pattern in self._domain_patterns:
|
||||
if pattern.match(url):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
# Prefix check (/foo/*)
|
||||
if self._simple_prefixes:
|
||||
path = url.split("?")[0]
|
||||
if any(path.startswith(p) for p in self._simple_prefixes):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
# Complex patterns
|
||||
if self._path_patterns:
|
||||
if any(p.search(url) for p in self._path_patterns):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
result = True
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
self._update_stats(False)
|
||||
return False
|
||||
result = False
|
||||
self._update_stats(result)
|
||||
return not result if self._reverse else result
|
||||
|
||||
|
||||
class ContentTypeFilter(URLFilter):
|
||||
@@ -427,6 +434,11 @@ class DomainFilter(URLFilter):
|
||||
if isinstance(domains, str):
|
||||
return {domains.lower()}
|
||||
return {d.lower() for d in domains}
|
||||
|
||||
@staticmethod
|
||||
def _is_subdomain(domain: str, parent_domain: str) -> bool:
|
||||
"""Check if domain is a subdomain of parent_domain"""
|
||||
return domain == parent_domain or domain.endswith(f".{parent_domain}")
|
||||
|
||||
@staticmethod
|
||||
@lru_cache(maxsize=10000)
|
||||
@@ -444,20 +456,26 @@ class DomainFilter(URLFilter):
|
||||
|
||||
domain = self._extract_domain(url)
|
||||
|
||||
# Early return for blocked domains
|
||||
if domain in self._blocked_domains:
|
||||
self._update_stats(False)
|
||||
return False
|
||||
# Check for blocked domains, including subdomains
|
||||
for blocked in self._blocked_domains:
|
||||
if self._is_subdomain(domain, blocked):
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
# If no allowed domains specified, accept all non-blocked
|
||||
if self._allowed_domains is None:
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# Final allowed domains check
|
||||
result = domain in self._allowed_domains
|
||||
self._update_stats(result)
|
||||
return result
|
||||
# Check if domain matches any allowed domain (including subdomains)
|
||||
for allowed in self._allowed_domains:
|
||||
if self._is_subdomain(domain, allowed):
|
||||
self._update_stats(True)
|
||||
return True
|
||||
|
||||
# No matches found
|
||||
self._update_stats(False)
|
||||
return False
|
||||
|
||||
|
||||
class ContentRelevanceFilter(URLFilter):
|
||||
|
||||
@@ -4,12 +4,10 @@ from typing import Any, List, Dict, Optional
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
import json
|
||||
import time
|
||||
import os
|
||||
|
||||
from .prompts import PROMPT_EXTRACT_BLOCKS, PROMPT_EXTRACT_BLOCKS_WITH_INSTRUCTION, PROMPT_EXTRACT_SCHEMA_WITH_INSTRUCTION, JSON_SCHEMA_BUILDER_XPATH
|
||||
from .config import (
|
||||
DEFAULT_PROVIDER, PROVIDER_MODELS,
|
||||
CHUNK_TOKEN_THRESHOLD,
|
||||
DEFAULT_PROVIDER, CHUNK_TOKEN_THRESHOLD,
|
||||
OVERLAP_RATE,
|
||||
WORD_TOKEN_RATE,
|
||||
)
|
||||
@@ -22,9 +20,7 @@ from .utils import (
|
||||
extract_xml_data,
|
||||
split_and_parse_json_objects,
|
||||
sanitize_input_encode,
|
||||
chunk_documents,
|
||||
merge_chunks,
|
||||
advanced_split,
|
||||
)
|
||||
from .models import * # noqa: F403
|
||||
|
||||
@@ -38,8 +34,9 @@ from .model_loader import (
|
||||
calculate_batch_size
|
||||
)
|
||||
|
||||
from .types import LLMConfig, create_llm_config
|
||||
|
||||
from functools import partial
|
||||
import math
|
||||
import numpy as np
|
||||
import re
|
||||
from bs4 import BeautifulSoup
|
||||
@@ -481,8 +478,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
A strategy that uses an LLM to extract meaningful content from the HTML.
|
||||
|
||||
Attributes:
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
llm_config: The LLM configuration object.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
@@ -490,27 +486,20 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
"""
|
||||
_UNWANTED_PROPS = {
|
||||
'provider' : 'Instead, use llmConfig=LlmConfig(provider="...")',
|
||||
'api_token' : 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
||||
'base_url' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
||||
'api_base' : 'Instead, use llmConfig=LlmConfig(base_url="...")',
|
||||
'provider' : 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||
'api_token' : 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||
'base_url' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||
'api_base' : 'Instead, use llm_config=LLMConfig(base_url="...")',
|
||||
}
|
||||
def __init__(
|
||||
self,
|
||||
llmConfig: 'LLMConfig' = None,
|
||||
llm_config: 'LLMConfig' = None,
|
||||
instruction: str = None,
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: str = None,
|
||||
api_base: str = None,
|
||||
schema: Dict = None,
|
||||
extraction_type="block",
|
||||
chunk_token_threshold=CHUNK_TOKEN_THRESHOLD,
|
||||
@@ -519,15 +508,18 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
apply_chunking=True,
|
||||
input_format: str = "markdown",
|
||||
verbose=False,
|
||||
# Deprecated arguments
|
||||
provider: str = DEFAULT_PROVIDER,
|
||||
api_token: Optional[str] = None,
|
||||
base_url: str = None,
|
||||
api_base: str = None,
|
||||
**kwargs,
|
||||
):
|
||||
"""
|
||||
Initialize the strategy with clustering parameters.
|
||||
|
||||
Args:
|
||||
llmConfig: The LLM configuration object.
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
llm_config: The LLM configuration object.
|
||||
instruction: The instruction to use for the LLM model.
|
||||
schema: Pydantic model schema for structured data.
|
||||
extraction_type: "block" or "schema".
|
||||
@@ -535,20 +527,19 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
overlap_rate: Overlap between chunks.
|
||||
word_token_rate: Word to token conversion rate.
|
||||
apply_chunking: Whether to apply chunking.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
verbose: Whether to print verbose output.
|
||||
usages: List of individual token usages.
|
||||
total_usage: Accumulated token usage.
|
||||
|
||||
# Deprecated arguments, will be removed very soon
|
||||
provider: The provider to use for extraction. It follows the format <provider_name>/<model_name>, e.g., "ollama/llama3.3".
|
||||
api_token: The API token for the provider.
|
||||
base_url: The base URL for the API request.
|
||||
api_base: The base URL for the API request.
|
||||
extra_args: Additional arguments for the API request, such as temprature, max_tokens, etc.
|
||||
"""
|
||||
super().__init__( input_format=input_format, **kwargs)
|
||||
self.llmConfig = llmConfig
|
||||
self.provider = provider
|
||||
self.api_token = api_token
|
||||
self.base_url = base_url
|
||||
self.api_base = api_base
|
||||
self.llm_config = llm_config
|
||||
self.instruction = instruction
|
||||
self.extract_type = extraction_type
|
||||
self.schema = schema
|
||||
@@ -565,6 +556,11 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
self.usages = [] # Store individual usages
|
||||
self.total_usage = TokenUsage() # Accumulated usage
|
||||
|
||||
self.provider = provider
|
||||
self.api_token = api_token
|
||||
self.base_url = base_url
|
||||
self.api_base = api_base
|
||||
|
||||
|
||||
def __setattr__(self, name, value):
|
||||
"""Handle attribute setting."""
|
||||
@@ -618,10 +614,10 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
)
|
||||
|
||||
response = perform_completion_with_backoff(
|
||||
self.llmConfig.provider,
|
||||
self.llm_config.provider,
|
||||
prompt_with_variables,
|
||||
self.llmConfig.api_token,
|
||||
base_url=self.llmConfig.base_url,
|
||||
self.llm_config.api_token,
|
||||
base_url=self.llm_config.base_url,
|
||||
extra_args=self.extra_args,
|
||||
) # , json_response=self.extract_type == "schema")
|
||||
# Track usage
|
||||
@@ -701,7 +697,7 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
overlap=int(self.chunk_token_threshold * self.overlap_rate),
|
||||
)
|
||||
extracted_content = []
|
||||
if self.llmConfig.provider.startswith("groq/"):
|
||||
if self.llm_config.provider.startswith("groq/"):
|
||||
# Sequential processing with a delay
|
||||
for ix, section in enumerate(merged_sections):
|
||||
extract_func = partial(self.extract, url)
|
||||
@@ -761,8 +757,6 @@ class LLMExtractionStrategy(ExtractionStrategy):
|
||||
#######################################################
|
||||
# New extraction strategies for JSON-based extraction #
|
||||
#######################################################
|
||||
|
||||
|
||||
class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
"""
|
||||
Abstract base class for extracting structured JSON from HTML content.
|
||||
@@ -1043,8 +1037,8 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
pass
|
||||
|
||||
_GENERATE_SCHEMA_UNWANTED_PROPS = {
|
||||
'provider': 'Instead, use llmConfig=LlmConfig(provider="...")',
|
||||
'api_token': 'Instead, use llmConfig=LlMConfig(api_token="...")',
|
||||
'provider': 'Instead, use llm_config=LLMConfig(provider="...")',
|
||||
'api_token': 'Instead, use llm_config=LlMConfig(api_token="...")',
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
@@ -1053,7 +1047,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
schema_type: str = "CSS", # or XPATH
|
||||
query: str = None,
|
||||
target_json_example: str = None,
|
||||
llmConfig: 'LLMConfig' = None,
|
||||
llm_config: 'LLMConfig' = create_llm_config(),
|
||||
provider: str = None,
|
||||
api_token: str = None,
|
||||
**kwargs
|
||||
@@ -1066,9 +1060,9 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
query (str, optional): Natural language description of what data to extract
|
||||
provider (str): Legacy Parameter. LLM provider to use
|
||||
api_token (str): Legacy Parameter. API token for LLM provider
|
||||
llmConfig (LlmConfig): LLM configuration object
|
||||
llm_config (LLMConfig): LLM configuration object
|
||||
prompt (str, optional): Custom prompt template to use
|
||||
**kwargs: Additional args passed to perform_completion_with_backoff
|
||||
**kwargs: Additional args passed to LLM processor
|
||||
|
||||
Returns:
|
||||
dict: Generated schema following the JsonElementExtractionStrategy format
|
||||
@@ -1085,7 +1079,7 @@ class JsonElementExtractionStrategy(ExtractionStrategy):
|
||||
# Build the prompt
|
||||
system_message = {
|
||||
"role": "system",
|
||||
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. You use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
|
||||
"content": f"""You specialize in generating special JSON schemas for web scraping. This schema uses CSS or XPATH selectors to present a repetitive pattern in crawled HTML, such as a product in a product list or a search result item in a list of search results. We use this JSON schema to pass to a language model along with the HTML content to extract structured data from the HTML. The language model uses the JSON schema to extract data from the HTML and retrieve values for fields in the JSON schema, following the schema.
|
||||
|
||||
Generating this HTML manually is not feasible, so you need to generate the JSON schema using the HTML content. The HTML copied from the crawled website is provided below, which we believe contains the repetitive pattern.
|
||||
|
||||
@@ -1099,9 +1093,10 @@ Generating this HTML manually is not feasible, so you need to generate the JSON
|
||||
In this context, the following items may or may not be present:
|
||||
- Example of target JSON object: This is a sample of the final JSON object that we hope to extract from the HTML using the schema you are generating.
|
||||
- Extra Instructions: This is optional instructions to consider when generating the schema provided by the user.
|
||||
- Query or explanation of target/goal data item: This is a description of what data we are trying to extract from the HTML. This explanation means we're not sure about the rigid schema of the structures we want, so we leave it to you to use your expertise to create the best and most comprehensive structures aimed at maximizing data extraction from this page. You must ensure that you do not pick up nuances that may exist on a particular page. The focus should be on the data we are extracting, and it must be valid, safe, and robust based on the given HTML.
|
||||
|
||||
# What if there is no example of target JSON object?
|
||||
In this scenario, use your best judgment to generate the schema. Try to maximize the number of fields that you can extract from the HTML.
|
||||
# What if there is no example of target JSON object and also no extra instructions or even no explanation of target/goal data item?
|
||||
In this scenario, use your best judgment to generate the schema. You need to examine the content of the page and understand the data it provides. If the page contains repetitive data, such as lists of items, products, jobs, places, books, or movies, focus on one single item that repeats. If the page is a detailed page about one product or item, create a schema to extract the entire structured data. At this stage, you must think and decide for yourself. Try to maximize the number of fields that you can extract from the HTML.
|
||||
|
||||
# What are the instructions and details for this schema generation?
|
||||
{prompt_template}"""
|
||||
@@ -1118,11 +1113,18 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
||||
}
|
||||
|
||||
if query:
|
||||
user_message["content"] += f"\n\nImportant Notes to Consider:\n{query}"
|
||||
user_message["content"] += f"\n\n## Query or explanation of target/goal data item:\n{query}"
|
||||
if target_json_example:
|
||||
user_message["content"] += f"\n\nExample of target JSON object:\n{target_json_example}"
|
||||
user_message["content"] += f"\n\n## Example of target JSON object:\n```json\n{target_json_example}\n```"
|
||||
|
||||
if query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: To remind you, in this process, we are not providing a rigid example of the adjacent objects we seek. We rely on your understanding of the explanation provided in the above section. Make sure to grasp what we are looking for and, based on that, create the best schema.."""
|
||||
elif not query and target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Please remember that in this process, we provided a proper example of a target JSON object. Make sure to adhere to the structure and create a schema that exactly fits this example. If you find that some elements on the page do not match completely, vote for the majority."""
|
||||
elif not query and not target_json_example:
|
||||
user_message["content"] += """IMPORTANT: Since we neither have a query nor an example, it is crucial to rely solely on the HTML content provided. Leverage your expertise to determine the schema based on the repetitive patterns observed in the content."""
|
||||
|
||||
user_message["content"] += """IMPORTANT: Ensure your schema is reliable, meaning do not use selectors that seem to generate dynamically and are not reliable. A reliable schema is what you want, as it consistently returns the same data even after many reloads of the page.
|
||||
user_message["content"] += """IMPORTANT: Ensure your schema remains reliable by avoiding selectors that appear to generate dynamically and are not dependable. You want a reliable schema, as it consistently returns the same data even after many page reloads.
|
||||
|
||||
Analyze the HTML and generate a JSON schema that follows the specified format. Only output valid JSON schema, nothing else.
|
||||
"""
|
||||
@@ -1130,11 +1132,12 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
||||
try:
|
||||
# Call LLM with backoff handling
|
||||
response = perform_completion_with_backoff(
|
||||
provider=llmConfig.provider,
|
||||
provider=llm_config.provider,
|
||||
prompt_with_variables="\n\n".join([system_message["content"], user_message["content"]]),
|
||||
json_response = True,
|
||||
api_token=llmConfig.api_token,
|
||||
**kwargs
|
||||
api_token=llm_config.api_token,
|
||||
base_url=llm_config.base_url,
|
||||
extra_args=kwargs
|
||||
)
|
||||
|
||||
# Extract and return schema
|
||||
@@ -1143,7 +1146,6 @@ In this scenario, use your best judgment to generate the schema. Try to maximize
|
||||
except Exception as e:
|
||||
raise Exception(f"Failed to generate schema: {str(e)}")
|
||||
|
||||
|
||||
class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors.
|
||||
@@ -1171,7 +1173,8 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
super().__init__(schema, **kwargs)
|
||||
|
||||
def _parse_html(self, html_content: str):
|
||||
return BeautifulSoup(html_content, "html.parser")
|
||||
# return BeautifulSoup(html_content, "html.parser")
|
||||
return BeautifulSoup(html_content, "lxml")
|
||||
|
||||
def _get_base_elements(self, parsed_html, selector: str):
|
||||
return parsed_html.select(selector)
|
||||
@@ -1190,6 +1193,373 @@ class JsonCssExtractionStrategy(JsonElementExtractionStrategy):
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
return element.get(attribute)
|
||||
|
||||
class JsonLxmlExtractionStrategy(JsonElementExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs["input_format"] = "html"
|
||||
super().__init__(schema, **kwargs)
|
||||
self._selector_cache = {}
|
||||
self._xpath_cache = {}
|
||||
self._result_cache = {}
|
||||
|
||||
# Control selector optimization strategy
|
||||
self.use_caching = kwargs.get("use_caching", True)
|
||||
self.optimize_common_patterns = kwargs.get("optimize_common_patterns", True)
|
||||
|
||||
# Load lxml dependencies once
|
||||
from lxml import etree, html
|
||||
from lxml.cssselect import CSSSelector
|
||||
self.etree = etree
|
||||
self.html_parser = html
|
||||
self.CSSSelector = CSSSelector
|
||||
|
||||
def _parse_html(self, html_content: str):
|
||||
"""Parse HTML content with error recovery"""
|
||||
try:
|
||||
parser = self.etree.HTMLParser(recover=True, remove_blank_text=True)
|
||||
return self.etree.fromstring(html_content, parser)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error parsing HTML, falling back to alternative method: {e}")
|
||||
try:
|
||||
return self.html_parser.fromstring(html_content)
|
||||
except Exception as e2:
|
||||
if self.verbose:
|
||||
print(f"Critical error parsing HTML: {e2}")
|
||||
# Create minimal document as fallback
|
||||
return self.etree.Element("html")
|
||||
|
||||
def _optimize_selector(self, selector_str):
|
||||
"""Optimize common selector patterns for better performance"""
|
||||
if not self.optimize_common_patterns:
|
||||
return selector_str
|
||||
|
||||
# Handle td:nth-child(N) pattern which is very common in table scraping
|
||||
import re
|
||||
if re.search(r'td:nth-child\(\d+\)', selector_str):
|
||||
return selector_str # Already handled specially in _apply_selector
|
||||
|
||||
# Split complex selectors into parts for optimization
|
||||
parts = selector_str.split()
|
||||
if len(parts) <= 1:
|
||||
return selector_str
|
||||
|
||||
# For very long selectors, consider using just the last specific part
|
||||
if len(parts) > 3 and any(p.startswith('.') or p.startswith('#') for p in parts):
|
||||
specific_parts = [p for p in parts if p.startswith('.') or p.startswith('#')]
|
||||
if specific_parts:
|
||||
return specific_parts[-1] # Use most specific class/id selector
|
||||
|
||||
return selector_str
|
||||
|
||||
def _create_selector_function(self, selector_str):
|
||||
"""Create a selector function that handles all edge cases"""
|
||||
original_selector = selector_str
|
||||
|
||||
# Try to optimize the selector if appropriate
|
||||
if self.optimize_common_patterns:
|
||||
selector_str = self._optimize_selector(selector_str)
|
||||
|
||||
try:
|
||||
# Attempt to compile the CSS selector
|
||||
compiled = self.CSSSelector(selector_str)
|
||||
xpath = compiled.path
|
||||
|
||||
# Store XPath for later use
|
||||
self._xpath_cache[selector_str] = xpath
|
||||
|
||||
# Create the wrapper function that implements the selection strategy
|
||||
def selector_func(element, context_sensitive=True):
|
||||
cache_key = None
|
||||
|
||||
# Use result caching if enabled
|
||||
if self.use_caching:
|
||||
# Create a cache key based on element and selector
|
||||
element_id = element.get('id', '') or str(hash(element))
|
||||
cache_key = f"{element_id}::{selector_str}"
|
||||
|
||||
if cache_key in self._result_cache:
|
||||
return self._result_cache[cache_key]
|
||||
|
||||
results = []
|
||||
try:
|
||||
# Strategy 1: Direct CSS selector application (fastest)
|
||||
results = compiled(element)
|
||||
|
||||
# If that fails and we need context sensitivity
|
||||
if not results and context_sensitive:
|
||||
# Strategy 2: Try XPath with context adjustment
|
||||
context_xpath = self._make_context_sensitive_xpath(xpath, element)
|
||||
if context_xpath:
|
||||
results = element.xpath(context_xpath)
|
||||
|
||||
# Strategy 3: Handle special case - nth-child
|
||||
if not results and 'nth-child' in original_selector:
|
||||
results = self._handle_nth_child_selector(element, original_selector)
|
||||
|
||||
# Strategy 4: Direct descendant search for class/ID selectors
|
||||
if not results:
|
||||
results = self._fallback_class_id_search(element, original_selector)
|
||||
|
||||
# Strategy 5: Last resort - tag name search for the final part
|
||||
if not results:
|
||||
parts = original_selector.split()
|
||||
if parts:
|
||||
last_part = parts[-1]
|
||||
# Extract tag name from the selector
|
||||
tag_match = re.match(r'^(\w+)', last_part)
|
||||
if tag_match:
|
||||
tag_name = tag_match.group(1)
|
||||
results = element.xpath(f".//{tag_name}")
|
||||
|
||||
# Cache results if caching is enabled
|
||||
if self.use_caching and cache_key:
|
||||
self._result_cache[cache_key] = results
|
||||
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error applying selector '{selector_str}': {e}")
|
||||
|
||||
return results
|
||||
|
||||
return selector_func
|
||||
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error compiling selector '{selector_str}': {e}")
|
||||
|
||||
# Fallback function for invalid selectors
|
||||
return lambda element, context_sensitive=True: []
|
||||
|
||||
def _make_context_sensitive_xpath(self, xpath, element):
|
||||
"""Convert absolute XPath to context-sensitive XPath"""
|
||||
try:
|
||||
# If starts with descendant-or-self, it's already context-sensitive
|
||||
if xpath.startswith('descendant-or-self::'):
|
||||
return xpath
|
||||
|
||||
# Remove leading slash if present
|
||||
if xpath.startswith('/'):
|
||||
context_xpath = f".{xpath}"
|
||||
else:
|
||||
context_xpath = f".//{xpath}"
|
||||
|
||||
# Validate the XPath by trying it
|
||||
try:
|
||||
element.xpath(context_xpath)
|
||||
return context_xpath
|
||||
except:
|
||||
# If that fails, try a simpler descendant search
|
||||
return f".//{xpath.split('/')[-1]}"
|
||||
except:
|
||||
return None
|
||||
|
||||
def _handle_nth_child_selector(self, element, selector_str):
|
||||
"""Special handling for nth-child selectors in tables"""
|
||||
import re
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Extract the column number from td:nth-child(N)
|
||||
match = re.search(r'td:nth-child\((\d+)\)', selector_str)
|
||||
if match:
|
||||
col_num = match.group(1)
|
||||
|
||||
# Check if there's content after the nth-child part
|
||||
remaining_selector = selector_str.split(f"td:nth-child({col_num})", 1)[-1].strip()
|
||||
|
||||
if remaining_selector:
|
||||
# If there's a specific element we're looking for after the column
|
||||
# Extract any tag names from the remaining selector
|
||||
tag_match = re.search(r'(\w+)', remaining_selector)
|
||||
tag_name = tag_match.group(1) if tag_match else '*'
|
||||
results = element.xpath(f".//td[{col_num}]//{tag_name}")
|
||||
else:
|
||||
# Just get the column cell
|
||||
results = element.xpath(f".//td[{col_num}]")
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error handling nth-child selector: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def _fallback_class_id_search(self, element, selector_str):
|
||||
"""Fallback to search by class or ID"""
|
||||
results = []
|
||||
|
||||
try:
|
||||
# Extract class selectors (.classname)
|
||||
import re
|
||||
class_matches = re.findall(r'\.([a-zA-Z0-9_-]+)', selector_str)
|
||||
|
||||
# Extract ID selectors (#idname)
|
||||
id_matches = re.findall(r'#([a-zA-Z0-9_-]+)', selector_str)
|
||||
|
||||
# Try each class
|
||||
for class_name in class_matches:
|
||||
class_results = element.xpath(f".//*[contains(@class, '{class_name}')]")
|
||||
results.extend(class_results)
|
||||
|
||||
# Try each ID (usually more specific)
|
||||
for id_name in id_matches:
|
||||
id_results = element.xpath(f".//*[@id='{id_name}']")
|
||||
results.extend(id_results)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error in fallback class/id search: {e}")
|
||||
|
||||
return results
|
||||
|
||||
def _get_selector(self, selector_str):
|
||||
"""Get or create a selector function with caching"""
|
||||
if selector_str not in self._selector_cache:
|
||||
self._selector_cache[selector_str] = self._create_selector_function(selector_str)
|
||||
return self._selector_cache[selector_str]
|
||||
|
||||
def _get_base_elements(self, parsed_html, selector: str):
|
||||
"""Get all base elements using the selector"""
|
||||
selector_func = self._get_selector(selector)
|
||||
# For base elements, we don't need context sensitivity
|
||||
return selector_func(parsed_html, context_sensitive=False)
|
||||
|
||||
def _get_elements(self, element, selector: str):
|
||||
"""Get child elements using the selector with context sensitivity"""
|
||||
selector_func = self._get_selector(selector)
|
||||
return selector_func(element, context_sensitive=True)
|
||||
|
||||
def _get_element_text(self, element) -> str:
|
||||
"""Extract normalized text from element"""
|
||||
try:
|
||||
# Get all text nodes and normalize
|
||||
text = " ".join(t.strip() for t in element.xpath(".//text()") if t.strip())
|
||||
return text
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error extracting text: {e}")
|
||||
# Fallback
|
||||
try:
|
||||
return element.text_content().strip()
|
||||
except:
|
||||
return ""
|
||||
|
||||
def _get_element_html(self, element) -> str:
|
||||
"""Get HTML string representation of element"""
|
||||
try:
|
||||
return self.etree.tostring(element, encoding='unicode', method='html')
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error serializing HTML: {e}")
|
||||
return ""
|
||||
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
"""Get attribute value safely"""
|
||||
try:
|
||||
return element.get(attribute)
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error getting attribute '{attribute}': {e}")
|
||||
return None
|
||||
|
||||
def _clear_caches(self):
|
||||
"""Clear caches to free memory"""
|
||||
if self.use_caching:
|
||||
self._result_cache.clear()
|
||||
|
||||
class JsonLxmlExtractionStrategy_naive(JsonElementExtractionStrategy):
|
||||
def __init__(self, schema: Dict[str, Any], **kwargs):
|
||||
kwargs["input_format"] = "html" # Force HTML input
|
||||
super().__init__(schema, **kwargs)
|
||||
self._selector_cache = {}
|
||||
|
||||
def _parse_html(self, html_content: str):
|
||||
from lxml import etree
|
||||
parser = etree.HTMLParser(recover=True)
|
||||
return etree.fromstring(html_content, parser)
|
||||
|
||||
def _get_selector(self, selector_str):
|
||||
"""Get a selector function that works within the context of an element"""
|
||||
if selector_str not in self._selector_cache:
|
||||
from lxml.cssselect import CSSSelector
|
||||
try:
|
||||
# Store both the compiled selector and its xpath translation
|
||||
compiled = CSSSelector(selector_str)
|
||||
|
||||
# Create a function that will apply this selector appropriately
|
||||
def select_func(element):
|
||||
try:
|
||||
# First attempt: direct CSS selector application
|
||||
results = compiled(element)
|
||||
if results:
|
||||
return results
|
||||
|
||||
# Second attempt: contextual XPath selection
|
||||
# Convert the root-based XPath to a context-based XPath
|
||||
xpath = compiled.path
|
||||
|
||||
# If the XPath already starts with descendant-or-self, handle it specially
|
||||
if xpath.startswith('descendant-or-self::'):
|
||||
context_xpath = xpath
|
||||
else:
|
||||
# For normal XPath expressions, make them relative to current context
|
||||
context_xpath = f"./{xpath.lstrip('/')}"
|
||||
|
||||
results = element.xpath(context_xpath)
|
||||
if results:
|
||||
return results
|
||||
|
||||
# Final fallback: simple descendant search for common patterns
|
||||
if 'nth-child' in selector_str:
|
||||
# Handle td:nth-child(N) pattern
|
||||
import re
|
||||
match = re.search(r'td:nth-child\((\d+)\)', selector_str)
|
||||
if match:
|
||||
col_num = match.group(1)
|
||||
sub_selector = selector_str.split(')', 1)[-1].strip()
|
||||
if sub_selector:
|
||||
return element.xpath(f".//td[{col_num}]//{sub_selector}")
|
||||
else:
|
||||
return element.xpath(f".//td[{col_num}]")
|
||||
|
||||
# Last resort: try each part of the selector separately
|
||||
parts = selector_str.split()
|
||||
if len(parts) > 1 and parts[-1]:
|
||||
return element.xpath(f".//{parts[-1]}")
|
||||
|
||||
return []
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error applying selector '{selector_str}': {e}")
|
||||
return []
|
||||
|
||||
self._selector_cache[selector_str] = select_func
|
||||
except Exception as e:
|
||||
if self.verbose:
|
||||
print(f"Error compiling selector '{selector_str}': {e}")
|
||||
|
||||
# Fallback function for invalid selectors
|
||||
def fallback_func(element):
|
||||
return []
|
||||
|
||||
self._selector_cache[selector_str] = fallback_func
|
||||
|
||||
return self._selector_cache[selector_str]
|
||||
|
||||
def _get_base_elements(self, parsed_html, selector: str):
|
||||
selector_func = self._get_selector(selector)
|
||||
return selector_func(parsed_html)
|
||||
|
||||
def _get_elements(self, element, selector: str):
|
||||
selector_func = self._get_selector(selector)
|
||||
return selector_func(element)
|
||||
|
||||
def _get_element_text(self, element) -> str:
|
||||
return "".join(element.xpath(".//text()")).strip()
|
||||
|
||||
def _get_element_html(self, element) -> str:
|
||||
from lxml import etree
|
||||
return etree.tostring(element, encoding='unicode')
|
||||
|
||||
def _get_element_attribute(self, element, attribute: str):
|
||||
return element.get(attribute)
|
||||
|
||||
class JsonXPathExtractionStrategy(JsonElementExtractionStrategy):
|
||||
"""
|
||||
|
||||
@@ -45,7 +45,34 @@ def post_install():
|
||||
setup_home_directory()
|
||||
install_playwright()
|
||||
run_migration()
|
||||
setup_builtin_browser()
|
||||
logger.success("Post-installation setup completed!", tag="COMPLETE")
|
||||
|
||||
def setup_builtin_browser():
|
||||
"""Set up a builtin browser for use with Crawl4AI"""
|
||||
try:
|
||||
logger.info("Setting up builtin browser...", tag="INIT")
|
||||
asyncio.run(_setup_builtin_browser())
|
||||
logger.success("Builtin browser setup completed!", tag="COMPLETE")
|
||||
except Exception as e:
|
||||
logger.warning(f"Failed to set up builtin browser: {e}")
|
||||
logger.warning("You can manually set up a builtin browser using 'crawl4ai-doctor builtin-browser-start'")
|
||||
|
||||
async def _setup_builtin_browser():
|
||||
try:
|
||||
# Import BrowserProfiler here to avoid circular imports
|
||||
from .browser_profiler import BrowserProfiler
|
||||
profiler = BrowserProfiler(logger=logger)
|
||||
|
||||
# Launch the builtin browser
|
||||
cdp_url = await profiler.launch_builtin_browser(headless=True)
|
||||
if cdp_url:
|
||||
logger.success(f"Builtin browser launched at {cdp_url}", tag="BROWSER")
|
||||
else:
|
||||
logger.warning("Failed to launch builtin browser", tag="BROWSER")
|
||||
except Exception as e:
|
||||
logger.warning(f"Error setting up builtin browser: {e}", tag="BROWSER")
|
||||
raise
|
||||
|
||||
|
||||
def install_playwright():
|
||||
|
||||
@@ -1,8 +1,8 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from tabnanny import verbose
|
||||
from typing import Optional, Dict, Any, Tuple
|
||||
from .models import MarkdownGenerationResult
|
||||
from .html2text import CustomHTML2Text
|
||||
# from .types import RelevantContentFilter
|
||||
from .content_filter_strategy import RelevantContentFilter
|
||||
import re
|
||||
from urllib.parse import urljoin
|
||||
|
||||
@@ -1,4 +1,3 @@
|
||||
from re import U
|
||||
from pydantic import BaseModel, HttpUrl, PrivateAttr
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from enum import Enum
|
||||
@@ -28,6 +27,12 @@ class CrawlerTaskResult:
|
||||
start_time: Union[datetime, float]
|
||||
end_time: Union[datetime, float]
|
||||
error_message: str = ""
|
||||
retry_count: int = 0
|
||||
wait_time: float = 0.0
|
||||
|
||||
@property
|
||||
def success(self) -> bool:
|
||||
return self.result.success
|
||||
|
||||
|
||||
class CrawlStatus(Enum):
|
||||
@@ -37,23 +42,57 @@ class CrawlStatus(Enum):
|
||||
FAILED = "FAILED"
|
||||
|
||||
|
||||
# @dataclass
|
||||
# class CrawlStats:
|
||||
# task_id: str
|
||||
# url: str
|
||||
# status: CrawlStatus
|
||||
# start_time: Optional[datetime] = None
|
||||
# end_time: Optional[datetime] = None
|
||||
# memory_usage: float = 0.0
|
||||
# peak_memory: float = 0.0
|
||||
# error_message: str = ""
|
||||
|
||||
# @property
|
||||
# def duration(self) -> str:
|
||||
# if not self.start_time:
|
||||
# return "0:00"
|
||||
# end = self.end_time or datetime.now()
|
||||
# duration = end - self.start_time
|
||||
# return str(timedelta(seconds=int(duration.total_seconds())))
|
||||
|
||||
|
||||
@dataclass
|
||||
class CrawlStats:
|
||||
task_id: str
|
||||
url: str
|
||||
status: CrawlStatus
|
||||
start_time: Optional[datetime] = None
|
||||
end_time: Optional[datetime] = None
|
||||
start_time: Optional[Union[datetime, float]] = None
|
||||
end_time: Optional[Union[datetime, float]] = None
|
||||
memory_usage: float = 0.0
|
||||
peak_memory: float = 0.0
|
||||
error_message: str = ""
|
||||
wait_time: float = 0.0
|
||||
retry_count: int = 0
|
||||
counted_requeue: bool = False
|
||||
|
||||
@property
|
||||
def duration(self) -> str:
|
||||
if not self.start_time:
|
||||
return "0:00"
|
||||
|
||||
# Convert start_time to datetime if it's a float
|
||||
start = self.start_time
|
||||
if isinstance(start, float):
|
||||
start = datetime.fromtimestamp(start)
|
||||
|
||||
# Get end time or use current time
|
||||
end = self.end_time or datetime.now()
|
||||
duration = end - self.start_time
|
||||
# Convert end_time to datetime if it's a float
|
||||
if isinstance(end, float):
|
||||
end = datetime.fromtimestamp(end)
|
||||
|
||||
duration = end - start
|
||||
return str(timedelta(seconds=int(duration.total_seconds())))
|
||||
|
||||
|
||||
@@ -149,7 +188,11 @@ class CrawlResult(BaseModel):
|
||||
markdown_result = data.pop('markdown', None)
|
||||
super().__init__(**data)
|
||||
if markdown_result is not None:
|
||||
self._markdown = markdown_result
|
||||
self._markdown = (
|
||||
MarkdownGenerationResult(**markdown_result)
|
||||
if isinstance(markdown_result, dict)
|
||||
else markdown_result
|
||||
)
|
||||
|
||||
@property
|
||||
def markdown(self):
|
||||
@@ -292,6 +335,7 @@ class Media(BaseModel):
|
||||
audios: List[
|
||||
MediaItem
|
||||
] = [] # Using MediaItem model for now, can be extended with Audio model if needed
|
||||
tables: List[Dict] = [] # Table data extracted from HTML tables
|
||||
|
||||
|
||||
class Links(BaseModel):
|
||||
|
||||
@@ -1,8 +1,119 @@
|
||||
from typing import List, Dict, Optional
|
||||
from abc import ABC, abstractmethod
|
||||
from itertools import cycle
|
||||
import os
|
||||
|
||||
|
||||
class ProxyConfig:
|
||||
def __init__(
|
||||
self,
|
||||
server: str,
|
||||
username: Optional[str] = None,
|
||||
password: Optional[str] = None,
|
||||
ip: Optional[str] = None,
|
||||
):
|
||||
"""Configuration class for a single proxy.
|
||||
|
||||
Args:
|
||||
server: Proxy server URL (e.g., "http://127.0.0.1:8080")
|
||||
username: Optional username for proxy authentication
|
||||
password: Optional password for proxy authentication
|
||||
ip: Optional IP address for verification purposes
|
||||
"""
|
||||
self.server = server
|
||||
self.username = username
|
||||
self.password = password
|
||||
|
||||
# Extract IP from server if not explicitly provided
|
||||
self.ip = ip or self._extract_ip_from_server()
|
||||
|
||||
def _extract_ip_from_server(self) -> Optional[str]:
|
||||
"""Extract IP address from server URL."""
|
||||
try:
|
||||
# Simple extraction assuming http://ip:port format
|
||||
if "://" in self.server:
|
||||
parts = self.server.split("://")[1].split(":")
|
||||
return parts[0]
|
||||
else:
|
||||
parts = self.server.split(":")
|
||||
return parts[0]
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def from_string(proxy_str: str) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
|
||||
parts = proxy_str.split(":")
|
||||
if len(parts) == 4: # ip:port:username:password
|
||||
ip, port, username, password = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
username=username,
|
||||
password=password,
|
||||
ip=ip
|
||||
)
|
||||
elif len(parts) == 2: # ip:port only
|
||||
ip, port = parts
|
||||
return ProxyConfig(
|
||||
server=f"http://{ip}:{port}",
|
||||
ip=ip
|
||||
)
|
||||
else:
|
||||
raise ValueError(f"Invalid proxy string format: {proxy_str}")
|
||||
|
||||
@staticmethod
|
||||
def from_dict(proxy_dict: Dict) -> "ProxyConfig":
|
||||
"""Create a ProxyConfig from a dictionary."""
|
||||
return ProxyConfig(
|
||||
server=proxy_dict.get("server"),
|
||||
username=proxy_dict.get("username"),
|
||||
password=proxy_dict.get("password"),
|
||||
ip=proxy_dict.get("ip")
|
||||
)
|
||||
|
||||
@staticmethod
|
||||
def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
|
||||
"""Load proxies from environment variable.
|
||||
|
||||
Args:
|
||||
env_var: Name of environment variable containing comma-separated proxy strings
|
||||
|
||||
Returns:
|
||||
List of ProxyConfig objects
|
||||
"""
|
||||
proxies = []
|
||||
try:
|
||||
proxy_list = os.getenv(env_var, "").split(",")
|
||||
for proxy in proxy_list:
|
||||
if not proxy:
|
||||
continue
|
||||
proxies.append(ProxyConfig.from_string(proxy))
|
||||
except Exception as e:
|
||||
print(f"Error loading proxies from environment: {e}")
|
||||
return proxies
|
||||
|
||||
def to_dict(self) -> Dict:
|
||||
"""Convert to dictionary representation."""
|
||||
return {
|
||||
"server": self.server,
|
||||
"username": self.username,
|
||||
"password": self.password,
|
||||
"ip": self.ip
|
||||
}
|
||||
|
||||
def clone(self, **kwargs) -> "ProxyConfig":
|
||||
"""Create a copy of this configuration with updated values.
|
||||
|
||||
Args:
|
||||
**kwargs: Key-value pairs of configuration options to update
|
||||
|
||||
Returns:
|
||||
ProxyConfig: A new instance with the specified updates
|
||||
"""
|
||||
config_dict = self.to_dict()
|
||||
config_dict.update(kwargs)
|
||||
return ProxyConfig.from_dict(config_dict)
|
||||
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
|
||||
class ProxyRotationStrategy(ABC):
|
||||
"""Base abstract class for proxy rotation strategies"""
|
||||
|
||||
@@ -1,14 +1,187 @@
|
||||
from typing import TYPE_CHECKING, Union
|
||||
|
||||
AsyncWebCrawler = Union['AsyncWebCrawlerType'] # Note the string literal
|
||||
CrawlerRunConfig = Union['CrawlerRunConfigType']
|
||||
# Logger types
|
||||
AsyncLoggerBase = Union['AsyncLoggerBaseType']
|
||||
AsyncLogger = Union['AsyncLoggerType']
|
||||
|
||||
# Crawler core types
|
||||
AsyncWebCrawler = Union['AsyncWebCrawlerType']
|
||||
CacheMode = Union['CacheModeType']
|
||||
CrawlResult = Union['CrawlResultType']
|
||||
CrawlerHub = Union['CrawlerHubType']
|
||||
BrowserProfiler = Union['BrowserProfilerType']
|
||||
|
||||
# Configuration types
|
||||
BrowserConfig = Union['BrowserConfigType']
|
||||
CrawlerRunConfig = Union['CrawlerRunConfigType']
|
||||
HTTPCrawlerConfig = Union['HTTPCrawlerConfigType']
|
||||
LLMConfig = Union['LLMConfigType']
|
||||
|
||||
# Content scraping types
|
||||
ContentScrapingStrategy = Union['ContentScrapingStrategyType']
|
||||
WebScrapingStrategy = Union['WebScrapingStrategyType']
|
||||
LXMLWebScrapingStrategy = Union['LXMLWebScrapingStrategyType']
|
||||
|
||||
# Proxy types
|
||||
ProxyRotationStrategy = Union['ProxyRotationStrategyType']
|
||||
RoundRobinProxyStrategy = Union['RoundRobinProxyStrategyType']
|
||||
|
||||
# Extraction types
|
||||
ExtractionStrategy = Union['ExtractionStrategyType']
|
||||
LLMExtractionStrategy = Union['LLMExtractionStrategyType']
|
||||
CosineStrategy = Union['CosineStrategyType']
|
||||
JsonCssExtractionStrategy = Union['JsonCssExtractionStrategyType']
|
||||
JsonXPathExtractionStrategy = Union['JsonXPathExtractionStrategyType']
|
||||
|
||||
# Chunking types
|
||||
ChunkingStrategy = Union['ChunkingStrategyType']
|
||||
RegexChunking = Union['RegexChunkingType']
|
||||
|
||||
# Markdown generation types
|
||||
DefaultMarkdownGenerator = Union['DefaultMarkdownGeneratorType']
|
||||
MarkdownGenerationResult = Union['MarkdownGenerationResultType']
|
||||
|
||||
# Content filter types
|
||||
RelevantContentFilter = Union['RelevantContentFilterType']
|
||||
PruningContentFilter = Union['PruningContentFilterType']
|
||||
BM25ContentFilter = Union['BM25ContentFilterType']
|
||||
LLMContentFilter = Union['LLMContentFilterType']
|
||||
|
||||
# Dispatcher types
|
||||
BaseDispatcher = Union['BaseDispatcherType']
|
||||
MemoryAdaptiveDispatcher = Union['MemoryAdaptiveDispatcherType']
|
||||
SemaphoreDispatcher = Union['SemaphoreDispatcherType']
|
||||
RateLimiter = Union['RateLimiterType']
|
||||
CrawlerMonitor = Union['CrawlerMonitorType']
|
||||
DisplayMode = Union['DisplayModeType']
|
||||
RunManyReturn = Union['RunManyReturnType']
|
||||
|
||||
# Docker client
|
||||
Crawl4aiDockerClient = Union['Crawl4aiDockerClientType']
|
||||
|
||||
# Deep crawling types
|
||||
DeepCrawlStrategy = Union['DeepCrawlStrategyType']
|
||||
BFSDeepCrawlStrategy = Union['BFSDeepCrawlStrategyType']
|
||||
FilterChain = Union['FilterChainType']
|
||||
ContentTypeFilter = Union['ContentTypeFilterType']
|
||||
DomainFilter = Union['DomainFilterType']
|
||||
URLFilter = Union['URLFilterType']
|
||||
FilterStats = Union['FilterStatsType']
|
||||
SEOFilter = Union['SEOFilterType']
|
||||
KeywordRelevanceScorer = Union['KeywordRelevanceScorerType']
|
||||
URLScorer = Union['URLScorerType']
|
||||
CompositeScorer = Union['CompositeScorerType']
|
||||
DomainAuthorityScorer = Union['DomainAuthorityScorerType']
|
||||
FreshnessScorer = Union['FreshnessScorerType']
|
||||
PathDepthScorer = Union['PathDepthScorerType']
|
||||
BestFirstCrawlingStrategy = Union['BestFirstCrawlingStrategyType']
|
||||
DFSDeepCrawlStrategy = Union['DFSDeepCrawlStrategyType']
|
||||
DeepCrawlDecorator = Union['DeepCrawlDecoratorType']
|
||||
|
||||
# Only import types during type checking to avoid circular imports
|
||||
if TYPE_CHECKING:
|
||||
from . import (
|
||||
# Logger imports
|
||||
from .async_logger import (
|
||||
AsyncLoggerBase as AsyncLoggerBaseType,
|
||||
AsyncLogger as AsyncLoggerType,
|
||||
)
|
||||
|
||||
# Crawler core imports
|
||||
from .async_webcrawler import (
|
||||
AsyncWebCrawler as AsyncWebCrawlerType,
|
||||
CacheMode as CacheModeType,
|
||||
)
|
||||
from .models import CrawlResult as CrawlResultType
|
||||
from .hub import CrawlerHub as CrawlerHubType
|
||||
from .browser_profiler import BrowserProfiler as BrowserProfilerType
|
||||
|
||||
# Configuration imports
|
||||
from .async_configs import (
|
||||
BrowserConfig as BrowserConfigType,
|
||||
CrawlerRunConfig as CrawlerRunConfigType,
|
||||
CrawlResult as CrawlResultType,
|
||||
HTTPCrawlerConfig as HTTPCrawlerConfigType,
|
||||
LLMConfig as LLMConfigType,
|
||||
)
|
||||
|
||||
# Content scraping imports
|
||||
from .content_scraping_strategy import (
|
||||
ContentScrapingStrategy as ContentScrapingStrategyType,
|
||||
WebScrapingStrategy as WebScrapingStrategyType,
|
||||
LXMLWebScrapingStrategy as LXMLWebScrapingStrategyType,
|
||||
)
|
||||
|
||||
# Proxy imports
|
||||
from .proxy_strategy import (
|
||||
ProxyRotationStrategy as ProxyRotationStrategyType,
|
||||
RoundRobinProxyStrategy as RoundRobinProxyStrategyType,
|
||||
)
|
||||
|
||||
# Extraction imports
|
||||
from .extraction_strategy import (
|
||||
ExtractionStrategy as ExtractionStrategyType,
|
||||
LLMExtractionStrategy as LLMExtractionStrategyType,
|
||||
CosineStrategy as CosineStrategyType,
|
||||
JsonCssExtractionStrategy as JsonCssExtractionStrategyType,
|
||||
JsonXPathExtractionStrategy as JsonXPathExtractionStrategyType,
|
||||
)
|
||||
|
||||
# Chunking imports
|
||||
from .chunking_strategy import (
|
||||
ChunkingStrategy as ChunkingStrategyType,
|
||||
RegexChunking as RegexChunkingType,
|
||||
)
|
||||
|
||||
# Markdown generation imports
|
||||
from .markdown_generation_strategy import (
|
||||
DefaultMarkdownGenerator as DefaultMarkdownGeneratorType,
|
||||
)
|
||||
from .models import MarkdownGenerationResult as MarkdownGenerationResultType
|
||||
|
||||
# Content filter imports
|
||||
from .content_filter_strategy import (
|
||||
RelevantContentFilter as RelevantContentFilterType,
|
||||
PruningContentFilter as PruningContentFilterType,
|
||||
BM25ContentFilter as BM25ContentFilterType,
|
||||
LLMContentFilter as LLMContentFilterType,
|
||||
)
|
||||
|
||||
# Dispatcher imports
|
||||
from .async_dispatcher import (
|
||||
BaseDispatcher as BaseDispatcherType,
|
||||
MemoryAdaptiveDispatcher as MemoryAdaptiveDispatcherType,
|
||||
SemaphoreDispatcher as SemaphoreDispatcherType,
|
||||
RateLimiter as RateLimiterType,
|
||||
CrawlerMonitor as CrawlerMonitorType,
|
||||
DisplayMode as DisplayModeType,
|
||||
RunManyReturn as RunManyReturnType,
|
||||
)
|
||||
)
|
||||
|
||||
# Docker client
|
||||
from .docker_client import Crawl4aiDockerClient as Crawl4aiDockerClientType
|
||||
|
||||
# Deep crawling imports
|
||||
from .deep_crawling import (
|
||||
DeepCrawlStrategy as DeepCrawlStrategyType,
|
||||
BFSDeepCrawlStrategy as BFSDeepCrawlStrategyType,
|
||||
FilterChain as FilterChainType,
|
||||
ContentTypeFilter as ContentTypeFilterType,
|
||||
DomainFilter as DomainFilterType,
|
||||
URLFilter as URLFilterType,
|
||||
FilterStats as FilterStatsType,
|
||||
SEOFilter as SEOFilterType,
|
||||
KeywordRelevanceScorer as KeywordRelevanceScorerType,
|
||||
URLScorer as URLScorerType,
|
||||
CompositeScorer as CompositeScorerType,
|
||||
DomainAuthorityScorer as DomainAuthorityScorerType,
|
||||
FreshnessScorer as FreshnessScorerType,
|
||||
PathDepthScorer as PathDepthScorerType,
|
||||
BestFirstCrawlingStrategy as BestFirstCrawlingStrategyType,
|
||||
DFSDeepCrawlStrategy as DFSDeepCrawlStrategyType,
|
||||
DeepCrawlDecorator as DeepCrawlDecoratorType,
|
||||
)
|
||||
|
||||
|
||||
|
||||
def create_llm_config(*args, **kwargs) -> 'LLMConfigType':
|
||||
from .async_configs import LLMConfig
|
||||
return LLMConfig(*args, **kwargs)
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
import time
|
||||
from urllib.parse import urlparse
|
||||
from concurrent.futures import ThreadPoolExecutor, as_completed
|
||||
from bs4 import BeautifulSoup, Comment, element, Tag, NavigableString
|
||||
import json
|
||||
@@ -27,12 +26,14 @@ import cProfile
|
||||
import pstats
|
||||
from functools import wraps
|
||||
import asyncio
|
||||
|
||||
from lxml import etree, html as lhtml
|
||||
import sqlite3
|
||||
import hashlib
|
||||
|
||||
from urllib.robotparser import RobotFileParser
|
||||
import aiohttp
|
||||
from urllib.parse import urlparse, urlunparse
|
||||
from functools import lru_cache
|
||||
|
||||
from packaging import version
|
||||
from . import __version__
|
||||
@@ -1962,6 +1963,82 @@ def normalize_url(href, base_url):
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
from urllib.parse import urljoin, urlparse, urlunparse, parse_qs, urlencode
|
||||
|
||||
# Handle None or empty values
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Use urljoin to handle relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Parse the URL for normalization
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Convert hostname to lowercase
|
||||
netloc = parsed.netloc.lower()
|
||||
|
||||
# Remove fragment entirely
|
||||
fragment = ''
|
||||
|
||||
# Normalize query parameters if needed
|
||||
query = parsed.query
|
||||
if query:
|
||||
# Parse query parameters
|
||||
params = parse_qs(query)
|
||||
|
||||
# Remove tracking parameters (example - customize as needed)
|
||||
tracking_params = ['utm_source', 'utm_medium', 'utm_campaign', 'ref', 'fbclid']
|
||||
for param in tracking_params:
|
||||
if param in params:
|
||||
del params[param]
|
||||
|
||||
# Rebuild query string, sorted for consistency
|
||||
query = urlencode(params, doseq=True) if params else ''
|
||||
|
||||
# Build normalized URL
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
netloc,
|
||||
parsed.path.rstrip('/') or '/', # Normalize trailing slash
|
||||
parsed.params,
|
||||
query,
|
||||
fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
@lru_cache(maxsize=10000)
|
||||
def efficient_normalize_url_for_deep_crawl(href, base_url):
|
||||
"""Efficient URL normalization with proper parsing"""
|
||||
from urllib.parse import urljoin
|
||||
|
||||
if not href:
|
||||
return None
|
||||
|
||||
# Resolve relative URLs
|
||||
full_url = urljoin(base_url, href.strip())
|
||||
|
||||
# Use proper URL parsing
|
||||
parsed = urlparse(full_url)
|
||||
|
||||
# Only perform the most critical normalizations
|
||||
# 1. Lowercase hostname
|
||||
# 2. Remove fragment
|
||||
normalized = urlunparse((
|
||||
parsed.scheme,
|
||||
parsed.netloc.lower(),
|
||||
parsed.path,
|
||||
parsed.params,
|
||||
parsed.query,
|
||||
'' # Remove fragment
|
||||
))
|
||||
|
||||
return normalized
|
||||
|
||||
|
||||
def normalize_url_tmp(href, base_url):
|
||||
"""Normalize URLs to ensure consistent format"""
|
||||
# Extract protocol and domain from base URL
|
||||
@@ -2540,3 +2617,116 @@ class HeadPeekr:
|
||||
def get_title(head_content: str):
|
||||
title_match = re.search(r'<title>(.*?)</title>', head_content, re.IGNORECASE | re.DOTALL)
|
||||
return title_match.group(1) if title_match else None
|
||||
|
||||
def preprocess_html_for_schema(html_content, text_threshold=100, attr_value_threshold=200, max_size=100000):
|
||||
"""
|
||||
Preprocess HTML to reduce size while preserving structure for schema generation.
|
||||
|
||||
Args:
|
||||
html_content (str): Raw HTML content
|
||||
text_threshold (int): Maximum length for text nodes before truncation
|
||||
attr_value_threshold (int): Maximum length for attribute values before truncation
|
||||
max_size (int): Target maximum size for output HTML
|
||||
|
||||
Returns:
|
||||
str: Preprocessed HTML content
|
||||
"""
|
||||
try:
|
||||
# Parse HTML with error recovery
|
||||
parser = etree.HTMLParser(remove_comments=True, remove_blank_text=True)
|
||||
tree = lhtml.fromstring(html_content, parser=parser)
|
||||
|
||||
# 1. Remove HEAD section (keep only BODY)
|
||||
head_elements = tree.xpath('//head')
|
||||
for head in head_elements:
|
||||
if head.getparent() is not None:
|
||||
head.getparent().remove(head)
|
||||
|
||||
# 2. Define tags to remove completely
|
||||
tags_to_remove = [
|
||||
'script', 'style', 'noscript', 'iframe', 'canvas', 'svg',
|
||||
'video', 'audio', 'source', 'track', 'map', 'area'
|
||||
]
|
||||
|
||||
# Remove unwanted elements
|
||||
for tag in tags_to_remove:
|
||||
elements = tree.xpath(f'//{tag}')
|
||||
for element in elements:
|
||||
if element.getparent() is not None:
|
||||
element.getparent().remove(element)
|
||||
|
||||
# 3. Process remaining elements to clean attributes and truncate text
|
||||
for element in tree.iter():
|
||||
# Skip if we're at the root level
|
||||
if element.getparent() is None:
|
||||
continue
|
||||
|
||||
# Clean non-essential attributes but preserve structural ones
|
||||
# attribs_to_keep = {'id', 'class', 'name', 'href', 'src', 'type', 'value', 'data-'}
|
||||
|
||||
# This is more aggressive than the previous version
|
||||
attribs_to_keep = {'id', 'class', 'name', 'type', 'value'}
|
||||
|
||||
# attributes_hates_truncate = ['id', 'class', "data-"]
|
||||
|
||||
# This means, I don't care, if an attribute is too long, truncate it, go and find a better css selector to build a schema
|
||||
attributes_hates_truncate = []
|
||||
|
||||
# Process each attribute
|
||||
for attrib in list(element.attrib.keys()):
|
||||
# Keep if it's essential or starts with data-
|
||||
if not (attrib in attribs_to_keep or attrib.startswith('data-')):
|
||||
element.attrib.pop(attrib)
|
||||
# Truncate long attribute values except for selectors
|
||||
elif attrib not in attributes_hates_truncate and len(element.attrib[attrib]) > attr_value_threshold:
|
||||
element.attrib[attrib] = element.attrib[attrib][:attr_value_threshold] + '...'
|
||||
|
||||
# Truncate text content if it's too long
|
||||
if element.text and len(element.text.strip()) > text_threshold:
|
||||
element.text = element.text.strip()[:text_threshold] + '...'
|
||||
|
||||
# Also truncate tail text if present
|
||||
if element.tail and len(element.tail.strip()) > text_threshold:
|
||||
element.tail = element.tail.strip()[:text_threshold] + '...'
|
||||
|
||||
# 4. Find repeated patterns and keep only a few examples
|
||||
# This is a simplistic approach - more sophisticated pattern detection could be implemented
|
||||
pattern_elements = {}
|
||||
for element in tree.xpath('//*[contains(@class, "")]'):
|
||||
parent = element.getparent()
|
||||
if parent is None:
|
||||
continue
|
||||
|
||||
# Create a signature based on tag and classes
|
||||
classes = element.get('class', '')
|
||||
if not classes:
|
||||
continue
|
||||
signature = f"{element.tag}.{classes}"
|
||||
|
||||
if signature in pattern_elements:
|
||||
pattern_elements[signature].append(element)
|
||||
else:
|
||||
pattern_elements[signature] = [element]
|
||||
|
||||
# Keep only 3 examples of each repeating pattern
|
||||
for signature, elements in pattern_elements.items():
|
||||
if len(elements) > 3:
|
||||
# Keep the first 2 and last elements
|
||||
for element in elements[2:-1]:
|
||||
if element.getparent() is not None:
|
||||
element.getparent().remove(element)
|
||||
|
||||
# 5. Convert back to string
|
||||
result = etree.tostring(tree, encoding='unicode', method='html')
|
||||
|
||||
# If still over the size limit, apply more aggressive truncation
|
||||
if len(result) > max_size:
|
||||
return result[:max_size] + "..."
|
||||
|
||||
return result
|
||||
|
||||
except Exception as e:
|
||||
# Fallback for parsing errors
|
||||
return html_content[:max_size] if len(html_content) > max_size else html_content
|
||||
|
||||
|
||||
|
||||
@@ -352,7 +352,10 @@ Example:
|
||||
from crawl4ai import CrawlerRunConfig, PruningContentFilter
|
||||
|
||||
config = CrawlerRunConfig(
|
||||
content_filter=PruningContentFilter(threshold=0.48)
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(threshold=0.48, threshold_type="fixed")
|
||||
),
|
||||
cache_mode= CacheMode.BYPASS
|
||||
)
|
||||
print(config.dump()) # Use this JSON in your API calls
|
||||
```
|
||||
@@ -551,7 +554,7 @@ async def test_stream_crawl(session, token: str):
|
||||
"https://example.com/page3",
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
||||
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||
}
|
||||
|
||||
# headers = {"Authorization": f"Bearer {token}"} # If JWT is enabled, more on this later
|
||||
@@ -595,8 +598,8 @@ curl http://localhost:8000/health
|
||||
## Complete Examples
|
||||
|
||||
Check out the `examples` folder in our repository for full working examples! Here are two to get you started:
|
||||
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk_example.py)
|
||||
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api_example.py)
|
||||
[Using Client SDK](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_sdk.py)
|
||||
[Using REST API](https://github.com/unclecode/crawl4ai/blob/main/docs/examples/docker_python_rest_api.py)
|
||||
|
||||
## Server Configuration
|
||||
|
||||
|
||||
@@ -2,6 +2,7 @@ import os
|
||||
import json
|
||||
import asyncio
|
||||
from typing import List, Tuple
|
||||
from functools import partial
|
||||
|
||||
import logging
|
||||
from typing import Optional, AsyncGenerator
|
||||
@@ -18,7 +19,8 @@ from crawl4ai import (
|
||||
CacheMode,
|
||||
BrowserConfig,
|
||||
MemoryAdaptiveDispatcher,
|
||||
RateLimiter
|
||||
RateLimiter,
|
||||
LLMConfig
|
||||
)
|
||||
from crawl4ai.utils import perform_completion_with_backoff
|
||||
from crawl4ai.content_filter_strategy import (
|
||||
@@ -103,8 +105,10 @@ async def process_llm_extraction(
|
||||
else:
|
||||
api_key = os.environ.get(config["llm"].get("api_key_env", None), "")
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=api_key,
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=api_key
|
||||
),
|
||||
instruction=instruction,
|
||||
schema=json.loads(schema) if schema else None,
|
||||
)
|
||||
@@ -164,8 +168,10 @@ async def handle_markdown_request(
|
||||
FilterType.FIT: PruningContentFilter(),
|
||||
FilterType.BM25: BM25ContentFilter(user_query=query or ""),
|
||||
FilterType.LLM: LLMContentFilter(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
llm_config=LLMConfig(
|
||||
provider=config["llm"]["provider"],
|
||||
api_token=os.environ.get(config["llm"].get("api_key_env", None), ""),
|
||||
),
|
||||
instruction=query or "Extract main content"
|
||||
)
|
||||
}[filter_type]
|
||||
@@ -383,12 +389,13 @@ async def handle_crawl_request(
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
results = await crawler.arun_many(
|
||||
urls=urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher
|
||||
)
|
||||
|
||||
results = []
|
||||
func = getattr(crawler, "arun" if len(urls) == 1 else "arun_many")
|
||||
partial_func = partial(func,
|
||||
urls[0] if len(urls) == 1 else urls,
|
||||
config=crawler_config,
|
||||
dispatcher=dispatcher)
|
||||
results = await partial_func()
|
||||
return {
|
||||
"success": True,
|
||||
"results": [result.model_dump() for result in results]
|
||||
|
||||
@@ -10,7 +10,7 @@ from pydantic.main import BaseModel
|
||||
import base64
|
||||
|
||||
instance = JWT()
|
||||
security = HTTPBearer()
|
||||
security = HTTPBearer(auto_error=False)
|
||||
SECRET_KEY = os.environ.get("SECRET_KEY", "mysecret")
|
||||
ACCESS_TOKEN_EXPIRE_MINUTES = 60
|
||||
|
||||
@@ -30,6 +30,9 @@ def create_access_token(data: dict, expires_delta: Optional[timedelta] = None) -
|
||||
|
||||
def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security)) -> Dict:
|
||||
"""Verify the JWT token from the Authorization header."""
|
||||
|
||||
if credentials is None:
|
||||
return None
|
||||
token = credentials.credentials
|
||||
verifying_key = get_jwk_from_secret(SECRET_KEY)
|
||||
try:
|
||||
@@ -38,9 +41,15 @@ def verify_token(credentials: HTTPAuthorizationCredentials = Depends(security))
|
||||
except Exception:
|
||||
raise HTTPException(status_code=401, detail="Invalid or expired token")
|
||||
|
||||
|
||||
def get_token_dependency(config: Dict):
|
||||
"""Return the token dependency if JWT is enabled, else None."""
|
||||
return verify_token if config.get("security", {}).get("jwt_enabled", False) else None
|
||||
"""Return the token dependency if JWT is enabled, else a function that returns None."""
|
||||
|
||||
if config.get("security", {}).get("jwt_enabled", False):
|
||||
return verify_token
|
||||
else:
|
||||
return lambda: None
|
||||
|
||||
|
||||
class TokenRequest(BaseModel):
|
||||
email: EmailStr
|
||||
@@ -3,7 +3,7 @@ app:
|
||||
title: "Crawl4AI API"
|
||||
version: "1.0.0"
|
||||
host: "0.0.0.0"
|
||||
port: 8000
|
||||
port: 8020
|
||||
reload: True
|
||||
timeout_keep_alive: 300
|
||||
|
||||
@@ -38,8 +38,8 @@ rate_limiting:
|
||||
|
||||
# Security Configuration
|
||||
security:
|
||||
enabled: true
|
||||
jwt_enabled: true
|
||||
enabled: false
|
||||
jwt_enabled: false
|
||||
https_redirect: false
|
||||
trusted_hosts: ["*"]
|
||||
headers:
|
||||
|
||||
123
docs/examples/README_BUILTIN_BROWSER.md
Normal file
123
docs/examples/README_BUILTIN_BROWSER.md
Normal file
@@ -0,0 +1,123 @@
|
||||
# Builtin Browser in Crawl4AI
|
||||
|
||||
This document explains the builtin browser feature in Crawl4AI and how to use it effectively.
|
||||
|
||||
## What is the Builtin Browser?
|
||||
|
||||
The builtin browser is a persistent Chrome instance that Crawl4AI manages for you. It runs in the background and can be used by multiple crawling operations, eliminating the need to start and stop browsers for each crawl.
|
||||
|
||||
Benefits include:
|
||||
- **Faster startup times** - The browser is already running, so your scripts start faster
|
||||
- **Shared resources** - All your crawling scripts can use the same browser instance
|
||||
- **Simplified management** - No need to worry about CDP URLs or browser processes
|
||||
- **Persistent cookies and sessions** - Browser state persists between script runs
|
||||
- **Less resource usage** - Only one browser instance for multiple scripts
|
||||
|
||||
## Using the Builtin Browser
|
||||
|
||||
### In Python Code
|
||||
|
||||
Using the builtin browser in your code is simple:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig
|
||||
|
||||
# Create browser config with builtin mode
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin", # This is the key setting!
|
||||
headless=True # Can be headless or not
|
||||
)
|
||||
|
||||
# Create the crawler
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
# Use it - no need to explicitly start()
|
||||
result = await crawler.arun("https://example.com")
|
||||
```
|
||||
|
||||
Key points:
|
||||
1. Set `browser_mode="builtin"` in your BrowserConfig
|
||||
2. No need for explicit `start()` call - the crawler will automatically connect to the builtin browser
|
||||
3. No need to use a context manager or call `close()` - the browser stays running
|
||||
|
||||
### Via CLI
|
||||
|
||||
The CLI provides commands to manage the builtin browser:
|
||||
|
||||
```bash
|
||||
# Start the builtin browser
|
||||
crwl browser start
|
||||
|
||||
# Check its status
|
||||
crwl browser status
|
||||
|
||||
# Open a visible window to see what the browser is doing
|
||||
crwl browser view --url https://example.com
|
||||
|
||||
# Stop it when no longer needed
|
||||
crwl browser stop
|
||||
|
||||
# Restart with different settings
|
||||
crwl browser restart --no-headless
|
||||
```
|
||||
|
||||
When crawling via CLI, simply add the builtin browser mode:
|
||||
|
||||
```bash
|
||||
crwl https://example.com -b "browser_mode=builtin"
|
||||
```
|
||||
|
||||
## How It Works
|
||||
|
||||
1. When a crawler with `browser_mode="builtin"` is created:
|
||||
- It checks if a builtin browser is already running
|
||||
- If not, it automatically launches one
|
||||
- It connects to the browser via CDP (Chrome DevTools Protocol)
|
||||
|
||||
2. The browser process continues running after your script exits
|
||||
- This means it's ready for the next crawl
|
||||
- You can manage it via the CLI commands
|
||||
|
||||
3. During installation, Crawl4AI attempts to create a builtin browser automatically
|
||||
|
||||
## Example
|
||||
|
||||
See the [builtin_browser_example.py](builtin_browser_example.py) file for a complete example.
|
||||
|
||||
Run it with:
|
||||
|
||||
```bash
|
||||
python builtin_browser_example.py
|
||||
```
|
||||
|
||||
## When to Use
|
||||
|
||||
The builtin browser is ideal for:
|
||||
- Scripts that run frequently
|
||||
- Development and testing workflows
|
||||
- Applications that need to minimize startup time
|
||||
- Systems where you want to manage browser instances centrally
|
||||
|
||||
You might not want to use it when:
|
||||
- Running one-off scripts
|
||||
- When you need different browser configurations for different tasks
|
||||
- In environments where persistent processes are not allowed
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
If you encounter issues:
|
||||
|
||||
1. Check the browser status:
|
||||
```
|
||||
crwl browser status
|
||||
```
|
||||
|
||||
2. Try restarting it:
|
||||
```
|
||||
crwl browser restart
|
||||
```
|
||||
|
||||
3. If problems persist, stop it and let Crawl4AI start a fresh one:
|
||||
```
|
||||
crwl browser stop
|
||||
```
|
||||
79
docs/examples/arun_vs_arun_many.py
Normal file
79
docs/examples/arun_vs_arun_many.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import asyncio
|
||||
import time
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler, CacheMode
|
||||
from crawl4ai.async_configs import CrawlerRunConfig
|
||||
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher, RateLimiter
|
||||
|
||||
VERBOSE = False
|
||||
|
||||
async def crawl_sequential(urls):
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||
results = []
|
||||
start_time = time.perf_counter()
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
for url in urls:
|
||||
result_container = await crawler.arun(url=url, config=config)
|
||||
results.append(result_container[0])
|
||||
total_time = time.perf_counter() - start_time
|
||||
return total_time, results
|
||||
|
||||
async def crawl_parallel_dispatcher(urls):
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||
# Dispatcher with rate limiter enabled (default behavior)
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
rate_limiter=RateLimiter(base_delay=(1.0, 3.0), max_delay=60.0, max_retries=3),
|
||||
max_session_permit=50,
|
||||
)
|
||||
start_time = time.perf_counter()
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
||||
results = []
|
||||
if isinstance(result_container, list):
|
||||
results = result_container
|
||||
else:
|
||||
async for res in result_container:
|
||||
results.append(res)
|
||||
total_time = time.perf_counter() - start_time
|
||||
return total_time, results
|
||||
|
||||
async def crawl_parallel_no_rate_limit(urls):
|
||||
config = CrawlerRunConfig(cache_mode=CacheMode.BYPASS, verbose=VERBOSE)
|
||||
# Dispatcher with no rate limiter and a high session permit to avoid queuing
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
rate_limiter=None,
|
||||
max_session_permit=len(urls) # allow all URLs concurrently
|
||||
)
|
||||
start_time = time.perf_counter()
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result_container = await crawler.arun_many(urls=urls, config=config, dispatcher=dispatcher)
|
||||
results = []
|
||||
if isinstance(result_container, list):
|
||||
results = result_container
|
||||
else:
|
||||
async for res in result_container:
|
||||
results.append(res)
|
||||
total_time = time.perf_counter() - start_time
|
||||
return total_time, results
|
||||
|
||||
async def main():
|
||||
urls = ["https://example.com"] * 100
|
||||
print(f"Crawling {len(urls)} URLs sequentially...")
|
||||
seq_time, seq_results = await crawl_sequential(urls)
|
||||
print(f"Sequential crawling took: {seq_time:.2f} seconds\n")
|
||||
|
||||
print(f"Crawling {len(urls)} URLs in parallel using arun_many with dispatcher (with rate limit)...")
|
||||
disp_time, disp_results = await crawl_parallel_dispatcher(urls)
|
||||
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds\n")
|
||||
|
||||
print(f"Crawling {len(urls)} URLs in parallel using dispatcher with no rate limiter...")
|
||||
no_rl_time, no_rl_results = await crawl_parallel_no_rate_limit(urls)
|
||||
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds\n")
|
||||
|
||||
print("Crawl4ai - Crawling Comparison")
|
||||
print("--------------------------------------------------------")
|
||||
print(f"Sequential crawling took: {seq_time:.2f} seconds")
|
||||
print(f"Parallel (dispatcher with rate limiter) took: {disp_time:.2f} seconds")
|
||||
print(f"Parallel (dispatcher without rate limiter) took: {no_rl_time:.2f} seconds")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
86
docs/examples/builtin_browser_example.py
Normal file
86
docs/examples/builtin_browser_example.py
Normal file
@@ -0,0 +1,86 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Builtin Browser Example
|
||||
|
||||
This example demonstrates how to use Crawl4AI's builtin browser feature,
|
||||
which simplifies the browser management process. With builtin mode:
|
||||
|
||||
- No need to manually start or connect to a browser
|
||||
- No need to manage CDP URLs or browser processes
|
||||
- Automatically connects to an existing browser or launches one if needed
|
||||
- Browser persists between script runs, reducing startup time
|
||||
- No explicit cleanup or close() calls needed
|
||||
|
||||
The example also demonstrates "auto-starting" where you don't need to explicitly
|
||||
call start() method on the crawler.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
import time
|
||||
|
||||
async def crawl_with_builtin_browser():
|
||||
"""
|
||||
Simple example of crawling with the builtin browser.
|
||||
|
||||
Key features:
|
||||
1. browser_mode="builtin" in BrowserConfig
|
||||
2. No explicit start() call needed
|
||||
3. No explicit close() needed
|
||||
"""
|
||||
print("\n=== Crawl4AI Builtin Browser Example ===\n")
|
||||
|
||||
# Create a browser configuration with builtin mode
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin", # This is the key setting!
|
||||
headless=True # Can run headless for background operation
|
||||
)
|
||||
|
||||
# Create crawler run configuration
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS, # Skip cache for this demo
|
||||
screenshot=True, # Take a screenshot
|
||||
verbose=True # Show verbose logging
|
||||
)
|
||||
|
||||
# Create the crawler instance
|
||||
# Note: We don't need to use "async with" context manager
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
|
||||
# Start crawling several URLs - no explicit start() needed!
|
||||
# The crawler will automatically connect to the builtin browser
|
||||
print("\n➡️ Crawling first URL...")
|
||||
t0 = time.time()
|
||||
result1 = await crawler.arun(
|
||||
url="https://crawl4ai.com",
|
||||
config=crawler_config
|
||||
)
|
||||
t1 = time.time()
|
||||
print(f"✅ First URL crawled in {t1-t0:.2f} seconds")
|
||||
print(f" Got {len(result1.markdown.raw_markdown)} characters of content")
|
||||
print(f" Title: {result1.metadata.get('title', 'No title')}")
|
||||
|
||||
# Try another URL - the browser is already running, so this should be faster
|
||||
print("\n➡️ Crawling second URL...")
|
||||
t0 = time.time()
|
||||
result2 = await crawler.arun(
|
||||
url="https://example.com",
|
||||
config=crawler_config
|
||||
)
|
||||
t1 = time.time()
|
||||
print(f"✅ Second URL crawled in {t1-t0:.2f} seconds")
|
||||
print(f" Got {len(result2.markdown.raw_markdown)} characters of content")
|
||||
print(f" Title: {result2.metadata.get('title', 'No title')}")
|
||||
|
||||
# The builtin browser continues running in the background
|
||||
# No need to explicitly close it
|
||||
print("\n🔄 The builtin browser remains running for future use")
|
||||
print(" You can use 'crwl browser status' to check its status")
|
||||
print(" or 'crwl browser stop' to stop it when completely done")
|
||||
|
||||
async def main():
|
||||
"""Run the example"""
|
||||
await crawl_with_builtin_browser()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
209
docs/examples/crawler_monitor_example.py
Normal file
209
docs/examples/crawler_monitor_example.py
Normal file
@@ -0,0 +1,209 @@
|
||||
"""
|
||||
CrawlerMonitor Example
|
||||
|
||||
This example demonstrates how to use the CrawlerMonitor component
|
||||
to visualize and track web crawler operations in real-time.
|
||||
"""
|
||||
|
||||
import time
|
||||
import uuid
|
||||
import random
|
||||
import threading
|
||||
from crawl4ai.components.crawler_monitor import CrawlerMonitor
|
||||
from crawl4ai.models import CrawlStatus
|
||||
|
||||
def simulate_webcrawler_operations(monitor, num_tasks=20):
|
||||
"""
|
||||
Simulates a web crawler's operations with multiple tasks and different states.
|
||||
|
||||
Args:
|
||||
monitor: The CrawlerMonitor instance
|
||||
num_tasks: Number of tasks to simulate
|
||||
"""
|
||||
print(f"Starting simulation with {num_tasks} tasks...")
|
||||
|
||||
# Create and register all tasks first
|
||||
task_ids = []
|
||||
for i in range(num_tasks):
|
||||
task_id = str(uuid.uuid4())
|
||||
url = f"https://example.com/page{i}"
|
||||
monitor.add_task(task_id, url)
|
||||
task_ids.append((task_id, url))
|
||||
|
||||
# Small delay between task creation
|
||||
time.sleep(0.2)
|
||||
|
||||
# Process tasks with a variety of different behaviors
|
||||
threads = []
|
||||
for i, (task_id, url) in enumerate(task_ids):
|
||||
# Create a thread for each task
|
||||
thread = threading.Thread(
|
||||
target=process_task,
|
||||
args=(monitor, task_id, url, i)
|
||||
)
|
||||
thread.daemon = True
|
||||
threads.append(thread)
|
||||
|
||||
# Start threads in batches to simulate concurrent processing
|
||||
batch_size = 4 # Process 4 tasks at a time
|
||||
for i in range(0, len(threads), batch_size):
|
||||
batch = threads[i:i+batch_size]
|
||||
for thread in batch:
|
||||
thread.start()
|
||||
time.sleep(0.5) # Stagger thread start times
|
||||
|
||||
# Wait a bit before starting next batch
|
||||
time.sleep(random.uniform(1.0, 3.0))
|
||||
|
||||
# Update queue statistics
|
||||
update_queue_stats(monitor)
|
||||
|
||||
# Simulate memory pressure changes
|
||||
active_threads = [t for t in threads if t.is_alive()]
|
||||
if len(active_threads) > 8:
|
||||
monitor.update_memory_status("CRITICAL")
|
||||
elif len(active_threads) > 4:
|
||||
monitor.update_memory_status("PRESSURE")
|
||||
else:
|
||||
monitor.update_memory_status("NORMAL")
|
||||
|
||||
# Wait for all threads to complete
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Final updates
|
||||
update_queue_stats(monitor)
|
||||
monitor.update_memory_status("NORMAL")
|
||||
|
||||
print("Simulation completed!")
|
||||
|
||||
def process_task(monitor, task_id, url, index):
|
||||
"""Simulate processing of a single task."""
|
||||
# Tasks start in queued state (already added)
|
||||
|
||||
# Simulate waiting in queue
|
||||
wait_time = random.uniform(0.5, 3.0)
|
||||
time.sleep(wait_time)
|
||||
|
||||
# Start processing - move to IN_PROGRESS
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=time.time(),
|
||||
wait_time=wait_time
|
||||
)
|
||||
|
||||
# Simulate task processing with memory usage changes
|
||||
total_process_time = random.uniform(2.0, 10.0)
|
||||
step_time = total_process_time / 5 # Update in 5 steps
|
||||
|
||||
for step in range(5):
|
||||
# Simulate increasing then decreasing memory usage
|
||||
if step < 3: # First 3 steps - increasing
|
||||
memory_usage = random.uniform(5.0, 20.0) * (step + 1)
|
||||
else: # Last 2 steps - decreasing
|
||||
memory_usage = random.uniform(5.0, 20.0) * (5 - step)
|
||||
|
||||
# Update peak memory if this is higher
|
||||
peak = max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
|
||||
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
memory_usage=memory_usage,
|
||||
peak_memory=peak
|
||||
)
|
||||
|
||||
time.sleep(step_time)
|
||||
|
||||
# Determine final state - 80% success, 20% failure
|
||||
if index % 5 == 0: # Every 5th task fails
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
end_time=time.time(),
|
||||
memory_usage=0.0,
|
||||
error_message="Connection timeout"
|
||||
)
|
||||
else:
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.COMPLETED,
|
||||
end_time=time.time(),
|
||||
memory_usage=0.0
|
||||
)
|
||||
|
||||
def update_queue_stats(monitor):
|
||||
"""Update queue statistics based on current tasks."""
|
||||
task_stats = monitor.get_all_task_stats()
|
||||
|
||||
# Count queued tasks
|
||||
queued_tasks = [
|
||||
stats for stats in task_stats.values()
|
||||
if stats["status"] == CrawlStatus.QUEUED.name
|
||||
]
|
||||
|
||||
total_queued = len(queued_tasks)
|
||||
|
||||
if total_queued > 0:
|
||||
current_time = time.time()
|
||||
# Calculate wait times
|
||||
wait_times = [
|
||||
current_time - stats.get("enqueue_time", current_time)
|
||||
for stats in queued_tasks
|
||||
]
|
||||
highest_wait_time = max(wait_times) if wait_times else 0.0
|
||||
avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
|
||||
else:
|
||||
highest_wait_time = 0.0
|
||||
avg_wait_time = 0.0
|
||||
|
||||
# Update monitor
|
||||
monitor.update_queue_statistics(
|
||||
total_queued=total_queued,
|
||||
highest_wait_time=highest_wait_time,
|
||||
avg_wait_time=avg_wait_time
|
||||
)
|
||||
|
||||
def main():
|
||||
# Initialize the monitor
|
||||
monitor = CrawlerMonitor(
|
||||
urls_total=20, # Total URLs to process
|
||||
refresh_rate=0.5, # Update UI twice per second
|
||||
enable_ui=True, # Enable terminal UI
|
||||
max_width=120 # Set maximum width to 120 characters
|
||||
)
|
||||
|
||||
# Start the monitor
|
||||
monitor.start()
|
||||
|
||||
try:
|
||||
# Run simulation
|
||||
simulate_webcrawler_operations(monitor)
|
||||
|
||||
# Keep monitor running a bit to see final state
|
||||
print("Waiting to view final state...")
|
||||
time.sleep(5)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nExample interrupted by user")
|
||||
finally:
|
||||
# Stop the monitor
|
||||
monitor.stop()
|
||||
print("Example completed!")
|
||||
|
||||
# Print some statistics
|
||||
summary = monitor.get_summary()
|
||||
print("\nCrawler Statistics Summary:")
|
||||
print(f"Total URLs: {summary['urls_total']}")
|
||||
print(f"Completed: {summary['urls_completed']}")
|
||||
print(f"Completion percentage: {summary['completion_percentage']:.1f}%")
|
||||
print(f"Peak memory usage: {summary['peak_memory_percent']:.1f}%")
|
||||
|
||||
# Print task status counts
|
||||
status_counts = summary['status_counts']
|
||||
print("\nTask Status Counts:")
|
||||
for status, count in status_counts.items():
|
||||
print(f" {status}: {count}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
230
docs/examples/crypto_analysis_example.py
Normal file
230
docs/examples/crypto_analysis_example.py
Normal file
@@ -0,0 +1,230 @@
|
||||
"""
|
||||
Crawl4AI Crypto Trading Analysis Demo
|
||||
Author: Unclecode
|
||||
Date: 2024-03-15
|
||||
|
||||
This script demonstrates advanced crypto market analysis using:
|
||||
1. Web scraping of real-time CoinMarketCap data
|
||||
2. Smart table extraction with layout detection
|
||||
3. Hedge fund-grade financial metrics
|
||||
4. Interactive visualizations for trading signals
|
||||
|
||||
Key Features:
|
||||
- Volume Anomaly Detection: Finds unusual trading activity
|
||||
- Liquidity Power Score: Identifies easily tradable assets
|
||||
- Volatility-Weighted Momentum: Surface sustainable trends
|
||||
- Smart Money Signals: Algorithmic buy/hold recommendations
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import pandas as pd
|
||||
import plotly.express as px
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LXMLWebScrapingStrategy
|
||||
from crawl4ai import CrawlResult
|
||||
from typing import List
|
||||
from IPython.display import HTML
|
||||
|
||||
class CryptoAlphaGenerator:
|
||||
"""
|
||||
Advanced crypto analysis engine that transforms raw web data into:
|
||||
- Volume anomaly flags
|
||||
- Liquidity scores
|
||||
- Momentum-risk ratios
|
||||
- Machine learning-inspired trading signals
|
||||
|
||||
Methods:
|
||||
analyze_tables(): Process raw tables into trading insights
|
||||
create_visuals(): Generate institutional-grade visualizations
|
||||
generate_insights(): Create plain English trading recommendations
|
||||
"""
|
||||
|
||||
def clean_data(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Convert crypto market data to machine-readable format
|
||||
Handles currency symbols, units (B=Billions), and percentage values
|
||||
"""
|
||||
# Clean numeric columns
|
||||
df['Price'] = df['Price'].str.replace('[^\d.]', '', regex=True).astype(float)
|
||||
df['Market Cap'] = df['Market Cap'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
|
||||
df['Volume(24h)'] = df['Volume(24h)'].str.extract(r'\$([\d.]+)B')[0].astype(float) * 1e9
|
||||
|
||||
# Convert percentages to decimal values
|
||||
for col in ['1h %', '24h %', '7d %']:
|
||||
df[col] = df[col].str.replace('%', '').astype(float) / 100
|
||||
|
||||
return df
|
||||
|
||||
def calculate_metrics(self, df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""
|
||||
Compute advanced trading metrics used by quantitative funds:
|
||||
|
||||
1. Volume/Market Cap Ratio - Measures liquidity efficiency
|
||||
(High ratio = Underestimated attention)
|
||||
|
||||
2. Volatility Score - Risk-adjusted momentum potential
|
||||
(STD of 1h/24h/7d returns)
|
||||
|
||||
3. Momentum Score - Weighted average of returns
|
||||
(1h:30% + 24h:50% + 7d:20%)
|
||||
|
||||
4. Volume Anomaly - 3σ deviation detection
|
||||
(Flags potential insider activity)
|
||||
"""
|
||||
# Liquidity Metrics
|
||||
df['Volume/Market Cap Ratio'] = df['Volume(24h)'] / df['Market Cap']
|
||||
|
||||
# Risk Metrics
|
||||
df['Volatility Score'] = df[['1h %','24h %','7d %']].std(axis=1)
|
||||
|
||||
# Momentum Metrics
|
||||
df['Momentum Score'] = (df['1h %']*0.3 + df['24h %']*0.5 + df['7d %']*0.2)
|
||||
|
||||
# Anomaly Detection
|
||||
median_vol = df['Volume(24h)'].median()
|
||||
df['Volume Anomaly'] = df['Volume(24h)'] > 3 * median_vol
|
||||
|
||||
# Value Flags
|
||||
df['Undervalued Flag'] = (df['Market Cap'] < 1e9) & (df['Momentum Score'] > 0.05)
|
||||
df['Liquid Giant'] = (df['Volume/Market Cap Ratio'] > 0.15) & (df['Market Cap'] > 1e9)
|
||||
|
||||
return df
|
||||
|
||||
def create_visuals(self, df: pd.DataFrame) -> dict:
|
||||
"""
|
||||
Generate three institutional-grade visualizations:
|
||||
|
||||
1. 3D Market Map - X:Size, Y:Liquidity, Z:Momentum
|
||||
2. Liquidity Tree - Color:Volume Efficiency
|
||||
3. Momentum Leaderboard - Top sustainable movers
|
||||
"""
|
||||
# 3D Market Overview
|
||||
fig1 = px.scatter_3d(
|
||||
df,
|
||||
x='Market Cap',
|
||||
y='Volume/Market Cap Ratio',
|
||||
z='Momentum Score',
|
||||
size='Volatility Score',
|
||||
color='Volume Anomaly',
|
||||
hover_name='Name',
|
||||
title='Smart Money Market Map: Spot Overlooked Opportunities',
|
||||
labels={'Market Cap': 'Size (Log $)', 'Volume/Market Cap Ratio': 'Liquidity Power'},
|
||||
log_x=True,
|
||||
template='plotly_dark'
|
||||
)
|
||||
|
||||
# Liquidity Efficiency Tree
|
||||
fig2 = px.treemap(
|
||||
df,
|
||||
path=['Name'],
|
||||
values='Market Cap',
|
||||
color='Volume/Market Cap Ratio',
|
||||
hover_data=['Momentum Score'],
|
||||
title='Liquidity Forest: Green = High Trading Efficiency',
|
||||
color_continuous_scale='RdYlGn'
|
||||
)
|
||||
|
||||
# Momentum Leaders
|
||||
fig3 = px.bar(
|
||||
df.sort_values('Momentum Score', ascending=False).head(10),
|
||||
x='Name',
|
||||
y='Momentum Score',
|
||||
color='Volatility Score',
|
||||
title='Sustainable Momentum Leaders (Low Volatility + High Growth)',
|
||||
text='7d %',
|
||||
template='plotly_dark'
|
||||
)
|
||||
|
||||
return {'market_map': fig1, 'liquidity_tree': fig2, 'momentum_leaders': fig3}
|
||||
|
||||
def generate_insights(self, df: pd.DataFrame) -> str:
|
||||
"""
|
||||
Create plain English trading insights explaining:
|
||||
- Volume spikes and their implications
|
||||
- Risk-reward ratios of top movers
|
||||
- Liquidity warnings for large positions
|
||||
"""
|
||||
top_coin = df.sort_values('Momentum Score', ascending=False).iloc[0]
|
||||
anomaly_coins = df[df['Volume Anomaly']].sort_values('Volume(24h)', ascending=False)
|
||||
|
||||
report = f"""
|
||||
🚀 Top Alpha Opportunity: {top_coin['Name']}
|
||||
- Momentum Score: {top_coin['Momentum Score']:.2%} (Top 1%)
|
||||
- Risk-Reward Ratio: {top_coin['Momentum Score']/top_coin['Volatility Score']:.1f}
|
||||
- Liquidity Warning: {'✅ Safe' if top_coin['Liquid Giant'] else '⚠️ Thin Markets'}
|
||||
|
||||
🔥 Volume Spikes Detected ({len(anomaly_coins)} coins):
|
||||
{anomaly_coins[['Name', 'Volume(24h)']].head(3).to_markdown(index=False)}
|
||||
|
||||
💡 Smart Money Tip: Coins with Volume/Cap > 15% and Momentum > 5%
|
||||
historically outperform by 22% weekly returns.
|
||||
"""
|
||||
return report
|
||||
|
||||
async def main():
|
||||
"""
|
||||
Main execution flow:
|
||||
1. Configure headless browser for scraping
|
||||
2. Extract live crypto market data
|
||||
3. Clean and analyze using hedge fund models
|
||||
4. Generate visualizations and insights
|
||||
5. Output professional trading report
|
||||
"""
|
||||
# Configure browser with anti-detection features
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
stealth=True,
|
||||
block_resources=["image", "media"]
|
||||
)
|
||||
|
||||
# Initialize crawler with smart table detection
|
||||
crawler = AsyncWebCrawler(config=browser_config)
|
||||
await crawler.start()
|
||||
|
||||
try:
|
||||
# Set up scraping parameters
|
||||
crawl_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
scraping_strategy=LXMLWebScrapingStrategy(
|
||||
table_score_threshold=8, # Strict table detection
|
||||
keep_data_attributes=True
|
||||
)
|
||||
)
|
||||
|
||||
# Execute market data extraction
|
||||
results: List[CrawlResult] = await crawler.arun(
|
||||
url='https://coinmarketcap.com/?page=1',
|
||||
config=crawl_config
|
||||
)
|
||||
|
||||
# Process results
|
||||
for result in results:
|
||||
if result.success and result.media['tables']:
|
||||
# Extract primary market table
|
||||
raw_df = pd.DataFrame(
|
||||
result.media['tables'][0]['rows'],
|
||||
columns=result.media['tables'][0]['headers']
|
||||
)
|
||||
|
||||
# Initialize analysis engine
|
||||
analyzer = CryptoAlphaGenerator()
|
||||
clean_df = analyzer.clean_data(raw_df)
|
||||
analyzed_df = analyzer.calculate_metrics(clean_df)
|
||||
|
||||
# Generate outputs
|
||||
visuals = analyzer.create_visuals(analyzed_df)
|
||||
insights = analyzer.generate_insights(analyzed_df)
|
||||
|
||||
# Save visualizations
|
||||
visuals['market_map'].write_html("market_map.html")
|
||||
visuals['liquidity_tree'].write_html("liquidity_tree.html")
|
||||
|
||||
# Display results
|
||||
print("🔑 Key Trading Insights:")
|
||||
print(insights)
|
||||
print("\n📊 Open 'market_map.html' for interactive analysis")
|
||||
|
||||
finally:
|
||||
await crawler.close()
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -65,7 +65,6 @@ async def basic_deep_crawl():
|
||||
f"\n✅ Performance: {len(results)} pages in {time.perf_counter() - start_time:.2f} seconds"
|
||||
)
|
||||
|
||||
|
||||
# 2️⃣ Stream vs. Non-Stream Execution
|
||||
async def stream_vs_nonstream():
|
||||
"""
|
||||
@@ -127,7 +126,6 @@ async def stream_vs_nonstream():
|
||||
print(f" ✅ All results: {time.perf_counter() - start_time:.2f} seconds")
|
||||
print("\n🔍 Key Takeaway: Streaming allows processing results immediately")
|
||||
|
||||
|
||||
# 3️⃣ Introduce Filters & Scorers
|
||||
async def filters_and_scorers():
|
||||
"""
|
||||
@@ -236,82 +234,10 @@ async def filters_and_scorers():
|
||||
print(f" ✅ Crawler prioritized {len(results)} pages by relevance score")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visits highest-scoring pages first")
|
||||
|
||||
|
||||
# 4️⃣ Wrap-Up and Key Takeaways
|
||||
async def wrap_up():
|
||||
"""
|
||||
PART 4: Wrap-Up and Key Takeaways
|
||||
|
||||
Summarize the key concepts learned in this tutorial.
|
||||
"""
|
||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||
|
||||
# Create a sophisticated filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a composite scorer that combines multiple scoring strategies
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
# Set up the configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
results = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Summarize the results
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
print(
|
||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||
)
|
||||
|
||||
# Group by depth
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
# 5️⃣ Advanced Filters
|
||||
# 4️⃣ Advanced Filters
|
||||
async def advanced_filters():
|
||||
"""
|
||||
PART 5: Demonstrates advanced filtering techniques for specialized crawling.
|
||||
PART 4: Demonstrates advanced filtering techniques for specialized crawling.
|
||||
|
||||
This function covers:
|
||||
- SEO filters
|
||||
@@ -371,11 +297,10 @@ async def advanced_filters():
|
||||
relevance_score = result.metadata.get("relevance_score", 0)
|
||||
print(f" → Score: {relevance_score:.2f} | {result.url}")
|
||||
|
||||
|
||||
# Main function to run the entire tutorial
|
||||
# 5️⃣ Max Pages and Score Thresholds
|
||||
async def max_pages_and_thresholds():
|
||||
"""
|
||||
PART 6: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||
PART 5: Demonstrates using max_pages and score_threshold parameters with different strategies.
|
||||
|
||||
This function shows:
|
||||
- How to limit the number of pages crawled
|
||||
@@ -471,6 +396,77 @@ async def max_pages_and_thresholds():
|
||||
print(f" ✅ Average score: {avg_score:.2f}")
|
||||
print(" 🔍 Note: BestFirstCrawlingStrategy visited highest-scoring pages first")
|
||||
|
||||
# 6️⃣ Wrap-Up and Key Takeaways
|
||||
async def wrap_up():
|
||||
"""
|
||||
PART 6: Wrap-Up and Key Takeaways
|
||||
|
||||
Summarize the key concepts learned in this tutorial.
|
||||
"""
|
||||
print("\n===== COMPLETE CRAWLER EXAMPLE =====")
|
||||
print("Combining filters, scorers, and streaming for an optimized crawl")
|
||||
|
||||
# Create a sophisticated filter chain
|
||||
filter_chain = FilterChain(
|
||||
[
|
||||
DomainFilter(
|
||||
allowed_domains=["docs.crawl4ai.com"],
|
||||
blocked_domains=["old.docs.crawl4ai.com"],
|
||||
),
|
||||
URLPatternFilter(patterns=["*core*", "*advanced*", "*blog*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]),
|
||||
]
|
||||
)
|
||||
|
||||
# Create a composite scorer that combines multiple scoring strategies
|
||||
keyword_scorer = KeywordRelevanceScorer(
|
||||
keywords=["crawl", "example", "async", "configuration"], weight=0.7
|
||||
)
|
||||
# Set up the configuration
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BestFirstCrawlingStrategy(
|
||||
max_depth=1,
|
||||
include_external=False,
|
||||
filter_chain=filter_chain,
|
||||
url_scorer=keyword_scorer,
|
||||
),
|
||||
scraping_strategy=LXMLWebScrapingStrategy(),
|
||||
stream=True,
|
||||
verbose=True,
|
||||
)
|
||||
|
||||
# Execute the crawl
|
||||
results = []
|
||||
start_time = time.perf_counter()
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
async for result in await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config
|
||||
):
|
||||
results.append(result)
|
||||
score = result.metadata.get("score", 0)
|
||||
depth = result.metadata.get("depth", 0)
|
||||
print(f"→ Depth: {depth} | Score: {score:.2f} | {result.url}")
|
||||
|
||||
duration = time.perf_counter() - start_time
|
||||
|
||||
# Summarize the results
|
||||
print(f"\n✅ Crawled {len(results)} high-value pages in {duration:.2f} seconds")
|
||||
print(
|
||||
f"✅ Average score: {sum(r.metadata.get('score', 0) for r in results) / len(results):.2f}"
|
||||
)
|
||||
|
||||
# Group by depth
|
||||
depth_counts = {}
|
||||
for result in results:
|
||||
depth = result.metadata.get("depth", 0)
|
||||
depth_counts[depth] = depth_counts.get(depth, 0) + 1
|
||||
|
||||
print("\n📊 Pages crawled by depth:")
|
||||
for depth, count in sorted(depth_counts.items()):
|
||||
print(f" Depth {depth}: {count} pages")
|
||||
|
||||
|
||||
async def run_tutorial():
|
||||
"""
|
||||
Executes all tutorial sections in sequence.
|
||||
@@ -482,12 +478,12 @@ async def run_tutorial():
|
||||
|
||||
# Define sections - uncomment to run specific parts during development
|
||||
tutorial_sections = [
|
||||
# basic_deep_crawl,
|
||||
# stream_vs_nonstream,
|
||||
# filters_and_scorers,
|
||||
max_pages_and_thresholds, # Added new section
|
||||
wrap_up,
|
||||
basic_deep_crawl,
|
||||
stream_vs_nonstream,
|
||||
filters_and_scorers,
|
||||
max_pages_and_thresholds,
|
||||
advanced_filters,
|
||||
wrap_up,
|
||||
]
|
||||
|
||||
for section in tutorial_sections:
|
||||
@@ -497,7 +493,6 @@ async def run_tutorial():
|
||||
print("You now have a comprehensive understanding of deep crawling with Crawl4AI.")
|
||||
print("For more information, check out https://docs.crawl4ai.com")
|
||||
|
||||
|
||||
# Execute the tutorial when run directly
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tutorial())
|
||||
@@ -39,7 +39,7 @@ async def memory_adaptive_with_rate_limit(urls, browser_config, run_config):
|
||||
start = time.perf_counter()
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=70.0,
|
||||
memory_threshold_percent=95.0,
|
||||
max_session_permit=10,
|
||||
rate_limiter=RateLimiter(
|
||||
base_delay=(1.0, 2.0), max_delay=30.0, max_retries=2
|
||||
|
||||
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
|
||||
# "https://news.ycombinator.com/news"
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
||||
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting Streaming Crawl: {url}")
|
||||
|
||||
@@ -11,7 +11,7 @@ import asyncio
|
||||
import os
|
||||
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import (
|
||||
LLMExtractionStrategy,
|
||||
JsonCssExtractionStrategy,
|
||||
@@ -61,19 +61,19 @@ async def main():
|
||||
|
||||
# 1. LLM Extraction with different input formats
|
||||
markdown_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information including name, price, and description",
|
||||
)
|
||||
|
||||
html_strategy = LLMExtractionStrategy(
|
||||
input_format="html",
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from HTML including structured data",
|
||||
)
|
||||
|
||||
fit_markdown_strategy = LLMExtractionStrategy(
|
||||
input_format="fit_markdown",
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract product information from cleaned markdown",
|
||||
)
|
||||
|
||||
|
||||
@@ -9,6 +9,26 @@ from crawl4ai import (
|
||||
CrawlResult
|
||||
)
|
||||
|
||||
async def example_cdp():
|
||||
browser_conf = BrowserConfig(
|
||||
headless=False,
|
||||
cdp_url="http://localhost:9223"
|
||||
)
|
||||
crawler_config = CrawlerRunConfig(
|
||||
session_id="test",
|
||||
js_code = """(() => { return {"result": "Hello World!"} })()""",
|
||||
js_only=True
|
||||
)
|
||||
async with AsyncWebCrawler(
|
||||
config=browser_conf,
|
||||
verbose=True,
|
||||
) as crawler:
|
||||
result : CrawlResult = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config,
|
||||
)
|
||||
print(result.js_execution_result)
|
||||
|
||||
|
||||
async def main():
|
||||
browser_config = BrowserConfig(headless=True, verbose=True)
|
||||
@@ -16,18 +36,15 @@ async def main():
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
# content_filter=PruningContentFilter(
|
||||
# threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
# )
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48, threshold_type="fixed", min_word_threshold=0
|
||||
)
|
||||
),
|
||||
)
|
||||
result : CrawlResult = await crawler.arun(
|
||||
# url="https://www.helloworld.org", config=crawler_config
|
||||
url="https://www.kidocode.com", config=crawler_config
|
||||
url="https://www.helloworld.org", config=crawler_config
|
||||
)
|
||||
print(result.markdown.raw_markdown[:500])
|
||||
# print(result.model_dump())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai import AsyncWebCrawler, LLMExtractionStrategy
|
||||
import asyncio
|
||||
import os
|
||||
@@ -23,7 +23,7 @@ async def main():
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
# provider= "openai/gpt-4o", api_token = os.getenv('OPENAI_API_KEY'),
|
||||
llmConfig=LlmConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
llm_config=LLMConfig(provider="groq/llama-3.1-70b-versatile", api_token=os.getenv("GROQ_API_KEY")),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="From the crawled content, extract all mentioned model names along with their "
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -23,7 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,7 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
ignore_cache = True,
|
||||
instruction="""
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
sys.path.append(
|
||||
os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||
@@ -211,7 +211,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os, sys
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# append parent directory to system path
|
||||
sys.path.append(
|
||||
@@ -147,7 +147,7 @@ async def extract_structured_data_using_llm(
|
||||
url="https://openai.com/api/pricing/",
|
||||
word_count_threshold=1,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config=LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
@@ -570,7 +570,7 @@ async def generate_knowledge_graph():
|
||||
relationships: List[Relationship]
|
||||
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY")), # In case of Ollama just pass "no-token"
|
||||
schema=KnowledgeGraph.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""Extract entities and relationships from the given text.""",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
import os
|
||||
import time
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import *
|
||||
from crawl4ai.extraction_strategy import *
|
||||
@@ -179,7 +179,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
)
|
||||
cprint(
|
||||
@@ -198,7 +198,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="I am interested in only financial news",
|
||||
),
|
||||
)
|
||||
@@ -210,7 +210,7 @@ def add_llm_extraction_strategy(crawler):
|
||||
result = crawler.run(
|
||||
url="https://www.nbcnews.com/business",
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
),
|
||||
)
|
||||
|
||||
@@ -13,11 +13,11 @@ from crawl4ai.deep_crawling import (
|
||||
)
|
||||
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
|
||||
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
from crawl4ai import RoundRobinProxyStrategy
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai import DefaultMarkdownGenerator
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.processors.pdf import PDFCrawlerStrategy, PDFContentScrapingStrategy
|
||||
from pprint import pprint
|
||||
@@ -284,9 +284,9 @@ async def llm_content_filter():
|
||||
PART 5: LLM Content Filter
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Configuring LLM providers via LLMConfig
|
||||
- Using LLM to generate focused markdown
|
||||
- LlmConfig for configuration
|
||||
- LLMConfig for configuration
|
||||
|
||||
Note: Requires a valid API key for the chosen LLM provider
|
||||
"""
|
||||
@@ -296,7 +296,7 @@ async def llm_content_filter():
|
||||
|
||||
# Create LLM configuration
|
||||
# Replace with your actual API key or set as environment variable
|
||||
llm_config = LlmConfig(
|
||||
llm_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY" # Will read from GEMINI_API_KEY environment variable
|
||||
)
|
||||
@@ -309,7 +309,7 @@ async def llm_content_filter():
|
||||
# Create markdown generator with LLM filter
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig=llm_config,
|
||||
llm_config=llm_config,
|
||||
instruction="Extract key concepts and summaries"
|
||||
)
|
||||
)
|
||||
@@ -381,7 +381,7 @@ async def llm_schema_generation():
|
||||
PART 7: LLM Schema Generation
|
||||
|
||||
This function demonstrates:
|
||||
- Configuring LLM providers via LlmConfig
|
||||
- Configuring LLM providers via LLMConfig
|
||||
- Using LLM to generate extraction schemas
|
||||
- JsonCssExtractionStrategy
|
||||
|
||||
@@ -406,9 +406,9 @@ async def llm_schema_generation():
|
||||
<div class="rating">4.7/5</div>
|
||||
</div>
|
||||
"""
|
||||
print("\n📊 Setting up LlmConfig...")
|
||||
print("\n📊 Setting up LLMConfig...")
|
||||
# Create LLM configuration
|
||||
llm_config = LlmConfig(
|
||||
llm_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro",
|
||||
api_token="env:GEMINI_API_KEY"
|
||||
)
|
||||
@@ -416,7 +416,7 @@ async def llm_schema_generation():
|
||||
print(" This would use the LLM to analyze HTML and create an extraction schema")
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html=sample_html,
|
||||
llmConfig = llm_config,
|
||||
llm_config = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print("\n✅ Generated Schema:")
|
||||
|
||||
@@ -71,7 +71,8 @@ We group them by category.
|
||||
| **`word_count_threshold`** | `int` (default: ~200) | Skips text blocks below X words. Helps ignore trivial sections. |
|
||||
| **`extraction_strategy`** | `ExtractionStrategy` (default: None) | If set, extracts structured data (CSS-based, LLM-based, etc.). |
|
||||
| **`markdown_generator`** | `MarkdownGenerationStrategy` (None) | If you want specialized markdown output (citations, filtering, chunking, etc.). |
|
||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. |
|
||||
| **`css_selector`** | `str` (None) | Retains only the part of the page matching this selector. Affects the entire extraction process. |
|
||||
| **`target_elements`** | `List[str]` (None) | List of CSS selectors for elements to focus on for markdown generation and data extraction, while still processing the entire page for links, media, etc. Provides more flexibility than `css_selector`. |
|
||||
| **`excluded_tags`** | `list` (None) | Removes entire tags (e.g. `["script", "style"]`). |
|
||||
| **`excluded_selector`** | `str` (None) | Like `css_selector` but to exclude. E.g. `"#ads, .tracker"`. |
|
||||
| **`only_text`** | `bool` (False) | If `True`, tries to extract text-only content. |
|
||||
@@ -245,8 +246,8 @@ run_config = CrawlerRunConfig(
|
||||
)
|
||||
```
|
||||
|
||||
# 3. **LlmConfig** - Setting up LLM providers
|
||||
LlmConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
# 3. **LLMConfig** - Setting up LLM providers
|
||||
LLMConfig is useful to pass LLM provider config to strategies and functions that rely on LLMs to do extraction, filtering, schema generation etc. Currently it can be used in the following -
|
||||
|
||||
1. LLMExtractionStrategy
|
||||
2. LLMContentFilter
|
||||
@@ -262,7 +263,7 @@ LlmConfig is useful to pass LLM provider config to strategies and functions that
|
||||
|
||||
## 3.2 Example Usage
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
@@ -270,7 +271,7 @@ llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI
|
||||
- **Use** `BrowserConfig` for **global** browser settings: engine, headless, proxy, user agent.
|
||||
- **Use** `CrawlerRunConfig` for each crawl’s **context**: how to filter content, handle caching, wait for dynamic elements, or run JS.
|
||||
- **Pass** both configs to `AsyncWebCrawler` (the `BrowserConfig`) and then to `arun()` (the `CrawlerRunConfig`).
|
||||
- **Use** `LlmConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
- **Use** `LLMConfig` for LLM provider configurations that can be used across all extraction, filtering, schema generation tasks. Can be used in - `LLMExtractionStrategy`, `LLMContentFilter`, `JsonCssExtractionStrategy.generate_schema` & `JsonXPathExtractionStrategy.generate_schema`
|
||||
|
||||
```python
|
||||
# Create a modified copy with the clone() method
|
||||
|
||||
@@ -131,7 +131,7 @@ OverlappingWindowChunking(
|
||||
```python
|
||||
from pydantic import BaseModel
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Define schema
|
||||
class Article(BaseModel):
|
||||
@@ -141,7 +141,7 @@ class Article(BaseModel):
|
||||
|
||||
# Create strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
schema=Article.schema(),
|
||||
instruction="Extract article details"
|
||||
)
|
||||
@@ -198,7 +198,7 @@ result = await crawler.arun(
|
||||
|
||||
```python
|
||||
from crawl4ai.chunking_strategy import OverlappingWindowChunking
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Create chunking strategy
|
||||
chunker = OverlappingWindowChunking(
|
||||
@@ -208,7 +208,7 @@ chunker = OverlappingWindowChunking(
|
||||
|
||||
# Use with extraction strategy
|
||||
strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="ollama/llama2"),
|
||||
llm_config = LLMConfig(provider="ollama/llama2"),
|
||||
chunking_strategy=chunker
|
||||
)
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ My dear friends and crawlers, there you go, this is the release of Crawl4AI v0.5
|
||||
* **Multiple Crawler Strategies:** Choose between the full-featured Playwright browser-based crawler or a new, *much* faster HTTP-only crawler for simpler tasks.
|
||||
* **Docker Deployment:** Deploy Crawl4AI as a scalable, self-contained service with built-in API endpoints and optional JWT authentication.
|
||||
* **Command-Line Interface (CLI):** Interact with Crawl4AI directly from your terminal. Crawl, configure, and extract data with simple commands.
|
||||
* **LLM Configuration (`LlmConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
* **LLM Configuration (`LLMConfig`):** A new, unified way to configure LLM providers (OpenAI, Anthropic, Ollama, etc.) for extraction, filtering, and schema generation. Simplifies API key management and switching between models.
|
||||
|
||||
**Minor Updates & Improvements:**
|
||||
|
||||
@@ -47,7 +47,7 @@ This release includes several breaking changes to improve the library's structur
|
||||
* **Config**: FastFilterChain has been replaced with FilterChain
|
||||
* **Deep-Crawl**: DeepCrawlStrategy.arun now returns Union[CrawlResultT, List[CrawlResultT], AsyncGenerator[CrawlResultT, None]]
|
||||
* **Proxy**: Removed synchronous WebCrawler support and related rate limiting configurations
|
||||
* **LLM Parameters:** Use the new `LlmConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
* **LLM Parameters:** Use the new `LLMConfig` object instead of passing `provider`, `api_token`, `base_url`, and `api_base` directly to `LLMExtractionStrategy` and `LLMContentFilter`.
|
||||
|
||||
**In short:** Update imports, adjust `arun_many()` usage, check for optional fields, and review the Docker deployment guide.
|
||||
|
||||
|
||||
@@ -251,7 +251,7 @@ from crawl4ai import (
|
||||
RoundRobinProxyStrategy,
|
||||
)
|
||||
import asyncio
|
||||
from crawl4ai.configs import ProxyConfig
|
||||
from crawl4ai.proxy_strategy import ProxyConfig
|
||||
async def main():
|
||||
# Load proxies and create rotation strategy
|
||||
proxies = ProxyConfig.from_env()
|
||||
@@ -305,13 +305,13 @@ asyncio.run(main())
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, DefaultMarkdownGenerator
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
import asyncio
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
markdown_generator = DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(llmConfig=llm_config, instruction="Extract key concepts and summaries")
|
||||
content_filter=LLMContentFilter(llm_config=llm_config, instruction="Extract key concepts and summaries")
|
||||
)
|
||||
|
||||
config = CrawlerRunConfig(markdown_generator=markdown_generator)
|
||||
@@ -335,13 +335,13 @@ asyncio.run(main())
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
llm_config = LlmConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
llm_config = LLMConfig(provider="gemini/gemini-1.5-pro", api_token="env:GEMINI_API_KEY")
|
||||
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html="<div class='product'><h2>Product Name</h2><span class='price'>$99</span></div>",
|
||||
llmConfig = llm_config,
|
||||
llm_config = llm_config,
|
||||
query="Extract product name and price"
|
||||
)
|
||||
print(schema)
|
||||
@@ -394,20 +394,20 @@ print(schema)
|
||||
serialization, especially for sets of allowed/blocked domains. No code changes
|
||||
required.
|
||||
|
||||
- **Added: New `LlmConfig` parameter.** This new parameter can be passed for
|
||||
- **Added: New `LLMConfig` parameter.** This new parameter can be passed for
|
||||
extraction, filtering, and schema generation tasks. It simplifies passing
|
||||
provider strings, API tokens, and base URLs across all sections where LLM
|
||||
configuration is necessary. It also enables reuse and allows for quick
|
||||
experimentation between different LLM configurations.
|
||||
|
||||
```python
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
# Example of using LlmConfig with LLMExtractionStrategy
|
||||
llm_config = LlmConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llmConfig=llm_config, schema=...)
|
||||
# Example of using LLMConfig with LLMExtractionStrategy
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o", api_token="YOUR_API_KEY")
|
||||
strategy = LLMExtractionStrategy(llm_config=llm_config, schema=...)
|
||||
|
||||
# Example usage within a crawler
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
@@ -418,7 +418,7 @@ print(schema)
|
||||
```
|
||||
**Breaking Change:** Removed old parameters like `provider`, `api_token`,
|
||||
`base_url`, and `api_base` from `LLMExtractionStrategy` and
|
||||
`LLMContentFilter`. Users should migrate to using the `LlmConfig` object.
|
||||
`LLMContentFilter`. Users should migrate to using the `LLMConfig` object.
|
||||
|
||||
- **Changed: Improved browser context management and added shared data support.
|
||||
(Breaking Change:** `BrowserContext` API updated). Browser contexts are now
|
||||
|
||||
@@ -4,7 +4,7 @@ Crawl4AI’s flexibility stems from two key classes:
|
||||
|
||||
1. **`BrowserConfig`** – Dictates **how** the browser is launched and behaves (e.g., headless or visible, proxy, user agent).
|
||||
2. **`CrawlerRunConfig`** – Dictates **how** each **crawl** operates (e.g., caching, extraction, timeouts, JavaScript code to run, etc.).
|
||||
3. **`LlmConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
3. **`LLMConfig`** - Dictates **how** LLM providers are configured. (model, api token, base url, temperature etc.)
|
||||
|
||||
In most examples, you create **one** `BrowserConfig` for the entire crawler session, then pass a **fresh** or re-used `CrawlerRunConfig` whenever you call `arun()`. This tutorial shows the most commonly used parameters. If you need advanced or rarely used fields, see the [Configuration Parameters](../api/parameters.md).
|
||||
|
||||
@@ -239,7 +239,7 @@ The `clone()` method:
|
||||
|
||||
|
||||
|
||||
## 3. LlmConfig Essentials
|
||||
## 3. LLMConfig Essentials
|
||||
|
||||
### Key fields to note
|
||||
|
||||
@@ -256,16 +256,16 @@ The `clone()` method:
|
||||
- If your provider has a custom endpoint
|
||||
|
||||
```python
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
```
|
||||
|
||||
## 4. Putting It All Together
|
||||
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LlmConfig` depending on each call’s needs:
|
||||
In a typical scenario, you define **one** `BrowserConfig` for your crawler session, then create **one or more** `CrawlerRunConfig` & `LLMConfig` depending on each call’s needs:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
|
||||
async def main():
|
||||
@@ -289,14 +289,14 @@ async def main():
|
||||
|
||||
# 3) Example LLM content filtering
|
||||
|
||||
gemini_config = LlmConfig(
|
||||
gemini_config = LLMConfig(
|
||||
provider="gemini/gemini-1.5-pro"
|
||||
api_token = "env:GEMINI_API_TOKEN"
|
||||
)
|
||||
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=gemini_config, # or your preferred provider
|
||||
llm_config=gemini_config, # or your preferred provider
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
@@ -343,7 +343,7 @@ if __name__ == "__main__":
|
||||
|
||||
For a **detailed list** of available parameters (including advanced ones), see:
|
||||
|
||||
- [BrowserConfig, CrawlerRunConfig & LlmConfig Reference](../api/parameters.md)
|
||||
- [BrowserConfig, CrawlerRunConfig & LLMConfig Reference](../api/parameters.md)
|
||||
|
||||
You can explore topics like:
|
||||
|
||||
@@ -356,7 +356,7 @@ You can explore topics like:
|
||||
|
||||
## 6. Conclusion
|
||||
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LlmConfig** give you straightforward ways to define:
|
||||
**BrowserConfig**, **CrawlerRunConfig** and **LLMConfig** give you straightforward ways to define:
|
||||
|
||||
- **Which** browser to launch, how it should run, and any proxy or user agent needs.
|
||||
- **How** each crawl should behave—caching, timeouts, JavaScript code, extraction strategies, etc.
|
||||
|
||||
@@ -8,6 +8,10 @@ Below, we show how to configure these parameters and combine them for precise co
|
||||
|
||||
## 1. CSS-Based Selection
|
||||
|
||||
There are two ways to select content from a page: using `css_selector` or the more flexible `target_elements`.
|
||||
|
||||
### 1.1 Using `css_selector`
|
||||
|
||||
A straightforward way to **limit** your crawl results to a certain region of the page is **`css_selector`** in **`CrawlerRunConfig`**:
|
||||
|
||||
```python
|
||||
@@ -32,6 +36,33 @@ if __name__ == "__main__":
|
||||
|
||||
**Result**: Only elements matching that selector remain in `result.cleaned_html`.
|
||||
|
||||
### 1.2 Using `target_elements`
|
||||
|
||||
The `target_elements` parameter provides more flexibility by allowing you to target **multiple elements** for content extraction while preserving the entire page context for other features:
|
||||
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig
|
||||
|
||||
async def main():
|
||||
config = CrawlerRunConfig(
|
||||
# Target article body and sidebar, but not other content
|
||||
target_elements=["article.main-content", "aside.sidebar"]
|
||||
)
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/blog-post",
|
||||
config=config
|
||||
)
|
||||
print("Markdown focused on target elements")
|
||||
print("Links from entire page still available:", len(result.links.get("internal", [])))
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
**Key difference**: With `target_elements`, the markdown generation and structural data extraction focus on those elements, but other page elements (like links, images, and tables) are still extracted from the entire page. This gives you fine-grained control over what appears in your markdown content while preserving full page context for link analysis and media collection.
|
||||
|
||||
---
|
||||
|
||||
## 2. Content Filtering & Exclusions
|
||||
@@ -211,7 +242,7 @@ if __name__ == "__main__":
|
||||
import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class ArticleData(BaseModel):
|
||||
@@ -220,7 +251,7 @@ class ArticleData(BaseModel):
|
||||
|
||||
async def main():
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4",api_token="sk-YOUR_API_KEY")
|
||||
schema=ArticleData.schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract 'headline' and a short 'summary' from the content."
|
||||
@@ -404,15 +435,59 @@ Stick to BeautifulSoup strategy (default) when:
|
||||
|
||||
---
|
||||
|
||||
## 7. Conclusion
|
||||
## 7. Combining CSS Selection Methods
|
||||
|
||||
By mixing **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
|
||||
You can combine `css_selector` and `target_elements` in powerful ways to achieve fine-grained control over your output:
|
||||
|
||||
1. **`css_selector`** – Basic scoping to an element or region.
|
||||
2. **`word_count_threshold`** – Skip short blocks.
|
||||
3. **`excluded_tags`** – Remove entire HTML tags.
|
||||
4. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.
|
||||
5. **`exclude_external_images`** – Remove images from external sources.
|
||||
6. **`process_iframes`** – Merge iframe content if needed.
|
||||
```python
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
|
||||
async def main():
|
||||
# Target specific content but preserve page context
|
||||
config = CrawlerRunConfig(
|
||||
# Focus markdown on main content and sidebar
|
||||
target_elements=["#main-content", ".sidebar"],
|
||||
|
||||
# Global filters applied to entire page
|
||||
excluded_tags=["nav", "footer", "header"],
|
||||
exclude_external_links=True,
|
||||
|
||||
# Use basic content thresholds
|
||||
word_count_threshold=15,
|
||||
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url="https://example.com/article",
|
||||
config=config
|
||||
)
|
||||
|
||||
print(f"Content focuses on specific elements, but all links still analyzed")
|
||||
print(f"Internal links: {len(result.links.get('internal', []))}")
|
||||
print(f"External links: {len(result.links.get('external', []))}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
```
|
||||
|
||||
This approach gives you the best of both worlds:
|
||||
- Markdown generation and content extraction focus on the elements you care about
|
||||
- Links, images and other page data still give you the full context of the page
|
||||
- Content filtering still applies globally
|
||||
|
||||
## 8. Conclusion
|
||||
|
||||
By mixing **target_elements** or **css_selector** scoping, **content filtering** parameters, and advanced **extraction strategies**, you can precisely **choose** which data to keep. Key parameters in **`CrawlerRunConfig`** for content selection include:
|
||||
|
||||
1. **`target_elements`** – Array of CSS selectors to focus markdown generation and data extraction, while preserving full page context for links and media.
|
||||
2. **`css_selector`** – Basic scoping to an element or region for all extraction processes.
|
||||
3. **`word_count_threshold`** – Skip short blocks.
|
||||
4. **`excluded_tags`** – Remove entire HTML tags.
|
||||
5. **`exclude_external_links`**, **`exclude_social_media_links`**, **`exclude_domains`** – Filter out unwanted links or domains.
|
||||
6. **`exclude_external_images`** – Remove images from external sources.
|
||||
7. **`process_iframes`** – Merge iframe content if needed.
|
||||
|
||||
Combine these with structured extraction (CSS, LLM-based, or others) to build powerful crawls that yield exactly the content you want, from raw or cleaned HTML up to sophisticated JSON structures. For more detail, see [Configuration Reference](../api/parameters.md). Enjoy curating your data to the max!
|
||||
@@ -133,19 +133,28 @@ This approach is handy when you still want external links but need to block cert
|
||||
|
||||
### 3.1 Accessing `result.media`
|
||||
|
||||
By default, Crawl4AI collects images, audio, and video URLs it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`).
|
||||
By default, Crawl4AI collects images, audio, video URLs, and data tables it finds on the page. These are stored in `result.media`, a dictionary keyed by media type (e.g., `images`, `videos`, `audio`, `tables`).
|
||||
|
||||
**Basic Example**:
|
||||
|
||||
```python
|
||||
if result.success:
|
||||
# Get images
|
||||
images_info = result.media.get("images", [])
|
||||
print(f"Found {len(images_info)} images in total.")
|
||||
for i, img in enumerate(images_info[:5]): # Inspect just the first 5
|
||||
for i, img in enumerate(images_info[:3]): # Inspect just the first 3
|
||||
print(f"[Image {i}] URL: {img['src']}")
|
||||
print(f" Alt text: {img.get('alt', '')}")
|
||||
print(f" Score: {img.get('score')}")
|
||||
print(f" Description: {img.get('desc', '')}\n")
|
||||
|
||||
# Get tables
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables in total.")
|
||||
for i, table in enumerate(tables):
|
||||
print(f"[Table {i}] Caption: {table.get('caption', 'No caption')}")
|
||||
print(f" Columns: {len(table.get('headers', []))}")
|
||||
print(f" Rows: {len(table.get('rows', []))}")
|
||||
```
|
||||
|
||||
**Structure Example**:
|
||||
@@ -171,6 +180,19 @@ result.media = {
|
||||
],
|
||||
"audio": [
|
||||
# Similar structure but with audio-specific fields
|
||||
],
|
||||
"tables": [
|
||||
{
|
||||
"headers": ["Name", "Age", "Location"],
|
||||
"rows": [
|
||||
["John Doe", "34", "New York"],
|
||||
["Jane Smith", "28", "San Francisco"],
|
||||
["Alex Johnson", "42", "Chicago"]
|
||||
],
|
||||
"caption": "Employee Directory",
|
||||
"summary": "Directory of company employees"
|
||||
},
|
||||
# More tables if present
|
||||
]
|
||||
}
|
||||
```
|
||||
@@ -199,7 +221,53 @@ crawler_cfg = CrawlerRunConfig(
|
||||
|
||||
This setting attempts to discard images from outside the primary domain, keeping only those from the site you’re crawling.
|
||||
|
||||
### 3.3 Additional Media Config
|
||||
### 3.3 Working with Tables
|
||||
|
||||
Crawl4AI can detect and extract structured data from HTML tables. Tables are analyzed based on various criteria to determine if they are actual data tables (as opposed to layout tables), including:
|
||||
|
||||
- Presence of thead and tbody sections
|
||||
- Use of th elements for headers
|
||||
- Column consistency
|
||||
- Text density
|
||||
- And other factors
|
||||
|
||||
Tables that score above the threshold (default: 7) are extracted and stored in `result.media.tables`.
|
||||
|
||||
**Accessing Table Data**:
|
||||
|
||||
```python
|
||||
if result.success:
|
||||
tables = result.media.get("tables", [])
|
||||
print(f"Found {len(tables)} data tables on the page")
|
||||
|
||||
if tables:
|
||||
# Access the first table
|
||||
first_table = tables[0]
|
||||
print(f"Table caption: {first_table.get('caption', 'No caption')}")
|
||||
print(f"Headers: {first_table.get('headers', [])}")
|
||||
|
||||
# Print the first 3 rows
|
||||
for i, row in enumerate(first_table.get('rows', [])[:3]):
|
||||
print(f"Row {i+1}: {row}")
|
||||
```
|
||||
|
||||
**Configuring Table Extraction**:
|
||||
|
||||
You can adjust the sensitivity of the table detection algorithm with:
|
||||
|
||||
```python
|
||||
crawler_cfg = CrawlerRunConfig(
|
||||
table_score_threshold=5 # Lower value = more tables detected (default: 7)
|
||||
)
|
||||
```
|
||||
|
||||
Each extracted table contains:
|
||||
- `headers`: Column header names
|
||||
- `rows`: List of rows, each containing cell values
|
||||
- `caption`: Table caption text (if available)
|
||||
- `summary`: Table summary attribute (if specified)
|
||||
|
||||
### 3.4 Additional Media Config
|
||||
|
||||
- **`screenshot`**: Set to `True` if you want a full-page screenshot stored as `base64` in `result.screenshot`.
|
||||
- **`pdf`**: Set to `True` if you want a PDF version of the page in `result.pdf`.
|
||||
@@ -273,4 +341,11 @@ if __name__ == "__main__":
|
||||
|
||||
---
|
||||
|
||||
**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
|
||||
**That’s it for Link & Media Analysis!** You’re now equipped to filter out unwanted sites and zero in on the images and videos that matter for your project.
|
||||
### Table Extraction Tips
|
||||
|
||||
- Not all HTML tables are extracted - only those detected as "data tables" vs. layout tables.
|
||||
- Tables with inconsistent cell counts, nested tables, or those used purely for layout may be skipped.
|
||||
- If you're missing tables, try adjusting the `table_score_threshold` to a lower value (default is 7).
|
||||
|
||||
The table detection algorithm scores tables based on features like consistent columns, presence of headers, text density, and more. Tables scoring above the threshold are considered data tables worth extracting.
|
||||
|
||||
@@ -175,13 +175,13 @@ prune_filter = PruningContentFilter(
|
||||
For intelligent content filtering and high-quality markdown generation, you can use the **LLMContentFilter**. This filter leverages LLMs to generate relevant markdown while preserving the original content's meaning and structure:
|
||||
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def main():
|
||||
# Initialize LLM filter with specific instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-api-token"), #or use environment variable
|
||||
instruction="""
|
||||
Focus on extracting the core educational content.
|
||||
Include:
|
||||
|
||||
@@ -128,7 +128,7 @@ Crawl4AI can also extract structured data (JSON) using CSS or XPath selectors. B
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Generate a schema (one-time cost)
|
||||
html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</span></div>"
|
||||
@@ -136,13 +136,13 @@ html = "<div class='product'><h2>Gaming Laptop</h2><span class='price'>$999.99</
|
||||
# Using OpenAI (requires API token)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token") # Required for OpenAI
|
||||
)
|
||||
|
||||
# Or using Ollama (open source, no token needed)
|
||||
schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the schema for fast, repeated extractions
|
||||
@@ -211,7 +211,7 @@ import os
|
||||
import json
|
||||
import asyncio
|
||||
from pydantic import BaseModel, Field
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class OpenAIModelFee(BaseModel):
|
||||
@@ -241,7 +241,7 @@ async def extract_structured_data_using_llm(
|
||||
word_count_threshold=1,
|
||||
page_timeout=80000,
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider=provider,api_token=api_token),
|
||||
llm_config = LLMConfig(provider=provider,api_token=api_token),
|
||||
schema=OpenAIModelFee.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="""From the crawled content, extract all mentioned model names along with their fees for input and output tokens.
|
||||
|
||||
@@ -71,7 +71,7 @@ Below is an overview of important LLM extraction parameters. All are typically s
|
||||
|
||||
```python
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4", api_token="YOUR_OPENAI_KEY"),
|
||||
schema=MyModel.model_json_schema(),
|
||||
extraction_type="schema",
|
||||
instruction="Extract a list of items from the text with 'name' and 'price' fields.",
|
||||
@@ -96,7 +96,7 @@ import asyncio
|
||||
import json
|
||||
from pydantic import BaseModel, Field
|
||||
from typing import List
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LlmConfig
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode, LLMConfig
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
|
||||
class Product(BaseModel):
|
||||
@@ -106,7 +106,7 @@ class Product(BaseModel):
|
||||
async def main():
|
||||
# 1. Define the LLM extraction strategy
|
||||
llm_strategy = LLMExtractionStrategy(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o-mini", api_token=os.getenv('OPENAI_API_KEY')),
|
||||
schema=Product.schema_json(), # Or use model_json_schema()
|
||||
extraction_type="schema",
|
||||
instruction="Extract all product objects with 'name' and 'price' from the content.",
|
||||
|
||||
@@ -415,7 +415,7 @@ The schema generator is available as a static method on both `JsonCssExtractionS
|
||||
|
||||
```python
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
# Sample HTML with product information
|
||||
html = """
|
||||
@@ -435,14 +435,14 @@ html = """
|
||||
css_schema = JsonCssExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="css",
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token="your-openai-token")
|
||||
)
|
||||
|
||||
# Option 2: Using Ollama (open source, no token needed)
|
||||
xpath_schema = JsonXPathExtractionStrategy.generate_schema(
|
||||
html,
|
||||
schema_type="xpath",
|
||||
llmConfig = LlmConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
llm_config = LLMConfig(provider="ollama/llama3.3", api_token=None) # Not needed for Ollama
|
||||
)
|
||||
|
||||
# Use the generated schema for fast, repeated extractions
|
||||
|
||||
78
docs/snippets/deep_crawl/1.intro.py
Normal file
78
docs/snippets/deep_crawl/1.intro.py
Normal file
@@ -0,0 +1,78 @@
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
CrawlResult,
|
||||
FilterChain,
|
||||
DomainFilter,
|
||||
URLPatternFilter,
|
||||
)
|
||||
|
||||
# Import necessary classes from crawl4ai library:
|
||||
# - AsyncWebCrawler: The main class for web crawling.
|
||||
# - CrawlerRunConfig: Configuration class for crawler behavior.
|
||||
# - BFSDeepCrawlStrategy: Breadth-First Search deep crawling strategy.
|
||||
# - CrawlResult: Data model for individual crawl results.
|
||||
# - FilterChain: Used to chain multiple URL filters.
|
||||
# - URLPatternFilter: Filter URLs based on patterns.
|
||||
# You had from crawl4ai.deep_crawling.filters import FilterChain, URLPatternFilter, which is also correct,
|
||||
# but for simplicity and consistency, we will use the direct import from crawl4ai in this example, as it is re-exported in __init__.py
|
||||
|
||||
async def basic_deep_crawl():
|
||||
"""
|
||||
Performs a basic deep crawl starting from a seed URL, demonstrating:
|
||||
- Breadth-First Search (BFS) deep crawling strategy.
|
||||
- Filtering URLs based on URL patterns.
|
||||
- Accessing crawl results and metadata.
|
||||
"""
|
||||
|
||||
# 1. Define URL Filters:
|
||||
# Create a URLPatternFilter to include only URLs containing "text".
|
||||
# This filter will be used to restrict crawling to URLs that are likely to contain textual content.
|
||||
url_filter = URLPatternFilter(
|
||||
patterns=[
|
||||
"*text*", # Include URLs that contain "text" in their path or URL
|
||||
]
|
||||
)
|
||||
|
||||
# Create a DomainFilter to allow only URLs from the "groq.com" domain and block URLs from the "example.com" domain.
|
||||
# This filter will be used to restrict crawling to URLs within the "groq.com" domain.
|
||||
domain_filter = DomainFilter(
|
||||
allowed_domains=["groq.com"],
|
||||
blocked_domains=["example.com"],
|
||||
)
|
||||
|
||||
# 2. Configure CrawlerRunConfig for Deep Crawling:
|
||||
# Configure CrawlerRunConfig to use BFSDeepCrawlStrategy for deep crawling.
|
||||
config = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2, # Set the maximum depth of crawling to 2 levels from the start URL
|
||||
max_pages=10, # Limit the total number of pages to crawl to 10, to prevent excessive crawling
|
||||
include_external=False, # Set to False to only crawl URLs within the same domain as the start URL
|
||||
filter_chain=FilterChain(filters=[url_filter, domain_filter]), # Apply the URLPatternFilter and DomainFilter to filter URLs during deep crawl
|
||||
),
|
||||
verbose=True, # Enable verbose logging to see detailed output during crawling
|
||||
)
|
||||
|
||||
# 3. Initialize and Run AsyncWebCrawler:
|
||||
# Use AsyncWebCrawler as a context manager for automatic start and close.
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results: List[CrawlResult] = await crawler.arun(
|
||||
# url="https://docs.crawl4ai.com", # Uncomment to use crawl4ai documentation as start URL
|
||||
url="https://console.groq.com/docs", # Set the start URL for deep crawling to Groq documentation
|
||||
config=config, # Pass the configured CrawlerRunConfig to arun method
|
||||
)
|
||||
|
||||
# 4. Process and Print Crawl Results:
|
||||
# Iterate through the list of CrawlResult objects returned by the deep crawl.
|
||||
for result in results:
|
||||
# Print the URL and its crawl depth from the metadata for each crawled URL.
|
||||
print(f"URL: {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import asyncio
|
||||
asyncio.run(basic_deep_crawl())
|
||||
162
docs/snippets/deep_crawl/2.filters.py
Normal file
162
docs/snippets/deep_crawl/2.filters.py
Normal file
@@ -0,0 +1,162 @@
|
||||
import asyncio
|
||||
from typing import List
|
||||
|
||||
from crawl4ai import (
|
||||
AsyncWebCrawler,
|
||||
CrawlerRunConfig,
|
||||
BFSDeepCrawlStrategy,
|
||||
CrawlResult,
|
||||
URLFilter, # Base class for filters, not directly used in examples but good to import for context
|
||||
ContentTypeFilter,
|
||||
DomainFilter,
|
||||
FilterChain,
|
||||
URLPatternFilter,
|
||||
SEOFilter # Advanced filter, can be introduced later or as bonus
|
||||
)
|
||||
|
||||
async def deep_crawl_filter_tutorial_part_2():
|
||||
"""
|
||||
Tutorial demonstrating URL filters in Crawl4AI, focusing on isolated filter behavior
|
||||
before integrating them into a deep crawl.
|
||||
|
||||
This tutorial covers:
|
||||
- Testing individual filters with synthetic URLs.
|
||||
- Understanding filter logic and behavior in isolation.
|
||||
- Combining filters using FilterChain.
|
||||
- Integrating filters into a deep crawling example.
|
||||
"""
|
||||
|
||||
# === Introduction: URL Filters in Isolation ===
|
||||
print("\n" + "=" * 40)
|
||||
print("=== Introduction: URL Filters in Isolation ===")
|
||||
print("=" * 40 + "\n")
|
||||
print("In this section, we will explore each filter individually using synthetic URLs.")
|
||||
print("This allows us to understand exactly how each filter works before using them in a crawl.\n")
|
||||
|
||||
|
||||
# === 2. ContentTypeFilter - Testing in Isolation ===
|
||||
print("\n" + "=" * 40)
|
||||
print("=== 2. ContentTypeFilter - Testing in Isolation ===")
|
||||
print("=" * 40 + "\n")
|
||||
|
||||
# 2.1. Create ContentTypeFilter:
|
||||
# Create a ContentTypeFilter to allow only 'text/html' and 'application/json' content types
|
||||
# BASED ON URL EXTENSIONS.
|
||||
content_type_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"])
|
||||
print("ContentTypeFilter created, allowing types (by extension): ['text/html', 'application/json']")
|
||||
print("Note: ContentTypeFilter in Crawl4ai works by checking URL file extensions, not HTTP headers.")
|
||||
|
||||
|
||||
# 2.2. Synthetic URLs for Testing:
|
||||
# ContentTypeFilter checks URL extensions. We provide URLs with different extensions to test.
|
||||
test_urls_content_type = [
|
||||
"https://example.com/page.html", # Should pass: .html extension (text/html)
|
||||
"https://example.com/data.json", # Should pass: .json extension (application/json)
|
||||
"https://example.com/image.png", # Should reject: .png extension (not allowed type)
|
||||
"https://example.com/document.pdf", # Should reject: .pdf extension (not allowed type)
|
||||
"https://example.com/page", # Should pass: no extension (defaults to allow) - check default behaviour!
|
||||
"https://example.com/page.xhtml", # Should pass: .xhtml extension (text/html)
|
||||
]
|
||||
|
||||
# 2.3. Apply Filter and Show Results:
|
||||
print("\n=== Testing ContentTypeFilter (URL Extension based) ===")
|
||||
for url in test_urls_content_type:
|
||||
passed = content_type_filter.apply(url)
|
||||
result = "PASSED" if passed else "REJECTED"
|
||||
extension = ContentTypeFilter._extract_extension(url) # Show extracted extension for clarity
|
||||
print(f"- URL: {url} - {result} (Extension: '{extension or 'No Extension'}')")
|
||||
print("=" * 40)
|
||||
|
||||
input("Press Enter to continue to DomainFilter example...")
|
||||
|
||||
# === 3. DomainFilter - Testing in Isolation ===
|
||||
print("\n" + "=" * 40)
|
||||
print("=== 3. DomainFilter - Testing in Isolation ===")
|
||||
print("=" * 40 + "\n")
|
||||
|
||||
# 3.1. Create DomainFilter:
|
||||
domain_filter = DomainFilter(allowed_domains=["crawl4ai.com", "example.com"])
|
||||
print("DomainFilter created, allowing domains: ['crawl4ai.com', 'example.com']")
|
||||
|
||||
# 3.2. Synthetic URLs for Testing:
|
||||
test_urls_domain = [
|
||||
"https://docs.crawl4ai.com/api",
|
||||
"https://example.com/products",
|
||||
"https://another-website.org/blog",
|
||||
"https://sub.example.com/about",
|
||||
"https://crawl4ai.com.attacker.net", # Corrected example: now should be rejected
|
||||
]
|
||||
|
||||
# 3.3. Apply Filter and Show Results:
|
||||
print("\n=== Testing DomainFilter ===")
|
||||
for url in test_urls_domain:
|
||||
passed = domain_filter.apply(url)
|
||||
result = "PASSED" if passed else "REJECTED"
|
||||
print(f"- URL: {url} - {result}")
|
||||
print("=" * 40)
|
||||
|
||||
input("Press Enter to continue to FilterChain example...")
|
||||
|
||||
# === 4. FilterChain - Combining Filters ===
|
||||
print("\n" + "=" * 40)
|
||||
print("=== 4. FilterChain - Combining Filters ===")
|
||||
print("=" * 40 + "\n")
|
||||
|
||||
combined_filter = FilterChain(
|
||||
filters=[
|
||||
URLPatternFilter(patterns=["*api*"]),
|
||||
ContentTypeFilter(allowed_types=["text/html"]), # Still URL extension based
|
||||
DomainFilter(allowed_domains=["docs.crawl4ai.com"]),
|
||||
]
|
||||
)
|
||||
print("FilterChain created, combining URLPatternFilter, ContentTypeFilter, and DomainFilter.")
|
||||
|
||||
|
||||
test_urls_combined = [
|
||||
"https://docs.crawl4ai.com/api/async-webcrawler",
|
||||
"https://example.com/api/products",
|
||||
"https://docs.crawl4ai.com/core/crawling",
|
||||
"https://another-website.org/api/data",
|
||||
]
|
||||
|
||||
# 4.3. Apply FilterChain and Show Results
|
||||
print("\n=== Testing FilterChain (URLPatternFilter + ContentTypeFilter + DomainFilter) ===")
|
||||
for url in test_urls_combined:
|
||||
passed = await combined_filter.apply(url)
|
||||
result = "PASSED" if passed else "REJECTED"
|
||||
print(f"- URL: {url} - {result}")
|
||||
print("=" * 40)
|
||||
|
||||
input("Press Enter to continue to Deep Crawl with FilterChain example...")
|
||||
|
||||
# === 5. Deep Crawl with FilterChain ===
|
||||
print("\n" + "=" * 40)
|
||||
print("=== 5. Deep Crawl with FilterChain ===")
|
||||
print("=" * 40 + "\n")
|
||||
print("Finally, let's integrate the FilterChain into a deep crawl example.")
|
||||
|
||||
config_final_crawl = CrawlerRunConfig(
|
||||
deep_crawl_strategy=BFSDeepCrawlStrategy(
|
||||
max_depth=2,
|
||||
max_pages=10,
|
||||
include_external=False,
|
||||
filter_chain=combined_filter
|
||||
),
|
||||
verbose=False,
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
results_final_crawl: List[CrawlResult] = await crawler.arun(
|
||||
url="https://docs.crawl4ai.com", config=config_final_crawl
|
||||
)
|
||||
|
||||
print("=== Crawled URLs (Deep Crawl with FilterChain) ===")
|
||||
for result in results_final_crawl:
|
||||
print(f"- {result.url}, Depth: {result.metadata.get('depth', 0)}")
|
||||
print("=" * 40)
|
||||
|
||||
print("\nTutorial Completed! Review the output of each section to understand URL filters.")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(deep_crawl_filter_tutorial_part_2())
|
||||
@@ -42,7 +42,7 @@ dependencies = [
|
||||
"pyperclip>=1.8.2",
|
||||
"faust-cchardet>=2.1.19",
|
||||
"aiohttp>=3.11.11",
|
||||
"humanize>=4.10.0"
|
||||
"humanize>=4.10.0",
|
||||
]
|
||||
classifiers = [
|
||||
"Development Status :: 4 - Beta",
|
||||
@@ -78,7 +78,7 @@ crawl4ai-download-models = "crawl4ai.model_loader:main"
|
||||
crawl4ai-migrate = "crawl4ai.migrations:main"
|
||||
crawl4ai-setup = "crawl4ai.install:post_install"
|
||||
crawl4ai-doctor = "crawl4ai.install:doctor"
|
||||
crwl = "crawl4ai.cli:cli"
|
||||
crwl = "crawl4ai.cli:main"
|
||||
|
||||
[tool.setuptools]
|
||||
packages = {find = {where = ["."], include = ["crawl4ai*"]}}
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import os
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.content_filter_strategy import LLMContentFilter
|
||||
|
||||
async def test_llm_filter():
|
||||
@@ -23,7 +23,7 @@ async def test_llm_filter():
|
||||
|
||||
# Initialize LLM filter with focused instruction
|
||||
filter = LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
instruction="""
|
||||
Focus on extracting the core educational content about Python classes.
|
||||
Include:
|
||||
@@ -43,7 +43,7 @@ async def test_llm_filter():
|
||||
)
|
||||
|
||||
filter = LLMContentFilter(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4o",api_token=os.getenv('OPENAI_API_KEY')),
|
||||
chunk_token_threshold=2 ** 12 * 2, # 2048 * 2
|
||||
instruction="""
|
||||
Extract the main educational content while preserving its original wording and substance completely. Your task is to:
|
||||
|
||||
@@ -10,6 +10,7 @@ import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.markdown_generation_strategy import DefaultMarkdownGenerator
|
||||
from crawl4ai.extraction_strategy import JsonCssExtractionStrategy, JsonXPathExtractionStrategy
|
||||
from crawl4ai.utils import preprocess_html_for_schema, JsonXPathExtractionStrategy
|
||||
import json
|
||||
|
||||
# Test HTML - A complex job board with companies, departments, and positions
|
||||
|
||||
@@ -7,7 +7,7 @@ import json
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
sys.path.append(parent_dir)
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.async_webcrawler import AsyncWebCrawler
|
||||
from crawl4ai.chunking_strategy import RegexChunking
|
||||
from crawl4ai.extraction_strategy import LLMExtractionStrategy
|
||||
@@ -49,7 +49,7 @@ async def test_llm_extraction_strategy():
|
||||
async with AsyncWebCrawler(verbose=True) as crawler:
|
||||
url = "https://www.nbcnews.com/business"
|
||||
extraction_strategy = LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
llm_config=LLMConfig(provider="openai/gpt-4o-mini",api_token=os.getenv("OPENAI_API_KEY")),
|
||||
instruction="Extract only content related to technology",
|
||||
)
|
||||
result = await crawler.arun(
|
||||
|
||||
4
tests/browser/docker/__init__.py
Normal file
4
tests/browser/docker/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""Docker browser strategy tests.
|
||||
|
||||
This package contains tests for the Docker browser strategy implementation.
|
||||
"""
|
||||
653
tests/browser/docker/test_docker_browser.py
Normal file
653
tests/browser/docker/test_docker_browser.py
Normal file
@@ -0,0 +1,653 @@
|
||||
"""Test examples for Docker Browser Strategy.
|
||||
|
||||
These examples demonstrate the functionality of Docker Browser Strategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import shutil
|
||||
import uuid
|
||||
import json
|
||||
from typing import List, Dict, Any, Optional, Tuple
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
from crawl4ai.browser.docker_config import DockerConfig
|
||||
from crawl4ai.browser.docker_registry import DockerRegistry
|
||||
from crawl4ai.browser.docker_utils import DockerUtils
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
# Global Docker utils instance
|
||||
docker_utils = DockerUtils(logger)
|
||||
|
||||
async def test_docker_components():
|
||||
"""Test Docker utilities, registry, and image building.
|
||||
|
||||
This function tests the core Docker components before running the browser tests.
|
||||
It validates DockerRegistry, DockerUtils, and builds test images to ensure
|
||||
everything is functioning correctly.
|
||||
"""
|
||||
logger.info("Testing Docker components", tag="SETUP")
|
||||
|
||||
# Create a test registry directory
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "test_registry")
|
||||
registry_file = os.path.join(registry_dir, "test_registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# 1. Test DockerRegistry
|
||||
logger.info("Testing DockerRegistry...", tag="SETUP")
|
||||
registry = DockerRegistry(registry_file)
|
||||
|
||||
# Test saving and loading registry
|
||||
test_container_id = "test-container-123"
|
||||
registry.register_container(test_container_id, 9876, "test-hash-123")
|
||||
registry.save()
|
||||
|
||||
# Create a new registry instance that loads from the file
|
||||
registry2 = DockerRegistry(registry_file)
|
||||
port = registry2.get_container_host_port(test_container_id)
|
||||
hash_value = registry2.get_container_config_hash(test_container_id)
|
||||
|
||||
if port != 9876 or hash_value != "test-hash-123":
|
||||
logger.error("DockerRegistry persistence failed", tag="SETUP")
|
||||
return False
|
||||
|
||||
# Clean up test container from registry
|
||||
registry2.unregister_container(test_container_id)
|
||||
logger.success("DockerRegistry works correctly", tag="SETUP")
|
||||
|
||||
# 2. Test DockerUtils
|
||||
logger.info("Testing DockerUtils...", tag="SETUP")
|
||||
|
||||
# Test port detection
|
||||
in_use = docker_utils.is_port_in_use(22) # SSH port is usually in use
|
||||
logger.info(f"Port 22 in use: {in_use}", tag="SETUP")
|
||||
|
||||
# Get next available port
|
||||
available_port = docker_utils.get_next_available_port(9000)
|
||||
logger.info(f"Next available port: {available_port}", tag="SETUP")
|
||||
|
||||
# Test config hash generation
|
||||
config_dict = {"mode": "connect", "headless": True}
|
||||
config_hash = docker_utils.generate_config_hash(config_dict)
|
||||
logger.info(f"Generated config hash: {config_hash[:8]}...", tag="SETUP")
|
||||
|
||||
# 3. Test Docker is available
|
||||
logger.info("Checking Docker availability...", tag="SETUP")
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - cannot continue tests", tag="SETUP")
|
||||
return False
|
||||
|
||||
# 4. Test building connect image
|
||||
logger.info("Building connect mode Docker image...", tag="SETUP")
|
||||
connect_image = await docker_utils.ensure_docker_image_exists(None, "connect")
|
||||
if not connect_image:
|
||||
logger.error("Failed to build connect mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built connect image: {connect_image}", tag="SETUP")
|
||||
|
||||
# 5. Test building launch image
|
||||
logger.info("Building launch mode Docker image...", tag="SETUP")
|
||||
launch_image = await docker_utils.ensure_docker_image_exists(None, "launch")
|
||||
if not launch_image:
|
||||
logger.error("Failed to build launch mode image", tag="SETUP")
|
||||
return False
|
||||
logger.success(f"Successfully built launch image: {launch_image}", tag="SETUP")
|
||||
|
||||
# 6. Test creating and removing container
|
||||
logger.info("Testing container creation and removal...", tag="SETUP")
|
||||
container_id = await docker_utils.create_container(
|
||||
image_name=launch_image,
|
||||
host_port=available_port,
|
||||
container_name="crawl4ai-test-container"
|
||||
)
|
||||
|
||||
if not container_id:
|
||||
logger.error("Failed to create test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.info(f"Created test container: {container_id[:12]}", tag="SETUP")
|
||||
|
||||
# Verify container is running
|
||||
running = await docker_utils.is_container_running(container_id)
|
||||
if not running:
|
||||
logger.error("Test container is not running", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Test commands in container
|
||||
logger.info("Testing command execution in container...", tag="SETUP")
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["ls", "-la", "/"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Command execution failed: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
# Verify Chrome is installed in the container
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["which", "google-chrome"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error("Chrome not found in container", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
chrome_path = stdout.strip()
|
||||
logger.info(f"Chrome found at: {chrome_path}", tag="SETUP")
|
||||
|
||||
# Test Chrome version
|
||||
returncode, stdout, stderr = await docker_utils.exec_in_container(
|
||||
container_id, ["google-chrome", "--version"]
|
||||
)
|
||||
|
||||
if returncode != 0:
|
||||
logger.error(f"Failed to get Chrome version: {stderr}", tag="SETUP")
|
||||
await docker_utils.remove_container(container_id)
|
||||
return False
|
||||
|
||||
logger.info(f"Chrome version: {stdout.strip()}", tag="SETUP")
|
||||
|
||||
# Remove test container
|
||||
removed = await docker_utils.remove_container(container_id)
|
||||
if not removed:
|
||||
logger.error("Failed to remove test container", tag="SETUP")
|
||||
return False
|
||||
|
||||
logger.success("Test container removed successfully", tag="SETUP")
|
||||
|
||||
# All components tested successfully
|
||||
logger.success("All Docker components tested successfully", tag="SETUP")
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Docker component tests failed: {str(e)}", tag="SETUP")
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry test directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def test_docker_connect_mode():
|
||||
"""Test Docker browser in connect mode.
|
||||
|
||||
This tests the basic functionality of creating a browser in Docker
|
||||
connect mode and using it for navigation.
|
||||
"""
|
||||
logger.info("Testing Docker browser in connect mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_launch_mode():
|
||||
"""Test Docker browser in launch mode.
|
||||
|
||||
This tests launching a Chrome browser within a Docker container
|
||||
on demand with custom settings.
|
||||
"""
|
||||
logger.info("Testing Docker browser in launch mode", tag="TEST")
|
||||
|
||||
# Create temp directory for user data
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), "tmp_user_data_launch")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="launch",
|
||||
persistent=False,
|
||||
remove_on_exit=True,
|
||||
user_data_dir=temp_dir
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
text_mode=True, # Enable text mode for faster operation
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_persistent_storage():
|
||||
"""Test Docker browser with persistent storage.
|
||||
|
||||
This tests creating localStorage data in one session and verifying
|
||||
it persists to another session when using persistent storage.
|
||||
"""
|
||||
logger.info("Testing Docker browser with persistent storage", tag="TEST")
|
||||
|
||||
# Create a unique temp directory
|
||||
test_id = uuid.uuid4().hex[:8]
|
||||
temp_dir = os.path.join(os.path.dirname(__file__), f"tmp_user_data_persist_{test_id}")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
|
||||
try:
|
||||
# Create Docker configuration with persistence
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running between sessions
|
||||
user_data_dir=temp_dir,
|
||||
container_user_data_dir="/data"
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get a page
|
||||
page1, context1 = await manager1.get_page(crawler_config)
|
||||
|
||||
# Navigate to example.com
|
||||
await page1.goto("https://example.com")
|
||||
|
||||
# Set localStorage item
|
||||
test_value = f"test_value_{test_id}"
|
||||
await page1.evaluate(f"localStorage.setItem('test_key', '{test_value}')")
|
||||
logger.info(f"Set localStorage test_key = {test_value}", tag="TEST")
|
||||
|
||||
# Close the first browser manager
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second browser manager with same config
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
|
||||
# Navigate to same site
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Get localStorage item
|
||||
value = await page2.evaluate("localStorage.getItem('test_key')")
|
||||
logger.info(f"Retrieved localStorage test_key = {value}", tag="TEST")
|
||||
|
||||
# Check if persistence worked
|
||||
if value == test_value:
|
||||
logger.success("Storage persistence verified!", tag="TEST")
|
||||
else:
|
||||
logger.error(f"Storage persistence failed! Expected {test_value}, got {value}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed successfully", tag="TEST")
|
||||
|
||||
return value == test_value
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up the temp directory
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
async def test_docker_parallel_pages():
|
||||
"""Test Docker browser with parallel page creation.
|
||||
|
||||
This tests the ability to create and use multiple pages in parallel
|
||||
from a single Docker browser instance.
|
||||
"""
|
||||
logger.info("Testing Docker browser with parallel pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create Docker configuration
|
||||
docker_config = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=False,
|
||||
remove_on_exit=True
|
||||
)
|
||||
|
||||
# Create browser configuration
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config
|
||||
)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig()
|
||||
|
||||
# Get multiple pages
|
||||
page_count = 3
|
||||
pages = await manager.get_pages(crawler_config, count=page_count)
|
||||
logger.info(f"Got {len(pages)} pages successfully", tag="TEST")
|
||||
|
||||
if len(pages) != page_count:
|
||||
logger.error(f"Expected {page_count} pages, got {len(pages)}", tag="TEST")
|
||||
await manager.close()
|
||||
return False
|
||||
|
||||
# Navigate to different sites with each page
|
||||
tasks = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
tasks.append(page.goto(f"https://example.com?page={i}"))
|
||||
|
||||
# Wait for all navigations to complete
|
||||
await asyncio.gather(*tasks)
|
||||
logger.info("All pages navigated successfully", tag="TEST")
|
||||
|
||||
# Get titles from all pages
|
||||
titles = []
|
||||
for i, (page, _) in enumerate(pages):
|
||||
title = await page.title()
|
||||
titles.append(title)
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_docker_registry_reuse():
|
||||
"""Test Docker container reuse via registry.
|
||||
|
||||
This tests that containers with matching configurations
|
||||
are reused rather than creating new ones.
|
||||
"""
|
||||
logger.info("Testing Docker container reuse via registry", tag="TEST")
|
||||
|
||||
# Create registry for this test
|
||||
registry_dir = os.path.join(os.path.dirname(__file__), "registry_reuse_test")
|
||||
registry_file = os.path.join(registry_dir, "registry.json")
|
||||
os.makedirs(registry_dir, exist_ok=True)
|
||||
|
||||
manager1 = None
|
||||
manager2 = None
|
||||
container_id1 = None
|
||||
|
||||
try:
|
||||
# Create identical Docker configurations with custom registry
|
||||
docker_config1 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True, # Keep container running after closing
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create first browser configuration
|
||||
browser_config1 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config1
|
||||
)
|
||||
|
||||
# Create first browser manager
|
||||
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||
|
||||
# Start the first browser
|
||||
await manager1.start()
|
||||
logger.info("First browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the strategy
|
||||
docker_strategy1 = manager1._strategy
|
||||
container_id1 = docker_strategy1.container_id
|
||||
logger.info(f"First browser container ID: {container_id1[:12]}", tag="TEST")
|
||||
|
||||
# Close the first manager but keep container running
|
||||
await manager1.close()
|
||||
logger.info("First browser closed", tag="TEST")
|
||||
|
||||
# Create second Docker configuration identical to first
|
||||
docker_config2 = DockerConfig(
|
||||
mode="connect",
|
||||
persistent=True,
|
||||
registry_file=registry_file
|
||||
)
|
||||
|
||||
# Create second browser configuration
|
||||
browser_config2 = BrowserConfig(
|
||||
browser_mode="docker",
|
||||
headless=True,
|
||||
docker_config=docker_config2
|
||||
)
|
||||
|
||||
# Create second browser manager
|
||||
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||
|
||||
# Start the second browser - should reuse existing container
|
||||
await manager2.start()
|
||||
logger.info("Second browser started successfully", tag="TEST")
|
||||
|
||||
# Get container ID from the second strategy
|
||||
docker_strategy2 = manager2._strategy
|
||||
container_id2 = docker_strategy2.container_id
|
||||
logger.info(f"Second browser container ID: {container_id2[:12]}", tag="TEST")
|
||||
|
||||
# Verify container reuse
|
||||
if container_id1 == container_id2:
|
||||
logger.success("Container reuse successful - using same container!", tag="TEST")
|
||||
else:
|
||||
logger.error("Container reuse failed - new container created!", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
docker_strategy2.docker_config.persistent = False
|
||||
docker_strategy2.docker_config.remove_on_exit = True
|
||||
await manager2.close()
|
||||
logger.info("Second browser closed and container removed", tag="TEST")
|
||||
|
||||
return container_id1 == container_id2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
if manager1:
|
||||
await manager1.close()
|
||||
if manager2:
|
||||
await manager2.close()
|
||||
# Make sure container is removed
|
||||
if container_id1:
|
||||
await docker_utils.remove_container(container_id1, force=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
finally:
|
||||
# Clean up registry directory
|
||||
if os.path.exists(registry_dir):
|
||||
shutil.rmtree(registry_dir)
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
logger.info("Starting Docker Browser Strategy tests", tag="TEST")
|
||||
|
||||
# Check if Docker is available
|
||||
if not await check_docker_available():
|
||||
logger.error("Docker is not available - skipping tests", tag="TEST")
|
||||
return
|
||||
|
||||
# First test Docker components
|
||||
setup_result = await test_docker_components()
|
||||
if not setup_result:
|
||||
logger.error("Docker component tests failed - skipping browser tests", tag="TEST")
|
||||
return
|
||||
|
||||
# Run browser tests
|
||||
results.append(await test_docker_connect_mode())
|
||||
results.append(await test_docker_launch_mode())
|
||||
results.append(await test_docker_persistent_storage())
|
||||
results.append(await test_docker_parallel_pages())
|
||||
results.append(await test_docker_registry_reuse())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(1 for r in results if r)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
async def check_docker_available() -> bool:
|
||||
"""Check if Docker is available on the system.
|
||||
|
||||
Returns:
|
||||
bool: True if Docker is available, False otherwise
|
||||
"""
|
||||
try:
|
||||
proc = await asyncio.create_subprocess_exec(
|
||||
"docker", "--version",
|
||||
stdout=asyncio.subprocess.PIPE,
|
||||
stderr=asyncio.subprocess.PIPE
|
||||
)
|
||||
stdout, _ = await proc.communicate()
|
||||
return proc.returncode == 0 and stdout
|
||||
except:
|
||||
return False
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
190
tests/browser/test_browser_manager.py
Normal file
190
tests/browser/test_browser_manager.py
Normal file
@@ -0,0 +1,190 @@
|
||||
"""Test examples for BrowserManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_basic_browser_manager():
|
||||
"""Test basic BrowserManager functionality with default configuration."""
|
||||
logger.info("Starting test_basic_browser_manager", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a browser manager with default config
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Page created successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_basic_browser_manager completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_basic_browser_manager failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_custom_browser_config():
|
||||
"""Test BrowserManager with custom browser configuration."""
|
||||
logger.info("Starting test_custom_browser_config", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create a custom browser config
|
||||
browser_config = BrowserConfig(
|
||||
browser_type="chromium",
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800,
|
||||
light_mode=True
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully with custom config", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Verify viewport size
|
||||
viewport_size = await page.evaluate("() => ({ width: window.innerWidth, height: window.innerHeight })")
|
||||
logger.info(f"Viewport size: {viewport_size}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_custom_browser_config completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_custom_browser_config failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_multiple_pages():
|
||||
"""Test BrowserManager with multiple pages."""
|
||||
logger.info("Starting test_multiple_pages", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create multiple pages
|
||||
pages = []
|
||||
urls = ["https://example.com", "https://example.org", "https://mozilla.org"]
|
||||
|
||||
for i, url in enumerate(urls):
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(url)
|
||||
pages.append((page, url))
|
||||
logger.info(f"Created page {i+1} for {url}", tag="TEST")
|
||||
|
||||
# Verify all pages are loaded correctly
|
||||
for i, (page, url) in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_multiple_pages completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_multiple_pages failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def test_session_management():
|
||||
"""Test session management in BrowserManager."""
|
||||
logger.info("Starting test_session_management", tag="TEST")
|
||||
|
||||
try:
|
||||
# Create browser manager
|
||||
manager = BrowserManager(logger=logger)
|
||||
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create a session
|
||||
session_id = "test_session_1"
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com", session_id=session_id)
|
||||
page1, context1 = await manager.get_page(crawler_config)
|
||||
await page1.goto("https://example.com")
|
||||
logger.info(f"Created session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Get the same session again
|
||||
page2, context2 = await manager.get_page(crawler_config)
|
||||
|
||||
# Verify it's the same page/context
|
||||
is_same_page = page1 == page2
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Same page: {is_same_page}, Same context: {is_same_context}", tag="TEST")
|
||||
|
||||
# Kill the session
|
||||
await manager.kill_session(session_id)
|
||||
logger.info(f"Killed session with ID: {session_id}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.success("test_session_management completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"test_session_management failed: {str(e)}", tag="TEST")
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_basic_browser_manager())
|
||||
results.append(await test_custom_browser_config())
|
||||
results.append(await test_multiple_pages())
|
||||
results.append(await test_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
808
tests/browser/test_builtin_browser.py
Normal file
808
tests/browser/test_builtin_browser.py
Normal file
@@ -0,0 +1,808 @@
|
||||
"""
|
||||
Test script for builtin browser functionality in the browser module.
|
||||
|
||||
This script tests:
|
||||
1. Creating a builtin browser
|
||||
2. Getting browser information
|
||||
3. Killing the browser
|
||||
4. Restarting the browser
|
||||
5. Testing operations with different browser strategies
|
||||
6. Testing edge cases
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List, Dict, Any
|
||||
from colorama import Fore, Style, init
|
||||
|
||||
# Add the project root to the path for imports
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
from rich.panel import Panel
|
||||
from rich.text import Text
|
||||
from rich.box import Box, SIMPLE
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Initialize colorama for cross-platform colored terminal output
|
||||
init()
|
||||
|
||||
# Define colors for pretty output
|
||||
SUCCESS = Fore.GREEN
|
||||
WARNING = Fore.YELLOW
|
||||
ERROR = Fore.RED
|
||||
INFO = Fore.CYAN
|
||||
RESET = Fore.RESET
|
||||
|
||||
# Create logger
|
||||
logger = AsyncLogger(verbose=True)
|
||||
|
||||
|
||||
async def test_builtin_browser_creation():
|
||||
"""Test creating a builtin browser using the BrowserManager with BuiltinBrowserStrategy"""
|
||||
print(f"\n{INFO}========== Testing Builtin Browser Creation =========={RESET}")
|
||||
|
||||
# Step 1: Create a BrowserManager with builtin mode
|
||||
print(f"\n{INFO}1. Creating BrowserManager with builtin mode{RESET}")
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True, verbose=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Step 2: Check if we have a BuiltinBrowserStrategy
|
||||
print(f"\n{INFO}2. Checking if we have a BuiltinBrowserStrategy{RESET}")
|
||||
if isinstance(manager._strategy, BuiltinBrowserStrategy):
|
||||
print(
|
||||
f"{SUCCESS}Correct strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"{ERROR}Wrong strategy type: {manager._strategy.__class__.__name__}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Step 3: Start the manager to launch or connect to builtin browser
|
||||
print(f"\n{INFO}3. Starting the browser manager{RESET}")
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Browser manager started successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to start browser manager: {str(e)}{RESET}")
|
||||
return None
|
||||
|
||||
# Step 4: Get browser info from the strategy
|
||||
print(f"\n{INFO}4. Getting browser information{RESET}")
|
||||
browser_info = manager._strategy.get_builtin_browser_info()
|
||||
if browser_info:
|
||||
print(f"{SUCCESS}Browser info retrieved:{RESET}")
|
||||
for key, value in browser_info.items():
|
||||
if key != "config": # Skip the verbose config section
|
||||
print(f" {key}: {value}")
|
||||
|
||||
cdp_url = browser_info.get("cdp_url")
|
||||
print(f"{SUCCESS}CDP URL: {cdp_url}{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to get browser information{RESET}")
|
||||
cdp_url = None
|
||||
|
||||
# Save manager for later tests
|
||||
return manager, cdp_url
|
||||
|
||||
|
||||
async def test_page_operations(manager: BrowserManager):
|
||||
"""Test page operations with the builtin browser"""
|
||||
print(
|
||||
f"\n{INFO}========== Testing Page Operations with Builtin Browser =========={RESET}"
|
||||
)
|
||||
|
||||
# Step 1: Get a single page
|
||||
print(f"\n{INFO}1. Getting a single page{RESET}")
|
||||
try:
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
print(f"{SUCCESS}Got page successfully{RESET}")
|
||||
|
||||
# Navigate to a test URL
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Page title: {title}{RESET}")
|
||||
|
||||
# Close the page
|
||||
await page.close()
|
||||
print(f"{SUCCESS}Page closed successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Page operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 2: Get multiple pages
|
||||
print(f"\n{INFO}2. Getting multiple pages with get_pages(){RESET}")
|
||||
try:
|
||||
# Request 3 pages
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=3)
|
||||
print(f"{SUCCESS}Got {len(pages)} pages{RESET}")
|
||||
|
||||
# Test each page
|
||||
for i, (page, context) in enumerate(pages):
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Page {i + 1} title: {title}{RESET}")
|
||||
await page.close()
|
||||
|
||||
print(f"{SUCCESS}All pages tested and closed successfully{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Multiple page operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_browser_status_management(manager: BrowserManager):
|
||||
"""Test browser status and management operations"""
|
||||
print(f"\n{INFO}========== Testing Browser Status and Management =========={RESET}")
|
||||
|
||||
# Step 1: Get browser status
|
||||
print(f"\n{INFO}1. Getting browser status{RESET}")
|
||||
try:
|
||||
status = await manager._strategy.get_builtin_browser_status()
|
||||
print(f"{SUCCESS}Browser status:{RESET}")
|
||||
print(f" Running: {status['running']}")
|
||||
print(f" CDP URL: {status['cdp_url']}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 2: Test killing the browser
|
||||
print(f"\n{INFO}2. Testing killing the browser{RESET}")
|
||||
try:
|
||||
result = await manager._strategy.kill_builtin_browser()
|
||||
if result:
|
||||
print(f"{SUCCESS}Browser killed successfully{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to kill browser{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Browser kill operation failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 3: Check status after kill
|
||||
print(f"\n{INFO}3. Checking status after kill{RESET}")
|
||||
try:
|
||||
status = await manager._strategy.get_builtin_browser_status()
|
||||
if not status["running"]:
|
||||
print(f"{SUCCESS}Browser is correctly reported as not running{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Browser is incorrectly reported as still running{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to get browser status: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 4: Launch a new browser
|
||||
print(f"\n{INFO}4. Launching a new browser{RESET}")
|
||||
try:
|
||||
cdp_url = await manager._strategy.launch_builtin_browser(
|
||||
browser_type="chromium", headless=True
|
||||
)
|
||||
if cdp_url:
|
||||
print(f"{SUCCESS}New browser launched at: {cdp_url}{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Failed to launch new browser{RESET}")
|
||||
return False
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Browser launch failed: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_multiple_managers():
|
||||
"""Test creating multiple BrowserManagers that use the same builtin browser"""
|
||||
print(f"\n{INFO}========== Testing Multiple Browser Managers =========={RESET}")
|
||||
|
||||
# Step 1: Create first manager
|
||||
print(f"\n{INFO}1. Creating first browser manager{RESET}")
|
||||
browser_config1 = (BrowserConfig(browser_mode="builtin", headless=True),)
|
||||
manager1 = BrowserManager(browser_config=browser_config1, logger=logger)
|
||||
|
||||
# Step 2: Create second manager
|
||||
print(f"\n{INFO}2. Creating second browser manager{RESET}")
|
||||
browser_config2 = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager2 = BrowserManager(browser_config=browser_config2, logger=logger)
|
||||
|
||||
# Step 3: Start both managers (should connect to the same builtin browser)
|
||||
print(f"\n{INFO}3. Starting both managers{RESET}")
|
||||
try:
|
||||
await manager1.start()
|
||||
print(f"{SUCCESS}First manager started{RESET}")
|
||||
|
||||
await manager2.start()
|
||||
print(f"{SUCCESS}Second manager started{RESET}")
|
||||
|
||||
# Check if they got the same CDP URL
|
||||
cdp_url1 = manager1._strategy.config.cdp_url
|
||||
cdp_url2 = manager2._strategy.config.cdp_url
|
||||
|
||||
if cdp_url1 == cdp_url2:
|
||||
print(
|
||||
f"{SUCCESS}Both managers connected to the same browser: {cdp_url1}{RESET}"
|
||||
)
|
||||
else:
|
||||
print(
|
||||
f"{WARNING}Managers connected to different browsers: {cdp_url1} and {cdp_url2}{RESET}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to start managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 4: Test using both managers
|
||||
print(f"\n{INFO}4. Testing operations with both managers{RESET}")
|
||||
try:
|
||||
# First manager creates a page
|
||||
page1, ctx1 = await manager1.get_page(CrawlerRunConfig())
|
||||
await page1.goto("https://example.com")
|
||||
title1 = await page1.title()
|
||||
print(f"{SUCCESS}Manager 1 page title: {title1}{RESET}")
|
||||
|
||||
# Second manager creates a page
|
||||
page2, ctx2 = await manager2.get_page(CrawlerRunConfig())
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
print(f"{SUCCESS}Manager 2 page title: {title2}{RESET}")
|
||||
|
||||
# Clean up
|
||||
await page1.close()
|
||||
await page2.close()
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to use both managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
# Step 5: Close both managers
|
||||
print(f"\n{INFO}5. Closing both managers{RESET}")
|
||||
try:
|
||||
await manager1.close()
|
||||
print(f"{SUCCESS}First manager closed{RESET}")
|
||||
|
||||
await manager2.close()
|
||||
print(f"{SUCCESS}Second manager closed{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to close managers: {str(e)}{RESET}")
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_edge_cases():
|
||||
"""Test edge cases like multiple starts, killing browser during operations, etc."""
|
||||
print(f"\n{INFO}========== Testing Edge Cases =========={RESET}")
|
||||
|
||||
# Step 1: Test multiple starts with the same manager
|
||||
print(f"\n{INFO}1. Testing multiple starts with the same manager{RESET}")
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}First start successful{RESET}")
|
||||
|
||||
# Try to start again
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Second start completed without errors{RESET}")
|
||||
|
||||
# Test if it's still functional
|
||||
page, context = await manager.get_page(CrawlerRunConfig())
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
print(
|
||||
f"{SUCCESS}Page operations work after multiple starts. Title: {title}{RESET}"
|
||||
)
|
||||
await page.close()
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Multiple starts test failed: {str(e)}{RESET}")
|
||||
return False
|
||||
finally:
|
||||
await manager.close()
|
||||
|
||||
# Step 2: Test killing the browser while manager is active
|
||||
print(f"\n{INFO}2. Testing killing the browser while manager is active{RESET}")
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
print(f"{SUCCESS}Manager started{RESET}")
|
||||
|
||||
# Kill the browser directly
|
||||
print(f"{INFO}Killing the browser...{RESET}")
|
||||
await manager._strategy.kill_builtin_browser()
|
||||
print(f"{SUCCESS}Browser killed{RESET}")
|
||||
|
||||
# Try to get a page (should fail or launch a new browser)
|
||||
try:
|
||||
page, context = await manager.get_page(CrawlerRunConfig())
|
||||
print(
|
||||
f"{WARNING}Page request succeeded despite killed browser (might have auto-restarted){RESET}"
|
||||
)
|
||||
title = await page.title()
|
||||
print(f"{SUCCESS}Got page title: {title}{RESET}")
|
||||
await page.close()
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{SUCCESS}Page request failed as expected after browser was killed: {str(e)}{RESET}"
|
||||
)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Kill during operation test failed: {str(e)}{RESET}")
|
||||
return False
|
||||
finally:
|
||||
await manager.close()
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def cleanup_browsers():
|
||||
"""Clean up any remaining builtin browsers"""
|
||||
print(f"\n{INFO}========== Cleaning Up Builtin Browsers =========={RESET}")
|
||||
|
||||
browser_config = BrowserConfig(browser_mode="builtin", headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# No need to start, just access the strategy directly
|
||||
strategy = manager._strategy
|
||||
if isinstance(strategy, BuiltinBrowserStrategy):
|
||||
result = await strategy.kill_builtin_browser()
|
||||
if result:
|
||||
print(f"{SUCCESS}Successfully killed all builtin browsers{RESET}")
|
||||
else:
|
||||
print(f"{WARNING}No builtin browsers found to kill{RESET}")
|
||||
else:
|
||||
print(f"{ERROR}Wrong strategy type: {strategy.__class__.__name__}{RESET}")
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Cleanup failed: {str(e)}{RESET}")
|
||||
finally:
|
||||
# Just to be safe
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
|
||||
async def test_performance_scaling():
|
||||
"""Test performance with multiple browsers and pages.
|
||||
|
||||
This test creates multiple browsers on different ports,
|
||||
spawns multiple pages per browser, and measures performance metrics.
|
||||
"""
|
||||
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||
|
||||
# Configuration parameters
|
||||
num_browsers = 10
|
||||
pages_per_browser = 10
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
base_port = 9222
|
||||
|
||||
# Set up a measuring mechanism for memory
|
||||
import psutil
|
||||
import gc
|
||||
|
||||
# Force garbage collection before starting
|
||||
gc.collect()
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||
peak_memory = initial_memory
|
||||
|
||||
# Report initial configuration
|
||||
print(
|
||||
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||
)
|
||||
|
||||
# List to track managers
|
||||
managers: List[BrowserManager] = []
|
||||
all_pages = []
|
||||
|
||||
|
||||
|
||||
# Get crawl4ai home directory
|
||||
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# Create all managers but don't start them yet
|
||||
manager_configs = []
|
||||
for i in range(num_browsers):
|
||||
port = base_port + i
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True,
|
||||
debugging_port=port,
|
||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
manager._strategy.shutting_down = True
|
||||
manager_configs.append((manager, i, port))
|
||||
|
||||
# Define async function to start a single manager
|
||||
async def start_manager(manager, index, port):
|
||||
try:
|
||||
await manager.start()
|
||||
return manager
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Start all managers in parallel
|
||||
start_tasks = [
|
||||
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||
]
|
||||
started_managers = await asyncio.gather(*start_tasks)
|
||||
|
||||
# Filter out None values (failed starts) and add to managers list
|
||||
managers = [m for m in started_managers if m is not None]
|
||||
|
||||
if len(managers) == 0:
|
||||
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||
return False
|
||||
|
||||
if len(managers) < num_browsers:
|
||||
print(
|
||||
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||
)
|
||||
|
||||
# Create pages for each browser
|
||||
for i, manager in enumerate(managers):
|
||||
try:
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||
all_pages.extend(pages)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||
|
||||
# Check memory after page creation
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Ask for confirmation before loading
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||
)
|
||||
# Step 1: Create and start multiple browser managers in parallel
|
||||
start_time = time.time()
|
||||
|
||||
if confirmation.lower() == "y":
|
||||
load_start_time = time.time()
|
||||
|
||||
# Function to load a single page
|
||||
async def load_page(page_ctx, index):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||
title = await page.title()
|
||||
return title
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
# Load all pages concurrently
|
||||
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||
|
||||
# Count successes and failures
|
||||
successes = sum(
|
||||
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||
)
|
||||
failures = len(load_results) - successes
|
||||
|
||||
load_time = time.time() - load_start_time
|
||||
total_test_time = time.time() - start_time
|
||||
|
||||
# Check memory after loading (peak memory)
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Calculate key metrics
|
||||
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||
crawls_per_minute = crawls_per_second * 60
|
||||
crawls_per_hour = crawls_per_minute * 60
|
||||
|
||||
# Print simplified performance summary
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
# Create a simple summary table
|
||||
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Crawls Completed", f"{successes}")
|
||||
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||
|
||||
# Display the table
|
||||
console.print(table)
|
||||
|
||||
# Ask confirmation before cleanup
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||
)
|
||||
if confirmation.lower() != "y":
|
||||
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||
return False
|
||||
|
||||
# Close all pages
|
||||
for page, _ in all_pages:
|
||||
try:
|
||||
await page.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Remove the temp directory
|
||||
import shutil
|
||||
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
async def test_performance_scaling_lab( num_browsers: int = 10, pages_per_browser: int = 10):
|
||||
"""Test performance with multiple browsers and pages.
|
||||
|
||||
This test creates multiple browsers on different ports,
|
||||
spawns multiple pages per browser, and measures performance metrics.
|
||||
"""
|
||||
print(f"\n{INFO}========== Testing Performance Scaling =========={RESET}")
|
||||
|
||||
# Configuration parameters
|
||||
num_browsers = num_browsers
|
||||
pages_per_browser = pages_per_browser
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
base_port = 9222
|
||||
|
||||
# Set up a measuring mechanism for memory
|
||||
import psutil
|
||||
import gc
|
||||
|
||||
# Force garbage collection before starting
|
||||
gc.collect()
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / 1024 / 1024 # in MB
|
||||
peak_memory = initial_memory
|
||||
|
||||
# Report initial configuration
|
||||
print(
|
||||
f"{INFO}Test configuration: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls{RESET}"
|
||||
)
|
||||
|
||||
# List to track managers
|
||||
managers: List[BrowserManager] = []
|
||||
all_pages = []
|
||||
|
||||
# Get crawl4ai home directory
|
||||
crawl4ai_home = os.path.expanduser("~/.crawl4ai")
|
||||
temp_dir = os.path.join(crawl4ai_home, "temp")
|
||||
os.makedirs(temp_dir, exist_ok=True)
|
||||
|
||||
# Create all managers but don't start them yet
|
||||
manager_configs = []
|
||||
for i in range(num_browsers):
|
||||
port = base_port + i
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True,
|
||||
debugging_port=port,
|
||||
user_data_dir=os.path.join(temp_dir, f"browser_profile_{i}"),
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
manager._strategy.shutting_down = True
|
||||
manager_configs.append((manager, i, port))
|
||||
|
||||
# Define async function to start a single manager
|
||||
async def start_manager(manager, index, port):
|
||||
try:
|
||||
await manager.start()
|
||||
return manager
|
||||
except Exception as e:
|
||||
print(
|
||||
f"{ERROR}Failed to start browser {index + 1} on port {port}: {str(e)}{RESET}"
|
||||
)
|
||||
return None
|
||||
|
||||
# Start all managers in parallel
|
||||
start_tasks = [
|
||||
start_manager(manager, i, port) for manager, i, port in manager_configs
|
||||
]
|
||||
started_managers = await asyncio.gather(*start_tasks)
|
||||
|
||||
# Filter out None values (failed starts) and add to managers list
|
||||
managers = [m for m in started_managers if m is not None]
|
||||
|
||||
if len(managers) == 0:
|
||||
print(f"{ERROR}All browser managers failed to start. Aborting test.{RESET}")
|
||||
return False
|
||||
|
||||
if len(managers) < num_browsers:
|
||||
print(
|
||||
f"{WARNING}Only {len(managers)} out of {num_browsers} browser managers started successfully{RESET}"
|
||||
)
|
||||
|
||||
# Create pages for each browser
|
||||
for i, manager in enumerate(managers):
|
||||
try:
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=pages_per_browser)
|
||||
all_pages.extend(pages)
|
||||
except Exception as e:
|
||||
print(f"{ERROR}Failed to create pages for browser {i + 1}: {str(e)}{RESET}")
|
||||
|
||||
# Check memory after page creation
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Ask for confirmation before loading
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with loading pages? (y/n): {RESET}"
|
||||
)
|
||||
# Step 1: Create and start multiple browser managers in parallel
|
||||
start_time = time.time()
|
||||
|
||||
if confirmation.lower() == "y":
|
||||
load_start_time = time.time()
|
||||
|
||||
# Function to load a single page
|
||||
async def load_page(page_ctx, index):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(f"https://example.com/page{index}", timeout=30000)
|
||||
title = await page.title()
|
||||
return title
|
||||
except Exception as e:
|
||||
return f"Error: {str(e)}"
|
||||
|
||||
# Load all pages concurrently
|
||||
load_tasks = [load_page(page_ctx, i) for i, page_ctx in enumerate(all_pages)]
|
||||
load_results = await asyncio.gather(*load_tasks, return_exceptions=True)
|
||||
|
||||
# Count successes and failures
|
||||
successes = sum(
|
||||
1 for r in load_results if isinstance(r, str) and not r.startswith("Error")
|
||||
)
|
||||
failures = len(load_results) - successes
|
||||
|
||||
load_time = time.time() - load_start_time
|
||||
total_test_time = time.time() - start_time
|
||||
|
||||
# Check memory after loading (peak memory)
|
||||
gc.collect()
|
||||
current_memory = process.memory_info().rss / 1024 / 1024
|
||||
peak_memory = max(peak_memory, current_memory)
|
||||
|
||||
# Calculate key metrics
|
||||
memory_per_page = peak_memory / successes if successes > 0 else 0
|
||||
time_per_crawl = total_test_time / successes if successes > 0 else 0
|
||||
crawls_per_second = successes / total_test_time if total_test_time > 0 else 0
|
||||
crawls_per_minute = crawls_per_second * 60
|
||||
crawls_per_hour = crawls_per_minute * 60
|
||||
|
||||
# Print simplified performance summary
|
||||
from rich.console import Console
|
||||
from rich.table import Table
|
||||
|
||||
console = Console()
|
||||
|
||||
# Create a simple summary table
|
||||
table = Table(title="CRAWL4AI PERFORMANCE SUMMARY")
|
||||
|
||||
table.add_column("Metric", style="cyan")
|
||||
table.add_column("Value", style="green")
|
||||
|
||||
table.add_row("Total Crawls Completed", f"{successes}")
|
||||
table.add_row("Total Time", f"{total_test_time:.2f} seconds")
|
||||
table.add_row("Time Per Crawl", f"{time_per_crawl:.2f} seconds")
|
||||
table.add_row("Crawling Speed", f"{crawls_per_second:.2f} crawls/second")
|
||||
table.add_row("Projected Rate (1 minute)", f"{crawls_per_minute:.0f} crawls")
|
||||
table.add_row("Projected Rate (1 hour)", f"{crawls_per_hour:.0f} crawls")
|
||||
table.add_row("Peak Memory Usage", f"{peak_memory:.2f} MB")
|
||||
table.add_row("Memory Per Crawl", f"{memory_per_page:.2f} MB")
|
||||
|
||||
# Display the table
|
||||
console.print(table)
|
||||
|
||||
# Ask confirmation before cleanup
|
||||
confirmation = input(
|
||||
f"{WARNING}Do you want to proceed with cleanup? (y/n): {RESET}"
|
||||
)
|
||||
if confirmation.lower() != "y":
|
||||
print(f"{WARNING}Cleanup aborted by user{RESET}")
|
||||
return False
|
||||
|
||||
# Close all pages
|
||||
for page, _ in all_pages:
|
||||
try:
|
||||
await page.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Remove the temp directory
|
||||
import shutil
|
||||
|
||||
if os.path.exists(temp_dir):
|
||||
shutil.rmtree(temp_dir)
|
||||
|
||||
return True
|
||||
|
||||
|
||||
|
||||
async def main():
|
||||
"""Run all tests"""
|
||||
try:
|
||||
print(f"{INFO}Starting builtin browser tests with browser module{RESET}")
|
||||
|
||||
# # Run browser creation test
|
||||
# manager, cdp_url = await test_builtin_browser_creation()
|
||||
# if not manager:
|
||||
# print(f"{ERROR}Browser creation failed, cannot continue tests{RESET}")
|
||||
# return
|
||||
|
||||
# # Run page operations test
|
||||
# await test_page_operations(manager)
|
||||
|
||||
# # Run browser status and management test
|
||||
# await test_browser_status_management(manager)
|
||||
|
||||
# # Close manager before multiple manager test
|
||||
# await manager.close()
|
||||
|
||||
# Run multiple managers test
|
||||
# await test_multiple_managers()
|
||||
|
||||
# Run performance scaling test
|
||||
await test_performance_scaling()
|
||||
# Run cleanup test
|
||||
# await cleanup_browsers()
|
||||
|
||||
# Run edge cases test
|
||||
# await test_edge_cases()
|
||||
|
||||
print(f"\n{SUCCESS}All tests completed!{RESET}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"\n{ERROR}Test failed with error: {str(e)}{RESET}")
|
||||
import traceback
|
||||
|
||||
traceback.print_exc()
|
||||
finally:
|
||||
# Clean up: kill any remaining builtin browsers
|
||||
await cleanup_browsers()
|
||||
print(f"{SUCCESS}Test cleanup complete{RESET}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
160
tests/browser/test_builtin_strategy.py
Normal file
160
tests/browser/test_builtin_strategy.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Test examples for BuiltinBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of BuiltinBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_builtin_browser():
|
||||
"""Test using a builtin browser that persists between sessions."""
|
||||
logger.info("Testing builtin browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start should connect to existing builtin browser or create one
|
||||
await manager.start()
|
||||
logger.info("Connected to builtin browser", tag="TEST")
|
||||
|
||||
# Test page creation
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Test navigation
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close manager (should not close the builtin browser)
|
||||
await manager.close()
|
||||
logger.info("First session closed", tag="TEST")
|
||||
|
||||
# Create a second manager to verify browser persistence
|
||||
logger.info("Creating second session to verify persistence", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
await manager2.start()
|
||||
logger.info("Connected to existing builtin browser", tag="TEST")
|
||||
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.org")
|
||||
title2 = await page2.title()
|
||||
logger.info(f"Second session page title: {title2}", tag="TEST")
|
||||
|
||||
await manager2.close()
|
||||
logger.info("Second session closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_builtin_browser_status():
|
||||
"""Test getting status of the builtin browser."""
|
||||
logger.info("Testing builtin browser status", tag="TEST")
|
||||
|
||||
from crawl4ai.browser.strategies import BuiltinBrowserStrategy
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
browser_mode="builtin",
|
||||
headless=True
|
||||
)
|
||||
|
||||
# Create strategy directly to access its status methods
|
||||
strategy = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
try:
|
||||
# Get status before starting (should be not running)
|
||||
status_before = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Initial status: {status_before}", tag="TEST")
|
||||
|
||||
# Start the browser
|
||||
await strategy.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Get status after starting
|
||||
status_after = await strategy.get_builtin_browser_status()
|
||||
logger.info(f"Status after start: {status_after}", tag="TEST")
|
||||
|
||||
# Create a page to verify functionality
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await strategy.get_page(crawler_config)
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Close strategy (should not kill the builtin browser)
|
||||
await strategy.close()
|
||||
logger.info("Strategy closed successfully", tag="TEST")
|
||||
|
||||
# Create a new strategy object
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
|
||||
# Get status again (should still be running)
|
||||
status_final = await strategy2.get_builtin_browser_status()
|
||||
logger.info(f"Final status: {status_final}", tag="TEST")
|
||||
|
||||
# Verify that the status shows the browser is running
|
||||
is_running = status_final.get('running', False)
|
||||
logger.info(f"Builtin browser persistence confirmed: {is_running}", tag="TEST")
|
||||
|
||||
# Kill the builtin browser to clean up
|
||||
logger.info("Killing builtin browser", tag="TEST")
|
||||
success = await strategy2.kill_builtin_browser()
|
||||
logger.info(f"Killed builtin browser successfully: {success}", tag="TEST")
|
||||
|
||||
return is_running and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await strategy.close()
|
||||
|
||||
# Try to kill the builtin browser to clean up
|
||||
strategy2 = BuiltinBrowserStrategy(browser_config, logger)
|
||||
await strategy2.kill_builtin_browser()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_builtin_browser())
|
||||
results.append(await test_builtin_browser_status())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
227
tests/browser/test_cdp_strategy.py
Normal file
227
tests/browser/test_cdp_strategy.py
Normal file
@@ -0,0 +1,227 @@
|
||||
"""Test examples for CDPBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of CDPBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_cdp_launch_connect():
|
||||
"""Test launching a browser and connecting via CDP."""
|
||||
logger.info("Testing launch and connect via CDP", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched and connected via CDP", tag="TEST")
|
||||
|
||||
# Test with multiple pages
|
||||
pages = []
|
||||
for i in range(3):
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
await page.goto(f"https://example.com?test={i}")
|
||||
pages.append(page)
|
||||
logger.info(f"Created page {i+1}", tag="TEST")
|
||||
|
||||
# Verify all pages are working
|
||||
for i, page in enumerate(pages):
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_cdp_with_user_data_dir():
|
||||
"""Test CDP browser with a user data directory."""
|
||||
logger.info("Testing CDP browser with user data directory", tag="TEST")
|
||||
|
||||
# Create a temporary user data directory
|
||||
import tempfile
|
||||
user_data_dir = tempfile.mkdtemp(prefix="crawl4ai-test-")
|
||||
logger.info(f"Created temporary user data directory: {user_data_dir}", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True,
|
||||
user_data_dir=user_data_dir
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched with user data directory", tag="TEST")
|
||||
|
||||
# Navigate to a page and store some data
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Set a cookie
|
||||
await context.add_cookies([{
|
||||
"name": "test_cookie",
|
||||
"value": "test_value",
|
||||
"url": "https://example.com"
|
||||
}])
|
||||
|
||||
# Visit the site
|
||||
await page.goto("https://example.com")
|
||||
|
||||
# Verify cookie was set
|
||||
cookies = await context.cookies(["https://example.com"])
|
||||
has_test_cookie = any(cookie["name"] == "test_cookie" for cookie in cookies)
|
||||
logger.info(f"Cookie set successfully: {has_test_cookie}", tag="TEST")
|
||||
|
||||
# Close the browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Start a new browser with the same user data directory
|
||||
logger.info("Starting second browser session with same user data directory", tag="TEST")
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
|
||||
# Get a new page and check if the cookie persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
|
||||
# Verify cookie persisted
|
||||
cookies2 = await context2.cookies(["https://example.com"])
|
||||
has_test_cookie2 = any(cookie["name"] == "test_cookie" for cookie in cookies2)
|
||||
logger.info(f"Cookie persisted across sessions: {has_test_cookie2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
|
||||
# Remove temporary directory
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
logger.info(f"Removed temporary user data directory", tag="TEST")
|
||||
|
||||
return has_test_cookie and has_test_cookie2
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Clean up temporary directory
|
||||
try:
|
||||
import shutil
|
||||
shutil.rmtree(user_data_dir, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
|
||||
return False
|
||||
|
||||
async def test_cdp_session_management():
|
||||
"""Test session management with CDP browser."""
|
||||
logger.info("Testing session management with CDP browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
use_managed_browser=True,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "test_session_1"
|
||||
session2_id = "test_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id)
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('session1_data', 'test_value')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, _ = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and data1 == "test_value" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# results.append(await test_cdp_launch_connect())
|
||||
# results.append(await test_cdp_with_user_data_dir())
|
||||
results.append(await test_cdp_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
77
tests/browser/test_combined.py
Normal file
77
tests/browser/test_combined.py
Normal file
@@ -0,0 +1,77 @@
|
||||
"""Combined test runner for all browser module tests.
|
||||
|
||||
This script runs all the browser module tests in sequence and
|
||||
provides a comprehensive summary.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def run_test_module(module_name, header):
|
||||
"""Run all tests in a module and return results."""
|
||||
logger.info(f"\n{'-'*30}", tag="TEST")
|
||||
logger.info(f"RUNNING: {header}", tag="TEST")
|
||||
logger.info(f"{'-'*30}", tag="TEST")
|
||||
|
||||
# Import the module dynamically
|
||||
module = __import__(f"tests.browser.{module_name}", fromlist=["run_tests"])
|
||||
|
||||
# Track time for performance measurement
|
||||
start_time = time.time()
|
||||
|
||||
# Run the tests
|
||||
await module.run_tests()
|
||||
|
||||
# Calculate time taken
|
||||
time_taken = time.time() - start_time
|
||||
logger.info(f"Time taken: {time_taken:.2f} seconds", tag="TIMING")
|
||||
|
||||
return time_taken
|
||||
|
||||
async def main():
|
||||
"""Run all test modules."""
|
||||
logger.info("STARTING COMPREHENSIVE BROWSER MODULE TESTS", tag="MAIN")
|
||||
|
||||
# List of test modules to run
|
||||
test_modules = [
|
||||
("test_browser_manager", "Browser Manager Tests"),
|
||||
("test_playwright_strategy", "Playwright Strategy Tests"),
|
||||
("test_cdp_strategy", "CDP Strategy Tests"),
|
||||
("test_builtin_strategy", "Builtin Browser Strategy Tests"),
|
||||
("test_profiles", "Profile Management Tests")
|
||||
]
|
||||
|
||||
# Run each test module
|
||||
timings = {}
|
||||
for module_name, header in test_modules:
|
||||
try:
|
||||
time_taken = await run_test_module(module_name, header)
|
||||
timings[module_name] = time_taken
|
||||
except Exception as e:
|
||||
logger.error(f"Error running {module_name}: {str(e)}", tag="ERROR")
|
||||
|
||||
# Print summary
|
||||
logger.info("\n\nTEST SUMMARY:", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
for module_name, header in test_modules:
|
||||
if module_name in timings:
|
||||
logger.info(f"{header}: {timings[module_name]:.2f} seconds", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{header}: FAILED TO RUN", tag="SUMMARY")
|
||||
logger.info(f"{'-'*50}", tag="SUMMARY")
|
||||
total_time = sum(timings.values())
|
||||
logger.info(f"Total time: {total_time:.2f} seconds", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
17
tests/browser/test_launch_standalone.py
Normal file
17
tests/browser/test_launch_standalone.py
Normal file
@@ -0,0 +1,17 @@
|
||||
from crawl4ai.browser_profiler import BrowserProfiler
|
||||
import asyncio
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test launching a standalone browser
|
||||
async def test_standalone_browser():
|
||||
profiler = BrowserProfiler()
|
||||
cdp_url = await profiler.launch_standalone_browser(
|
||||
browser_type="chromium",
|
||||
user_data_dir="~/.crawl4ai/browser_profile/test-browser-data",
|
||||
debugging_port=9222,
|
||||
headless=False
|
||||
)
|
||||
print(f"CDP URL: {cdp_url}")
|
||||
|
||||
asyncio.run(test_standalone_browser())
|
||||
902
tests/browser/test_parallel_crawling.py
Normal file
902
tests/browser/test_parallel_crawling.py
Normal file
@@ -0,0 +1,902 @@
|
||||
"""
|
||||
Test examples for parallel crawling with the browser module.
|
||||
|
||||
These examples demonstrate the functionality of parallel page creation
|
||||
and serve as functional tests for multi-page crawling performance.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
from typing import List
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_get_pages_basic():
|
||||
"""Test basic functionality of get_pages method."""
|
||||
logger.info("Testing basic get_pages functionality", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Request 3 pages
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=3)
|
||||
|
||||
# Verify we got the correct number of pages
|
||||
assert len(pages) == 3, f"Expected 3 pages, got {len(pages)}"
|
||||
|
||||
# Verify each page is valid
|
||||
for i, (page, context) in enumerate(pages):
|
||||
await page.goto("https://example.com")
|
||||
title = await page.title()
|
||||
logger.info(f"Page {i+1} title: {title}", tag="TEST")
|
||||
assert title, f"Page {i+1} has no title"
|
||||
|
||||
await manager.close()
|
||||
logger.success("Basic get_pages test completed successfully", tag="TEST")
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_parallel_approaches_comparison():
|
||||
"""Compare two parallel crawling approaches:
|
||||
1. Create a page for each URL on-demand (get_page + gather)
|
||||
2. Get all pages upfront with get_pages, then use them (get_pages + gather)
|
||||
"""
|
||||
logger.info("Comparing different parallel crawling approaches", tag="TEST")
|
||||
|
||||
urls = [
|
||||
"https://example.com/page1",
|
||||
"https://crawl4ai.com",
|
||||
"https://kidocode.com",
|
||||
"https://bbc.com",
|
||||
# "https://example.com/page1",
|
||||
# "https://example.com/page2",
|
||||
# "https://example.com/page3",
|
||||
# "https://example.com/page4",
|
||||
]
|
||||
|
||||
browser_config = BrowserConfig(headless=False)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(url):
|
||||
"""Create a new page for each URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = [fetch_title_approach1(url) for url in urls]
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in zip(pages, urls)]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
if speedup > 1:
|
||||
logger.success(f"Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Verify same content was retrieved in both approaches
|
||||
assert len(approach1_results) == len(approach2_results), "Result count mismatch"
|
||||
|
||||
# Sort results for comparison since parallel execution might complete in different order
|
||||
assert sorted(approach1_results) == sorted(approach2_results), "Results content mismatch"
|
||||
|
||||
await manager.close()
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_multi_browser_scaling(num_browsers=3, pages_per_browser=5):
|
||||
"""Test performance with multiple browsers and pages per browser.
|
||||
Compares two approaches:
|
||||
1. On-demand page creation (get_page + gather)
|
||||
2. Pre-created pages (get_pages + gather)
|
||||
"""
|
||||
logger.info(f"Testing multi-browser scaling with {num_browsers} browsers × {pages_per_browser} pages", tag="TEST")
|
||||
|
||||
# Generate test URLs
|
||||
total_pages = num_browsers * pages_per_browser
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_pages)]
|
||||
|
||||
# Create browser managers
|
||||
managers = []
|
||||
base_port = 9222
|
||||
|
||||
try:
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True # Using default browser mode like in test_parallel_approaches_comparison
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among managers
|
||||
urls_per_manager = {}
|
||||
for i, manager in enumerate(managers):
|
||||
start_idx = i * pages_per_browser
|
||||
end_idx = min(start_idx + pages_per_browser, len(urls))
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
|
||||
# Approach 1: Create a page for each URL on-demand and run in parallel
|
||||
logger.info("Testing approach 1: get_page for each URL + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
async def fetch_title_approach1(manager, url):
|
||||
"""Create a new page for the URL, go to the URL, and get title"""
|
||||
crawler_config = CrawlerRunConfig(url=url)
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Run fetch_title_approach1 for each URL in parallel
|
||||
tasks = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
for url in manager_urls:
|
||||
tasks.append(fetch_title_approach1(manager, url))
|
||||
|
||||
approach1_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach1_time = time.time() - start_time
|
||||
logger.info(f"Approach 1 time (get_page + gather): {approach1_time:.2f}s", tag="TEST")
|
||||
|
||||
# Approach 2: Get all pages upfront with get_pages, then use them in parallel
|
||||
logger.info("Testing approach 2: get_pages upfront + gather", tag="TEST")
|
||||
start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
async def fetch_title_approach2(page_ctx, url):
|
||||
"""Use a pre-created page to go to URL and get title"""
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title_approach2(page_ctx, url) for page_ctx, url in all_pages]
|
||||
approach2_results = await asyncio.gather(*tasks)
|
||||
|
||||
approach2_time = time.time() - start_time
|
||||
logger.info(f"Approach 2 time (get_pages + gather): {approach2_time:.2f}s", tag="TEST")
|
||||
|
||||
# Compare results and performance
|
||||
speedup = approach1_time / approach2_time if approach2_time > 0 else 0
|
||||
pages_per_second = total_pages / approach2_time
|
||||
|
||||
# Show a simple summary
|
||||
logger.info(f"📊 Summary: {num_browsers} browsers × {pages_per_browser} pages = {total_pages} total crawls", tag="TEST")
|
||||
logger.info(f"⚡ Performance: {pages_per_second:.1f} pages/second ({pages_per_second*60:.0f} pages/minute)", tag="TEST")
|
||||
logger.info(f"🚀 Total crawl time: {approach2_time:.2f} seconds", tag="TEST")
|
||||
|
||||
if speedup > 1:
|
||||
logger.success(f"✅ Approach 2 (get_pages upfront) was {speedup:.2f}x faster", tag="TEST")
|
||||
else:
|
||||
logger.info(f"✅ Approach 1 (get_page + gather) was {1/speedup:.2f}x faster", tag="TEST")
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def grid_search_optimal_configuration(total_urls=50):
|
||||
"""Perform a grid search to find the optimal balance between number of browsers and pages per browser.
|
||||
|
||||
This function tests different combinations of browser count and pages per browser,
|
||||
while keeping the total number of URLs constant. It measures performance metrics
|
||||
for each configuration to find the "sweet spot" that provides the best speed
|
||||
with reasonable memory usage.
|
||||
|
||||
Args:
|
||||
total_urls: Total number of URLs to crawl (default: 50)
|
||||
"""
|
||||
logger.info(f"=== GRID SEARCH FOR OPTIMAL CRAWLING CONFIGURATION ({total_urls} URLs) ===", tag="TEST")
|
||||
|
||||
# Generate test URLs once
|
||||
urls = [f"https://example.com/page_{i}" for i in range(total_urls)]
|
||||
|
||||
# Define grid search configurations
|
||||
# We'll use more flexible approach: test all browser counts from 1 to min(20, total_urls)
|
||||
# and distribute pages evenly (some browsers may have 1 more page than others)
|
||||
configurations = []
|
||||
|
||||
# Maximum number of browsers to test
|
||||
max_browsers_to_test = min(20, total_urls)
|
||||
|
||||
# Try configurations with 1 to max_browsers_to_test browsers
|
||||
for num_browsers in range(1, max_browsers_to_test + 1):
|
||||
base_pages_per_browser = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Generate exact page distribution array
|
||||
if remainder > 0:
|
||||
# First 'remainder' browsers get one more page
|
||||
page_distribution = [base_pages_per_browser + 1] * remainder + [base_pages_per_browser] * (num_browsers - remainder)
|
||||
pages_distribution = f"{base_pages_per_browser+1} pages × {remainder} browsers, {base_pages_per_browser} pages × {num_browsers - remainder} browsers"
|
||||
else:
|
||||
# All browsers get the same number of pages
|
||||
page_distribution = [base_pages_per_browser] * num_browsers
|
||||
pages_distribution = f"{base_pages_per_browser} pages × {num_browsers} browsers"
|
||||
|
||||
# Format the distribution as a tuple string like (4, 4, 3, 3)
|
||||
distribution_str = str(tuple(page_distribution))
|
||||
|
||||
configurations.append((num_browsers, base_pages_per_browser, pages_distribution, page_distribution, distribution_str))
|
||||
|
||||
# Track results
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for num_browsers, pages_per_browser, pages_distribution, page_distribution, distribution_str in configurations:
|
||||
logger.info("-" * 80, tag="TEST")
|
||||
logger.info(f"Testing configuration: {num_browsers} browsers with distribution: {distribution_str}", tag="TEST")
|
||||
logger.info(f"Details: {pages_distribution}", tag="TEST")
|
||||
# Sleep a bit for randomness
|
||||
await asyncio.sleep(0.5)
|
||||
|
||||
try:
|
||||
# Import psutil for memory tracking
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
initial_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
logger.warning("psutil not available, memory metrics will not be tracked", tag="TEST")
|
||||
initial_memory = 0
|
||||
|
||||
# Create and start browser managers
|
||||
managers = []
|
||||
start_time = time.time()
|
||||
|
||||
# Start all browsers in parallel
|
||||
start_tasks = []
|
||||
for i in range(num_browsers):
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
browser_startup_time = time.time() - start_time
|
||||
|
||||
# Measure memory after browser startup
|
||||
if initial_memory > 0:
|
||||
browser_memory = process.memory_info().rss / (1024 * 1024) - initial_memory
|
||||
else:
|
||||
browser_memory = 0
|
||||
|
||||
# Distribute URLs among managers using the exact page distribution
|
||||
urls_per_manager = {}
|
||||
total_assigned = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
if i < len(page_distribution):
|
||||
# Get the exact number of pages for this browser from our distribution
|
||||
manager_pages = page_distribution[i]
|
||||
|
||||
# Get the URL slice for this manager
|
||||
start_idx = total_assigned
|
||||
end_idx = start_idx + manager_pages
|
||||
urls_per_manager[manager] = urls[start_idx:end_idx]
|
||||
total_assigned += manager_pages
|
||||
else:
|
||||
# If we have more managers than our distribution (should never happen)
|
||||
urls_per_manager[manager] = []
|
||||
|
||||
# Use the more efficient approach (pre-created pages)
|
||||
logger.info("Running page crawling test...", tag="TEST")
|
||||
crawl_start_time = time.time()
|
||||
|
||||
# Get all pages upfront for each manager
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls: # Skip managers with no URLs
|
||||
continue
|
||||
crawler_config = CrawlerRunConfig()
|
||||
pages = await manager.get_pages(crawler_config, count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Measure memory after page creation
|
||||
if initial_memory > 0:
|
||||
pages_memory = process.memory_info().rss / (1024 * 1024) - browser_memory - initial_memory
|
||||
else:
|
||||
pages_memory = 0
|
||||
|
||||
# Function to crawl a URL with a pre-created page
|
||||
async def fetch_title(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
# Use the pre-created pages to fetch titles in parallel
|
||||
tasks = [fetch_title(page_ctx, url) for page_ctx, url in all_pages]
|
||||
crawl_results = await asyncio.gather(*tasks)
|
||||
|
||||
crawl_time = time.time() - crawl_start_time
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Final memory measurement
|
||||
if initial_memory > 0:
|
||||
peak_memory = max(browser_memory + pages_memory, process.memory_info().rss / (1024 * 1024) - initial_memory)
|
||||
else:
|
||||
peak_memory = 0
|
||||
|
||||
# Close all managers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Store result metrics
|
||||
result = {
|
||||
"num_browsers": num_browsers,
|
||||
"pages_per_browser": pages_per_browser,
|
||||
"page_distribution": page_distribution,
|
||||
"distribution_str": distribution_str,
|
||||
"total_urls": total_urls,
|
||||
"browser_startup_time": browser_startup_time,
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"browser_memory": browser_memory,
|
||||
"pages_memory": pages_memory,
|
||||
"peak_memory": peak_memory,
|
||||
"pages_per_second": pages_per_second,
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory usage
|
||||
"efficiency_score": pages_per_second / (peak_memory + 1) if peak_memory > 0 else pages_per_second,
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
# Log the results
|
||||
logger.info(f"Browser startup: {browser_startup_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Crawl time: {crawl_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Total time: {total_time:.2f}s", tag="TEST")
|
||||
logger.info(f"Performance: {pages_per_second:.1f} pages/second", tag="TEST")
|
||||
|
||||
if peak_memory > 0:
|
||||
logger.info(f"Browser memory: {browser_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Pages memory: {pages_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Peak memory: {peak_memory:.1f}MB", tag="TEST")
|
||||
logger.info(f"Efficiency score: {result['efficiency_score']:.6f}", tag="TEST")
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Error testing configuration: {str(e)}", tag="TEST")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# Print summary of all configurations
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
logger.info("GRID SEARCH RESULTS SUMMARY", tag="TEST")
|
||||
logger.info("=" * 100, tag="TEST")
|
||||
|
||||
# Rank configurations by efficiency score
|
||||
ranked_results = sorted(results, key=lambda x: x["efficiency_score"], reverse=True)
|
||||
|
||||
# Also determine rankings by different metrics
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
lowest_memory = sorted(results, key=lambda x: x["peak_memory"] if x["peak_memory"] > 0 else float('inf'))[0]
|
||||
most_efficient = ranked_results[0]
|
||||
|
||||
# Print top performers by category
|
||||
logger.info("🏆 TOP PERFORMERS BY CATEGORY:", tag="TEST")
|
||||
logger.info(f"⚡ Fastest: {fastest['num_browsers']} browsers × ~{fastest['pages_per_browser']} pages " +
|
||||
f"({fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/s)", tag="TEST")
|
||||
|
||||
if lowest_memory["peak_memory"] > 0:
|
||||
logger.info(f"💾 Lowest memory: {lowest_memory['num_browsers']} browsers × ~{lowest_memory['pages_per_browser']} pages " +
|
||||
f"({lowest_memory['peak_memory']:.1f}MB)", tag="TEST")
|
||||
|
||||
logger.info(f"🌟 Most efficient: {most_efficient['num_browsers']} browsers × ~{most_efficient['pages_per_browser']} pages " +
|
||||
f"(score: {most_efficient['efficiency_score']:.6f})", tag="TEST")
|
||||
|
||||
# Print result table header
|
||||
logger.info("\n📊 COMPLETE RANKING TABLE (SORTED BY EFFICIENCY SCORE):", tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Define table header
|
||||
header = f"{'Rank':<5} | {'Browsers':<8} | {'Distribution':<55} | {'Total Time(s)':<12} | {'Speed(p/s)':<12} | {'Memory(MB)':<12} | {'Efficiency':<10} | {'Notes'}"
|
||||
logger.info(header, tag="TEST")
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Print each configuration in ranked order
|
||||
for rank, result in enumerate(ranked_results, 1):
|
||||
# Add special notes for top performers
|
||||
notes = []
|
||||
if result == fastest:
|
||||
notes.append("⚡ Fastest")
|
||||
if result == lowest_memory:
|
||||
notes.append("💾 Lowest Memory")
|
||||
if result == most_efficient:
|
||||
notes.append("🌟 Most Efficient")
|
||||
|
||||
notes_str = " | ".join(notes) if notes else ""
|
||||
|
||||
# Format memory if available
|
||||
memory_str = f"{result['peak_memory']:.1f}" if result['peak_memory'] > 0 else "N/A"
|
||||
|
||||
# Get the distribution string
|
||||
dist_str = result.get('distribution_str', str(tuple([result['pages_per_browser']] * result['num_browsers'])))
|
||||
|
||||
# Build the row
|
||||
row = f"{rank:<5} | {result['num_browsers']:<8} | {dist_str:<55} | {result['total_time']:.2f}s{' ':<7} | "
|
||||
row += f"{result['pages_per_second']:.2f}{' ':<6} | {memory_str}{' ':<6} | {result['efficiency_score']:.4f}{' ':<4} | {notes_str}"
|
||||
|
||||
logger.info(row, tag="TEST")
|
||||
|
||||
logger.info("-" * 120, tag="TEST")
|
||||
|
||||
# Generate visualization if matplotlib is available
|
||||
try:
|
||||
import matplotlib.pyplot as plt
|
||||
import numpy as np
|
||||
|
||||
# Extract data for plotting from ranked results
|
||||
browser_counts = [r["num_browsers"] for r in ranked_results]
|
||||
efficiency_scores = [r["efficiency_score"] for r in ranked_results]
|
||||
crawl_times = [r["crawl_time"] for r in ranked_results]
|
||||
total_times = [r["total_time"] for r in ranked_results]
|
||||
|
||||
# Filter results with memory data
|
||||
memory_results = [r for r in ranked_results if r["peak_memory"] > 0]
|
||||
memory_browser_counts = [r["num_browsers"] for r in memory_results]
|
||||
peak_memories = [r["peak_memory"] for r in memory_results]
|
||||
|
||||
# Create figure with clean design
|
||||
plt.figure(figsize=(14, 12), facecolor='white')
|
||||
plt.style.use('ggplot')
|
||||
|
||||
# Create grid for subplots
|
||||
gs = plt.GridSpec(3, 1, height_ratios=[1, 1, 1], hspace=0.3)
|
||||
|
||||
# Plot 1: Efficiency Score (higher is better)
|
||||
ax1 = plt.subplot(gs[0])
|
||||
bar_colors = ['#3498db'] * len(browser_counts)
|
||||
|
||||
# Highlight the most efficient
|
||||
most_efficient_idx = browser_counts.index(most_efficient["num_browsers"])
|
||||
bar_colors[most_efficient_idx] = '#e74c3c' # Red for most efficient
|
||||
|
||||
bars = ax1.bar(range(len(browser_counts)), efficiency_scores, color=bar_colors)
|
||||
ax1.set_xticks(range(len(browser_counts)))
|
||||
ax1.set_xticklabels([f"{bc}" for bc in browser_counts], rotation=45)
|
||||
ax1.set_xlabel('Number of Browsers')
|
||||
ax1.set_ylabel('Efficiency Score (higher is better)')
|
||||
ax1.set_title('Browser Configuration Efficiency (higher is better)')
|
||||
|
||||
# Add value labels on top of bars
|
||||
for bar, score in zip(bars, efficiency_scores):
|
||||
height = bar.get_height()
|
||||
ax1.text(bar.get_x() + bar.get_width()/2., height + 0.02*max(efficiency_scores),
|
||||
f'{score:.3f}', ha='center', va='bottom', rotation=90, fontsize=8)
|
||||
|
||||
# Highlight best configuration
|
||||
ax1.text(0.02, 0.90, f"🌟 Most Efficient: {most_efficient['num_browsers']} browsers with ~{most_efficient['pages_per_browser']} pages",
|
||||
transform=ax1.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='yellow', alpha=0.3))
|
||||
|
||||
# Plot 2: Time Performance
|
||||
ax2 = plt.subplot(gs[1])
|
||||
|
||||
# Plot both total time and crawl time
|
||||
ax2.plot(browser_counts, crawl_times, 'bo-', label='Crawl Time (s)', linewidth=2)
|
||||
ax2.plot(browser_counts, total_times, 'go--', label='Total Time (s)', linewidth=2, alpha=0.6)
|
||||
|
||||
# Mark the fastest configuration
|
||||
fastest_idx = browser_counts.index(fastest["num_browsers"])
|
||||
ax2.plot(browser_counts[fastest_idx], crawl_times[fastest_idx], 'ro', ms=10,
|
||||
label=f'Fastest: {fastest["num_browsers"]} browsers')
|
||||
|
||||
ax2.set_xlabel('Number of Browsers')
|
||||
ax2.set_ylabel('Time (seconds)')
|
||||
ax2.set_title(f'Time Performance for {total_urls} URLs by Browser Count')
|
||||
ax2.grid(True, linestyle='--', alpha=0.7)
|
||||
ax2.legend(loc='upper right')
|
||||
|
||||
# Plot pages per second on second y-axis
|
||||
pages_per_second = [total_urls/t for t in crawl_times]
|
||||
ax2_twin = ax2.twinx()
|
||||
ax2_twin.plot(browser_counts, pages_per_second, 'r^--', label='Pages/second', alpha=0.5)
|
||||
ax2_twin.set_ylabel('Pages per second')
|
||||
|
||||
# Add note about the fastest configuration
|
||||
ax2.text(0.02, 0.90, f"⚡ Fastest: {fastest['num_browsers']} browsers with ~{fastest['pages_per_browser']} pages" +
|
||||
f"\n {fastest['crawl_time']:.2f}s ({fastest['pages_per_second']:.1f} pages/s)",
|
||||
transform=ax2.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightblue', alpha=0.3))
|
||||
|
||||
# Plot 3: Memory Usage (if available)
|
||||
if memory_results:
|
||||
ax3 = plt.subplot(gs[2])
|
||||
|
||||
# Prepare data for grouped bar chart
|
||||
memory_per_browser = [m/n for m, n in zip(peak_memories, memory_browser_counts)]
|
||||
memory_per_page = [m/(n*p) for m, n, p in zip(
|
||||
[r["peak_memory"] for r in memory_results],
|
||||
[r["num_browsers"] for r in memory_results],
|
||||
[r["pages_per_browser"] for r in memory_results])]
|
||||
|
||||
x = np.arange(len(memory_browser_counts))
|
||||
width = 0.35
|
||||
|
||||
# Create grouped bars
|
||||
ax3.bar(x - width/2, peak_memories, width, label='Total Memory (MB)', color='#9b59b6')
|
||||
ax3.bar(x + width/2, memory_per_browser, width, label='Memory per Browser (MB)', color='#3498db')
|
||||
|
||||
# Configure axis
|
||||
ax3.set_xticks(x)
|
||||
ax3.set_xticklabels([f"{bc}" for bc in memory_browser_counts], rotation=45)
|
||||
ax3.set_xlabel('Number of Browsers')
|
||||
ax3.set_ylabel('Memory (MB)')
|
||||
ax3.set_title('Memory Usage by Browser Configuration')
|
||||
ax3.legend(loc='upper left')
|
||||
ax3.grid(True, linestyle='--', alpha=0.7)
|
||||
|
||||
# Add second y-axis for memory per page
|
||||
ax3_twin = ax3.twinx()
|
||||
ax3_twin.plot(x, memory_per_page, 'ro-', label='Memory per Page (MB)')
|
||||
ax3_twin.set_ylabel('Memory per Page (MB)')
|
||||
|
||||
# Get lowest memory configuration
|
||||
lowest_memory_idx = memory_browser_counts.index(lowest_memory["num_browsers"])
|
||||
|
||||
# Add note about lowest memory configuration
|
||||
ax3.text(0.02, 0.90, f"💾 Lowest Memory: {lowest_memory['num_browsers']} browsers with ~{lowest_memory['pages_per_browser']} pages" +
|
||||
f"\n {lowest_memory['peak_memory']:.1f}MB ({lowest_memory['peak_memory']/total_urls:.2f}MB per page)",
|
||||
transform=ax3.transAxes, fontsize=12, verticalalignment='top',
|
||||
bbox=dict(boxstyle='round,pad=0.5', facecolor='lightgreen', alpha=0.3))
|
||||
|
||||
# Add overall title
|
||||
plt.suptitle(f'Browser Scaling Grid Search Results for {total_urls} URLs', fontsize=16, y=0.98)
|
||||
|
||||
# Add timestamp and info at the bottom
|
||||
plt.figtext(0.5, 0.01, f"Generated by Crawl4AI at {time.strftime('%Y-%m-%d %H:%M:%S')}",
|
||||
ha="center", fontsize=10, style='italic')
|
||||
|
||||
# Get current directory and save the figure there
|
||||
import os
|
||||
__current_file = os.path.abspath(__file__)
|
||||
current_dir = os.path.dirname(__current_file)
|
||||
output_file = os.path.join(current_dir, 'browser_scaling_grid_search.png')
|
||||
|
||||
# Adjust layout and save figure with high DPI
|
||||
plt.tight_layout(rect=[0, 0.03, 1, 0.97])
|
||||
plt.savefig(output_file, dpi=200, bbox_inches='tight')
|
||||
logger.success(f"Visualization saved to {output_file}", tag="TEST")
|
||||
|
||||
except ImportError:
|
||||
logger.warning("matplotlib not available, skipping visualization", tag="TEST")
|
||||
|
||||
return most_efficient["num_browsers"], most_efficient["pages_per_browser"]
|
||||
|
||||
async def find_optimal_browser_config(total_urls=50, verbose=True, rate_limit_delay=0.2):
|
||||
"""Find optimal browser configuration for crawling a specific number of URLs.
|
||||
|
||||
Args:
|
||||
total_urls: Number of URLs to crawl
|
||||
verbose: Whether to print progress
|
||||
rate_limit_delay: Delay between page loads to avoid rate limiting
|
||||
|
||||
Returns:
|
||||
dict: Contains fastest, lowest_memory, and optimal configurations
|
||||
"""
|
||||
if verbose:
|
||||
print(f"\n=== Finding optimal configuration for crawling {total_urls} URLs ===\n")
|
||||
|
||||
# Generate test URLs with timestamp to avoid caching
|
||||
timestamp = int(time.time())
|
||||
urls = [f"https://example.com/page_{i}?t={timestamp}" for i in range(total_urls)]
|
||||
|
||||
# Limit browser configurations to test (1 browser to max 10)
|
||||
max_browsers = min(10, total_urls)
|
||||
configs_to_test = []
|
||||
|
||||
# Generate configurations (browser count, pages distribution)
|
||||
for num_browsers in range(1, max_browsers + 1):
|
||||
base_pages = total_urls // num_browsers
|
||||
remainder = total_urls % num_browsers
|
||||
|
||||
# Create distribution array like [3, 3, 2, 2] (some browsers get one more page)
|
||||
if remainder > 0:
|
||||
distribution = [base_pages + 1] * remainder + [base_pages] * (num_browsers - remainder)
|
||||
else:
|
||||
distribution = [base_pages] * num_browsers
|
||||
|
||||
configs_to_test.append((num_browsers, distribution))
|
||||
|
||||
results = []
|
||||
|
||||
# Test each configuration
|
||||
for browser_count, page_distribution in configs_to_test:
|
||||
if verbose:
|
||||
print(f"Testing {browser_count} browsers with distribution {tuple(page_distribution)}")
|
||||
|
||||
try:
|
||||
# Track memory if possible
|
||||
try:
|
||||
import psutil
|
||||
process = psutil.Process()
|
||||
start_memory = process.memory_info().rss / (1024 * 1024) # MB
|
||||
except ImportError:
|
||||
if verbose:
|
||||
print("Memory tracking not available (psutil not installed)")
|
||||
start_memory = 0
|
||||
|
||||
# Start browsers in parallel
|
||||
managers = []
|
||||
start_tasks = []
|
||||
start_time = time.time()
|
||||
|
||||
for i in range(browser_count):
|
||||
config = BrowserConfig(headless=True)
|
||||
manager = BrowserManager(browser_config=config, logger=logger)
|
||||
start_tasks.append(manager.start())
|
||||
managers.append(manager)
|
||||
|
||||
await asyncio.gather(*start_tasks)
|
||||
|
||||
# Distribute URLs among browsers
|
||||
urls_per_manager = {}
|
||||
url_index = 0
|
||||
|
||||
for i, manager in enumerate(managers):
|
||||
pages_for_this_browser = page_distribution[i]
|
||||
end_index = url_index + pages_for_this_browser
|
||||
urls_per_manager[manager] = urls[url_index:end_index]
|
||||
url_index = end_index
|
||||
|
||||
# Create pages for each browser
|
||||
all_pages = []
|
||||
for manager, manager_urls in urls_per_manager.items():
|
||||
if not manager_urls:
|
||||
continue
|
||||
pages = await manager.get_pages(CrawlerRunConfig(), count=len(manager_urls))
|
||||
all_pages.extend(zip(pages, manager_urls))
|
||||
|
||||
# Crawl pages with delay to avoid rate limiting
|
||||
async def crawl_page(page_ctx, url):
|
||||
page, _ = page_ctx
|
||||
try:
|
||||
await page.goto(url)
|
||||
if rate_limit_delay > 0:
|
||||
await asyncio.sleep(rate_limit_delay)
|
||||
title = await page.title()
|
||||
return title
|
||||
finally:
|
||||
await page.close()
|
||||
|
||||
crawl_start = time.time()
|
||||
crawl_tasks = [crawl_page(page_ctx, url) for page_ctx, url in all_pages]
|
||||
await asyncio.gather(*crawl_tasks)
|
||||
crawl_time = time.time() - crawl_start
|
||||
total_time = time.time() - start_time
|
||||
|
||||
# Measure final memory usage
|
||||
if start_memory > 0:
|
||||
end_memory = process.memory_info().rss / (1024 * 1024)
|
||||
memory_used = end_memory - start_memory
|
||||
else:
|
||||
memory_used = 0
|
||||
|
||||
# Close all browsers
|
||||
for manager in managers:
|
||||
await manager.close()
|
||||
|
||||
# Calculate metrics
|
||||
pages_per_second = total_urls / crawl_time
|
||||
|
||||
# Calculate efficiency score (higher is better)
|
||||
# This balances speed vs memory
|
||||
if memory_used > 0:
|
||||
efficiency = pages_per_second / (memory_used + 1)
|
||||
else:
|
||||
efficiency = pages_per_second
|
||||
|
||||
# Store result
|
||||
result = {
|
||||
"browser_count": browser_count,
|
||||
"distribution": tuple(page_distribution),
|
||||
"crawl_time": crawl_time,
|
||||
"total_time": total_time,
|
||||
"memory_used": memory_used,
|
||||
"pages_per_second": pages_per_second,
|
||||
"efficiency": efficiency
|
||||
}
|
||||
|
||||
results.append(result)
|
||||
|
||||
if verbose:
|
||||
print(f" ✓ Crawled {total_urls} pages in {crawl_time:.2f}s ({pages_per_second:.1f} pages/sec)")
|
||||
if memory_used > 0:
|
||||
print(f" ✓ Memory used: {memory_used:.1f}MB ({memory_used/total_urls:.1f}MB per page)")
|
||||
print(f" ✓ Efficiency score: {efficiency:.4f}")
|
||||
|
||||
except Exception as e:
|
||||
if verbose:
|
||||
print(f" ✗ Error: {str(e)}")
|
||||
|
||||
# Clean up
|
||||
for manager in managers:
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
|
||||
# If no successful results, return None
|
||||
if not results:
|
||||
return None
|
||||
|
||||
# Find best configurations
|
||||
fastest = sorted(results, key=lambda x: x["crawl_time"])[0]
|
||||
|
||||
# Only consider memory if available
|
||||
memory_results = [r for r in results if r["memory_used"] > 0]
|
||||
if memory_results:
|
||||
lowest_memory = sorted(memory_results, key=lambda x: x["memory_used"])[0]
|
||||
else:
|
||||
lowest_memory = fastest
|
||||
|
||||
# Find most efficient (balanced speed vs memory)
|
||||
optimal = sorted(results, key=lambda x: x["efficiency"], reverse=True)[0]
|
||||
|
||||
# Print summary
|
||||
if verbose:
|
||||
print("\n=== OPTIMAL CONFIGURATIONS ===")
|
||||
print(f"⚡ Fastest: {fastest['browser_count']} browsers {fastest['distribution']}")
|
||||
print(f" {fastest['crawl_time']:.2f}s, {fastest['pages_per_second']:.1f} pages/sec")
|
||||
|
||||
print(f"💾 Memory-efficient: {lowest_memory['browser_count']} browsers {lowest_memory['distribution']}")
|
||||
if lowest_memory["memory_used"] > 0:
|
||||
print(f" {lowest_memory['memory_used']:.1f}MB, {lowest_memory['memory_used']/total_urls:.2f}MB per page")
|
||||
|
||||
print(f"🌟 Balanced optimal: {optimal['browser_count']} browsers {optimal['distribution']}")
|
||||
print(f" {optimal['crawl_time']:.2f}s, {optimal['pages_per_second']:.1f} pages/sec, score: {optimal['efficiency']:.4f}")
|
||||
|
||||
return {
|
||||
"fastest": fastest,
|
||||
"lowest_memory": lowest_memory,
|
||||
"optimal": optimal,
|
||||
"all_configs": results
|
||||
}
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
# Find optimal configuration using our utility function
|
||||
configs = await find_optimal_browser_config(
|
||||
total_urls=20, # Use a small number for faster testing
|
||||
verbose=True,
|
||||
rate_limit_delay=0.2 # 200ms delay between page loads to avoid rate limiting
|
||||
)
|
||||
|
||||
if configs:
|
||||
# Show the optimal configuration
|
||||
optimal = configs["optimal"]
|
||||
print(f"\n🎯 Recommended configuration for production use:")
|
||||
print(f" {optimal['browser_count']} browsers with distribution {optimal['distribution']}")
|
||||
print(f" Estimated performance: {optimal['pages_per_second']:.1f} pages/second")
|
||||
results.append(True)
|
||||
else:
|
||||
print("\n❌ Failed to find optimal configuration")
|
||||
results.append(False)
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
print(f"\nTests complete: {passed}/{total} passed")
|
||||
|
||||
if passed == total:
|
||||
print("All tests passed!")
|
||||
else:
|
||||
print(f"{total - passed} tests failed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
267
tests/browser/test_playwright_strategy.py
Normal file
267
tests/browser/test_playwright_strategy.py
Normal file
@@ -0,0 +1,267 @@
|
||||
"""Test examples for PlaywrightBrowserStrategy.
|
||||
|
||||
These examples demonstrate the functionality of PlaywrightBrowserStrategy
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_playwright_basic():
|
||||
"""Test basic Playwright browser functionality."""
|
||||
logger.info("Testing standard Playwright browser", tag="TEST")
|
||||
|
||||
# Create browser config for standard Playwright
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
viewport_width=1280,
|
||||
viewport_height=800
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create crawler config
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
|
||||
# Get a page
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
logger.info("Got page successfully", tag="TEST")
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_text_mode():
|
||||
"""Test Playwright browser in text-only mode."""
|
||||
logger.info("Testing Playwright text mode", tag="TEST")
|
||||
|
||||
# Create browser config with text mode enabled
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
text_mode=True # Enable text-only mode
|
||||
)
|
||||
|
||||
# Create browser manager with the config
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully in text mode", tag="TEST")
|
||||
|
||||
# Get a page
|
||||
crawler_config = CrawlerRunConfig(url="https://example.com")
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate to a website
|
||||
await page.goto("https://example.com")
|
||||
logger.info("Navigated to example.com", tag="TEST")
|
||||
|
||||
# Get page title
|
||||
title = await page.title()
|
||||
logger.info(f"Page title: {title}", tag="TEST")
|
||||
|
||||
# Check if images are blocked in text mode
|
||||
# We'll check if any image requests were made
|
||||
has_images = False
|
||||
async with page.expect_request("**/*.{png,jpg,jpeg,gif,webp,svg}", timeout=1000) as request_info:
|
||||
try:
|
||||
# Try to load a page with images
|
||||
await page.goto("https://picsum.photos/", wait_until="domcontentloaded")
|
||||
request = await request_info.value
|
||||
has_images = True
|
||||
except:
|
||||
# Timeout without image requests means text mode is working
|
||||
has_images = False
|
||||
|
||||
logger.info(f"Text mode image blocking working: {not has_images}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return True
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_context_reuse():
|
||||
"""Test context caching and reuse with identical configurations."""
|
||||
logger.info("Testing context reuse with identical configurations", tag="TEST")
|
||||
|
||||
# Create browser config
|
||||
browser_config = BrowserConfig(headless=True)
|
||||
|
||||
# Create browser manager
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
# Start the browser
|
||||
await manager.start()
|
||||
logger.info("Browser started successfully", tag="TEST")
|
||||
|
||||
# Create identical crawler configs
|
||||
crawler_config1 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
crawler_config2 = CrawlerRunConfig(
|
||||
css_selector="body",
|
||||
)
|
||||
|
||||
# Get pages with these configs
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
|
||||
# Check if contexts are reused
|
||||
is_same_context = context1 == context2
|
||||
logger.info(f"Contexts reused: {is_same_context}", tag="TEST")
|
||||
|
||||
# Now try with a different config
|
||||
crawler_config3 = CrawlerRunConfig()
|
||||
|
||||
page3, context3 = await manager.get_page(crawler_config3)
|
||||
|
||||
# This should be a different context
|
||||
is_different_context = context1 != context3
|
||||
logger.info(f"Different contexts for different configs: {is_different_context}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
# Both tests should pass for success
|
||||
return is_same_context and is_different_context
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Ensure cleanup
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_playwright_session_management():
|
||||
"""Test session management with Playwright browser."""
|
||||
logger.info("Testing session management with Playwright browser", tag="TEST")
|
||||
|
||||
browser_config = BrowserConfig(
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
try:
|
||||
await manager.start()
|
||||
logger.info("Browser launched successfully", tag="TEST")
|
||||
|
||||
# Create two sessions
|
||||
session1_id = "playwright_session_1"
|
||||
session2_id = "playwright_session_2"
|
||||
|
||||
# Set up first session
|
||||
crawler_config1 = CrawlerRunConfig(session_id=session1_id, url="https://example.com")
|
||||
page1, context1 = await manager.get_page(crawler_config1)
|
||||
await page1.goto("https://example.com")
|
||||
await page1.evaluate("localStorage.setItem('playwright_session1_data', 'test_value1')")
|
||||
logger.info(f"Set up session 1 with ID: {session1_id}", tag="TEST")
|
||||
|
||||
# Set up second session
|
||||
crawler_config2 = CrawlerRunConfig(session_id=session2_id, url="https://example.org")
|
||||
page2, context2 = await manager.get_page(crawler_config2)
|
||||
await page2.goto("https://example.org")
|
||||
await page2.evaluate("localStorage.setItem('playwright_session2_data', 'test_value2')")
|
||||
logger.info(f"Set up session 2 with ID: {session2_id}", tag="TEST")
|
||||
|
||||
# Get first session again
|
||||
page1_again, context1_again = await manager.get_page(crawler_config1)
|
||||
|
||||
# Verify it's the same page and data persists
|
||||
is_same_page = page1 == page1_again
|
||||
is_same_context = context1 == context1_again
|
||||
data1 = await page1_again.evaluate("localStorage.getItem('playwright_session1_data')")
|
||||
logger.info(f"Session 1 reuse successful: {is_same_page}, data: {data1}", tag="TEST")
|
||||
|
||||
# Kill first session
|
||||
await manager.kill_session(session1_id)
|
||||
logger.info(f"Killed session 1", tag="TEST")
|
||||
|
||||
# Verify second session still works
|
||||
data2 = await page2.evaluate("localStorage.getItem('playwright_session2_data')")
|
||||
logger.info(f"Session 2 still functional after killing session 1, data: {data2}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager.close()
|
||||
logger.info("Browser closed successfully", tag="TEST")
|
||||
|
||||
return is_same_page and is_same_context and data1 == "test_value1" and data2 == "test_value2"
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
try:
|
||||
await manager.close()
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_playwright_basic())
|
||||
results.append(await test_playwright_text_mode())
|
||||
results.append(await test_playwright_context_reuse())
|
||||
results.append(await test_playwright_session_management())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
176
tests/browser/test_profiles.py
Normal file
176
tests/browser/test_profiles.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Test examples for BrowserProfileManager.
|
||||
|
||||
These examples demonstrate the functionality of BrowserProfileManager
|
||||
and serve as functional tests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
import sys
|
||||
import uuid
|
||||
import shutil
|
||||
|
||||
# Add the project root to Python path if running directly
|
||||
if __name__ == "__main__":
|
||||
sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '../..')))
|
||||
|
||||
from crawl4ai.browser import BrowserManager, BrowserProfileManager
|
||||
from crawl4ai.async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from crawl4ai.async_logger import AsyncLogger
|
||||
|
||||
# Create a logger for clear terminal output
|
||||
logger = AsyncLogger(verbose=True, log_file=None)
|
||||
|
||||
async def test_profile_creation():
|
||||
"""Test creating and managing browser profiles."""
|
||||
logger.info("Testing profile creation and management", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
|
||||
try:
|
||||
# List existing profiles
|
||||
profiles = profile_manager.list_profiles()
|
||||
logger.info(f"Found {len(profiles)} existing profiles", tag="TEST")
|
||||
|
||||
# Generate a unique profile name for testing
|
||||
test_profile_name = f"test-profile-{uuid.uuid4().hex[:8]}"
|
||||
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Verify the profile is now in the list
|
||||
profiles = profile_manager.list_profiles()
|
||||
profile_found = any(p["name"] == test_profile_name for p in profiles)
|
||||
logger.info(f"Profile found in list: {profile_found}", tag="TEST")
|
||||
|
||||
# Try to get the profile path
|
||||
retrieved_path = profile_manager.get_profile_path(test_profile_name)
|
||||
path_match = retrieved_path == profile_path
|
||||
logger.info(f"Retrieved correct profile path: {path_match}", tag="TEST")
|
||||
|
||||
# Delete the profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Profile deletion successful: {success}", tag="TEST")
|
||||
|
||||
# Verify it's gone
|
||||
profiles_after = profile_manager.list_profiles()
|
||||
profile_removed = not any(p["name"] == test_profile_name for p in profiles_after)
|
||||
logger.info(f"Profile removed from list: {profile_removed}", tag="TEST")
|
||||
|
||||
# Clean up just in case
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
|
||||
return profile_found and path_match and success and profile_removed
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up test directory
|
||||
try:
|
||||
if os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def test_profile_with_browser():
|
||||
"""Test using a profile with a browser."""
|
||||
logger.info("Testing using a profile with a browser", tag="TEST")
|
||||
|
||||
profile_manager = BrowserProfileManager(logger=logger)
|
||||
test_profile_name = f"test-browser-profile-{uuid.uuid4().hex[:8]}"
|
||||
profile_path = None
|
||||
|
||||
try:
|
||||
# Create a test profile directory
|
||||
profile_path = os.path.join(profile_manager.profiles_dir, test_profile_name)
|
||||
os.makedirs(os.path.join(profile_path, "Default"), exist_ok=True)
|
||||
|
||||
# Create a dummy Preferences file to simulate a Chrome profile
|
||||
with open(os.path.join(profile_path, "Default", "Preferences"), "w") as f:
|
||||
f.write("{\"test\": true}")
|
||||
|
||||
logger.info(f"Created test profile at: {profile_path}", tag="TEST")
|
||||
|
||||
# Now use this profile with a browser
|
||||
browser_config = BrowserConfig(
|
||||
user_data_dir=profile_path,
|
||||
headless=True
|
||||
)
|
||||
|
||||
manager = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
|
||||
# Start the browser with the profile
|
||||
await manager.start()
|
||||
logger.info("Browser started with profile", tag="TEST")
|
||||
|
||||
# Create a page
|
||||
crawler_config = CrawlerRunConfig()
|
||||
page, context = await manager.get_page(crawler_config)
|
||||
|
||||
# Navigate and set some data to verify profile works
|
||||
await page.goto("https://example.com")
|
||||
await page.evaluate("localStorage.setItem('test_data', 'profile_value')")
|
||||
|
||||
# Close browser
|
||||
await manager.close()
|
||||
logger.info("First browser session closed", tag="TEST")
|
||||
|
||||
# Create a new browser with the same profile
|
||||
manager2 = BrowserManager(browser_config=browser_config, logger=logger)
|
||||
await manager2.start()
|
||||
logger.info("Second browser session started with same profile", tag="TEST")
|
||||
|
||||
# Get a page and check if the data persists
|
||||
page2, context2 = await manager2.get_page(crawler_config)
|
||||
await page2.goto("https://example.com")
|
||||
data = await page2.evaluate("localStorage.getItem('test_data')")
|
||||
|
||||
# Verify data persisted
|
||||
data_persisted = data == "profile_value"
|
||||
logger.info(f"Data persisted across sessions: {data_persisted}", tag="TEST")
|
||||
|
||||
# Clean up
|
||||
await manager2.close()
|
||||
logger.info("Second browser session closed", tag="TEST")
|
||||
|
||||
# Delete the test profile
|
||||
success = profile_manager.delete_profile(test_profile_name)
|
||||
logger.info(f"Test profile deleted: {success}", tag="TEST")
|
||||
|
||||
return data_persisted and success
|
||||
except Exception as e:
|
||||
logger.error(f"Test failed: {str(e)}", tag="TEST")
|
||||
# Clean up
|
||||
try:
|
||||
if profile_path and os.path.exists(profile_path):
|
||||
shutil.rmtree(profile_path, ignore_errors=True)
|
||||
except:
|
||||
pass
|
||||
return False
|
||||
|
||||
async def run_tests():
|
||||
"""Run all tests sequentially."""
|
||||
results = []
|
||||
|
||||
results.append(await test_profile_creation())
|
||||
results.append(await test_profile_with_browser())
|
||||
|
||||
# Print summary
|
||||
total = len(results)
|
||||
passed = sum(results)
|
||||
logger.info(f"Tests complete: {passed}/{total} passed", tag="SUMMARY")
|
||||
|
||||
if passed == total:
|
||||
logger.success("All tests passed!", tag="SUMMARY")
|
||||
else:
|
||||
logger.error(f"{total - passed} tests failed", tag="SUMMARY")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(run_tests())
|
||||
@@ -7,7 +7,7 @@ from crawl4ai import (
|
||||
BrowserConfig, CrawlerRunConfig, DefaultMarkdownGenerator,
|
||||
PruningContentFilter, JsonCssExtractionStrategy, LLMContentFilter, CacheMode
|
||||
)
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.docker_client import Crawl4aiDockerClient
|
||||
|
||||
class Crawl4AiTester:
|
||||
@@ -143,7 +143,7 @@ async def test_with_client():
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-40"),
|
||||
llm_config=LLMConfig(provider="openai/gpt-40"),
|
||||
instruction="Extract key technical concepts"
|
||||
)
|
||||
),
|
||||
|
||||
@@ -2,7 +2,7 @@ import inspect
|
||||
from typing import Any, Dict
|
||||
from enum import Enum
|
||||
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
|
||||
def to_serializable_dict(obj: Any) -> Dict:
|
||||
"""
|
||||
@@ -224,7 +224,7 @@ if __name__ == "__main__":
|
||||
config3 = CrawlerRunConfig(
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=LLMContentFilter(
|
||||
llmConfig = LlmConfig(provider="openai/gpt-4"),
|
||||
llm_config = LLMConfig(provider="openai/gpt-4"),
|
||||
instruction="Extract key technical concepts",
|
||||
chunk_token_threshold=2000,
|
||||
overlap_rate=0.1
|
||||
|
||||
@@ -73,7 +73,7 @@ async def test_stream_crawl(session, token: str):
|
||||
# "https://news.ycombinator.com/news"
|
||||
],
|
||||
"browser_config": {"headless": True, "viewport": {"width": 1200}},
|
||||
"crawler_config": {"stream": True, "cache_mode": "aggressive"}
|
||||
"crawler_config": {"stream": True, "cache_mode": "bypass"}
|
||||
}
|
||||
headers = {"Authorization": f"Bearer {token}"}
|
||||
print(f"\nTesting Streaming Crawl: {url}")
|
||||
|
||||
168
tests/memory/test_crawler_monitor.py
Normal file
168
tests/memory/test_crawler_monitor.py
Normal file
@@ -0,0 +1,168 @@
|
||||
"""
|
||||
Test script for the CrawlerMonitor component.
|
||||
This script simulates a crawler with multiple tasks to demonstrate the real-time monitoring capabilities.
|
||||
"""
|
||||
|
||||
import time
|
||||
import uuid
|
||||
import random
|
||||
import threading
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Add the parent directory to the path to import crawl4ai
|
||||
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "../..")))
|
||||
|
||||
from crawl4ai.components.crawler_monitor import CrawlerMonitor
|
||||
from crawl4ai.models import CrawlStatus
|
||||
|
||||
def simulate_crawler_task(monitor, task_id, url, simulate_failure=False):
|
||||
"""Simulate a crawler task with different states."""
|
||||
# Task starts in the QUEUED state
|
||||
wait_time = random.uniform(0.5, 3.0)
|
||||
time.sleep(wait_time)
|
||||
|
||||
# Update to IN_PROGRESS state
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.IN_PROGRESS,
|
||||
start_time=time.time(),
|
||||
wait_time=wait_time
|
||||
)
|
||||
|
||||
# Simulate task running
|
||||
process_time = random.uniform(1.0, 5.0)
|
||||
for i in range(int(process_time * 2)):
|
||||
# Simulate memory usage changes
|
||||
memory_usage = random.uniform(5.0, 25.0)
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
memory_usage=memory_usage,
|
||||
peak_memory=max(memory_usage, monitor.get_task_stats(task_id).get("peak_memory", 0))
|
||||
)
|
||||
time.sleep(0.5)
|
||||
|
||||
# Update to COMPLETED or FAILED state
|
||||
if simulate_failure and random.random() < 0.8: # 80% chance of failure if simulate_failure is True
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.FAILED,
|
||||
end_time=time.time(),
|
||||
error_message="Simulated failure: Connection timeout",
|
||||
memory_usage=0.0
|
||||
)
|
||||
else:
|
||||
monitor.update_task(
|
||||
task_id=task_id,
|
||||
status=CrawlStatus.COMPLETED,
|
||||
end_time=time.time(),
|
||||
memory_usage=0.0
|
||||
)
|
||||
|
||||
def update_queue_stats(monitor, num_queued_tasks):
|
||||
"""Update queue statistics periodically."""
|
||||
while monitor.is_running:
|
||||
queued_tasks = [
|
||||
task for task_id, task in monitor.get_all_task_stats().items()
|
||||
if task["status"] == CrawlStatus.QUEUED.name
|
||||
]
|
||||
|
||||
total_queued = len(queued_tasks)
|
||||
|
||||
if total_queued > 0:
|
||||
current_time = time.time()
|
||||
wait_times = [
|
||||
current_time - task.get("enqueue_time", current_time)
|
||||
for task in queued_tasks
|
||||
]
|
||||
highest_wait_time = max(wait_times) if wait_times else 0.0
|
||||
avg_wait_time = sum(wait_times) / len(wait_times) if wait_times else 0.0
|
||||
else:
|
||||
highest_wait_time = 0.0
|
||||
avg_wait_time = 0.0
|
||||
|
||||
monitor.update_queue_statistics(
|
||||
total_queued=total_queued,
|
||||
highest_wait_time=highest_wait_time,
|
||||
avg_wait_time=avg_wait_time
|
||||
)
|
||||
|
||||
# Simulate memory pressure based on number of active tasks
|
||||
active_tasks = len([
|
||||
task for task_id, task in monitor.get_all_task_stats().items()
|
||||
if task["status"] == CrawlStatus.IN_PROGRESS.name
|
||||
])
|
||||
|
||||
if active_tasks > 8:
|
||||
monitor.update_memory_status("CRITICAL")
|
||||
elif active_tasks > 4:
|
||||
monitor.update_memory_status("PRESSURE")
|
||||
else:
|
||||
monitor.update_memory_status("NORMAL")
|
||||
|
||||
time.sleep(1.0)
|
||||
|
||||
def test_crawler_monitor():
|
||||
"""Test the CrawlerMonitor with simulated crawler tasks."""
|
||||
# Total number of URLs to crawl
|
||||
total_urls = 50
|
||||
|
||||
# Initialize the monitor
|
||||
monitor = CrawlerMonitor(urls_total=total_urls, refresh_rate=0.5)
|
||||
|
||||
# Start the monitor
|
||||
monitor.start()
|
||||
|
||||
# Start thread to update queue statistics
|
||||
queue_stats_thread = threading.Thread(target=update_queue_stats, args=(monitor, total_urls))
|
||||
queue_stats_thread.daemon = True
|
||||
queue_stats_thread.start()
|
||||
|
||||
try:
|
||||
# Create task threads
|
||||
threads = []
|
||||
for i in range(total_urls):
|
||||
task_id = str(uuid.uuid4())
|
||||
url = f"https://example.com/page{i}"
|
||||
|
||||
# Add task to monitor
|
||||
monitor.add_task(task_id, url)
|
||||
|
||||
# Determine if this task should simulate failure
|
||||
simulate_failure = (i % 10 == 0) # Every 10th task
|
||||
|
||||
# Create and start thread for this task
|
||||
thread = threading.Thread(
|
||||
target=simulate_crawler_task,
|
||||
args=(monitor, task_id, url, simulate_failure)
|
||||
)
|
||||
thread.daemon = True
|
||||
threads.append(thread)
|
||||
|
||||
# Start threads with delay to simulate tasks being added over time
|
||||
batch_size = 5
|
||||
for i in range(0, len(threads), batch_size):
|
||||
batch = threads[i:i+batch_size]
|
||||
for thread in batch:
|
||||
thread.start()
|
||||
time.sleep(0.5) # Small delay between starting threads
|
||||
|
||||
# Wait a bit before starting the next batch
|
||||
time.sleep(2.0)
|
||||
|
||||
# Wait for all threads to complete
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
|
||||
# Keep monitor running a bit longer to see the final state
|
||||
time.sleep(5.0)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\nTest interrupted by user")
|
||||
finally:
|
||||
# Stop the monitor
|
||||
monitor.stop()
|
||||
print("\nCrawler monitor test completed")
|
||||
|
||||
if __name__ == "__main__":
|
||||
test_crawler_monitor()
|
||||
410
tests/memory/test_dispatcher_stress.py
Normal file
410
tests/memory/test_dispatcher_stress.py
Normal file
@@ -0,0 +1,410 @@
|
||||
import asyncio
|
||||
import time
|
||||
import psutil
|
||||
import logging
|
||||
import random
|
||||
from typing import List, Dict
|
||||
import uuid
|
||||
import sys
|
||||
import os
|
||||
|
||||
# Import your crawler components
|
||||
from crawl4ai.models import DisplayMode, CrawlStatus, CrawlResult
|
||||
from crawl4ai.async_configs import CrawlerRunConfig, BrowserConfig, CacheMode
|
||||
from crawl4ai import AsyncWebCrawler
|
||||
from crawl4ai import MemoryAdaptiveDispatcher, CrawlerMonitor
|
||||
|
||||
# Global configuration
|
||||
STREAM = False # Toggle between streaming and non-streaming modes
|
||||
|
||||
# Configure logging to file only (to avoid breaking the rich display)
|
||||
os.makedirs("logs", exist_ok=True)
|
||||
file_handler = logging.FileHandler("logs/memory_stress_test.log")
|
||||
file_handler.setFormatter(logging.Formatter('%(asctime)s [%(levelname)s] %(message)s'))
|
||||
|
||||
# Root logger - only to file, not console
|
||||
root_logger = logging.getLogger()
|
||||
root_logger.setLevel(logging.INFO)
|
||||
root_logger.addHandler(file_handler)
|
||||
|
||||
# Our test logger also writes to file only
|
||||
logger = logging.getLogger("memory_stress_test")
|
||||
logger.setLevel(logging.INFO)
|
||||
logger.addHandler(file_handler)
|
||||
logger.propagate = False # Don't propagate to root logger
|
||||
|
||||
# Create a memory restrictor to simulate limited memory environment
|
||||
class MemorySimulator:
|
||||
def __init__(self, target_percent: float = 85.0, aggressive: bool = False):
|
||||
"""Simulates memory pressure by allocating memory"""
|
||||
self.target_percent = target_percent
|
||||
self.memory_blocks: List[bytearray] = []
|
||||
self.aggressive = aggressive
|
||||
|
||||
def apply_pressure(self, additional_percent: float = 0.0):
|
||||
"""Fill memory until we reach target percentage"""
|
||||
current_percent = psutil.virtual_memory().percent
|
||||
target = self.target_percent + additional_percent
|
||||
|
||||
if current_percent >= target:
|
||||
return # Already at target
|
||||
|
||||
logger.info(f"Current memory: {current_percent}%, target: {target}%")
|
||||
|
||||
# Calculate how much memory we need to allocate
|
||||
total_memory = psutil.virtual_memory().total
|
||||
target_usage = (target / 100.0) * total_memory
|
||||
current_usage = (current_percent / 100.0) * total_memory
|
||||
bytes_to_allocate = int(target_usage - current_usage)
|
||||
|
||||
if bytes_to_allocate <= 0:
|
||||
return
|
||||
|
||||
# Allocate in smaller chunks to avoid overallocation
|
||||
if self.aggressive:
|
||||
# Use larger chunks for faster allocation in aggressive mode
|
||||
chunk_size = min(bytes_to_allocate, 200 * 1024 * 1024) # 200MB chunks
|
||||
else:
|
||||
chunk_size = min(bytes_to_allocate, 50 * 1024 * 1024) # 50MB chunks
|
||||
|
||||
try:
|
||||
logger.info(f"Allocating {chunk_size / (1024 * 1024):.1f}MB to reach target memory usage")
|
||||
self.memory_blocks.append(bytearray(chunk_size))
|
||||
time.sleep(0.5) # Give system time to register the allocation
|
||||
except MemoryError:
|
||||
logger.warning("Unable to allocate more memory")
|
||||
|
||||
def release_pressure(self, percent: float = None):
|
||||
"""
|
||||
Release allocated memory
|
||||
If percent is specified, release that percentage of blocks
|
||||
"""
|
||||
if not self.memory_blocks:
|
||||
return
|
||||
|
||||
if percent is None:
|
||||
# Release all
|
||||
logger.info(f"Releasing all {len(self.memory_blocks)} memory blocks")
|
||||
self.memory_blocks.clear()
|
||||
else:
|
||||
# Release specified percentage
|
||||
blocks_to_release = int(len(self.memory_blocks) * (percent / 100.0))
|
||||
if blocks_to_release > 0:
|
||||
logger.info(f"Releasing {blocks_to_release} of {len(self.memory_blocks)} memory blocks ({percent}%)")
|
||||
self.memory_blocks = self.memory_blocks[blocks_to_release:]
|
||||
|
||||
def spike_pressure(self, duration: float = 5.0):
|
||||
"""
|
||||
Create a temporary spike in memory pressure then release
|
||||
Useful for forcing requeues
|
||||
"""
|
||||
logger.info(f"Creating memory pressure spike for {duration} seconds")
|
||||
# Save current blocks count
|
||||
initial_blocks = len(self.memory_blocks)
|
||||
|
||||
# Create spike with extra 5%
|
||||
self.apply_pressure(additional_percent=5.0)
|
||||
|
||||
# Schedule release after duration
|
||||
asyncio.create_task(self._delayed_release(duration, initial_blocks))
|
||||
|
||||
async def _delayed_release(self, delay: float, target_blocks: int):
|
||||
"""Helper for spike_pressure - releases extra blocks after delay"""
|
||||
await asyncio.sleep(delay)
|
||||
|
||||
# Remove blocks added since spike started
|
||||
if len(self.memory_blocks) > target_blocks:
|
||||
logger.info(f"Releasing memory spike ({len(self.memory_blocks) - target_blocks} blocks)")
|
||||
self.memory_blocks = self.memory_blocks[:target_blocks]
|
||||
|
||||
# Test statistics collector
|
||||
class TestResults:
|
||||
def __init__(self):
|
||||
self.start_time = time.time()
|
||||
self.completed_urls: List[str] = []
|
||||
self.failed_urls: List[str] = []
|
||||
self.requeued_count = 0
|
||||
self.memory_warnings = 0
|
||||
self.max_memory_usage = 0.0
|
||||
self.max_queue_size = 0
|
||||
self.max_wait_time = 0.0
|
||||
self.url_to_attempt: Dict[str, int] = {} # Track retries per URL
|
||||
|
||||
def log_summary(self):
|
||||
duration = time.time() - self.start_time
|
||||
logger.info("===== TEST SUMMARY =====")
|
||||
logger.info(f"Stream mode: {'ON' if STREAM else 'OFF'}")
|
||||
logger.info(f"Total duration: {duration:.1f} seconds")
|
||||
logger.info(f"Completed URLs: {len(self.completed_urls)}")
|
||||
logger.info(f"Failed URLs: {len(self.failed_urls)}")
|
||||
logger.info(f"Requeue events: {self.requeued_count}")
|
||||
logger.info(f"Memory warnings: {self.memory_warnings}")
|
||||
logger.info(f"Max memory usage: {self.max_memory_usage:.1f}%")
|
||||
logger.info(f"Max queue size: {self.max_queue_size}")
|
||||
logger.info(f"Max wait time: {self.max_wait_time:.1f} seconds")
|
||||
|
||||
# Log URLs with multiple attempts
|
||||
retried_urls = {url: count for url, count in self.url_to_attempt.items() if count > 1}
|
||||
if retried_urls:
|
||||
logger.info(f"URLs with retries: {len(retried_urls)}")
|
||||
# Log the top 5 most retried
|
||||
top_retries = sorted(retried_urls.items(), key=lambda x: x[1], reverse=True)[:5]
|
||||
for url, count in top_retries:
|
||||
logger.info(f" URL {url[-30:]} had {count} attempts")
|
||||
|
||||
# Write summary to a separate human-readable file
|
||||
with open("logs/test_summary.txt", "w") as f:
|
||||
f.write(f"Stream mode: {'ON' if STREAM else 'OFF'}\n")
|
||||
f.write(f"Total duration: {duration:.1f} seconds\n")
|
||||
f.write(f"Completed URLs: {len(self.completed_urls)}\n")
|
||||
f.write(f"Failed URLs: {len(self.failed_urls)}\n")
|
||||
f.write(f"Requeue events: {self.requeued_count}\n")
|
||||
f.write(f"Memory warnings: {self.memory_warnings}\n")
|
||||
f.write(f"Max memory usage: {self.max_memory_usage:.1f}%\n")
|
||||
f.write(f"Max queue size: {self.max_queue_size}\n")
|
||||
f.write(f"Max wait time: {self.max_wait_time:.1f} seconds\n")
|
||||
|
||||
# Custom monitor with stats tracking
|
||||
# Custom monitor that extends CrawlerMonitor with test-specific tracking
|
||||
class StressTestMonitor(CrawlerMonitor):
|
||||
def __init__(self, test_results: TestResults, **kwargs):
|
||||
# Initialize the parent CrawlerMonitor
|
||||
super().__init__(**kwargs)
|
||||
self.test_results = test_results
|
||||
|
||||
def update_memory_status(self, status: str):
|
||||
if status != self.memory_status:
|
||||
logger.info(f"Memory status changed: {self.memory_status} -> {status}")
|
||||
if "CRITICAL" in status or "PRESSURE" in status:
|
||||
self.test_results.memory_warnings += 1
|
||||
|
||||
# Track peak memory usage in test results
|
||||
current_memory = psutil.virtual_memory().percent
|
||||
self.test_results.max_memory_usage = max(self.test_results.max_memory_usage, current_memory)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_memory_status(status)
|
||||
|
||||
def update_queue_statistics(self, total_queued: int, highest_wait_time: float, avg_wait_time: float):
|
||||
# Track queue metrics in test results
|
||||
self.test_results.max_queue_size = max(self.test_results.max_queue_size, total_queued)
|
||||
self.test_results.max_wait_time = max(self.test_results.max_wait_time, highest_wait_time)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_queue_statistics(total_queued, highest_wait_time, avg_wait_time)
|
||||
|
||||
def update_task(self, task_id: str, **kwargs):
|
||||
# Track URL status changes for test results
|
||||
if task_id in self.stats:
|
||||
old_status = self.stats[task_id].status
|
||||
|
||||
# If this is a requeue event (requeued due to memory pressure)
|
||||
if 'error_message' in kwargs and 'requeued' in kwargs['error_message']:
|
||||
if not hasattr(self.stats[task_id], 'counted_requeue') or not self.stats[task_id].counted_requeue:
|
||||
self.test_results.requeued_count += 1
|
||||
self.stats[task_id].counted_requeue = True
|
||||
|
||||
# Track completion status for test results
|
||||
if 'status' in kwargs:
|
||||
new_status = kwargs['status']
|
||||
if old_status != new_status:
|
||||
if new_status == CrawlStatus.COMPLETED:
|
||||
if task_id not in self.test_results.completed_urls:
|
||||
self.test_results.completed_urls.append(task_id)
|
||||
elif new_status == CrawlStatus.FAILED:
|
||||
if task_id not in self.test_results.failed_urls:
|
||||
self.test_results.failed_urls.append(task_id)
|
||||
|
||||
# Call parent method to update the dashboard
|
||||
super().update_task(task_id, **kwargs)
|
||||
self.live.update(self._create_table())
|
||||
|
||||
# Generate test URLs - use example.com with unique paths to avoid browser caching
|
||||
def generate_test_urls(count: int) -> List[str]:
|
||||
urls = []
|
||||
for i in range(count):
|
||||
# Add random path and query parameters to create unique URLs
|
||||
path = f"/path/{uuid.uuid4()}"
|
||||
query = f"?test={i}&random={random.randint(1, 100000)}"
|
||||
urls.append(f"https://example.com{path}{query}")
|
||||
return urls
|
||||
|
||||
# Process result callback
|
||||
async def process_result(result, test_results: TestResults):
|
||||
# Track attempt counts
|
||||
if result.url not in test_results.url_to_attempt:
|
||||
test_results.url_to_attempt[result.url] = 1
|
||||
else:
|
||||
test_results.url_to_attempt[result.url] += 1
|
||||
|
||||
if "requeued" in result.error_message:
|
||||
test_results.requeued_count += 1
|
||||
logger.debug(f"Requeued due to memory pressure: {result.url}")
|
||||
elif result.success:
|
||||
test_results.completed_urls.append(result.url)
|
||||
logger.debug(f"Successfully processed: {result.url}")
|
||||
else:
|
||||
test_results.failed_urls.append(result.url)
|
||||
logger.warning(f"Failed to process: {result.url} - {result.error_message}")
|
||||
|
||||
# Process multiple results (used in non-streaming mode)
|
||||
async def process_results(results, test_results: TestResults):
|
||||
for result in results:
|
||||
await process_result(result, test_results)
|
||||
|
||||
# Main test function for extreme memory pressure simulation
|
||||
async def run_memory_stress_test(
|
||||
url_count: int = 100,
|
||||
target_memory_percent: float = 92.0, # Push to dangerous levels
|
||||
chunk_size: int = 20, # Larger chunks for more chaos
|
||||
aggressive: bool = False,
|
||||
spikes: bool = True
|
||||
):
|
||||
test_results = TestResults()
|
||||
memory_simulator = MemorySimulator(target_percent=target_memory_percent, aggressive=aggressive)
|
||||
|
||||
logger.info(f"Starting stress test with {url_count} URLs in {'STREAM' if STREAM else 'NON-STREAM'} mode")
|
||||
logger.info(f"Target memory usage: {target_memory_percent}%")
|
||||
|
||||
# First, elevate memory usage to create pressure
|
||||
logger.info("Creating initial memory pressure...")
|
||||
memory_simulator.apply_pressure()
|
||||
|
||||
# Create test URLs in chunks to simulate real-world crawling where URLs are discovered
|
||||
all_urls = generate_test_urls(url_count)
|
||||
url_chunks = [all_urls[i:i+chunk_size] for i in range(0, len(all_urls), chunk_size)]
|
||||
|
||||
# Set up the crawler components - low memory thresholds to create more requeues
|
||||
browser_config = BrowserConfig(headless=True, verbose=False)
|
||||
run_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
verbose=False,
|
||||
stream=STREAM # Use the global STREAM variable to set mode
|
||||
)
|
||||
|
||||
# Create monitor with reference to test results
|
||||
monitor = StressTestMonitor(
|
||||
test_results=test_results,
|
||||
display_mode=DisplayMode.DETAILED,
|
||||
max_visible_rows=20,
|
||||
total_urls=url_count # Pass total URLs count
|
||||
)
|
||||
|
||||
# Create dispatcher with EXTREME settings - pure survival mode
|
||||
# These settings are designed to create a memory battleground
|
||||
dispatcher = MemoryAdaptiveDispatcher(
|
||||
memory_threshold_percent=63.0, # Start throttling at just 60% memory
|
||||
critical_threshold_percent=70.0, # Start requeuing at 70% - incredibly aggressive
|
||||
recovery_threshold_percent=55.0, # Only resume normal ops when plenty of memory available
|
||||
check_interval=0.1, # Check extremely frequently (100ms)
|
||||
max_session_permit=20 if aggressive else 10, # Double the concurrent sessions - pure chaos
|
||||
fairness_timeout=10.0, # Extremely low timeout - rapid priority changes
|
||||
monitor=monitor
|
||||
)
|
||||
|
||||
# Set up spike schedule if enabled
|
||||
if spikes:
|
||||
spike_intervals = []
|
||||
# Create 3-5 random spike times
|
||||
num_spikes = random.randint(3, 5)
|
||||
for _ in range(num_spikes):
|
||||
# Schedule spikes at random chunks
|
||||
chunk_index = random.randint(1, len(url_chunks) - 1)
|
||||
spike_intervals.append(chunk_index)
|
||||
logger.info(f"Scheduled memory spikes at chunks: {spike_intervals}")
|
||||
|
||||
try:
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
# Process URLs in chunks to simulate discovering URLs over time
|
||||
for chunk_index, url_chunk in enumerate(url_chunks):
|
||||
logger.info(f"Processing chunk {chunk_index+1}/{len(url_chunks)} ({len(url_chunk)} URLs)")
|
||||
|
||||
# Regular pressure increases
|
||||
if chunk_index % 2 == 0:
|
||||
logger.info("Increasing memory pressure...")
|
||||
memory_simulator.apply_pressure()
|
||||
|
||||
# Memory spike if scheduled for this chunk
|
||||
if spikes and chunk_index in spike_intervals:
|
||||
logger.info(f"⚠️ CREATING MASSIVE MEMORY SPIKE at chunk {chunk_index+1} ⚠️")
|
||||
# Create a nightmare scenario - multiple overlapping spikes
|
||||
memory_simulator.spike_pressure(duration=10.0) # 10-second spike
|
||||
|
||||
# 50% chance of double-spike (pure evil)
|
||||
if random.random() < 0.5:
|
||||
await asyncio.sleep(2.0) # Wait 2 seconds
|
||||
logger.info("💀 DOUBLE SPIKE - EXTREME MEMORY PRESSURE 💀")
|
||||
memory_simulator.spike_pressure(duration=8.0) # 8-second overlapping spike
|
||||
|
||||
if STREAM:
|
||||
# Stream mode - process results as they come in
|
||||
async for result in dispatcher.run_urls_stream(
|
||||
urls=url_chunk,
|
||||
crawler=crawler,
|
||||
config=run_config
|
||||
):
|
||||
await process_result(result, test_results)
|
||||
else:
|
||||
# Non-stream mode - get all results at once
|
||||
results = await dispatcher.run_urls(
|
||||
urls=url_chunk,
|
||||
crawler=crawler,
|
||||
config=run_config
|
||||
)
|
||||
await process_results(results, test_results)
|
||||
|
||||
# Simulate discovering more URLs while others are still processing
|
||||
await asyncio.sleep(1)
|
||||
|
||||
# RARELY release pressure - make the system fight for resources
|
||||
if chunk_index % 5 == 4: # Less frequent releases
|
||||
release_percent = random.choice([10, 15, 20]) # Smaller, inconsistent releases
|
||||
logger.info(f"Releasing {release_percent}% of memory blocks - brief respite")
|
||||
memory_simulator.release_pressure(percent=release_percent)
|
||||
|
||||
except Exception as e:
|
||||
logger.error(f"Test error: {str(e)}")
|
||||
raise
|
||||
finally:
|
||||
# Release memory pressure
|
||||
memory_simulator.release_pressure()
|
||||
# Log final results
|
||||
test_results.log_summary()
|
||||
|
||||
# Check for success criteria
|
||||
if len(test_results.completed_urls) + len(test_results.failed_urls) < url_count:
|
||||
logger.error(f"TEST FAILED: Not all URLs were processed. {url_count - len(test_results.completed_urls) - len(test_results.failed_urls)} URLs missing.")
|
||||
return False
|
||||
|
||||
logger.info("TEST PASSED: All URLs were processed without crashing.")
|
||||
return True
|
||||
|
||||
# Command-line entry point
|
||||
if __name__ == "__main__":
|
||||
# Parse command line arguments
|
||||
url_count = int(sys.argv[1]) if len(sys.argv) > 1 else 100
|
||||
target_memory = float(sys.argv[2]) if len(sys.argv) > 2 else 85.0
|
||||
|
||||
# Check if stream mode is specified
|
||||
if len(sys.argv) > 3:
|
||||
STREAM = sys.argv[3].lower() in ('true', 'yes', '1', 'stream')
|
||||
|
||||
# Check if aggressive mode is specified
|
||||
aggressive = False
|
||||
if len(sys.argv) > 4:
|
||||
aggressive = sys.argv[4].lower() in ('true', 'yes', '1', 'aggressive')
|
||||
|
||||
print(f"Starting test with {url_count} URLs, {target_memory}% memory target")
|
||||
print(f"Stream mode: {STREAM}, Aggressive: {aggressive}")
|
||||
print("Logs will be written to the logs directory")
|
||||
print("Live display starting now...")
|
||||
|
||||
# Run the test
|
||||
result = asyncio.run(run_memory_stress_test(
|
||||
url_count=url_count,
|
||||
target_memory_percent=target_memory,
|
||||
aggressive=aggressive
|
||||
))
|
||||
|
||||
# Exit with status code
|
||||
sys.exit(0 if result else 1)
|
||||
@@ -1,5 +1,5 @@
|
||||
import unittest, os
|
||||
from crawl4ai.async_configs import LlmConfig
|
||||
from crawl4ai import LLMConfig
|
||||
from crawl4ai.web_crawler import WebCrawler
|
||||
from crawl4ai.chunking_strategy import (
|
||||
RegexChunking,
|
||||
@@ -43,7 +43,7 @@ class TestWebCrawler(unittest.TestCase):
|
||||
word_count_threshold=5,
|
||||
chunking_strategy=FixedLengthWordChunking(chunk_size=100),
|
||||
extraction_strategy=LLMExtractionStrategy(
|
||||
llmConfig=LlmConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
llm_config=LLMConfig(provider="openai/gpt-3.5-turbo", api_token=os.getenv("OPENAI_API_KEY"))
|
||||
),
|
||||
bypass_cache=True,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user