From 6dc01eae3ac77092d6fe3e9f6730cb6afb1ae8d2 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 18:53:22 +0800 Subject: [PATCH 1/7] refactor(core): improve type hints and remove unused file - Add RelevantContentFilter to __init__.py exports - Update version to 0.4.3b3 - Enhance type hints in async_configs.py - Remove empty utils.scraping.py file - Update mkdocs configuration with version info and GitHub integration BREAKING CHANGE: None --- crawl4ai/__init__.py | 3 ++- crawl4ai/__version__.py | 2 +- crawl4ai/async_configs.py | 11 +++++++---- crawl4ai/utils.scraping.py | 0 mkdocs.yml | 11 ++++++++++- 5 files changed, 20 insertions(+), 7 deletions(-) delete mode 100644 crawl4ai/utils.scraping.py diff --git a/crawl4ai/__init__.py b/crawl4ai/__init__.py index 482afdd7..7f284323 100644 --- a/crawl4ai/__init__.py +++ b/crawl4ai/__init__.py @@ -16,7 +16,7 @@ from .extraction_strategy import ( ) from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import DefaultMarkdownGenerator -from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter +from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter from .models import CrawlResult, MarkdownGenerationResult from .async_dispatcher import ( MemoryAdaptiveDispatcher, @@ -44,6 +44,7 @@ __all__ = [ "ChunkingStrategy", "RegexChunking", "DefaultMarkdownGenerator", + "RelevantContentFilter", "PruningContentFilter", "BM25ContentFilter", "LLMContentFilter", diff --git a/crawl4ai/__version__.py b/crawl4ai/__version__.py index a0acc761..3274435a 100644 --- a/crawl4ai/__version__.py +++ b/crawl4ai/__version__.py @@ -1,2 +1,2 @@ # crawl4ai/_version.py -__version__ = "0.4.3b2" +__version__ = "0.4.3b3" diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index b0813abe..c1404026 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -6,12 +6,15 @@ from .config import ( IMAGE_SCORE_THRESHOLD, SOCIAL_MEDIA_DOMAINS, ) + from .user_agent_generator import UserAgentGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy +from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from typing import Optional, Union, List +from .cache_context import CacheMode class BrowserConfig: @@ -81,13 +84,13 @@ class BrowserConfig: user_data_dir: str = None, chrome_channel: str = "chromium", channel: str = "chromium", - proxy: Optional[str] = None, + proxy: str = None, proxy_config: dict = None, viewport_width: int = 1080, viewport_height: int = 600, accept_downloads: bool = False, downloads_path: str = None, - storage_state=None, + storage_state : Union[str, dict, None]=None, ignore_https_errors: bool = True, java_script_enabled: bool = True, sleep_on_close: bool = False, @@ -382,7 +385,7 @@ class CrawlerRunConfig: extraction_strategy: ExtractionStrategy = None, chunking_strategy: ChunkingStrategy = RegexChunking(), markdown_generator: MarkdownGenerationStrategy = None, - content_filter=None, + content_filter : RelevantContentFilter = None, only_text: bool = False, css_selector: str = None, excluded_tags: list = None, @@ -396,7 +399,7 @@ class CrawlerRunConfig: # SSL Parameters fetch_ssl_certificate: bool = False, # Caching Parameters - cache_mode=None, + cache_mode: CacheMode =None, session_id: str = None, bypass_cache: bool = False, disable_cache: bool = False, diff --git a/crawl4ai/utils.scraping.py b/crawl4ai/utils.scraping.py deleted file mode 100644 index e69de29b..00000000 diff --git a/mkdocs.yml b/mkdocs.yml index 255492e3..16f44b05 100644 --- a/mkdocs.yml +++ b/mkdocs.yml @@ -1,4 +1,4 @@ -site_name: Crawl4AI Documentation +site_name: Crawl4AI Documentation (v0.4.3b2) site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_url: https://docs.crawl4ai.com repo_url: https://github.com/unclecode/crawl4ai @@ -52,6 +52,11 @@ nav: theme: name: 'terminal' palette: 'dark' + icon: + repo: fontawesome/brands/github + +plugins: + - search markdown_extensions: - pymdownx.highlight: @@ -64,6 +69,9 @@ markdown_extensions: - attr_list - tables +extra: + version: !ENV [CRAWL4AI_VERSION, 'development'] + extra_css: - assets/styles.css - assets/highlight.css @@ -72,3 +80,4 @@ extra_css: extra_javascript: - assets/highlight.min.js - assets/highlight_init.js + - https://buttons.github.io/buttons.js \ No newline at end of file From 6a01008a2b518748e662d28cdd6ad4a0f8ab70c0 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 22:33:36 +0800 Subject: [PATCH 2/7] docs(multi-url): improve documentation clarity and update examples - Restructure multi-URL crawling documentation with better formatting and examples - Update code examples to use new API syntax (arun_many) - Add detailed parameter explanations for RateLimiter and Dispatchers - Enhance CSS styling for better documentation readability - Fix outdated method calls in feature demo script BREAKING CHANGE: Updated dispatcher.run_urls() to crawler.arun_many() in examples --- docs/examples/v0_4_3b2_features_demo.py | 20 ++- docs/md_v2/advanced/multi-url-crawling.md | 195 +++++++++++++++++++--- docs/md_v2/assets/styles.css | 14 ++ 3 files changed, 201 insertions(+), 28 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 6e091423..a3a7355b 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -85,10 +85,10 @@ async def demo_memory_dispatcher(): ) print("\n🚀 Starting batch crawl...") - results = await dispatcher.run_urls( + results = await crawler.arun_many( urls=urls, - crawler=crawler, config=crawler_config, + dispatcher=dispatcher ) print(f"\n✅ Completed {len(results)} URLs successfully") @@ -115,15 +115,17 @@ async def demo_streaming_support(): dispatcher = MemoryAdaptiveDispatcher(max_session_permit=3, check_interval=0.5) print("Starting streaming crawl...") - async for result in dispatcher.run_urls_stream( - urls=urls, crawler=crawler, config=crawler_config + async for result in await crawler.arun_many( + urls=urls, + config=crawler_config, + dispatcher=dispatcher ): # Process each result as it arrives print( - f"Received result for {result.url} - Success: {result.result.success}" + f"Received result for {result.url} - Success: {result.success}" ) - if result.result.success: - print(f"Content length: {len(result.result.markdown)}") + if result.success: + print(f"Content length: {len(result.markdown)}") async def demo_content_scraping(): @@ -147,6 +149,8 @@ async def demo_content_scraping(): print("Successfully scraped content using LXML strategy") + + async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo @@ -336,7 +340,7 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - await demo_memory_dispatcher() + # await demo_memory_dispatcher() await demo_streaming_support() await demo_content_scraping() diff --git a/docs/md_v2/advanced/multi-url-crawling.md b/docs/md_v2/advanced/multi-url-crawling.md index 12c4f916..f6d944d6 100644 --- a/docs/md_v2/advanced/multi-url-crawling.md +++ b/docs/md_v2/advanced/multi-url-crawling.md @@ -5,16 +5,20 @@ ## 1. Introduction When crawling many URLs: + - **Basic**: Use `arun()` in a loop (simple but less efficient) - **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control - **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.) **Why Dispatchers?** + - **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources - **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses - **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance - **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency +--- + ## 2. Core Components ### 2.1 Rate Limiter @@ -22,34 +26,116 @@ When crawling many URLs: ```python class RateLimiter: def __init__( - base_delay: Tuple[float, float] = (1.0, 3.0), # Random delay range between requests - max_delay: float = 60.0, # Maximum backoff delay - max_retries: int = 3, # Retries before giving up - rate_limit_codes: List[int] = [429, 503] # Status codes triggering backoff + # Random delay range between requests + base_delay: Tuple[float, float] = (1.0, 3.0), + + # Maximum backoff delay + max_delay: float = 60.0, + + # Retries before giving up + max_retries: int = 3, + + # Status codes triggering backoff + rate_limit_codes: List[int] = [429, 503] ) ``` -The RateLimiter provides: -- Random delays between requests -- Exponential backoff on rate limit responses -- Domain-specific rate limiting -- Automatic retry handling +Here’s the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines. + +#### RateLimiter Constructor Parameters + +The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters. + +**Parameters of the `RateLimiter` constructor:** + +1. **`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`) +  The range for a random delay (in seconds) between consecutive requests to the same domain. + +- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request. +- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits. + +**Example:** +If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc. + +--- + +2. **`max_delay`** (`float`, default: `60.0`) +  The maximum allowable delay when rate-limiting errors occur. + +- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter. +- The `max_delay` ensures the delay doesn’t grow unreasonably high, capping it at this value. + +**Example:** +For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`. + +--- + +3. **`max_retries`** (`int`, default: `3`) +  The maximum number of retries for a request if rate-limiting errors occur. + +- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times. +- If all retries fail, the request is marked as failed, and the process continues. + +**Example:** +If `max_retries = 3`, the system retries a failed request three times before giving up. + +--- + +4. **`rate_limit_codes`** (`List[int]`, default: `[429, 503]`) +  A list of HTTP status codes that trigger the rate-limiting logic. + +- These status codes indicate the server is overwhelmed or actively limiting requests. +- You can customize this list to include other codes based on specific server behavior. + +**Example:** +If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes. + +--- + +**How to Use the `RateLimiter`:** + +Here’s an example of initializing and using a `RateLimiter` in your project: + +```python +from crawl4ai import RateLimiter + +# Create a RateLimiter with custom settings +rate_limiter = RateLimiter( + base_delay=(2.0, 4.0), # Random delay between 2-4 seconds + max_delay=30.0, # Cap delay at 30 seconds + max_retries=5, # Retry up to 5 times on rate-limiting errors + rate_limit_codes=[429, 503] # Handle these HTTP status codes +) + +# RateLimiter will handle delays and retries internally +# No additional setup is required for its operation +``` + +The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency. + ### 2.2 Crawler Monitor The CrawlerMonitor provides real-time visibility into crawling operations: ```python +from crawl4ai import CrawlerMonitor, DisplayMode monitor = CrawlerMonitor( - max_visible_rows=15, # Maximum rows in live display - display_mode=DisplayMode.DETAILED # DETAILED or AGGREGATED view + # Maximum rows in live display + max_visible_rows=15, + + # DETAILED or AGGREGATED view + display_mode=DisplayMode.DETAILED ) ``` **Display Modes**: + 1. **DETAILED**: Shows individual task status, memory usage, and timing 2. **AGGREGATED**: Displays summary statistics and overall progress +--- + ## 3. Available Dispatchers ### 3.1 MemoryAdaptiveDispatcher (Default) @@ -57,6 +143,8 @@ monitor = CrawlerMonitor( Automatically manages concurrency based on system memory usage: ```python +from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher + dispatcher = MemoryAdaptiveDispatcher( memory_threshold_percent=90.0, # Pause if memory exceeds this check_interval=1.0, # How often to check memory @@ -73,13 +161,37 @@ dispatcher = MemoryAdaptiveDispatcher( ) ``` +**Constructor Parameters:** + +1. **`memory_threshold_percent`** (`float`, default: `90.0`) +  Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload. + +2. **`check_interval`** (`float`, default: `1.0`) +  The interval (in seconds) at which the dispatcher checks system memory usage. + +3. **`max_session_permit`** (`int`, default: `10`) +  The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency. + +4. **`memory_wait_timeout`** (`float`, default: `300.0`) +  Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised. + +5. **`rate_limiter`** (`RateLimiter`, default: `None`) +  Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details. + +6. **`monitor`** (`CrawlerMonitor`, default: `None`) +  Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details. + +--- + ### 3.2 SemaphoreDispatcher Provides simple concurrency control with a fixed limit: ```python +from crawl4ai.async_dispatcher import SemaphoreDispatcher + dispatcher = SemaphoreDispatcher( - max_session_permit=5, # Fixed concurrent tasks + max_session_permit=20, # Maximum concurrent tasks rate_limiter=RateLimiter( # Optional rate limiting base_delay=(0.5, 1.0), max_delay=10.0 @@ -91,6 +203,19 @@ dispatcher = SemaphoreDispatcher( ) ``` +**Constructor Parameters:** + +1. **`max_session_permit`** (`int`, default: `20`) +  The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots. + +2. **`rate_limiter`** (`RateLimiter`, default: `None`) +  Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details. + +3. **`monitor`** (`CrawlerMonitor`, default: `None`) +  Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details. + +--- + ## 4. Usage Examples ### 4.1 Batch Processing (Default) @@ -128,6 +253,14 @@ async def crawl_batch(): print(f"Failed to crawl {result.url}: {result.error_message}") ``` +**Review:** +- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete. +- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory. +- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing. +- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl. + +--- + ### 4.2 Streaming Mode ```python @@ -161,6 +294,14 @@ async def crawl_streaming(): print(f"Failed to crawl {result.url}: {result.error_message}") ``` +**Review:** +- **Purpose:** Enables streaming to process results as soon as they’re available. +- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management. +- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling. +- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage. + +--- + ### 4.3 Semaphore-based Crawling ```python @@ -189,6 +330,14 @@ async def crawl_with_semaphore(urls): return results ``` +**Review:** +- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots. +- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks. +- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests. +- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory. + +--- + ### 4.4 Robots.txt Consideration ```python @@ -221,11 +370,13 @@ if __name__ == "__main__": asyncio.run(main()) ``` -**Key Points**: -- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling -- Robots.txt files are cached for efficiency -- Failed robots.txt checks return 403 status code -- Dispatcher handles robots.txt checks automatically for each URL +**Review:** +- **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling. +- **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling. +- **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`). +- **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices. + +--- ## 5. Dispatch Results @@ -255,20 +406,24 @@ for result in results: ## 6. Summary -1. **Two Dispatcher Types**: +1. **Two Dispatcher Types**: + - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory - SemaphoreDispatcher: Fixed concurrency limit -2. **Optional Components**: +2. **Optional Components**: + - RateLimiter: Smart request pacing and backoff - CrawlerMonitor: Real-time progress visualization -3. **Key Benefits**: +3. **Key Benefits**: + - Automatic memory management - Built-in rate limiting - Live progress monitoring - Flexible concurrency control Choose the dispatcher that best fits your needs: + - **MemoryAdaptiveDispatcher**: For large crawls or limited resources - **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios diff --git a/docs/md_v2/assets/styles.css b/docs/md_v2/assets/styles.css index ed7fc12e..8ee8cbb1 100644 --- a/docs/md_v2/assets/styles.css +++ b/docs/md_v2/assets/styles.css @@ -95,6 +95,10 @@ strong { } +div.highlight { + margin-bottom: 2em; +} + .terminal-card > header { color: var(--font-color); text-align: center; @@ -231,6 +235,16 @@ pre { font-size: 2em; } +.terminal h2 { + font-size: 1.5em; + margin-bottom: 0.8em; +} + +.terminal h3 { + font-size: 1.3em; + margin-bottom: 0.8em; +} + .terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 { text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color); } From 65d33bcc0f50c8b7e2ee4c875d70fe3a3c866a94 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Thu, 23 Jan 2025 22:36:58 +0800 Subject: [PATCH 3/7] style(docs): improve code formatting in features demo Clean up whitespace and improve readability in v0_4_3b2_features_demo.py: - Remove excessive blank lines between functions - Improve config formatting for better readability - Uncomment memory dispatcher demo in main function No breaking changes. --- docs/examples/v0_4_3b2_features_demo.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index a3a7355b..7771c3f8 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -95,7 +95,6 @@ async def demo_memory_dispatcher(): except Exception as e: print(f"\n❌ Error in memory dispatcher demo: {str(e)}") - async def demo_streaming_support(): """ 2. Streaming Support Demo @@ -127,7 +126,6 @@ async def demo_streaming_support(): if result.success: print(f"Content length: {len(result.markdown)}") - async def demo_content_scraping(): """ 3. Content Scraping Strategy Demo @@ -140,7 +138,10 @@ async def demo_content_scraping(): url = "https://example.com/article" # Configure with the new LXML strategy - config = CrawlerRunConfig(scraping_strategy=LXMLWebScrapingStrategy(), verbose=True) + config = CrawlerRunConfig( + scraping_strategy=LXMLWebScrapingStrategy(), + verbose=True + ) print("Scraping content with LXML strategy...") async with crawler: @@ -148,9 +149,6 @@ async def demo_content_scraping(): if result.success: print("Successfully scraped content using LXML strategy") - - - async def demo_llm_markdown(): """ 4. LLM-Powered Markdown Generation Demo @@ -201,7 +199,6 @@ async def demo_llm_markdown(): print(result.markdown_v2.fit_markdown[:500]) print("Successfully generated LLM-filtered markdown") - async def demo_robots_compliance(): """ 5. Robots.txt Compliance Demo @@ -225,8 +222,6 @@ async def demo_robots_compliance(): elif result.success: print(f"Successfully crawled: {result.url}") - - async def demo_json_schema_generation(): """ 7. LLM-Powered Schema Generation Demo @@ -280,7 +275,6 @@ async def demo_json_schema_generation(): print(json.dumps(result.extracted_content, indent=2) if result.extracted_content else None) print("Successfully used generated schema for crawling") - async def demo_proxy_rotation(): """ 8. Proxy Rotation Demo @@ -340,7 +334,7 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - # await demo_memory_dispatcher() + await demo_memory_dispatcher() await demo_streaming_support() await demo_content_scraping() From 69a77222efe976b4fb9c3b2074817e858a6d7248 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Fri, 24 Jan 2025 15:53:47 +0800 Subject: [PATCH 4/7] feat(browser): add CDP URL configuration support Add support for direct CDP URL configuration in BrowserConfig and ManagedBrowser classes. This allows connecting to remote browser instances using custom CDP endpoints instead of always launching a local browser. - Added cdp_url parameter to BrowserConfig - Added cdp_url support in ManagedBrowser.start() method - Updated documentation for new parameters --- crawl4ai/async_configs.py | 6 ++++++ crawl4ai/async_crawler_strategy.py | 13 +++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index c1404026..d0a9b9e1 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -32,6 +32,7 @@ class BrowserConfig: Default: True. use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing advanced manipulation. Default: False. + cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/". debugging_port (int): Port for the browser debugging protocol. Default: 9222. use_persistent_context (bool): Use a persistent browser context (like a persistent profile). Automatically sets use_managed_browser=True. Default: False. @@ -80,6 +81,7 @@ class BrowserConfig: browser_type: str = "chromium", headless: bool = True, use_managed_browser: bool = False, + cdp_url: str = None, use_persistent_context: bool = False, user_data_dir: str = None, chrome_channel: str = "chromium", @@ -107,10 +109,12 @@ class BrowserConfig: light_mode: bool = False, extra_args: list = None, debugging_port: int = 9222, + host: str = "localhost", ): self.browser_type = browser_type self.headless = headless self.use_managed_browser = use_managed_browser + self.cdp_url = cdp_url self.use_persistent_context = use_persistent_context self.user_data_dir = user_data_dir self.chrome_channel = chrome_channel or self.browser_type or "chromium" @@ -162,6 +166,7 @@ class BrowserConfig: browser_type=kwargs.get("browser_type", "chromium"), headless=kwargs.get("headless", True), use_managed_browser=kwargs.get("use_managed_browser", False), + cdp_url=kwargs.get("cdp_url"), use_persistent_context=kwargs.get("use_persistent_context", False), user_data_dir=kwargs.get("user_data_dir"), chrome_channel=kwargs.get("chrome_channel", "chromium"), @@ -194,6 +199,7 @@ class BrowserConfig: "browser_type": self.browser_type, "headless": self.headless, "use_managed_browser": self.use_managed_browser, + "cdp_url": self.cdp_url, "use_persistent_context": self.use_persistent_context, "user_data_dir": self.user_data_dir, "chrome_channel": self.chrome_channel, diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index 738dfb51..b11796e0 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -102,6 +102,7 @@ class ManagedBrowser: logger=None, host: str = "localhost", debugging_port: int = 9222, + cdp_url: Optional[str] = None, ): """ Initialize the ManagedBrowser instance. @@ -116,6 +117,7 @@ class ManagedBrowser: logger (logging.Logger): Logger instance for logging messages. Default: None. host (str): Host for debugging the browser. Default: "localhost". debugging_port (int): Port for debugging the browser. Default: 9222. + cdp_url (str or None): CDP URL to connect to the browser. Default: None. """ self.browser_type = browser_type self.user_data_dir = user_data_dir @@ -129,9 +131,16 @@ class ManagedBrowser: async def start(self) -> str: """ - Starts the browser process and returns the CDP endpoint URL. - If user_data_dir is not provided, creates a temporary directory. + Starts the browser process or returns CDP endpoint URL. + If cdp_url is provided, returns it directly. + If user_data_dir is not provided for local browser, creates a temporary directory. + + Returns: + str: CDP endpoint URL """ + # If CDP URL provided, just return it + if self.cdp_url: + return self.cdp_url # Create temp dir if needed if not self.user_data_dir: From 4d7f91b3789d645b1a3231552ac46a2c136ee607 Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:16:39 +0800 Subject: [PATCH 5/7] refactor(user-agent): improve user agent generation system Redesign user agent generation to be more modular and reliable: - Add abstract base class UAGen for user agent generation - Implement ValidUAGenerator using fake-useragent library - Add OnlineUAGenerator for fetching real-world user agents - Update browser configurations to use new UA generation system - Improve client hints generation This change makes the user agent system more maintainable and provides better real-world user agent coverage. --- crawl4ai/async_configs.py | 43 +++++-- crawl4ai/async_crawler_strategy.py | 14 ++- crawl4ai/user_agent_generator.py | 182 ++++++++++++++++++++++++----- 3 files changed, 196 insertions(+), 43 deletions(-) diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py index d0a9b9e1..44c83262 100644 --- a/crawl4ai/async_configs.py +++ b/crawl4ai/async_configs.py @@ -7,7 +7,7 @@ from .config import ( SOCIAL_MEDIA_DOMAINS, ) -from .user_agent_generator import UserAgentGenerator +from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator from .extraction_strategy import ExtractionStrategy from .chunking_strategy import ChunkingStrategy, RegexChunking from .markdown_generation_strategy import MarkdownGenerationStrategy @@ -100,11 +100,13 @@ class BrowserConfig: cookies: list = None, headers: dict = None, user_agent: str = ( - "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " - "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " + # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 " + # "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" + "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36" ), - user_agent_mode: str = None, - user_agent_generator_config: dict = None, + user_agent_mode: str = "", + user_agent_generator_config: dict = {}, text_mode: bool = False, light_mode: bool = False, extra_args: list = None, @@ -143,17 +145,15 @@ class BrowserConfig: self.verbose = verbose self.debugging_port = debugging_port - user_agenr_generator = UserAgentGenerator() - if self.user_agent_mode != "random" and self.user_agent_generator_config: - self.user_agent = user_agenr_generator.generate( + fa_user_agenr_generator = ValidUAGenerator() + if self.user_agent_mode == "random": + self.user_agent = fa_user_agenr_generator.generate( **(self.user_agent_generator_config or {}) ) - elif self.user_agent_mode == "random": - self.user_agent = user_agenr_generator.generate() else: pass - - self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) + + self.browser_hint = UAGen.generate_client_hints(self.user_agent) self.headers.setdefault("sec-ch-ua", self.browser_hint) # If persistent context is requested, ensure managed browser is enabled @@ -382,6 +382,11 @@ class CrawlerRunConfig: stream (bool): If True, stream the page content as it is being loaded. url: str = None # This is not a compulsory parameter check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False + user_agent (str): Custom User-Agent string to use. Default: None + user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided + user_agent as-is. Default: None. + user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set. + Default: None. """ def __init__( @@ -453,6 +458,9 @@ class CrawlerRunConfig: stream: bool = False, url: str = None, check_robots_txt: bool = False, + user_agent: str = None, + user_agent_mode: str = None, + user_agent_generator_config: dict = {}, ): self.url = url @@ -535,6 +543,11 @@ class CrawlerRunConfig: # Robots.txt Handling Parameters self.check_robots_txt = check_robots_txt + # User Agent Parameters + self.user_agent = user_agent + self.user_agent_mode = user_agent_mode + self.user_agent_generator_config = user_agent_generator_config + # Validate type of extraction strategy and chunking strategy if they are provided if self.extraction_strategy is not None and not isinstance( self.extraction_strategy, ExtractionStrategy @@ -632,6 +645,9 @@ class CrawlerRunConfig: stream=kwargs.get("stream", False), url=kwargs.get("url"), check_robots_txt=kwargs.get("check_robots_txt", False), + user_agent=kwargs.get("user_agent"), + user_agent_mode=kwargs.get("user_agent_mode"), + user_agent_generator_config=kwargs.get("user_agent_generator_config", {}), ) # Create a funciton returns dict of the object @@ -695,6 +711,9 @@ class CrawlerRunConfig: "stream": self.stream, "url": self.url, "check_robots_txt": self.check_robots_txt, + "user_agent": self.user_agent, + "user_agent_mode": self.user_agent_mode, + "user_agent_generator_config": self.user_agent_generator_config, } def clone(self, **kwargs): diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py index b11796e0..62ee4c65 100644 --- a/crawl4ai/async_crawler_strategy.py +++ b/crawl4ai/async_crawler_strategy.py @@ -23,6 +23,7 @@ from .async_logger import AsyncLogger from playwright_stealth import StealthConfig from .ssl_certificate import SSLCertificate from .utils import get_home_folder, get_chromium_path +from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator stealth_config = StealthConfig( webdriver=True, @@ -128,6 +129,7 @@ class ManagedBrowser: self.host = host self.logger = logger self.shutting_down = False + self.cdp_url = cdp_url async def start(self) -> str: """ @@ -563,7 +565,7 @@ class BrowserManager: Context: Browser context object with the specified configurations """ # Base settings - user_agent = self.config.headers.get("User-Agent", self.config.user_agent) + user_agent = self.config.headers.get("User-Agent", self.config.user_agent) viewport_settings = { "width": self.config.viewport_width, "height": self.config.viewport_height, @@ -1269,10 +1271,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy): self._downloaded_files = [] # Handle user agent with magic mode - user_agent = self.browser_config.user_agent - if config.magic and self.browser_config.user_agent_mode != "random": - self.browser_config.user_agent = UserAgentGenerator().generate( - **(self.browser_config.user_agent_generator_config or {}) + user_agent_to_override = config.user_agent + if user_agent_to_override: + self.browser_config.user_agent = user_agent_to_override + elif config.magic or config.user_agent_mode == "random": + self.browser_config.user_agent = ValidUAGenerator().generate( + **(config.user_agent_generator_config or {}) ) # Get page for session diff --git a/crawl4ai/user_agent_generator.py b/crawl4ai/user_agent_generator.py index 4f0f42cb..91e7a31d 100644 --- a/crawl4ai/user_agent_generator.py +++ b/crawl4ai/user_agent_generator.py @@ -2,8 +2,146 @@ import random from typing import Optional, Literal, List, Dict, Tuple import re +from abc import ABC, abstractmethod +import random +from fake_useragent import UserAgent +import requests +from lxml import html +import json +from typing import Optional, List, Union, Dict -class UserAgentGenerator: +class UAGen(ABC): + @abstractmethod + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Union[str, Dict]: + pass + + @staticmethod + def generate_client_hints( user_agent: str) -> str: + """Generate Sec-CH-UA header value based on user agent string""" + def _parse_user_agent(user_agent: str) -> Dict[str, str]: + """Parse a user agent string to extract browser and version information""" + browsers = { + "chrome": r"Chrome/(\d+)", + "edge": r"Edg/(\d+)", + "safari": r"Version/(\d+)", + "firefox": r"Firefox/(\d+)", + } + + result = {} + for browser, pattern in browsers.items(): + match = re.search(pattern, user_agent) + if match: + result[browser] = match.group(1) + + return result + browsers = _parse_user_agent(user_agent) + + # Client hints components + hints = [] + + # Handle different browser combinations + if "chrome" in browsers: + hints.append(f'"Chromium";v="{browsers["chrome"]}"') + hints.append('"Not_A Brand";v="8"') + + if "edge" in browsers: + hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"') + else: + hints.append(f'"Google Chrome";v="{browsers["chrome"]}"') + + elif "firefox" in browsers: + # Firefox doesn't typically send Sec-CH-UA + return '""' + + elif "safari" in browsers: + # Safari's format for client hints + hints.append(f'"Safari";v="{browsers["safari"]}"') + hints.append('"Not_A Brand";v="8"') + + return ", ".join(hints) + +class ValidUAGenerator(UAGen): + def __init__(self): + self.ua = UserAgent() + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> str: + + self.ua = UserAgent( + browsers=browsers or ['Chrome', 'Firefox', 'Edge'], + os=os or ['Windows', 'Mac OS X'], + min_version=min_version, + platforms=platforms or ['desktop'], + fallback=fallback + ) + return self.ua.random + +class OnlineUAGenerator(UAGen): + def __init__(self): + self.agents = [] + self._fetch_agents() + + def _fetch_agents(self): + try: + response = requests.get( + 'https://www.useragents.me/', + timeout=5, + headers={'Accept': 'text/html,application/xhtml+xml'} + ) + response.raise_for_status() + + tree = html.fromstring(response.content) + json_text = tree.cssselect('#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea')[0].text + self.agents = json.loads(json_text) + except Exception as e: + print(f"Error fetching agents: {e}") + + def generate(self, + browsers: Optional[List[str]] = None, + os: Optional[Union[str, List[str]]] = None, + min_version: float = 0.0, + platforms: Optional[Union[str, List[str]]] = None, + pct_threshold: Optional[float] = None, + fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Dict: + + if not self.agents: + self._fetch_agents() + + filtered_agents = self.agents + + if pct_threshold: + filtered_agents = [a for a in filtered_agents if a['pct'] >= pct_threshold] + + if browsers: + filtered_agents = [a for a in filtered_agents + if any(b.lower() in a['ua'].lower() for b in browsers)] + + if os: + os_list = [os] if isinstance(os, str) else os + filtered_agents = [a for a in filtered_agents + if any(o.lower() in a['ua'].lower() for o in os_list)] + + if platforms: + platform_list = [platforms] if isinstance(platforms, str) else platforms + filtered_agents = [a for a in filtered_agents + if any(p.lower() in a['ua'].lower() for p in platform_list)] + + return filtered_agents[0] if filtered_agents else {'ua': fallback, 'pct': 0} + + + +class UserAgentGenerator(): """ Generate random user agents with specified constraints. @@ -187,9 +325,15 @@ class UserAgentGenerator: browser_stack = self.get_browser_stack(num_browsers) # Add appropriate legacy token based on browser stack - if "Firefox" in str(browser_stack): + if "Firefox" in str(browser_stack) or browser_type == "firefox": components.append(random.choice(self.rendering_engines["gecko"])) - elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): + elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack) or browser_type == "chrome": + components.append(self.rendering_engines["chrome_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Edge" in str(browser_stack) or browser_type == "edge": + components.append(self.rendering_engines["safari_webkit"]) + components.append("(KHTML, like Gecko)") + elif "Safari" in str(browser_stack) or browser_type == "safari": components.append(self.rendering_engines["chrome_webkit"]) components.append("(KHTML, like Gecko)") @@ -273,27 +417,13 @@ class UserAgentGenerator: # Example usage: if __name__ == "__main__": - generator = UserAgentGenerator() - print(generator.generate()) + + # Usage example: + generator = ValidUAGenerator() + ua = generator.generate() + print(ua) + + generator = OnlineUAGenerator() + ua = generator.generate() + print(ua) - print("\nSingle browser (Chrome):") - print(generator.generate(num_browsers=1, browser_type="chrome")) - - print("\nTwo browsers (Gecko/Firefox):") - print(generator.generate(num_browsers=2)) - - print("\nThree browsers (Chrome/Safari/Edge):") - print(generator.generate(num_browsers=3)) - - print("\nFirefox on Linux:") - print( - generator.generate( - device_type="desktop", - os_type="linux", - browser_type="firefox", - num_browsers=2, - ) - ) - - print("\nChrome/Safari/Edge on Windows:") - print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3)) From 97796f39d27f3a0f9ee62512f72dafb2e630a29e Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:52:35 +0800 Subject: [PATCH 6/7] docs(examples): update proxy rotation demo and disable other demos Modify proxy rotation example to include empty user agent setting and comment out other demo functions for focused testing. This change simplifies the demo file to focus specifically on proxy rotation functionality. No breaking changes. --- docs/examples/v0_4_3b2_features_demo.py | 17 ++++++++--------- 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 7771c3f8..3b604c62 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -297,8 +297,7 @@ async def demo_proxy_rotation(): } except Exception as e: print(f"Error loading proxy: {e}") - return None - + return None # Create 10 test requests to httpbin urls = ["https://httpbin.org/ip"] * 2 @@ -314,7 +313,7 @@ async def demo_proxy_rotation(): continue # Create new config with proxy - current_config = run_config.clone(proxy_config=proxy) + current_config = run_config.clone(proxy_config=proxy, user_agent="") result = await crawler.arun(url=url, config=current_config) if result.success: @@ -334,18 +333,18 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - await demo_memory_dispatcher() - await demo_streaming_support() - await demo_content_scraping() + # await demo_memory_dispatcher() + # await demo_streaming_support() + # await demo_content_scraping() # # LLM Integration Demos print("\n🤖 LLM INTEGRATION DEMOS") - await demo_json_schema_generation() - await demo_llm_markdown() + # await demo_json_schema_generation() + # await demo_llm_markdown() # # Core Improvements print("\n🔧 CORE IMPROVEMENT DEMOS") - await demo_robots_compliance() + # await demo_robots_compliance() await demo_proxy_rotation() if __name__ == "__main__": From 09ac7ed008a6ef5b89c78200fa632f7494e55bfc Mon Sep 17 00:00:00 2001 From: UncleCode Date: Sat, 25 Jan 2025 21:56:08 +0800 Subject: [PATCH 7/7] feat(demo): uncomment feature demos and add fake-useragent dependency Uncomments demonstration code for memory dispatcher, streaming support, content scraping, JSON schema generation, LLM markdown, and robots compliance in the v0.4.3b2 features demo file. Also adds fake-useragent package as a project dependency. This change makes all feature demonstrations active by default and ensures proper user agent handling capabilities. --- docs/examples/v0_4_3b2_features_demo.py | 12 ++++++------ pyproject.toml | 1 + 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/docs/examples/v0_4_3b2_features_demo.py b/docs/examples/v0_4_3b2_features_demo.py index 3b604c62..1032f346 100644 --- a/docs/examples/v0_4_3b2_features_demo.py +++ b/docs/examples/v0_4_3b2_features_demo.py @@ -333,18 +333,18 @@ async def main(): # Efficiency & Speed Demos print("\n🚀 EFFICIENCY & SPEED DEMOS") - # await demo_memory_dispatcher() - # await demo_streaming_support() - # await demo_content_scraping() + await demo_memory_dispatcher() + await demo_streaming_support() + await demo_content_scraping() # # LLM Integration Demos print("\n🤖 LLM INTEGRATION DEMOS") - # await demo_json_schema_generation() - # await demo_llm_markdown() + await demo_json_schema_generation() + await demo_llm_markdown() # # Core Improvements print("\n🔧 CORE IMPROVEMENT DEMOS") - # await demo_robots_compliance() + await demo_robots_compliance() await demo_proxy_rotation() if __name__ == "__main__": diff --git a/pyproject.toml b/pyproject.toml index 328438e9..38e1f89f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -37,6 +37,7 @@ dependencies = [ "rich>=13.9.4", "cssselect>=1.2.0", "httpx==0.27.2", + "fake-useragent>=2.0.3" ] classifiers = [ "Development Status :: 4 - Beta",