Merge branch 'vr0.4.3b3'

This commit is contained in:
UncleCode
2025-01-25 21:57:29 +08:00
11 changed files with 425 additions and 75 deletions

View File

@@ -16,7 +16,7 @@ from .extraction_strategy import (
) )
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import DefaultMarkdownGenerator from .markdown_generation_strategy import DefaultMarkdownGenerator
from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter from .content_filter_strategy import PruningContentFilter, BM25ContentFilter, LLMContentFilter, RelevantContentFilter
from .models import CrawlResult, MarkdownGenerationResult from .models import CrawlResult, MarkdownGenerationResult
from .async_dispatcher import ( from .async_dispatcher import (
MemoryAdaptiveDispatcher, MemoryAdaptiveDispatcher,
@@ -44,6 +44,7 @@ __all__ = [
"ChunkingStrategy", "ChunkingStrategy",
"RegexChunking", "RegexChunking",
"DefaultMarkdownGenerator", "DefaultMarkdownGenerator",
"RelevantContentFilter",
"PruningContentFilter", "PruningContentFilter",
"BM25ContentFilter", "BM25ContentFilter",
"LLMContentFilter", "LLMContentFilter",

View File

@@ -1,2 +1,2 @@
# crawl4ai/_version.py # crawl4ai/_version.py
__version__ = "0.4.3b2" __version__ = "0.4.3b3"

View File

@@ -6,12 +6,15 @@ from .config import (
IMAGE_SCORE_THRESHOLD, IMAGE_SCORE_THRESHOLD,
SOCIAL_MEDIA_DOMAINS, SOCIAL_MEDIA_DOMAINS,
) )
from .user_agent_generator import UserAgentGenerator
from .user_agent_generator import UserAgentGenerator, UAGen, ValidUAGenerator, OnlineUAGenerator
from .extraction_strategy import ExtractionStrategy from .extraction_strategy import ExtractionStrategy
from .chunking_strategy import ChunkingStrategy, RegexChunking from .chunking_strategy import ChunkingStrategy, RegexChunking
from .markdown_generation_strategy import MarkdownGenerationStrategy from .markdown_generation_strategy import MarkdownGenerationStrategy
from .content_filter_strategy import RelevantContentFilter, BM25ContentFilter, LLMContentFilter, PruningContentFilter
from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy from .content_scraping_strategy import ContentScrapingStrategy, WebScrapingStrategy
from typing import Optional, Union, List from typing import Optional, Union, List
from .cache_context import CacheMode
class BrowserConfig: class BrowserConfig:
@@ -29,6 +32,7 @@ class BrowserConfig:
Default: True. Default: True.
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
advanced manipulation. Default: False. advanced manipulation. Default: False.
cdp_url (str): URL for the Chrome DevTools Protocol (CDP) endpoint. Default: "ws://localhost:9222/devtools/browser/".
debugging_port (int): Port for the browser debugging protocol. Default: 9222. debugging_port (int): Port for the browser debugging protocol. Default: 9222.
use_persistent_context (bool): Use a persistent browser context (like a persistent profile). use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
Automatically sets use_managed_browser=True. Default: False. Automatically sets use_managed_browser=True. Default: False.
@@ -77,17 +81,18 @@ class BrowserConfig:
browser_type: str = "chromium", browser_type: str = "chromium",
headless: bool = True, headless: bool = True,
use_managed_browser: bool = False, use_managed_browser: bool = False,
cdp_url: str = None,
use_persistent_context: bool = False, use_persistent_context: bool = False,
user_data_dir: str = None, user_data_dir: str = None,
chrome_channel: str = "chromium", chrome_channel: str = "chromium",
channel: str = "chromium", channel: str = "chromium",
proxy: Optional[str] = None, proxy: str = None,
proxy_config: dict = None, proxy_config: dict = None,
viewport_width: int = 1080, viewport_width: int = 1080,
viewport_height: int = 600, viewport_height: int = 600,
accept_downloads: bool = False, accept_downloads: bool = False,
downloads_path: str = None, downloads_path: str = None,
storage_state=None, storage_state : Union[str, dict, None]=None,
ignore_https_errors: bool = True, ignore_https_errors: bool = True,
java_script_enabled: bool = True, java_script_enabled: bool = True,
sleep_on_close: bool = False, sleep_on_close: bool = False,
@@ -95,19 +100,23 @@ class BrowserConfig:
cookies: list = None, cookies: list = None,
headers: dict = None, headers: dict = None,
user_agent: str = ( user_agent: str = (
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 " # "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 "
"(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47" # "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 "
# "(KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47"
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36"
), ),
user_agent_mode: str = None, user_agent_mode: str = "",
user_agent_generator_config: dict = None, user_agent_generator_config: dict = {},
text_mode: bool = False, text_mode: bool = False,
light_mode: bool = False, light_mode: bool = False,
extra_args: list = None, extra_args: list = None,
debugging_port: int = 9222, debugging_port: int = 9222,
host: str = "localhost",
): ):
self.browser_type = browser_type self.browser_type = browser_type
self.headless = headless self.headless = headless
self.use_managed_browser = use_managed_browser self.use_managed_browser = use_managed_browser
self.cdp_url = cdp_url
self.use_persistent_context = use_persistent_context self.use_persistent_context = use_persistent_context
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
self.chrome_channel = chrome_channel or self.browser_type or "chromium" self.chrome_channel = chrome_channel or self.browser_type or "chromium"
@@ -136,17 +145,15 @@ class BrowserConfig:
self.verbose = verbose self.verbose = verbose
self.debugging_port = debugging_port self.debugging_port = debugging_port
user_agenr_generator = UserAgentGenerator() fa_user_agenr_generator = ValidUAGenerator()
if self.user_agent_mode != "random" and self.user_agent_generator_config: if self.user_agent_mode == "random":
self.user_agent = user_agenr_generator.generate( self.user_agent = fa_user_agenr_generator.generate(
**(self.user_agent_generator_config or {}) **(self.user_agent_generator_config or {})
) )
elif self.user_agent_mode == "random":
self.user_agent = user_agenr_generator.generate()
else: else:
pass pass
self.browser_hint = user_agenr_generator.generate_client_hints(self.user_agent) self.browser_hint = UAGen.generate_client_hints(self.user_agent)
self.headers.setdefault("sec-ch-ua", self.browser_hint) self.headers.setdefault("sec-ch-ua", self.browser_hint)
# If persistent context is requested, ensure managed browser is enabled # If persistent context is requested, ensure managed browser is enabled
@@ -159,6 +166,7 @@ class BrowserConfig:
browser_type=kwargs.get("browser_type", "chromium"), browser_type=kwargs.get("browser_type", "chromium"),
headless=kwargs.get("headless", True), headless=kwargs.get("headless", True),
use_managed_browser=kwargs.get("use_managed_browser", False), use_managed_browser=kwargs.get("use_managed_browser", False),
cdp_url=kwargs.get("cdp_url"),
use_persistent_context=kwargs.get("use_persistent_context", False), use_persistent_context=kwargs.get("use_persistent_context", False),
user_data_dir=kwargs.get("user_data_dir"), user_data_dir=kwargs.get("user_data_dir"),
chrome_channel=kwargs.get("chrome_channel", "chromium"), chrome_channel=kwargs.get("chrome_channel", "chromium"),
@@ -191,6 +199,7 @@ class BrowserConfig:
"browser_type": self.browser_type, "browser_type": self.browser_type,
"headless": self.headless, "headless": self.headless,
"use_managed_browser": self.use_managed_browser, "use_managed_browser": self.use_managed_browser,
"cdp_url": self.cdp_url,
"use_persistent_context": self.use_persistent_context, "use_persistent_context": self.use_persistent_context,
"user_data_dir": self.user_data_dir, "user_data_dir": self.user_data_dir,
"chrome_channel": self.chrome_channel, "chrome_channel": self.chrome_channel,
@@ -373,6 +382,11 @@ class CrawlerRunConfig:
stream (bool): If True, stream the page content as it is being loaded. stream (bool): If True, stream the page content as it is being loaded.
url: str = None # This is not a compulsory parameter url: str = None # This is not a compulsory parameter
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
user_agent (str): Custom User-Agent string to use. Default: None
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
user_agent as-is. Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
""" """
def __init__( def __init__(
@@ -382,7 +396,7 @@ class CrawlerRunConfig:
extraction_strategy: ExtractionStrategy = None, extraction_strategy: ExtractionStrategy = None,
chunking_strategy: ChunkingStrategy = RegexChunking(), chunking_strategy: ChunkingStrategy = RegexChunking(),
markdown_generator: MarkdownGenerationStrategy = None, markdown_generator: MarkdownGenerationStrategy = None,
content_filter=None, content_filter : RelevantContentFilter = None,
only_text: bool = False, only_text: bool = False,
css_selector: str = None, css_selector: str = None,
excluded_tags: list = None, excluded_tags: list = None,
@@ -396,7 +410,7 @@ class CrawlerRunConfig:
# SSL Parameters # SSL Parameters
fetch_ssl_certificate: bool = False, fetch_ssl_certificate: bool = False,
# Caching Parameters # Caching Parameters
cache_mode=None, cache_mode: CacheMode =None,
session_id: str = None, session_id: str = None,
bypass_cache: bool = False, bypass_cache: bool = False,
disable_cache: bool = False, disable_cache: bool = False,
@@ -444,6 +458,9 @@ class CrawlerRunConfig:
stream: bool = False, stream: bool = False,
url: str = None, url: str = None,
check_robots_txt: bool = False, check_robots_txt: bool = False,
user_agent: str = None,
user_agent_mode: str = None,
user_agent_generator_config: dict = {},
): ):
self.url = url self.url = url
@@ -526,6 +543,11 @@ class CrawlerRunConfig:
# Robots.txt Handling Parameters # Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt self.check_robots_txt = check_robots_txt
# User Agent Parameters
self.user_agent = user_agent
self.user_agent_mode = user_agent_mode
self.user_agent_generator_config = user_agent_generator_config
# Validate type of extraction strategy and chunking strategy if they are provided # Validate type of extraction strategy and chunking strategy if they are provided
if self.extraction_strategy is not None and not isinstance( if self.extraction_strategy is not None and not isinstance(
self.extraction_strategy, ExtractionStrategy self.extraction_strategy, ExtractionStrategy
@@ -623,6 +645,9 @@ class CrawlerRunConfig:
stream=kwargs.get("stream", False), stream=kwargs.get("stream", False),
url=kwargs.get("url"), url=kwargs.get("url"),
check_robots_txt=kwargs.get("check_robots_txt", False), check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
) )
# Create a funciton returns dict of the object # Create a funciton returns dict of the object
@@ -686,6 +711,9 @@ class CrawlerRunConfig:
"stream": self.stream, "stream": self.stream,
"url": self.url, "url": self.url,
"check_robots_txt": self.check_robots_txt, "check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
} }
def clone(self, **kwargs): def clone(self, **kwargs):

View File

@@ -23,6 +23,7 @@ from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig from playwright_stealth import StealthConfig
from .ssl_certificate import SSLCertificate from .ssl_certificate import SSLCertificate
from .utils import get_home_folder, get_chromium_path from .utils import get_home_folder, get_chromium_path
from .user_agent_generator import ValidUAGenerator, OnlineUAGenerator
stealth_config = StealthConfig( stealth_config = StealthConfig(
webdriver=True, webdriver=True,
@@ -102,6 +103,7 @@ class ManagedBrowser:
logger=None, logger=None,
host: str = "localhost", host: str = "localhost",
debugging_port: int = 9222, debugging_port: int = 9222,
cdp_url: Optional[str] = None,
): ):
""" """
Initialize the ManagedBrowser instance. Initialize the ManagedBrowser instance.
@@ -116,6 +118,7 @@ class ManagedBrowser:
logger (logging.Logger): Logger instance for logging messages. Default: None. logger (logging.Logger): Logger instance for logging messages. Default: None.
host (str): Host for debugging the browser. Default: "localhost". host (str): Host for debugging the browser. Default: "localhost".
debugging_port (int): Port for debugging the browser. Default: 9222. debugging_port (int): Port for debugging the browser. Default: 9222.
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
""" """
self.browser_type = browser_type self.browser_type = browser_type
self.user_data_dir = user_data_dir self.user_data_dir = user_data_dir
@@ -126,12 +129,20 @@ class ManagedBrowser:
self.host = host self.host = host
self.logger = logger self.logger = logger
self.shutting_down = False self.shutting_down = False
self.cdp_url = cdp_url
async def start(self) -> str: async def start(self) -> str:
""" """
Starts the browser process and returns the CDP endpoint URL. Starts the browser process or returns CDP endpoint URL.
If user_data_dir is not provided, creates a temporary directory. If cdp_url is provided, returns it directly.
If user_data_dir is not provided for local browser, creates a temporary directory.
Returns:
str: CDP endpoint URL
""" """
# If CDP URL provided, just return it
if self.cdp_url:
return self.cdp_url
# Create temp dir if needed # Create temp dir if needed
if not self.user_data_dir: if not self.user_data_dir:
@@ -1260,10 +1271,12 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
self._downloaded_files = [] self._downloaded_files = []
# Handle user agent with magic mode # Handle user agent with magic mode
user_agent = self.browser_config.user_agent user_agent_to_override = config.user_agent
if config.magic and self.browser_config.user_agent_mode != "random": if user_agent_to_override:
self.browser_config.user_agent = UserAgentGenerator().generate( self.browser_config.user_agent = user_agent_to_override
**(self.browser_config.user_agent_generator_config or {}) elif config.magic or config.user_agent_mode == "random":
self.browser_config.user_agent = ValidUAGenerator().generate(
**(config.user_agent_generator_config or {})
) )
# Get page for session # Get page for session

View File

@@ -2,8 +2,146 @@ import random
from typing import Optional, Literal, List, Dict, Tuple from typing import Optional, Literal, List, Dict, Tuple
import re import re
from abc import ABC, abstractmethod
import random
from fake_useragent import UserAgent
import requests
from lxml import html
import json
from typing import Optional, List, Union, Dict
class UserAgentGenerator: class UAGen(ABC):
@abstractmethod
def generate(self,
browsers: Optional[List[str]] = None,
os: Optional[Union[str, List[str]]] = None,
min_version: float = 0.0,
platforms: Optional[Union[str, List[str]]] = None,
pct_threshold: Optional[float] = None,
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Union[str, Dict]:
pass
@staticmethod
def generate_client_hints( user_agent: str) -> str:
"""Generate Sec-CH-UA header value based on user agent string"""
def _parse_user_agent(user_agent: str) -> Dict[str, str]:
"""Parse a user agent string to extract browser and version information"""
browsers = {
"chrome": r"Chrome/(\d+)",
"edge": r"Edg/(\d+)",
"safari": r"Version/(\d+)",
"firefox": r"Firefox/(\d+)",
}
result = {}
for browser, pattern in browsers.items():
match = re.search(pattern, user_agent)
if match:
result[browser] = match.group(1)
return result
browsers = _parse_user_agent(user_agent)
# Client hints components
hints = []
# Handle different browser combinations
if "chrome" in browsers:
hints.append(f'"Chromium";v="{browsers["chrome"]}"')
hints.append('"Not_A Brand";v="8"')
if "edge" in browsers:
hints.append(f'"Microsoft Edge";v="{browsers["edge"]}"')
else:
hints.append(f'"Google Chrome";v="{browsers["chrome"]}"')
elif "firefox" in browsers:
# Firefox doesn't typically send Sec-CH-UA
return '""'
elif "safari" in browsers:
# Safari's format for client hints
hints.append(f'"Safari";v="{browsers["safari"]}"')
hints.append('"Not_A Brand";v="8"')
return ", ".join(hints)
class ValidUAGenerator(UAGen):
def __init__(self):
self.ua = UserAgent()
def generate(self,
browsers: Optional[List[str]] = None,
os: Optional[Union[str, List[str]]] = None,
min_version: float = 0.0,
platforms: Optional[Union[str, List[str]]] = None,
pct_threshold: Optional[float] = None,
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> str:
self.ua = UserAgent(
browsers=browsers or ['Chrome', 'Firefox', 'Edge'],
os=os or ['Windows', 'Mac OS X'],
min_version=min_version,
platforms=platforms or ['desktop'],
fallback=fallback
)
return self.ua.random
class OnlineUAGenerator(UAGen):
def __init__(self):
self.agents = []
self._fetch_agents()
def _fetch_agents(self):
try:
response = requests.get(
'https://www.useragents.me/',
timeout=5,
headers={'Accept': 'text/html,application/xhtml+xml'}
)
response.raise_for_status()
tree = html.fromstring(response.content)
json_text = tree.cssselect('#most-common-desktop-useragents-json-csv > div:nth-child(1) > textarea')[0].text
self.agents = json.loads(json_text)
except Exception as e:
print(f"Error fetching agents: {e}")
def generate(self,
browsers: Optional[List[str]] = None,
os: Optional[Union[str, List[str]]] = None,
min_version: float = 0.0,
platforms: Optional[Union[str, List[str]]] = None,
pct_threshold: Optional[float] = None,
fallback: str = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 Chrome/116.0.0.0 Safari/537.36") -> Dict:
if not self.agents:
self._fetch_agents()
filtered_agents = self.agents
if pct_threshold:
filtered_agents = [a for a in filtered_agents if a['pct'] >= pct_threshold]
if browsers:
filtered_agents = [a for a in filtered_agents
if any(b.lower() in a['ua'].lower() for b in browsers)]
if os:
os_list = [os] if isinstance(os, str) else os
filtered_agents = [a for a in filtered_agents
if any(o.lower() in a['ua'].lower() for o in os_list)]
if platforms:
platform_list = [platforms] if isinstance(platforms, str) else platforms
filtered_agents = [a for a in filtered_agents
if any(p.lower() in a['ua'].lower() for p in platform_list)]
return filtered_agents[0] if filtered_agents else {'ua': fallback, 'pct': 0}
class UserAgentGenerator():
""" """
Generate random user agents with specified constraints. Generate random user agents with specified constraints.
@@ -187,9 +325,15 @@ class UserAgentGenerator:
browser_stack = self.get_browser_stack(num_browsers) browser_stack = self.get_browser_stack(num_browsers)
# Add appropriate legacy token based on browser stack # Add appropriate legacy token based on browser stack
if "Firefox" in str(browser_stack): if "Firefox" in str(browser_stack) or browser_type == "firefox":
components.append(random.choice(self.rendering_engines["gecko"])) components.append(random.choice(self.rendering_engines["gecko"]))
elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack): elif "Chrome" in str(browser_stack) or "Safari" in str(browser_stack) or browser_type == "chrome":
components.append(self.rendering_engines["chrome_webkit"])
components.append("(KHTML, like Gecko)")
elif "Edge" in str(browser_stack) or browser_type == "edge":
components.append(self.rendering_engines["safari_webkit"])
components.append("(KHTML, like Gecko)")
elif "Safari" in str(browser_stack) or browser_type == "safari":
components.append(self.rendering_engines["chrome_webkit"]) components.append(self.rendering_engines["chrome_webkit"])
components.append("(KHTML, like Gecko)") components.append("(KHTML, like Gecko)")
@@ -273,27 +417,13 @@ class UserAgentGenerator:
# Example usage: # Example usage:
if __name__ == "__main__": if __name__ == "__main__":
generator = UserAgentGenerator()
print(generator.generate())
print("\nSingle browser (Chrome):") # Usage example:
print(generator.generate(num_browsers=1, browser_type="chrome")) generator = ValidUAGenerator()
ua = generator.generate()
print(ua)
print("\nTwo browsers (Gecko/Firefox):") generator = OnlineUAGenerator()
print(generator.generate(num_browsers=2)) ua = generator.generate()
print(ua)
print("\nThree browsers (Chrome/Safari/Edge):")
print(generator.generate(num_browsers=3))
print("\nFirefox on Linux:")
print(
generator.generate(
device_type="desktop",
os_type="linux",
browser_type="firefox",
num_browsers=2,
)
)
print("\nChrome/Safari/Edge on Windows:")
print(generator.generate(device_type="desktop", os_type="windows", num_browsers=3))

View File

@@ -299,7 +299,6 @@ async def demo_proxy_rotation():
print(f"Error loading proxy: {e}") print(f"Error loading proxy: {e}")
return None return None
# Create 10 test requests to httpbin # Create 10 test requests to httpbin
urls = ["https://httpbin.org/ip"] * 2 urls = ["https://httpbin.org/ip"] * 2
@@ -314,7 +313,7 @@ async def demo_proxy_rotation():
continue continue
# Create new config with proxy # Create new config with proxy
current_config = run_config.clone(proxy_config=proxy) current_config = run_config.clone(proxy_config=proxy, user_agent="")
result = await crawler.arun(url=url, config=current_config) result = await crawler.arun(url=url, config=current_config)
if result.success: if result.success:

View File

@@ -5,16 +5,20 @@
## 1. Introduction ## 1. Introduction
When crawling many URLs: When crawling many URLs:
- **Basic**: Use `arun()` in a loop (simple but less efficient) - **Basic**: Use `arun()` in a loop (simple but less efficient)
- **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control - **Better**: Use `arun_many()`, which efficiently handles multiple URLs with proper concurrency control
- **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.) - **Best**: Customize dispatcher behavior for your specific needs (memory management, rate limits, etc.)
**Why Dispatchers?** **Why Dispatchers?**
- **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources - **Adaptive**: Memory-based dispatchers can pause or slow down based on system resources
- **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses - **Rate-limiting**: Built-in rate limiting with exponential backoff for 429/503 responses
- **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance - **Real-time Monitoring**: Live dashboard of ongoing tasks, memory usage, and performance
- **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency - **Flexibility**: Choose between memory-adaptive or semaphore-based concurrency
---
## 2. Core Components ## 2. Core Components
### 2.1 Rate Limiter ### 2.1 Rate Limiter
@@ -22,34 +26,116 @@ When crawling many URLs:
```python ```python
class RateLimiter: class RateLimiter:
def __init__( def __init__(
base_delay: Tuple[float, float] = (1.0, 3.0), # Random delay range between requests # Random delay range between requests
max_delay: float = 60.0, # Maximum backoff delay base_delay: Tuple[float, float] = (1.0, 3.0),
max_retries: int = 3, # Retries before giving up
rate_limit_codes: List[int] = [429, 503] # Status codes triggering backoff # Maximum backoff delay
max_delay: float = 60.0,
# Retries before giving up
max_retries: int = 3,
# Status codes triggering backoff
rate_limit_codes: List[int] = [429, 503]
) )
``` ```
The RateLimiter provides: Heres the revised and simplified explanation of the **RateLimiter**, focusing on constructor parameters and adhering to your markdown style and mkDocs guidelines.
- Random delays between requests
- Exponential backoff on rate limit responses #### RateLimiter Constructor Parameters
- Domain-specific rate limiting
- Automatic retry handling The **RateLimiter** is a utility that helps manage the pace of requests to avoid overloading servers or getting blocked due to rate limits. It operates internally to delay requests and handle retries but can be configured using its constructor parameters.
**Parameters of the `RateLimiter` constructor:**
1.**`base_delay`** (`Tuple[float, float]`, default: `(1.0, 3.0)`)
The range for a random delay (in seconds) between consecutive requests to the same domain.
- A random delay is chosen between `base_delay[0]` and `base_delay[1]` for each request.
- This prevents sending requests at a predictable frequency, reducing the chances of triggering rate limits.
**Example:**
If `base_delay = (2.0, 5.0)`, delays could be randomly chosen as `2.3s`, `4.1s`, etc.
---
2.**`max_delay`** (`float`, default: `60.0`)
The maximum allowable delay when rate-limiting errors occur.
- When servers return rate-limit responses (e.g., 429 or 503), the delay increases exponentially with jitter.
- The `max_delay` ensures the delay doesnt grow unreasonably high, capping it at this value.
**Example:**
For a `max_delay = 30.0`, even if backoff calculations suggest a delay of `45s`, it will cap at `30s`.
---
3.**`max_retries`** (`int`, default: `3`)
The maximum number of retries for a request if rate-limiting errors occur.
- After encountering a rate-limit response, the `RateLimiter` retries the request up to this number of times.
- If all retries fail, the request is marked as failed, and the process continues.
**Example:**
If `max_retries = 3`, the system retries a failed request three times before giving up.
---
4.**`rate_limit_codes`** (`List[int]`, default: `[429, 503]`)
A list of HTTP status codes that trigger the rate-limiting logic.
- These status codes indicate the server is overwhelmed or actively limiting requests.
- You can customize this list to include other codes based on specific server behavior.
**Example:**
If `rate_limit_codes = [429, 503, 504]`, the crawler will back off on these three error codes.
---
**How to Use the `RateLimiter`:**
Heres an example of initializing and using a `RateLimiter` in your project:
```python
from crawl4ai import RateLimiter
# Create a RateLimiter with custom settings
rate_limiter = RateLimiter(
base_delay=(2.0, 4.0), # Random delay between 2-4 seconds
max_delay=30.0, # Cap delay at 30 seconds
max_retries=5, # Retry up to 5 times on rate-limiting errors
rate_limit_codes=[429, 503] # Handle these HTTP status codes
)
# RateLimiter will handle delays and retries internally
# No additional setup is required for its operation
```
The `RateLimiter` integrates seamlessly with dispatchers like `MemoryAdaptiveDispatcher` and `SemaphoreDispatcher`, ensuring requests are paced correctly without user intervention. Its internal mechanisms manage delays and retries to avoid overwhelming servers while maximizing efficiency.
### 2.2 Crawler Monitor ### 2.2 Crawler Monitor
The CrawlerMonitor provides real-time visibility into crawling operations: The CrawlerMonitor provides real-time visibility into crawling operations:
```python ```python
from crawl4ai import CrawlerMonitor, DisplayMode
monitor = CrawlerMonitor( monitor = CrawlerMonitor(
max_visible_rows=15, # Maximum rows in live display # Maximum rows in live display
display_mode=DisplayMode.DETAILED # DETAILED or AGGREGATED view max_visible_rows=15,
# DETAILED or AGGREGATED view
display_mode=DisplayMode.DETAILED
) )
``` ```
**Display Modes**: **Display Modes**:
1. **DETAILED**: Shows individual task status, memory usage, and timing 1. **DETAILED**: Shows individual task status, memory usage, and timing
2. **AGGREGATED**: Displays summary statistics and overall progress 2. **AGGREGATED**: Displays summary statistics and overall progress
---
## 3. Available Dispatchers ## 3. Available Dispatchers
### 3.1 MemoryAdaptiveDispatcher (Default) ### 3.1 MemoryAdaptiveDispatcher (Default)
@@ -57,6 +143,8 @@ monitor = CrawlerMonitor(
Automatically manages concurrency based on system memory usage: Automatically manages concurrency based on system memory usage:
```python ```python
from crawl4ai.async_dispatcher import MemoryAdaptiveDispatcher
dispatcher = MemoryAdaptiveDispatcher( dispatcher = MemoryAdaptiveDispatcher(
memory_threshold_percent=90.0, # Pause if memory exceeds this memory_threshold_percent=90.0, # Pause if memory exceeds this
check_interval=1.0, # How often to check memory check_interval=1.0, # How often to check memory
@@ -73,13 +161,37 @@ dispatcher = MemoryAdaptiveDispatcher(
) )
``` ```
**Constructor Parameters:**
1.**`memory_threshold_percent`** (`float`, default: `90.0`)
Specifies the memory usage threshold (as a percentage). If system memory usage exceeds this value, the dispatcher pauses crawling to prevent system overload.
2.**`check_interval`** (`float`, default: `1.0`)
The interval (in seconds) at which the dispatcher checks system memory usage.
3.**`max_session_permit`** (`int`, default: `10`)
The maximum number of concurrent crawling tasks allowed. This ensures resource limits are respected while maintaining concurrency.
4.**`memory_wait_timeout`** (`float`, default: `300.0`)
Optional timeout (in seconds). If memory usage exceeds `memory_threshold_percent` for longer than this duration, a `MemoryError` is raised.
5.**`rate_limiter`** (`RateLimiter`, default: `None`)
Optional rate-limiting logic to avoid server-side blocking (e.g., for handling 429 or 503 errors). See **RateLimiter** for details.
6.**`monitor`** (`CrawlerMonitor`, default: `None`)
Optional monitoring for real-time task tracking and performance insights. See **CrawlerMonitor** for details.
---
### 3.2 SemaphoreDispatcher ### 3.2 SemaphoreDispatcher
Provides simple concurrency control with a fixed limit: Provides simple concurrency control with a fixed limit:
```python ```python
from crawl4ai.async_dispatcher import SemaphoreDispatcher
dispatcher = SemaphoreDispatcher( dispatcher = SemaphoreDispatcher(
max_session_permit=5, # Fixed concurrent tasks max_session_permit=20, # Maximum concurrent tasks
rate_limiter=RateLimiter( # Optional rate limiting rate_limiter=RateLimiter( # Optional rate limiting
base_delay=(0.5, 1.0), base_delay=(0.5, 1.0),
max_delay=10.0 max_delay=10.0
@@ -91,6 +203,19 @@ dispatcher = SemaphoreDispatcher(
) )
``` ```
**Constructor Parameters:**
1.**`max_session_permit`** (`int`, default: `20`)
The maximum number of concurrent crawling tasks allowed, irrespective of semaphore slots.
2.**`rate_limiter`** (`RateLimiter`, default: `None`)
Optional rate-limiting logic to avoid overwhelming servers. See **RateLimiter** for details.
3.**`monitor`** (`CrawlerMonitor`, default: `None`)
Optional monitoring for tracking task progress and resource usage. See **CrawlerMonitor** for details.
---
## 4. Usage Examples ## 4. Usage Examples
### 4.1 Batch Processing (Default) ### 4.1 Batch Processing (Default)
@@ -128,6 +253,14 @@ async def crawl_batch():
print(f"Failed to crawl {result.url}: {result.error_message}") print(f"Failed to crawl {result.url}: {result.error_message}")
``` ```
**Review:**
- **Purpose:** Executes a batch crawl with all URLs processed together after crawling is complete.
- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` to manage concurrency and system memory.
- **Stream:** Disabled (`stream=False`), so all results are collected at once for post-processing.
- **Best Use Case:** When you need to analyze results in bulk rather than individually during the crawl.
---
### 4.2 Streaming Mode ### 4.2 Streaming Mode
```python ```python
@@ -161,6 +294,14 @@ async def crawl_streaming():
print(f"Failed to crawl {result.url}: {result.error_message}") print(f"Failed to crawl {result.url}: {result.error_message}")
``` ```
**Review:**
- **Purpose:** Enables streaming to process results as soon as theyre available.
- **Dispatcher:** Uses `MemoryAdaptiveDispatcher` for concurrency and memory management.
- **Stream:** Enabled (`stream=True`), allowing real-time processing during crawling.
- **Best Use Case:** When you need to act on results immediately, such as for real-time analytics or progressive data storage.
---
### 4.3 Semaphore-based Crawling ### 4.3 Semaphore-based Crawling
```python ```python
@@ -189,6 +330,14 @@ async def crawl_with_semaphore(urls):
return results return results
``` ```
**Review:**
- **Purpose:** Uses `SemaphoreDispatcher` to limit concurrency with a fixed number of slots.
- **Dispatcher:** Configured with a semaphore to control parallel crawling tasks.
- **Rate Limiter:** Prevents servers from being overwhelmed by pacing requests.
- **Best Use Case:** When you want precise control over the number of concurrent requests, independent of system memory.
---
### 4.4 Robots.txt Consideration ### 4.4 Robots.txt Consideration
```python ```python
@@ -221,11 +370,13 @@ if __name__ == "__main__":
asyncio.run(main()) asyncio.run(main())
``` ```
**Key Points**: **Review:**
- When `check_robots_txt=True`, each URL's robots.txt is checked before crawling - **Purpose:** Ensures compliance with `robots.txt` rules for ethical and legal web crawling.
- Robots.txt files are cached for efficiency - **Configuration:** Set `check_robots_txt=True` to validate each URL against `robots.txt` before crawling.
- Failed robots.txt checks return 403 status code - **Dispatcher:** Handles requests with concurrency limits (`semaphore_count=3`).
- Dispatcher handles robots.txt checks automatically for each URL - **Best Use Case:** When crawling websites that strictly enforce robots.txt policies or for responsible crawling practices.
---
## 5. Dispatch Results ## 5. Dispatch Results
@@ -255,20 +406,24 @@ for result in results:
## 6. Summary ## 6. Summary
1. **Two Dispatcher Types**: 1.**Two Dispatcher Types**:
- MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory - MemoryAdaptiveDispatcher (default): Dynamic concurrency based on memory
- SemaphoreDispatcher: Fixed concurrency limit - SemaphoreDispatcher: Fixed concurrency limit
2. **Optional Components**: 2.**Optional Components**:
- RateLimiter: Smart request pacing and backoff - RateLimiter: Smart request pacing and backoff
- CrawlerMonitor: Real-time progress visualization - CrawlerMonitor: Real-time progress visualization
3. **Key Benefits**: 3.**Key Benefits**:
- Automatic memory management - Automatic memory management
- Built-in rate limiting - Built-in rate limiting
- Live progress monitoring - Live progress monitoring
- Flexible concurrency control - Flexible concurrency control
Choose the dispatcher that best fits your needs: Choose the dispatcher that best fits your needs:
- **MemoryAdaptiveDispatcher**: For large crawls or limited resources - **MemoryAdaptiveDispatcher**: For large crawls or limited resources
- **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios - **SemaphoreDispatcher**: For simple, fixed-concurrency scenarios

View File

@@ -95,6 +95,10 @@ strong {
} }
div.highlight {
margin-bottom: 2em;
}
.terminal-card > header { .terminal-card > header {
color: var(--font-color); color: var(--font-color);
text-align: center; text-align: center;
@@ -231,6 +235,16 @@ pre {
font-size: 2em; font-size: 2em;
} }
.terminal h2 {
font-size: 1.5em;
margin-bottom: 0.8em;
}
.terminal h3 {
font-size: 1.3em;
margin-bottom: 0.8em;
}
.terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 { .terminal h1, .terminal h2, .terminal h3, .terminal h4, .terminal h5, .terminal h6 {
text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color); text-shadow: 0 0 0px var(--font-color), 0 0 0px var(--font-color), 0 0 0px var(--font-color);
} }

View File

@@ -1,4 +1,4 @@
site_name: Crawl4AI Documentation site_name: Crawl4AI Documentation (v0.4.3b2)
site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper site_description: 🚀🤖 Crawl4AI, Open-source LLM-Friendly Web Crawler & Scraper
site_url: https://docs.crawl4ai.com site_url: https://docs.crawl4ai.com
repo_url: https://github.com/unclecode/crawl4ai repo_url: https://github.com/unclecode/crawl4ai
@@ -52,6 +52,11 @@ nav:
theme: theme:
name: 'terminal' name: 'terminal'
palette: 'dark' palette: 'dark'
icon:
repo: fontawesome/brands/github
plugins:
- search
markdown_extensions: markdown_extensions:
- pymdownx.highlight: - pymdownx.highlight:
@@ -64,6 +69,9 @@ markdown_extensions:
- attr_list - attr_list
- tables - tables
extra:
version: !ENV [CRAWL4AI_VERSION, 'development']
extra_css: extra_css:
- assets/styles.css - assets/styles.css
- assets/highlight.css - assets/highlight.css
@@ -72,3 +80,4 @@ extra_css:
extra_javascript: extra_javascript:
- assets/highlight.min.js - assets/highlight.min.js
- assets/highlight_init.js - assets/highlight_init.js
- https://buttons.github.io/buttons.js

View File

@@ -37,6 +37,7 @@ dependencies = [
"rich>=13.9.4", "rich>=13.9.4",
"cssselect>=1.2.0", "cssselect>=1.2.0",
"httpx==0.27.2", "httpx==0.27.2",
"fake-useragent>=2.0.3"
] ]
classifiers = [ classifiers = [
"Development Status :: 4 - Beta", "Development Status :: 4 - Beta",