feat(crawler): add HTTP crawler strategy for lightweight web scraping

Implements a new AsyncHTTPCrawlerStrategy class that provides a fast, memory-efficient alternative to browser-based crawling. Features include:
- Support for HTTP/HTTPS requests with configurable methods, headers, and timeouts
- File and raw content handling capabilities
- Streaming response processing for large files
- Customizable request/response hooks
- Comprehensive error handling

Also refactors browser management code into separate module for better organization.
This commit is contained in:
UncleCode
2025-02-15 19:26:30 +08:00
parent 063df572b0
commit 8bb799068e
7 changed files with 1353 additions and 851 deletions

View File

@@ -2,7 +2,7 @@
import warnings
from .async_webcrawler import AsyncWebCrawler, CacheMode
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_configs import BrowserConfig, CrawlerRunConfig, HTTPCrawlerConfig
from .content_scraping_strategy import (
ContentScrapingStrategy,
WebScrapingStrategy,
@@ -70,6 +70,7 @@ __all__ = [
"LXMLWebScrapingStrategy",
"BrowserConfig",
"CrawlerRunConfig",
"HTTPCrawlerConfig",
"ExtractionStrategy",
"LLMExtractionStrategy",
"CosineStrategy",

View File

@@ -1,5 +1,5 @@
import re
from attr import has
from email import header
from re import I
from .config import (
MIN_WORD_THRESHOLD,
IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD,
@@ -19,7 +19,6 @@ from typing import Union, List
from .cache_context import CacheMode
from .proxy_strategy import ProxyRotationStrategy
import inspect
from typing import Any, Dict, Optional
from enum import Enum
@@ -47,8 +46,8 @@ def to_serializable_dict(obj: Any) -> Dict:
if hasattr(obj, 'isoformat'):
return obj.isoformat()
# Handle lists, tuples, and sets
if isinstance(obj, (list, tuple, set)):
# Handle lists, tuples, and sets, and basically any iterable
if isinstance(obj, (list, tuple, set)) or hasattr(obj, '__iter__'):
return [to_serializable_dict(item) for item in obj]
# Handle frozensets, which are not iterable
@@ -67,7 +66,6 @@ def to_serializable_dict(obj: Any) -> Dict:
# Get constructor signature
sig = inspect.signature(obj.__class__.__init__)
params = sig.parameters
_type = obj.__class__.__name__
# Get current values
current_values = {}
@@ -81,24 +79,8 @@ def to_serializable_dict(obj: Any) -> Dict:
if not (is_empty_value(value) and is_empty_value(param.default)):
if value != param.default:
current_values[name] = to_serializable_dict(value)
elif hasattr(obj.__class__, '__slots__') and f"_{name}" in obj.__slots__:
slot = f"_{name}"
slot_value = getattr(obj, slot, None)
if not is_empty_value(slot_value):
current_values[name] = to_serializable_dict(slot_value)
# # Then handle slots if present
# if hasattr(obj.__class__, '__slots__'):
# for slot in obj.__class__.__slots__:
# # Remove leading underscore if present
# param_name = slot[1:] if slot.startswith('_') else slot
# # Get the slot value if it exists
# if hasattr(obj, slot):
# value = getattr(obj, slot)
# if not is_empty_value(value):
# current_values[param_name] = to_serializable_dict(value)
_type = obj.__class__.__name__
return {
"type": obj.__class__.__name__,
@@ -126,10 +108,7 @@ def from_serializable_dict(data: Any) -> Any:
# Import from crawl4ai for class instances
import crawl4ai
if not hasattr(crawl4ai, data["type"]):
return None
else:
cls = getattr(crawl4ai, data["type"])
cls = getattr(crawl4ai, data["type"])
# Handle Enum
if issubclass(cls, Enum):
@@ -390,16 +369,72 @@ class BrowserConfig():
def load( data: dict) -> "BrowserConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
# check if the deserialized object is an instance of BrowserConfig
if isinstance(config, BrowserConfig):
return config
elif isinstance(config, dict):
return BrowserConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type for BrowserConfig")
return BrowserConfig.from_kwargs(config)
class HTTPCrawlerConfig():
"""HTTP-specific crawler configuration"""
method: str = "GET"
headers: Optional[Dict[str, str]] = None
data: Optional[Dict[str, Any]] = None
json: Optional[Dict[str, Any]] = None
follow_redirects: bool = True
verify_ssl: bool = True
def __init__(self, method: str = "GET", headers: Optional[Dict[str, str]] = None, data: Optional[Dict[str, Any]] = None, json: Optional[Dict[str, Any]] = None, follow_redirects: bool = True, verify_ssl: bool = True):
self.method = method
self.headers = headers
self.data = data
self.json = json
self.follow_redirects = follow_redirects
self.verify_ssl = verify_ssl
@staticmethod
def from_kwargs(kwargs: dict) -> "HTTPCrawlerConfig":
return HTTPCrawlerConfig(
method=kwargs.get("method", "GET"),
headers=kwargs.get("headers"),
data=kwargs.get("data"),
json=kwargs.get("json"),
follow_redirects=kwargs.get("follow_redirects", True),
verify_ssl=kwargs.get("verify_ssl", True),
)
def to_dict(self):
return {
"method": self.method,
"headers": self.headers,
"data": self.data,
"json": self.json,
"follow_redirects": self.follow_redirects,
"verify_ssl": self.verify_ssl,
}
def clone(self, **kwargs):
"""Create a copy of this configuration with updated values.
Args:
**kwargs: Key-value pairs of configuration options to update
Returns:
HTTPCrawlerConfig: A new instance with the specified updates
"""
config_dict = self.to_dict()
config_dict.update(kwargs)
return HTTPCrawlerConfig.from_kwargs(config_dict)
def dump(self) -> dict:
return to_serializable_dict(self)
@staticmethod
def load(data: dict) -> "HTTPCrawlerConfig":
config = from_serializable_dict(data)
if isinstance(config, HTTPCrawlerConfig):
return config
return HTTPCrawlerConfig.from_kwargs(config)
class CrawlerRunConfig():
"""
Configuration class for controlling how the crawler runs each crawl operation.
@@ -450,7 +485,7 @@ class CrawlerRunConfig():
# Caching Parameters
cache_mode (CacheMode or None): Defines how caching is handled.
If None, defaults to CacheMode.ENABLED internally.
Default: None.
Default: CacheMode.BYPASS.
session_id (str or None): Optional session ID to persist the browser context and the created
page instance. If the ID already exists, the crawler does not
create a new page and uses the current page to preserve the state.
@@ -543,19 +578,27 @@ class CrawlerRunConfig():
log_console (bool): If True, log console messages from the page.
Default: False.
# Streaming Parameters
# HTTP Crwler Strategy Parameters
method (str): HTTP method to use for the request, when using AsyncHTTPCrwalerStrategy.
Default: "GET".
data (dict): Data to send in the request body, when using AsyncHTTPCrwalerStrategy.
Default: None.
json (dict): JSON data to send in the request body, when using AsyncHTTPCrwalerStrategy.
# Connection Parameters
stream (bool): If True, enables streaming of crawled URLs as they are processed when used with arun_many.
Default: False.
# Optional Parameters
stream (bool): If True, stream the page content as it is being loaded.
url: str = None # This is not a compulsory parameter
check_robots_txt (bool): Whether to check robots.txt rules before crawling. Default: False
user_agent (str): Custom User-Agent string to use. Default: None
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided
user_agent as-is. Default: None.
Default: False.
user_agent (str): Custom User-Agent string to use.
Default: None.
user_agent_mode (str or None): Mode for generating the user agent (e.g., "random"). If None, use the provided user_agent as-is.
Default: None.
user_agent_generator_config (dict or None): Configuration for user agent generation if user_agent_mode is set.
Default: None.
url: str = None # This is not a compulsory parameter
"""
def __init__(
@@ -580,7 +623,7 @@ class CrawlerRunConfig():
# SSL Parameters
fetch_ssl_certificate: bool = False,
# Caching Parameters
cache_mode: CacheMode =None,
cache_mode: CacheMode = CacheMode.BYPASS,
session_id: str = None,
bypass_cache: bool = False,
disable_cache: bool = False,
@@ -625,7 +668,8 @@ class CrawlerRunConfig():
# Debugging and Logging Parameters
verbose: bool = True,
log_console: bool = False,
# Streaming Parameters
# Connection Parameters
method: str = "GET",
stream: bool = False,
url: str = None,
check_robots_txt: bool = False,
@@ -713,8 +757,9 @@ class CrawlerRunConfig():
self.verbose = verbose
self.log_console = log_console
# Streaming Parameters
# Connection Parameters
self.stream = stream
self.method = method
# Robots.txt Handling Parameters
self.check_robots_txt = check_robots_txt
@@ -769,7 +814,7 @@ class CrawlerRunConfig():
# SSL Parameters
fetch_ssl_certificate=kwargs.get("fetch_ssl_certificate", False),
# Caching Parameters
cache_mode=kwargs.get("cache_mode"),
cache_mode=kwargs.get("cache_mode", CacheMode.BYPASS),
session_id=kwargs.get("session_id"),
bypass_cache=kwargs.get("bypass_cache", False),
disable_cache=kwargs.get("disable_cache", False),
@@ -823,15 +868,17 @@ class CrawlerRunConfig():
# Debugging and Logging Parameters
verbose=kwargs.get("verbose", True),
log_console=kwargs.get("log_console", False),
# Streaming Parameters
# Connection Parameters
method=kwargs.get("method", "GET"),
stream=kwargs.get("stream", False),
url=kwargs.get("url"),
check_robots_txt=kwargs.get("check_robots_txt", False),
user_agent=kwargs.get("user_agent"),
user_agent_mode=kwargs.get("user_agent_mode"),
user_agent_generator_config=kwargs.get("user_agent_generator_config", {}),
# Deep Crawl Parameters
deep_crawl_strategy=kwargs.get("deep_crawl_strategy"),
url=kwargs.get("url"),
)
# Create a funciton returns dict of the object
@@ -843,13 +890,9 @@ class CrawlerRunConfig():
def load(data: dict) -> "CrawlerRunConfig":
# Deserialize the object from a dictionary
config = from_serializable_dict(data)
# If config type is alread instant of CrawleRunConfig, return it
if isinstance(config, CrawlerRunConfig):
return config
elif isinstance(config, dict):
return CrawlerRunConfig.from_kwargs(config)
else:
raise ValueError("Invalid data type")
return CrawlerRunConfig.from_kwargs(config)
def to_dict(self):
return {
@@ -910,13 +953,14 @@ class CrawlerRunConfig():
"exclude_internal_links": self.exclude_internal_links,
"verbose": self.verbose,
"log_console": self.log_console,
"method": self.method,
"stream": self.stream,
"url": self.url,
"check_robots_txt": self.check_robots_txt,
"user_agent": self.user_agent,
"user_agent_mode": self.user_agent_mode,
"user_agent_generator_config": self.user_agent_generator_config,
"deep_crawl_strategy": self.deep_crawl_strategy,
"url": self.url,
}
def clone(self, **kwargs):

File diff suppressed because it is too large Load Diff

796
crawl4ai/browser_manager.py Normal file
View File

@@ -0,0 +1,796 @@
import asyncio
import time
from typing import List, Optional
import os
import sys
import shutil
import tempfile
import subprocess
from playwright.async_api import BrowserContext
import hashlib
from .js_snippet import load_js_script
from .config import DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from playwright_stealth import StealthConfig
from .utils import get_chromium_path
stealth_config = StealthConfig(
webdriver=True,
chrome_app=True,
chrome_csi=True,
chrome_load_times=True,
chrome_runtime=True,
navigator_languages=True,
navigator_plugins=True,
navigator_permissions=True,
webgl_vendor=True,
outerdimensions=True,
navigator_hardware_concurrency=True,
media_codecs=True,
)
BROWSER_DISABLE_OPTIONS = [
"--disable-background-networking",
"--disable-background-timer-throttling",
"--disable-backgrounding-occluded-windows",
"--disable-breakpad",
"--disable-client-side-phishing-detection",
"--disable-component-extensions-with-background-pages",
"--disable-default-apps",
"--disable-extensions",
"--disable-features=TranslateUI",
"--disable-hang-monitor",
"--disable-ipc-flooding-protection",
"--disable-popup-blocking",
"--disable-prompt-on-repost",
"--disable-sync",
"--force-color-profile=srgb",
"--metrics-recording-only",
"--no-first-run",
"--password-store=basic",
"--use-mock-keychain",
]
class ManagedBrowser:
"""
Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
Attributes:
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
Default: "chromium".
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
temporary directory may be used. Default: None.
headless (bool): Whether to run the browser in headless mode (no visible GUI).
Default: True.
browser_process (subprocess.Popen): The process object for the browser.
temp_dir (str): Temporary directory for user data if not provided.
debugging_port (int): Port for debugging the browser.
host (str): Host for debugging the browser.
Methods:
start(): Starts the browser process and returns the CDP endpoint URL.
_get_browser_path(): Returns the browser executable path based on OS and browser type.
_get_browser_args(): Returns browser-specific command line arguments.
_get_user_data_dir(): Returns the user data directory path.
_cleanup(): Terminates the browser process and removes the temporary directory.
"""
browser_type: str
user_data_dir: str
headless: bool
browser_process: subprocess.Popen
temp_dir: str
debugging_port: int
host: str
def __init__(
self,
browser_type: str = "chromium",
user_data_dir: Optional[str] = None,
headless: bool = False,
logger=None,
host: str = "localhost",
debugging_port: int = 9222,
cdp_url: Optional[str] = None,
):
"""
Initialize the ManagedBrowser instance.
Args:
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
Default: "chromium".
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
temporary directory may be used. Default: None.
headless (bool): Whether to run the browser in headless mode (no visible GUI).
Default: True.
logger (logging.Logger): Logger instance for logging messages. Default: None.
host (str): Host for debugging the browser. Default: "localhost".
debugging_port (int): Port for debugging the browser. Default: 9222.
cdp_url (str or None): CDP URL to connect to the browser. Default: None.
"""
self.browser_type = browser_type
self.user_data_dir = user_data_dir
self.headless = headless
self.browser_process = None
self.temp_dir = None
self.debugging_port = debugging_port
self.host = host
self.logger = logger
self.shutting_down = False
self.cdp_url = cdp_url
async def start(self) -> str:
"""
Starts the browser process or returns CDP endpoint URL.
If cdp_url is provided, returns it directly.
If user_data_dir is not provided for local browser, creates a temporary directory.
Returns:
str: CDP endpoint URL
"""
# If CDP URL provided, just return it
if self.cdp_url:
return self.cdp_url
# Create temp dir if needed
if not self.user_data_dir:
self.temp_dir = tempfile.mkdtemp(prefix="browser-profile-")
self.user_data_dir = self.temp_dir
# Get browser path and args based on OS and browser type
# browser_path = self._get_browser_path()
args = await self._get_browser_args()
# Start browser process
try:
self.browser_process = subprocess.Popen(
args, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
# Monitor browser process output for errors
asyncio.create_task(self._monitor_browser_process())
await asyncio.sleep(2) # Give browser time to start
return f"http://{self.host}:{self.debugging_port}"
except Exception as e:
await self.cleanup()
raise Exception(f"Failed to start browser: {e}")
async def _monitor_browser_process(self):
"""
Monitor the browser process for unexpected termination.
How it works:
1. Read stdout and stderr from the browser process.
2. If the process has terminated, log the error message and terminate the browser.
3. If the shutting_down flag is set, log the normal termination message.
4. If any other error occurs, log the error message.
Note: This method should be called in a separate task to avoid blocking the main event loop.
"""
if self.browser_process:
try:
stdout, stderr = await asyncio.gather(
asyncio.to_thread(self.browser_process.stdout.read),
asyncio.to_thread(self.browser_process.stderr.read),
)
# Check shutting_down flag BEFORE logging anything
if self.browser_process.poll() is not None:
if not self.shutting_down:
self.logger.error(
message="Browser process terminated unexpectedly | Code: {code} | STDOUT: {stdout} | STDERR: {stderr}",
tag="ERROR",
params={
"code": self.browser_process.returncode,
"stdout": stdout.decode(),
"stderr": stderr.decode(),
},
)
await self.cleanup()
else:
self.logger.info(
message="Browser process terminated normally | Code: {code}",
tag="INFO",
params={"code": self.browser_process.returncode},
)
except Exception as e:
if not self.shutting_down:
self.logger.error(
message="Error monitoring browser process: {error}",
tag="ERROR",
params={"error": str(e)},
)
def _get_browser_path_WIP(self) -> str:
"""Returns the browser executable path based on OS and browser type"""
if sys.platform == "darwin": # macOS
paths = {
"chromium": "/Applications/Google Chrome.app/Contents/MacOS/Google Chrome",
"firefox": "/Applications/Firefox.app/Contents/MacOS/firefox",
"webkit": "/Applications/Safari.app/Contents/MacOS/Safari",
}
elif sys.platform == "win32": # Windows
paths = {
"chromium": "C:\\Program Files\\Google\\Chrome\\Application\\chrome.exe",
"firefox": "C:\\Program Files\\Mozilla Firefox\\firefox.exe",
"webkit": None, # WebKit not supported on Windows
}
else: # Linux
paths = {
"chromium": "google-chrome",
"firefox": "firefox",
"webkit": None, # WebKit not supported on Linux
}
return paths.get(self.browser_type)
async def _get_browser_path(self) -> str:
browser_path = await get_chromium_path(self.browser_type)
return browser_path
async def _get_browser_args(self) -> List[str]:
"""Returns browser-specific command line arguments"""
base_args = [await self._get_browser_path()]
if self.browser_type == "chromium":
args = [
f"--remote-debugging-port={self.debugging_port}",
f"--user-data-dir={self.user_data_dir}",
]
if self.headless:
args.append("--headless=new")
elif self.browser_type == "firefox":
args = [
"--remote-debugging-port",
str(self.debugging_port),
"--profile",
self.user_data_dir,
]
if self.headless:
args.append("--headless")
else:
raise NotImplementedError(f"Browser type {self.browser_type} not supported")
return base_args + args
async def cleanup(self):
"""Cleanup browser process and temporary directory"""
# Set shutting_down flag BEFORE any termination actions
self.shutting_down = True
if self.browser_process:
try:
self.browser_process.terminate()
# Wait for process to end gracefully
for _ in range(10): # 10 attempts, 100ms each
if self.browser_process.poll() is not None:
break
await asyncio.sleep(0.1)
# Force kill if still running
if self.browser_process.poll() is None:
self.browser_process.kill()
await asyncio.sleep(0.1) # Brief wait for kill to take effect
except Exception as e:
self.logger.error(
message="Error terminating browser: {error}",
tag="ERROR",
params={"error": str(e)},
)
if self.temp_dir and os.path.exists(self.temp_dir):
try:
shutil.rmtree(self.temp_dir)
except Exception as e:
self.logger.error(
message="Error removing temporary directory: {error}",
tag="ERROR",
params={"error": str(e)},
)
class BrowserManager:
"""
Manages the browser instance and context.
Attributes:
config (BrowserConfig): Configuration object containing all browser settings
logger: Logger instance for recording events and errors
browser (Browser): The browser instance
default_context (BrowserContext): The default browser context
managed_browser (ManagedBrowser): The managed browser instance
playwright (Playwright): The Playwright instance
sessions (dict): Dictionary to store session information
session_ttl (int): Session timeout in seconds
"""
def __init__(self, browser_config: BrowserConfig, logger=None):
"""
Initialize the BrowserManager with a browser configuration.
Args:
browser_config (BrowserConfig): Configuration object containing all browser settings
logger: Logger instance for recording events and errors
"""
self.config: BrowserConfig = browser_config
self.logger = logger
# Browser state
self.browser = None
self.default_context = None
self.managed_browser = None
self.playwright = None
# Session management
self.sessions = {}
self.session_ttl = 1800 # 30 minutes
# Keep track of contexts by a "config signature," so each unique config reuses a single context
self.contexts_by_config = {}
self._contexts_lock = asyncio.Lock()
# Initialize ManagedBrowser if needed
if self.config.use_managed_browser:
self.managed_browser = ManagedBrowser(
browser_type=self.config.browser_type,
user_data_dir=self.config.user_data_dir,
headless=self.config.headless,
logger=self.logger,
debugging_port=self.config.debugging_port,
)
async def start(self):
"""
Start the browser instance and set up the default context.
How it works:
1. Check if Playwright is already initialized.
2. If not, initialize Playwright.
3. If managed browser is used, start it and connect to the CDP endpoint.
4. If managed browser is not used, launch the browser and set up the default context.
Note: This method should be called in a separate task to avoid blocking the main event loop.
"""
if self.playwright is None:
from playwright.async_api import async_playwright
self.playwright = await async_playwright().start()
if self.config.use_managed_browser:
cdp_url = await self.managed_browser.start()
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
contexts = self.browser.contexts
if contexts:
self.default_context = contexts[0]
else:
self.default_context = await self.create_browser_context()
# self.default_context = await self.browser.new_context(
# viewport={
# "width": self.config.viewport_width,
# "height": self.config.viewport_height,
# },
# storage_state=self.config.storage_state,
# user_agent=self.config.headers.get(
# "User-Agent", self.config.user_agent
# ),
# accept_downloads=self.config.accept_downloads,
# ignore_https_errors=self.config.ignore_https_errors,
# java_script_enabled=self.config.java_script_enabled,
# )
await self.setup_context(self.default_context)
else:
browser_args = self._build_browser_args()
# Launch appropriate browser type
if self.config.browser_type == "firefox":
self.browser = await self.playwright.firefox.launch(**browser_args)
elif self.config.browser_type == "webkit":
self.browser = await self.playwright.webkit.launch(**browser_args)
else:
self.browser = await self.playwright.chromium.launch(**browser_args)
self.default_context = self.browser
def _build_browser_args(self) -> dict:
"""Build browser launch arguments from config."""
args = [
"--disable-gpu",
"--disable-gpu-compositing",
"--disable-software-rasterizer",
"--no-sandbox",
"--disable-dev-shm-usage",
"--no-first-run",
"--no-default-browser-check",
"--disable-infobars",
"--window-position=0,0",
"--ignore-certificate-errors",
"--ignore-certificate-errors-spki-list",
"--disable-blink-features=AutomationControlled",
"--window-position=400,0",
"--disable-renderer-backgrounding",
"--disable-ipc-flooding-protection",
"--force-color-profile=srgb",
"--mute-audio",
"--disable-background-timer-throttling",
# "--single-process",
f"--window-size={self.config.viewport_width},{self.config.viewport_height}",
]
if self.config.light_mode:
args.extend(BROWSER_DISABLE_OPTIONS)
if self.config.text_mode:
args.extend(
[
"--blink-settings=imagesEnabled=false",
"--disable-remote-fonts",
"--disable-images",
"--disable-javascript",
"--disable-software-rasterizer",
"--disable-dev-shm-usage",
]
)
if self.config.extra_args:
args.extend(self.config.extra_args)
browser_args = {"headless": self.config.headless, "args": args}
if self.config.chrome_channel:
browser_args["channel"] = self.config.chrome_channel
if self.config.accept_downloads:
browser_args["downloads_path"] = self.config.downloads_path or os.path.join(
os.getcwd(), "downloads"
)
os.makedirs(browser_args["downloads_path"], exist_ok=True)
if self.config.proxy or self.config.proxy_config:
from playwright.async_api import ProxySettings
proxy_settings = (
ProxySettings(server=self.config.proxy)
if self.config.proxy
else ProxySettings(
server=self.config.proxy_config.get("server"),
username=self.config.proxy_config.get("username"),
password=self.config.proxy_config.get("password"),
)
)
browser_args["proxy"] = proxy_settings
return browser_args
async def setup_context(
self,
context: BrowserContext,
crawlerRunConfig: CrawlerRunConfig = None,
is_default=False,
):
"""
Set up a browser context with the configured options.
How it works:
1. Set extra HTTP headers if provided.
2. Add cookies if provided.
3. Load storage state if provided.
4. Accept downloads if enabled.
5. Set default timeouts for navigation and download.
6. Set user agent if provided.
7. Set browser hints if provided.
8. Set proxy if provided.
9. Set downloads path if provided.
10. Set storage state if provided.
11. Set cache if provided.
12. Set extra HTTP headers if provided.
13. Add cookies if provided.
14. Set default timeouts for navigation and download if enabled.
15. Set user agent if provided.
16. Set browser hints if provided.
Args:
context (BrowserContext): The browser context to set up
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
is_default (bool): Flag indicating if this is the default context
Returns:
None
"""
if self.config.headers:
await context.set_extra_http_headers(self.config.headers)
if self.config.cookies:
await context.add_cookies(self.config.cookies)
if self.config.storage_state:
await context.storage_state(path=None)
if self.config.accept_downloads:
context.set_default_timeout(DOWNLOAD_PAGE_TIMEOUT)
context.set_default_navigation_timeout(DOWNLOAD_PAGE_TIMEOUT)
if self.config.downloads_path:
context._impl_obj._options["accept_downloads"] = True
context._impl_obj._options[
"downloads_path"
] = self.config.downloads_path
# Handle user agent and browser hints
if self.config.user_agent:
combined_headers = {
"User-Agent": self.config.user_agent,
"sec-ch-ua": self.config.browser_hint,
}
combined_headers.update(self.config.headers)
await context.set_extra_http_headers(combined_headers)
# Add default cookie
await context.add_cookies(
[
{
"name": "cookiesEnabled",
"value": "true",
"url": crawlerRunConfig.url
if crawlerRunConfig
else "https://crawl4ai.com/",
}
]
)
# Handle navigator overrides
if crawlerRunConfig:
if (
crawlerRunConfig.override_navigator
or crawlerRunConfig.simulate_user
or crawlerRunConfig.magic
):
await context.add_init_script(load_js_script("navigator_overrider"))
async def create_browser_context(self, crawlerRunConfig: CrawlerRunConfig = None):
"""
Creates and returns a new browser context with configured settings.
Applies text-only mode settings if text_mode is enabled in config.
Returns:
Context: Browser context object with the specified configurations
"""
# Base settings
user_agent = self.config.headers.get("User-Agent", self.config.user_agent)
viewport_settings = {
"width": self.config.viewport_width,
"height": self.config.viewport_height,
}
proxy_settings = {"server": self.config.proxy} if self.config.proxy else None
blocked_extensions = [
# Images
"jpg",
"jpeg",
"png",
"gif",
"webp",
"svg",
"ico",
"bmp",
"tiff",
"psd",
# Fonts
"woff",
"woff2",
"ttf",
"otf",
"eot",
# Styles
# 'css', 'less', 'scss', 'sass',
# Media
"mp4",
"webm",
"ogg",
"avi",
"mov",
"wmv",
"flv",
"m4v",
"mp3",
"wav",
"aac",
"m4a",
"opus",
"flac",
# Documents
"pdf",
"doc",
"docx",
"xls",
"xlsx",
"ppt",
"pptx",
# Archives
"zip",
"rar",
"7z",
"tar",
"gz",
# Scripts and data
"xml",
"swf",
"wasm",
]
# Common context settings
context_settings = {
"user_agent": user_agent,
"viewport": viewport_settings,
"proxy": proxy_settings,
"accept_downloads": self.config.accept_downloads,
"storage_state": self.config.storage_state,
"ignore_https_errors": self.config.ignore_https_errors,
"device_scale_factor": 1.0,
"java_script_enabled": self.config.java_script_enabled,
}
if crawlerRunConfig:
# Check if there is value for crawlerRunConfig.proxy_config set add that to context
if crawlerRunConfig.proxy_config:
proxy_settings = {
"server": crawlerRunConfig.proxy_config.get("server"),
}
if crawlerRunConfig.proxy_config.get("username"):
proxy_settings.update({
"username": crawlerRunConfig.proxy_config.get("username"),
"password": crawlerRunConfig.proxy_config.get("password"),
})
context_settings["proxy"] = proxy_settings
if self.config.text_mode:
text_mode_settings = {
"has_touch": False,
"is_mobile": False,
}
# Update context settings with text mode settings
context_settings.update(text_mode_settings)
# Create and return the context with all settings
context = await self.browser.new_context(**context_settings)
# Apply text mode settings if enabled
if self.config.text_mode:
# Create and apply route patterns for each extension
for ext in blocked_extensions:
await context.route(f"**/*.{ext}", lambda route: route.abort())
return context
def _make_config_signature(self, crawlerRunConfig: CrawlerRunConfig) -> str:
"""
Converts the crawlerRunConfig into a dict, excludes ephemeral fields,
then returns a hash of the sorted JSON. This yields a stable signature
that identifies configurations requiring a unique browser context.
"""
import json
config_dict = crawlerRunConfig.__dict__.copy()
# Exclude items that do not affect browser-level setup.
# Expand or adjust as needed, e.g. chunking_strategy is purely for data extraction, not for browser config.
ephemeral_keys = [
"session_id",
"js_code",
"scraping_strategy",
"extraction_strategy",
"chunking_strategy",
"cache_mode",
"content_filter",
"semaphore_count",
"url"
]
for key in ephemeral_keys:
if key in config_dict:
del config_dict[key]
# Convert to canonical JSON string
signature_json = json.dumps(config_dict, sort_keys=True, default=str)
# Hash the JSON so we get a compact, unique string
signature_hash = hashlib.sha256(signature_json.encode("utf-8")).hexdigest()
return signature_hash
async def get_page(self, crawlerRunConfig: CrawlerRunConfig):
"""
Get a page for the given session ID, creating a new one if needed.
Args:
crawlerRunConfig (CrawlerRunConfig): Configuration object containing all browser settings
Returns:
(page, context): The Page and its BrowserContext
"""
self._cleanup_expired_sessions()
# If a session_id is provided and we already have it, reuse that page + context
if crawlerRunConfig.session_id and crawlerRunConfig.session_id in self.sessions:
context, page, _ = self.sessions[crawlerRunConfig.session_id]
# Update last-used timestamp
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
return page, context
# If using a managed browser, just grab the shared default_context
if self.config.use_managed_browser:
context = self.default_context
page = await context.new_page()
else:
# Otherwise, check if we have an existing context for this config
config_signature = self._make_config_signature(crawlerRunConfig)
async with self._contexts_lock:
if config_signature in self.contexts_by_config:
context = self.contexts_by_config[config_signature]
else:
# Create and setup a new context
context = await self.create_browser_context(crawlerRunConfig)
await self.setup_context(context, crawlerRunConfig)
self.contexts_by_config[config_signature] = context
# Create a new page from the chosen context
page = await context.new_page()
# If a session_id is specified, store this session so we can reuse later
if crawlerRunConfig.session_id:
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
return page, context
async def kill_session(self, session_id: str):
"""
Kill a browser session and clean up resources.
Args:
session_id (str): The session ID to kill.
"""
if session_id in self.sessions:
context, page, _ = self.sessions[session_id]
await page.close()
if not self.config.use_managed_browser:
await context.close()
del self.sessions[session_id]
def _cleanup_expired_sessions(self):
"""Clean up expired sessions based on TTL."""
current_time = time.time()
expired_sessions = [
sid
for sid, (_, _, last_used) in self.sessions.items()
if current_time - last_used > self.session_ttl
]
for sid in expired_sessions:
asyncio.create_task(self.kill_session(sid))
async def close(self):
"""Close all browser resources and clean up."""
if self.config.sleep_on_close:
await asyncio.sleep(0.5)
session_ids = list(self.sessions.keys())
for session_id in session_ids:
await self.kill_session(session_id)
# Now close all contexts we created. This reclaims memory from ephemeral contexts.
for ctx in self.contexts_by_config.values():
try:
await ctx.close()
except Exception as e:
self.logger.error(
message="Error closing context: {error}",
tag="ERROR",
params={"error": str(e)}
)
self.contexts_by_config.clear()
if self.browser:
await self.browser.close()
self.browser = None
if self.managed_browser:
await asyncio.sleep(0.5)
await self.managed_browser.cleanup()
self.managed_browser = None
if self.playwright:
await self.playwright.stop()
self.playwright = None

View File

@@ -39,7 +39,9 @@ dependencies = [
"httpx==0.27.2",
"fake-useragent>=2.0.3",
"click>=8.1.7",
"pyperclip>=1.8.2"
"pyperclip>=1.8.2",
"cchardet>=2.1.7",
"aiohttp>=3.11.11"
]
classifiers = [
"Development Status :: 4 - Beta",

View File

@@ -0,0 +1,56 @@
import asyncio
from crawl4ai import (
AsyncWebCrawler,
CrawlerRunConfig,
HTTPCrawlerConfig,
CacheMode,
DefaultMarkdownGenerator,
PruningContentFilter
)
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
async def main():
# Initialize HTTP crawler strategy
http_strategy = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(
method="GET",
verify_ssl=True,
follow_redirects=True
),
logger=AsyncLogger(verbose=True)
)
# Initialize web crawler with HTTP strategy
async with AsyncWebCrawler(crawler_strategy=http_strategy) as crawler:
crawler_config = CrawlerRunConfig(
cache_mode=CacheMode.BYPASS,
markdown_generator=DefaultMarkdownGenerator(
content_filter=PruningContentFilter(
threshold=0.48,
threshold_type="fixed",
min_word_threshold=0
)
)
)
# Test different URLs
urls = [
"https://example.com",
"https://httpbin.org/get",
"raw://<html><body>Test content</body></html>"
]
for url in urls:
print(f"\n=== Testing {url} ===")
try:
result = await crawler.arun(url=url, config=crawler_config)
print(f"Status: {result.status_code}")
print(f"Raw HTML length: {len(result.html)}")
if hasattr(result, 'markdown_v2'):
print(f"Markdown length: {len(result.markdown_v2.raw_markdown)}")
except Exception as e:
print(f"Error: {e}")
if __name__ == "__main__":
asyncio.run(main())

View File

@@ -0,0 +1,116 @@
from tkinter import N
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
from crawl4ai.async_logger import AsyncLogger
from crawl4ai import CrawlerRunConfig, HTTPCrawlerConfig
from crawl4ai.async_crawler_strategy import ConnectionTimeoutError
import asyncio
import os
async def main():
"""Test the AsyncHTTPCrawlerStrategy with various scenarios"""
logger = AsyncLogger(verbose=True)
# Initialize the strategy with default HTTPCrawlerConfig
crawler = AsyncHTTPCrawlerStrategy(
browser_config=HTTPCrawlerConfig(),
logger=logger
)
# Test 1: Basic HTTP GET
print("\n=== Test 1: Basic HTTP GET ===")
result = await crawler.crawl("https://example.com")
print(f"Status: {result.status_code}")
print(f"Content length: {len(result.html)}")
print(f"Headers: {dict(result.response_headers)}")
# Test 2: POST request with JSON
print("\n=== Test 2: POST with JSON ===")
crawler.browser_config = crawler.browser_config.clone(
method="POST",
json={"test": "data"},
headers={"Content-Type": "application/json"}
)
try:
result = await crawler.crawl(
"https://httpbin.org/post",
)
print(f"Status: {result.status_code}")
print(f"Response: {result.html[:200]}...")
except Exception as e:
print(f"Error: {e}")
# Test 3: File handling
crawler.browser_config = HTTPCrawlerConfig()
print("\n=== Test 3: Local file handling ===")
# Create a tmp file with test content
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(b"<html><body>Test content</body></html>")
f.close()
result = await crawler.crawl(f"file://{f.name}")
print(f"File content: {result.html}")
# Test 4: Raw content
print("\n=== Test 4: Raw content handling ===")
raw_html = "raw://<html><body>Raw test content</body></html>"
result = await crawler.crawl(raw_html)
print(f"Raw content: {result.html}")
# Test 5: Custom hooks
print("\n=== Test 5: Custom hooks ===")
async def before_request(url, kwargs):
print(f"Before request to {url}")
kwargs['headers']['X-Custom'] = 'test'
async def after_request(response):
print(f"After request, status: {response.status_code}")
crawler.set_hook('before_request', before_request)
crawler.set_hook('after_request', after_request)
result = await crawler.crawl("https://example.com")
# Test 6: Error handling
print("\n=== Test 6: Error handling ===")
try:
await crawler.crawl("https://nonexistent.domain.test")
except Exception as e:
print(f"Expected error: {e}")
# Test 7: Redirects
print("\n=== Test 7: Redirect handling ===")
crawler.browser_config = HTTPCrawlerConfig(follow_redirects=True)
result = await crawler.crawl("http://httpbin.org/redirect/1")
print(f"Final URL: {result.redirected_url}")
# Test 8: Custom timeout
print("\n=== Test 8: Custom timeout ===")
try:
await crawler.crawl(
"https://httpbin.org/delay/5",
config=CrawlerRunConfig(page_timeout=2)
)
except ConnectionTimeoutError as e:
print(f"Expected timeout: {e}")
# Test 9: SSL verification
print("\n=== Test 9: SSL verification ===")
crawler.browser_config = HTTPCrawlerConfig(verify_ssl=False)
try:
await crawler.crawl("https://expired.badssl.com/")
print("Connected to invalid SSL site with verification disabled")
except Exception as e:
print(f"SSL error: {e}")
# Test 10: Large file streaming
print("\n=== Test 10: Large file streaming ===")
from tempfile import NamedTemporaryFile
with NamedTemporaryFile(delete=False) as f:
f.write(b"<html><body>" + b"X" * 1024 * 1024 * 10 + b"</body></html>")
f.close()
result = await crawler.crawl("file://" + f.name)
print(f"Large file content length: {len(result.html)}")
os.remove(f.name)
crawler.close()
if __name__ == "__main__":
asyncio.run(main())