Compare commits
2 Commits
main
...
next-brows
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b11a91e1dd | ||
|
|
7aaaaae461 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -225,3 +225,4 @@ tree.md
|
||||
.scripts
|
||||
.local
|
||||
.do
|
||||
plans/
|
||||
@@ -11,7 +11,7 @@ from .user_agent_generator import UserAgentGenerator
|
||||
from .extraction_strategy import ExtractionStrategy
|
||||
from .chunking_strategy import ChunkingStrategy
|
||||
from .markdown_generation_strategy import MarkdownGenerationStrategy
|
||||
from typing import Union, List
|
||||
from typing import Union, List, Optional
|
||||
|
||||
|
||||
class BrowserConfig:
|
||||
@@ -27,11 +27,12 @@ class BrowserConfig:
|
||||
Default: "chromium".
|
||||
headless (bool): Whether to run the browser in headless mode (no visible GUI).
|
||||
Default: True.
|
||||
use_managed_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
use_remote_browser (bool): Launch the browser using a managed approach (e.g., via CDP), allowing
|
||||
advanced manipulation. Default: False.
|
||||
use_docker (bool): Whether to use Docker for browser execution. Default: False.
|
||||
debugging_port (int): Port for the browser debugging protocol. Default: 9222.
|
||||
use_persistent_context (bool): Use a persistent browser context (like a persistent profile).
|
||||
Automatically sets use_managed_browser=True. Default: False.
|
||||
Automatically sets use_remote_browser=True. Default: False.
|
||||
user_data_dir (str or None): Path to a user data directory for persistent sessions. If None, a
|
||||
temporary directory may be used. Default: None.
|
||||
chrome_channel (str): The Chrome channel to launch (e.g., "chrome", "msedge"). Only applies if browser_type
|
||||
@@ -76,9 +77,10 @@ class BrowserConfig:
|
||||
self,
|
||||
browser_type: str = "chromium",
|
||||
headless: bool = True,
|
||||
use_managed_browser: bool = False,
|
||||
use_remote_browser: bool = False,
|
||||
use_docker: bool = False,
|
||||
use_persistent_context: bool = False,
|
||||
user_data_dir: str = None,
|
||||
user_data_dir: Optional[str] = None,
|
||||
chrome_channel: str = "chromium",
|
||||
channel: str = "chromium",
|
||||
proxy: str = None,
|
||||
@@ -107,7 +109,8 @@ class BrowserConfig:
|
||||
):
|
||||
self.browser_type = browser_type
|
||||
self.headless = headless
|
||||
self.use_managed_browser = use_managed_browser
|
||||
self.use_remote_browser = use_remote_browser
|
||||
self.use_docker = use_docker
|
||||
self.use_persistent_context = use_persistent_context
|
||||
self.user_data_dir = user_data_dir
|
||||
self.chrome_channel = chrome_channel or self.browser_type or "chromium"
|
||||
@@ -148,14 +151,15 @@ class BrowserConfig:
|
||||
|
||||
# If persistent context is requested, ensure managed browser is enabled
|
||||
if self.use_persistent_context:
|
||||
self.use_managed_browser = True
|
||||
self.use_remote_browser = True
|
||||
|
||||
@staticmethod
|
||||
def from_kwargs(kwargs: dict) -> "BrowserConfig":
|
||||
return BrowserConfig(
|
||||
browser_type=kwargs.get("browser_type", "chromium"),
|
||||
headless=kwargs.get("headless", True),
|
||||
use_managed_browser=kwargs.get("use_managed_browser", False),
|
||||
use_remote_browser=kwargs.get("use_remote_browser", False),
|
||||
use_docker=kwargs.get("use_docker", False),
|
||||
use_persistent_context=kwargs.get("use_persistent_context", False),
|
||||
user_data_dir=kwargs.get("user_data_dir"),
|
||||
chrome_channel=kwargs.get("chrome_channel", "chromium"),
|
||||
|
||||
@@ -24,6 +24,7 @@ from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_logger import AsyncLogger
|
||||
from playwright_stealth import StealthConfig, stealth_async
|
||||
from .ssl_certificate import SSLCertificate
|
||||
from .browser_farm.docker_browser import DockerBrowser
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
@@ -63,7 +64,7 @@ BROWSER_DISABLE_OPTIONS = [
|
||||
]
|
||||
|
||||
|
||||
class ManagedBrowser:
|
||||
class RemoteConnector:
|
||||
"""
|
||||
Manages the browser process and context. This class allows to connect to the browser using CDP protocol.
|
||||
|
||||
@@ -104,7 +105,7 @@ class ManagedBrowser:
|
||||
debugging_port: int = 9222,
|
||||
):
|
||||
"""
|
||||
Initialize the ManagedBrowser instance.
|
||||
Initialize the RemoteConnector instance.
|
||||
|
||||
Args:
|
||||
browser_type (str): The type of browser to launch. Supported values: "chromium", "firefox", "webkit".
|
||||
@@ -295,7 +296,7 @@ class BrowserManager:
|
||||
logger: Logger instance for recording events and errors
|
||||
browser (Browser): The browser instance
|
||||
default_context (BrowserContext): The default browser context
|
||||
managed_browser (ManagedBrowser): The managed browser instance
|
||||
remote_browser (RemoteConnector or DockerBrowser): The remote browser instance
|
||||
playwright (Playwright): The Playwright instance
|
||||
sessions (dict): Dictionary to store session information
|
||||
session_ttl (int): Session timeout in seconds
|
||||
@@ -314,16 +315,18 @@ class BrowserManager:
|
||||
# Browser state
|
||||
self.browser = None
|
||||
self.default_context = None
|
||||
self.managed_browser = None
|
||||
self.remote_browser = None # Used for both managed and docker browsers
|
||||
self.playwright = None
|
||||
|
||||
# Session management
|
||||
self.sessions = {}
|
||||
self.session_ttl = 1800 # 30 minutes
|
||||
|
||||
# Initialize ManagedBrowser if needed
|
||||
if self.config.use_managed_browser:
|
||||
self.managed_browser = ManagedBrowser(
|
||||
# Initialize remote browser connection
|
||||
if self.config.use_docker:
|
||||
self.remote_browser = DockerBrowser()
|
||||
elif self.config.use_remote_browser:
|
||||
self.remote_browser = RemoteConnector(
|
||||
browser_type=self.config.browser_type,
|
||||
user_data_dir=self.config.user_data_dir,
|
||||
headless=self.config.headless,
|
||||
@@ -348,29 +351,24 @@ class BrowserManager:
|
||||
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.use_managed_browser:
|
||||
cdp_url = await self.managed_browser.start()
|
||||
if self.config.use_docker or self.config.use_remote_browser:
|
||||
# Get endpoint from remote browser (Docker or managed)
|
||||
if self.config.use_docker:
|
||||
host, port = await self.remote_browser.get_browser_endpoint()
|
||||
cdp_url = f"http://{host}:{port}"
|
||||
else:
|
||||
cdp_url = await self.remote_browser.start()
|
||||
|
||||
# Connect to remote browser
|
||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||
contexts = self.browser.contexts
|
||||
if contexts:
|
||||
self.default_context = contexts[0]
|
||||
else:
|
||||
self.default_context = await self.create_browser_context()
|
||||
# self.default_context = await self.browser.new_context(
|
||||
# viewport={
|
||||
# "width": self.config.viewport_width,
|
||||
# "height": self.config.viewport_height,
|
||||
# },
|
||||
# storage_state=self.config.storage_state,
|
||||
# user_agent=self.config.headers.get(
|
||||
# "User-Agent", self.config.user_agent
|
||||
# ),
|
||||
# accept_downloads=self.config.accept_downloads,
|
||||
# ignore_https_errors=self.config.ignore_https_errors,
|
||||
# java_script_enabled=self.config.java_script_enabled,
|
||||
# )
|
||||
await self.setup_context(self.default_context)
|
||||
await self.setup_context(self.default_context, None)
|
||||
else:
|
||||
# Regular browser launch code
|
||||
browser_args = self._build_browser_args()
|
||||
|
||||
# Launch appropriate browser type
|
||||
@@ -453,12 +451,7 @@ class BrowserManager:
|
||||
|
||||
return browser_args
|
||||
|
||||
async def setup_context(
|
||||
self,
|
||||
context: BrowserContext,
|
||||
crawlerRunConfig: CrawlerRunConfig,
|
||||
is_default=False,
|
||||
):
|
||||
async def setup_context(self, context: BrowserContext, crawlerRunConfig: CrawlerRunConfig, is_default=False, ):
|
||||
"""
|
||||
Set up a browser context with the configured options.
|
||||
|
||||
@@ -514,11 +507,11 @@ class BrowserManager:
|
||||
combined_headers.update(self.config.headers)
|
||||
await context.set_extra_http_headers(combined_headers)
|
||||
|
||||
if crawlerRunConfig:
|
||||
# Add default cookie
|
||||
await context.add_cookies(
|
||||
[{"name": "cookiesEnabled", "value": "true", "url": crawlerRunConfig.url}]
|
||||
)
|
||||
|
||||
# Handle navigator overrides
|
||||
if (
|
||||
crawlerRunConfig.override_navigator
|
||||
@@ -610,7 +603,7 @@ class BrowserManager:
|
||||
self.sessions[crawlerRunConfig.session_id] = (context, page, time.time())
|
||||
return page, context
|
||||
|
||||
if self.config.use_managed_browser:
|
||||
if self.config.use_remote_browser:
|
||||
context = self.default_context
|
||||
page = await context.new_page()
|
||||
else:
|
||||
@@ -633,7 +626,7 @@ class BrowserManager:
|
||||
if session_id in self.sessions:
|
||||
context, page, _ = self.sessions[session_id]
|
||||
await page.close()
|
||||
if not self.config.use_managed_browser:
|
||||
if not self.config.use_remote_browser:
|
||||
await context.close()
|
||||
del self.sessions[session_id]
|
||||
|
||||
@@ -661,10 +654,11 @@ class BrowserManager:
|
||||
await self.browser.close()
|
||||
self.browser = None
|
||||
|
||||
if self.managed_browser:
|
||||
if self.remote_browser:
|
||||
await asyncio.sleep(0.5)
|
||||
await self.managed_browser.cleanup()
|
||||
self.managed_browser = None
|
||||
if hasattr(self.remote_browser, 'cleanup'):
|
||||
await self.remote_browser.cleanup()
|
||||
self.remote_browser = None
|
||||
|
||||
if self.playwright:
|
||||
await self.playwright.stop()
|
||||
|
||||
48
crawl4ai/browser_farm/Dockerfile
Normal file
48
crawl4ai/browser_farm/Dockerfile
Normal file
@@ -0,0 +1,48 @@
|
||||
FROM python:3.10-slim
|
||||
|
||||
# System dependencies for Playwright/Chromium
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
gnupg \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libx11-xcb1 \
|
||||
libxcb-dri3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
socat \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install crawl4ai and setup
|
||||
RUN pip install crawl4ai
|
||||
RUN crawl4ai-setup
|
||||
|
||||
# Add startup script
|
||||
RUN echo '#!/bin/bash\n\
|
||||
/root/.cache/ms-playwright/chromium-1148/chrome-linux/chrome \
|
||||
--remote-debugging-port=9222 \
|
||||
--no-sandbox \
|
||||
--headless=new \
|
||||
--disable-gpu &\n\
|
||||
sleep 2\n\
|
||||
socat TCP-LISTEN:9223,fork,reuseaddr TCP:127.0.0.1:9222\n' > /start.sh && \
|
||||
chmod +x /start.sh
|
||||
|
||||
EXPOSE 9223
|
||||
CMD ["/start.sh"]
|
||||
4
crawl4ai/browser_farm/__init__.py
Normal file
4
crawl4ai/browser_farm/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
from .service import BrowserFarmService, BrowserEndpoint
|
||||
from .docker_browser import DockerBrowser
|
||||
|
||||
__all__ = ['BrowserFarmService', 'BrowserEndpoint', 'DockerBrowser']
|
||||
12
crawl4ai/browser_farm/docker_browser.py
Normal file
12
crawl4ai/browser_farm/docker_browser.py
Normal file
@@ -0,0 +1,12 @@
|
||||
from .service import BrowserFarmService
|
||||
|
||||
class DockerBrowser:
|
||||
"""Client to get browser endpoints from BrowserFarmService"""
|
||||
|
||||
def __init__(self):
|
||||
self.service = BrowserFarmService()
|
||||
|
||||
async def get_browser_endpoint(self) -> tuple[str, int]:
|
||||
"""Get host/port for the browser"""
|
||||
endpoint = await self.service.get_available_browser()
|
||||
return endpoint.host, endpoint.port
|
||||
33
crawl4ai/browser_farm/service.py
Normal file
33
crawl4ai/browser_farm/service.py
Normal file
@@ -0,0 +1,33 @@
|
||||
from dataclasses import dataclass
|
||||
import aiohttp
|
||||
|
||||
@dataclass
|
||||
class BrowserEndpoint:
|
||||
"""Represents a browser endpoint in the farm"""
|
||||
host: str
|
||||
port: int
|
||||
|
||||
class BrowserFarmService:
|
||||
"""MVP: Returns hardcoded browser endpoint"""
|
||||
def __init__(self):
|
||||
# For MVP, hardcode the values
|
||||
self._browser = BrowserEndpoint(
|
||||
host="localhost",
|
||||
port=9333 # Mapped from Docker's 9223
|
||||
)
|
||||
|
||||
async def get_available_browser(self) -> BrowserEndpoint:
|
||||
"""Returns our single browser endpoint"""
|
||||
if not await self.health_check():
|
||||
raise ConnectionError("No healthy browser available")
|
||||
return self._browser
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Basic health check - verify endpoint responds"""
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"http://{self._browser.host}:{self._browser.port}/json/version"
|
||||
async with session.get(url) as response:
|
||||
return response.status == 200
|
||||
except:
|
||||
return False
|
||||
31
docs/examples/hello_world_docker.py
Normal file
31
docs/examples/hello_world_docker.py
Normal file
@@ -0,0 +1,31 @@
|
||||
import asyncio
|
||||
from crawl4ai import *
|
||||
|
||||
async def main():
|
||||
# Configure browser to use Docker
|
||||
browser_config = BrowserConfig(
|
||||
headless=True,
|
||||
verbose=True,
|
||||
use_docker=True # Enable Docker browser
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler(config=browser_config) as crawler:
|
||||
crawler_config = CrawlerRunConfig(
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(
|
||||
content_filter=PruningContentFilter(
|
||||
threshold=0.48,
|
||||
threshold_type="fixed",
|
||||
min_word_threshold=0
|
||||
)
|
||||
)
|
||||
)
|
||||
|
||||
result = await crawler.arun(
|
||||
url="https://www.helloworld.org",
|
||||
config=crawler_config
|
||||
)
|
||||
print(result.markdown_v2.raw_markdown[:500])
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
@@ -160,7 +160,7 @@ async def browser_management_example():
|
||||
print(f"Browser profile will be saved to: {user_data_dir}")
|
||||
|
||||
async with AsyncWebCrawler(
|
||||
use_managed_browser=True,
|
||||
use_remote_browser=True,
|
||||
user_data_dir=user_data_dir,
|
||||
headless=False,
|
||||
verbose=True
|
||||
@@ -221,7 +221,7 @@ async def api_example():
|
||||
},
|
||||
"crawler_params": {
|
||||
"headless": True,
|
||||
# "use_managed_browser": True
|
||||
# "use_remote_browser": True
|
||||
},
|
||||
"cache_mode": "bypass",
|
||||
# "screenshot": True,
|
||||
|
||||
@@ -24,7 +24,7 @@ Crawl4AI empowers you to navigate and interact with the web using your authentic
|
||||
headless=False, # Set to False for initial setup to view browser actions
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
use_managed_browser=True, # Enables persistent browser sessions
|
||||
use_remote_browser=True, # Enables persistent browser sessions
|
||||
browser_type="chromium",
|
||||
user_data_dir="/path/to/user_profile_data" # Path to save session data
|
||||
)
|
||||
@@ -61,7 +61,7 @@ async def main():
|
||||
browser_config = BrowserConfig(
|
||||
headless=True, # Automate subsequent runs
|
||||
verbose=True,
|
||||
use_managed_browser=True,
|
||||
use_remote_browser=True,
|
||||
user_data_dir="/path/to/user_profile_data"
|
||||
)
|
||||
|
||||
|
||||
@@ -63,7 +63,7 @@ async with AsyncWebCrawler(config=config) as crawler:
|
||||
```
|
||||
|
||||
### Managed Browser
|
||||
The `ManagedBrowser` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
|
||||
The `RemoteConnector` class offers a high-level abstraction for managing browser instances, emphasizing resource management, debugging capabilities, and anti-bot measures.
|
||||
|
||||
#### How It Works
|
||||
- **Browser Process Management**:
|
||||
@@ -82,7 +82,7 @@ The `ManagedBrowser` class offers a high-level abstraction for managing browser
|
||||
- **Scalability**:
|
||||
- Handles multiple browser instances concurrently, scaling dynamically based on workload.
|
||||
|
||||
#### Example: Using `ManagedBrowser`
|
||||
#### Example: Using `RemoteConnector`
|
||||
```python
|
||||
from crawl4ai import AsyncWebCrawler, BrowserConfig
|
||||
|
||||
|
||||
@@ -10,7 +10,7 @@
|
||||
| async_crawler_strategy.py | headers | `kwargs.get("headers", {})` | AsyncPlaywrightCrawlerStrategy | Custom HTTP headers for requests |
|
||||
| async_crawler_strategy.py | verbose | `kwargs.get("verbose", False)` | AsyncPlaywrightCrawlerStrategy | Enable detailed logging output |
|
||||
| async_crawler_strategy.py | sleep_on_close | `kwargs.get("sleep_on_close", False)` | AsyncPlaywrightCrawlerStrategy | Add delay before closing browser |
|
||||
| async_crawler_strategy.py | use_managed_browser | `kwargs.get("use_managed_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance |
|
||||
| async_crawler_strategy.py | use_remote_browser | `kwargs.get("use_remote_browser", False)` | AsyncPlaywrightCrawlerStrategy | Use managed browser instance |
|
||||
| async_crawler_strategy.py | user_data_dir | `kwargs.get("user_data_dir", None)` | AsyncPlaywrightCrawlerStrategy | Custom directory for browser profile data |
|
||||
| async_crawler_strategy.py | session_id | `kwargs.get("session_id")` | AsyncPlaywrightCrawlerStrategy | Unique identifier for browser session |
|
||||
| async_crawler_strategy.py | override_navigator | `kwargs.get("override_navigator", False)` | AsyncPlaywrightCrawlerStrategy | Override browser navigator properties |
|
||||
|
||||
@@ -489,7 +489,7 @@ request = {
|
||||
|
||||
# Session Management
|
||||
"user_data_dir": "./browser-data", # Browser profile location
|
||||
"use_managed_browser": True, # Use persistent browser
|
||||
"use_remote_browser": True, # Use persistent browser
|
||||
}
|
||||
}
|
||||
```
|
||||
|
||||
586
plans/browser-farm-context.md
Normal file
586
plans/browser-farm-context.md
Normal file
@@ -0,0 +1,586 @@
|
||||
# Unclecode:
|
||||
|
||||
I plan to create a Docker container that runs multiple instances of the Chromium browser. We will connect these browsers to my library remotely using Playwright for crawling. I will share part of the code that manages the Playwright, which supports connecting to a remote browser. We pass a user data directory to persist data. My ultimate goal is to have a Docker container with many browsers, allowing me to spawn multiple instances in the cloud for users to crawl at scale.
|
||||
|
||||
When we launch this Docker, it initially does not run any browsers. Then we will create a new module in library in charge if managing this Docker, and that will create the browsers within that Docker, enabling us to set up multiple browsers with different configurations, settings, and user data directories, or even without a user data directory. We will communicate with these browsers accordingly.
|
||||
|
||||
To achieve this, we need to create a class responsible for monitoring and managing the Docker. This class will create the Docker, turn it on, or build it for the first time. Later, we will convert this Docker into an image containing all the required libraries, preferably based on Ubuntu for ease of use. The Python wrapper we create will include functions to manage these multiple Docker instances.
|
||||
|
||||
To simplify the initial approach, we can start by imagining a single Docker container. We will create this Docker, keep it running, and then develop a Python class capable of creating a browser. We will use Crawl4ai to connect to that browser and crawl a simple website. If this is successful, we can gradually expand from there.
|
||||
|
||||
Before implementing any code, I would like you to rewrite what I just explained to ensure we have a common understanding. Please digest this information and present it in a professional manner, reflecting my needs and suggesting how we should proceed without jumping into coding just yet.
|
||||
|
||||
It's really important to me that you act as a professional software engineer in this area, understand my needs, and suggest the best way to handle them.
|
||||
|
||||
## AI
|
||||
|
||||
Core Objective:
|
||||
- Create a scalable browser farm for distributed web crawling using Docker containers
|
||||
- Enable remote connection to these browsers via Playwright
|
||||
- Support multiple concurrent browser instances with different configurations
|
||||
|
||||
Architecture Components:
|
||||
|
||||
1. Docker Container Layer:
|
||||
- Base container running Ubuntu
|
||||
- No pre-running browsers at startup
|
||||
- Contains all necessary dependencies for Chromium/Playwright
|
||||
- Ability to spawn multiple browser instances on demand
|
||||
|
||||
2. Browser Management Layer:
|
||||
- Python class to handle Docker lifecycle (create/start/stop/rebuild)
|
||||
- Functions to spawn/manage browser instances within the container
|
||||
- Support for different browser configs (with/without user data dirs)
|
||||
- Connection management via Playwright's remote capabilities
|
||||
|
||||
3. Integration with Crawl4ai:
|
||||
- Connect library to remote browsers
|
||||
- Handle crawling operations through remote browser instances
|
||||
- Manage browser sessions and persistence
|
||||
|
||||
Suggested Development Phases:
|
||||
|
||||
1. Single Container MVP:
|
||||
- Create basic Docker setup
|
||||
- Implement Python management class
|
||||
- Test single browser spawning/connection
|
||||
- Validate with simple crawl operation
|
||||
|
||||
2. Multi-Browser Extension:
|
||||
- Add multi-browser support within container
|
||||
- Implement browser pool management
|
||||
- Add configuration profiles
|
||||
|
||||
3. Scale Out:
|
||||
- Multiple container support
|
||||
- Load balancing
|
||||
- Cloud deployment considerations
|
||||
|
||||
Key Technical Considerations:
|
||||
- Container resource management
|
||||
- Browser process isolation
|
||||
- Connection pooling
|
||||
- Error recovery/resilience
|
||||
- Resource cleanup
|
||||
|
||||
Would you like me to elaborate on any of these aspects before we move to implementation?
|
||||
|
||||
# Unclecode
|
||||
|
||||
Ok, before we start ask me if any follow up questions?
|
||||
|
||||
## AI
|
||||
|
||||
Sure, I have a few crucial questions to clarify the architecture:
|
||||
|
||||
1. For browser persistence, do you plan to map user data directories from the host to the container, or keep them inside the container? This affects container lifecycle management.
|
||||
|
||||
2. When spawning browsers, would you prefer:
|
||||
- Pre-defined number at container startup
|
||||
- Dynamic creation on-demand
|
||||
- Or both options?
|
||||
|
||||
3. For browser connection management:
|
||||
- Do you need connection pooling?
|
||||
- Should failed browsers auto-restart?
|
||||
- What's your strategy for connection timeouts/retries?
|
||||
|
||||
4. For monitoring/health checks:
|
||||
- What metrics do you need?
|
||||
- How do you want to handle browser crashes?
|
||||
- Do you need logs from individual browser instances?
|
||||
|
||||
5. For security:
|
||||
- Will browsers need different isolation levels?
|
||||
- Do you need network segregation between instances?
|
||||
- Any specific security requirements for the user data?
|
||||
|
||||
These answers will help shape the implementation details. Which should we tackle first?
|
||||
|
||||
# Unclecode
|
||||
|
||||
|
||||
For the first question, we should support both options. This means that when you create a new browser, we can define whether it will map to the user data directories or be created inside Docker. We can create a Docker container and, by default, map a folder from the home directory of Crawl4ai to use as the default location. When we create or pass any user data directory, we will use that folder. Therefore, when users want to create a data directory, they must save it there and then pass it.
|
||||
|
||||
For the second question, we need dynamic creation. When we create, we can request how many we need. One very important aspect is that this browser management layer must understand resources in a Docker environment, including available memory, CPU, and base resources, and then make decisions. If we need to create a new Docker, it should handle that as well. It's also beneficial to have a predefined number as another option for setup and creation. The key point is that our browser management leaders in this system must understand available resources and act dynamically.
|
||||
|
||||
We must remember that when we have multiple browsers, each browser allows us to create multiple contexts, such as pages or browser tabs, which can operate in parallel, with each one being a separate process. Essentially, we are creating a collection of multi-process applications, so it is crucial to consider the resources. This system must never crash. We should always aim to use close to 70-80% of available resources and not exceed that, while being smart about distributing all incoming requests.
|
||||
|
||||
We can think of this browser management layer as a proxy server, like Nginx. We are building a wrapper that handles many crawling requests and distributes them among multiple Dockers running separately, each with multiple browser instances. However, the developer experience should not be affected by this complexity. Developers should only interact with one browser. In my library, it is possible to pass a session ID, which translates to one page in a browser. When there is a session ID, it means we always use the same page repeatedly to maintain the status of the previous crawling system. This orchestration should ensure simplicity for the developer.
|
||||
|
||||
We have this concept of connection pooling because we create all these browsers and make them available for immediate use. Otherwise, every time we want to create a browser, we have to wait for the object to be created. So, we definitely need this connection pooling. However, as I mentioned, developers shouldn't need to be aware of this; they should just use the library as they currently do. If a browser fails, we must have an auto-restart feature. I assume we have a layer that controls health, performance, and efficiency. This layer decides to increase or decrease the number of available running processes or manage memory. It should have the authority to restart, add, reduce, or make any similar decisions. This relates to timeouts and retries. In my current library, when users crawl, they can pass many parameters. I will share those parameters with you and provide a sample of the code to ensure you understand what the library does before we implement anything.
|
||||
|
||||
For number four, monitoring and health checks, a few things come to mind, but I would like you to add your ideas as well. One important aspect is that if a browser crashes, we need to bring it back into the game if available resources allow. We must log and easily identify the reason, especially when debugging what happened. Every single page in this structure must have a unique locator address. This address includes the Docker, the browser, and finally the page. The combination of this unique address allows us to connect the developer's request to that particular page within this structure. When developers share a session ID, they want to ensure they access the same page during crawling sessions repeatedly. This is crucial. As I mentioned, developers don’t need to know anything about these structures; for them, it’s just a simple interface. We must monitor and regularly check that all running pages and browsers are available. Additionally, when we create a browser, we create multiple pages. This means that every browser has a certain number of pages already open and ready for use. The goal is to maintain a pool of connections to all these pages to enable instant crawling. It's important to note that we don’t just create browsers; we open pages in those browsers and then make them available in our connection tool.
|
||||
|
||||
For no 5, for now we skip it.
|
||||
|
||||
Let's revise your understanding so far. Remember we start small, then get some real result, and iteratively we add. I want to hv a minimal level that allows my user to start using Crawl4ai in this way so they no concern about browser and use library as they use in their own machine, this helps them go production easier, then itertively receiving reproted bugs, fisxng them we add all features.
|
||||
|
||||
# AI
|
||||
|
||||
Let me summarize the core architecture and initial MVP based on your detailed explanation:
|
||||
|
||||
Core Architecture:
|
||||
- Browser Management Layer acts as a proxy/orchestrator
|
||||
- Each Docker can host multiple browsers
|
||||
- Each browser can have multiple pages/contexts
|
||||
- Resources are monitored and managed at Docker/browser/page level
|
||||
- Connection pooling to pre-initialize pages for instant use
|
||||
- Unique addressing: Docker->Browser->Page
|
||||
- Support for both mapped and container-internal user data directories
|
||||
|
||||
MVP Focus:
|
||||
1. Single Docker Container:
|
||||
- Dynamic browser creation
|
||||
- Pre-initialized pages per browser
|
||||
- Basic resource monitoring
|
||||
- Connection pooling
|
||||
- Session persistence via page reuse
|
||||
|
||||
2. Connection System:
|
||||
- Unique locator system (Docker/Browser/Page)
|
||||
- Session ID mapping to specific pages
|
||||
- Transparent to end users of Crawl4ai
|
||||
|
||||
3. Resource Management:
|
||||
- Basic monitoring of memory/CPU
|
||||
- Keep resource usage under 70-80%
|
||||
- Auto-restart for failed browsers
|
||||
- Simple health checks
|
||||
|
||||
Critical Questions before starting:
|
||||
1. What's the minimum number of pre-initialized pages per browser?
|
||||
2. What metrics define a "healthy" browser/page?
|
||||
3. Should the first version support mapped user directories, or start with in-container only?
|
||||
4. What's the minimum acceptable resource headroom before scaling?
|
||||
|
||||
Would you like to clarify any of these points before we start with the initial implementation?
|
||||
|
||||
# Unclecode
|
||||
|
||||
|
||||
To answer your question, remember that when we finish this, our CLI will have a command line to activate this cluster for end users (crawl4ai-deploy). When they do that, they can pass parameters or attach a YAML file containing all the necessary parameters. For many of them, we can support a fixed number. For example, when you set pre-initialized pages, we can also have dynamic options. Dynamic means we look at the resources, consider a buffer, and then decide how many pages to create based on the available resources. That's how I like to think about it.
|
||||
|
||||
For number two, you can suggest a good starting metric to determine if we have a healthy page. The most important thing is that the page object is available and usable. I think something like that would work.
|
||||
|
||||
Regarding how to pass directories, For the first version, I'm concerned that when users use the system, they may already have a user directory they want to utilize. For example, they might open Chrome browsers remotely, log in to some websites, and want to use that data. So, I think it’s better to map the folder, by the way after installing library ~/.crawl4ai is the home directory; otherwise, we have to transfer this data, and that’s what I believe we should do.
|
||||
|
||||
I don't understand the last question about acceptable resource headroom.
|
||||
|
||||
# AI
|
||||
|
||||
For the resource headroom question, I was asking about when we should trigger scaling decisions - like when should we stop creating new pages/browsers based on resource usage. But I see from your earlier points that we should aim to stay under 70-80% resource utilization.
|
||||
|
||||
Let me clarify a few more specifics before implementation:
|
||||
|
||||
1. CLI Configuration:
|
||||
```yaml
|
||||
# Example crawl4ai-deploy.yaml
|
||||
browsers:
|
||||
init_mode: "dynamic" # or "fixed"
|
||||
fixed_count: 3 # if fixed mode
|
||||
pages_per_browser:
|
||||
mode: "dynamic" # or "fixed"
|
||||
fixed_count: 5 # if fixed mode
|
||||
user_data_dir: "~/.crawl4ai/browser_profiles"
|
||||
|
||||
resources:
|
||||
max_cpu_percent: 80
|
||||
max_memory_percent: 80
|
||||
headroom_buffer: 20 # percent to keep free
|
||||
|
||||
health_check:
|
||||
interval: 30 # seconds
|
||||
metrics:
|
||||
- page_response_time
|
||||
- browser_memory
|
||||
- connection_status
|
||||
```
|
||||
|
||||
2. For health checks, I propose:
|
||||
- Page object responds to basic Playwright commands
|
||||
- Browser process is running
|
||||
- Memory usage per browser instance
|
||||
- Connection can be established
|
||||
- Basic navigation works
|
||||
|
||||
Do these configurations look right? Should we adjust anything before starting the implementation?
|
||||
|
||||
Also, for the MVP, would you prefer we start with:
|
||||
1. Docker setup + basic browser spawning
|
||||
2. Or the Python management layer first?
|
||||
|
||||
# Unclecode
|
||||
|
||||
Sounds good, about how to start, My brain works better when I receive immediate feedback rather than working for a long time and then testing it. A good way for me to start is to create this management layer that sets up the Docker, then creates the browser and page, or reads the YAML file to do that, and then uses Crawl4ai to perform simple crawling. I can skip the Docker creation part, but I know it's not very difficult. Alternatively, I could create a Docker in the command line and then use it, but I prefer to see this part. I hope to complete this creation process, make it available, connect it to my current library, and then run it. So, yeah.
|
||||
|
||||
# AI
|
||||
|
||||
Got it - let's start with immediate feedback loop. Here's our first steps:
|
||||
|
||||
1. Create basic Browser Manager class that:
|
||||
- Creates/controls Docker container
|
||||
- Creates browsers with pages
|
||||
- Provides connection info for Crawl4ai
|
||||
|
||||
2. Simple YAML config reader
|
||||
|
||||
3. Quick test script using Crawl4ai to verify it works
|
||||
|
||||
Before coding, need to confirm:
|
||||
1. Do you want to start with a hardcoded Docker setup first (just Python side), or include Docker file creation too?
|
||||
2. For testing - what's a simple site we should use?
|
||||
3. Should I see your current Playwright connection code first to ensure compatibility?
|
||||
|
||||
This way we get something running end-to-end quickly, then iterate on each piece.
|
||||
|
||||
# Unclecode
|
||||
|
||||
Ok check:
|
||||
@plans/docstring.md
|
||||
@crawl4ai/async_crawler_strategy.py
|
||||
@docs/example/hello_world.py
|
||||
|
||||
We need a service class in charge of creation/management/communication of this Docker network (cluster node or whatever you call it), we launch it using cli, it basically will be a running service and monitoring the cluster (aka redis server or mongodb server). It keep checking, managing crashes and more... and communicate with labrary.
|
||||
|
||||
Then BrowserManager, when see use_docker is set ON, it uses a new class (let's call it DockerBroswer) and that call the running service to get a host/port to the selected or available browser, and create a RemoteConnector pointing to that address, from this point forward, rest is like we are using RemoteConnector (I am thinking to change the name to RemoteBrowser) :
|
||||
|
||||
AsyncWebCrawler -> AsyncPlaywrightCrawlerStrategy -> BrowserManager -> DockerBroswer (communicate with running service to get host/port) -> RemoteConnector
|
||||
|
||||
Is that clear? And please be my guest and suggest better names for these new classes
|
||||
|
||||
# AI
|
||||
|
||||
Ah, now I understand better! Let me propose a clearer architecture:
|
||||
|
||||
```python
|
||||
# 1. Service Layer (Running as daemon)
|
||||
class BrowserFarmService: # or CrawlClusterService?
|
||||
"""Manages the Docker browser farm, running as a service"""
|
||||
def __init__(self, config_path: str = "~/.crawl4ai/farm.yaml"):
|
||||
self.containers = {} # track running containers
|
||||
self.browser_pool = {} # available browsers
|
||||
self.resource_monitor = ResourceMonitor()
|
||||
|
||||
async def start(self):
|
||||
"""Start the service, init Docker setup"""
|
||||
pass
|
||||
|
||||
async def get_available_browser(self) -> BrowserEndpoint:
|
||||
"""Returns available browser endpoint or creates new one"""
|
||||
pass
|
||||
|
||||
async def health_check(self):
|
||||
"""Monitor containers/browsers health"""
|
||||
pass
|
||||
|
||||
# 2. Client Layer (Used by BrowserManager)
|
||||
class DockerBrowser: # or RemoteBrowser?
|
||||
"""Client to communicate with BrowserFarmService"""
|
||||
def __init__(self, service_url: str = "http://localhost:11235"):
|
||||
self.service = BrowserFarmServiceClient(service_url)
|
||||
|
||||
async def get_browser(self) -> tuple[str, int]:
|
||||
"""Get host/port for an available browser"""
|
||||
endpoint = await self.service.get_browser()
|
||||
return endpoint.host, endpoint.port
|
||||
|
||||
# 3. Connect via RemoteConnector (renamed to RemoteBrowser)
|
||||
class RemoteBrowser: # formerly RemoteConnector
|
||||
"""Connects to remote browser instances"""
|
||||
pass
|
||||
```
|
||||
|
||||
Better names could be:
|
||||
- BrowserFarmService -> CrawlClusterService
|
||||
- DockerBrowser -> RemoteBrowserClient
|
||||
- RemoteConnector -> RemoteConnector
|
||||
|
||||
Does this align better with your vision? What parts need refinement?
|
||||
|
||||
# Unclecode
|
||||
|
||||
From this point, I want to continue the conversation in another chat session. I need you to write a detailed outline of what we are supposed to do, including the major plan you suggested, all the components, and the part we are currently working on and how we want to achieve it. These details are important. I will then move to the end of the chat session, including the steps you just mentioned, and continue from there.
|
||||
We need to start with understanding and defining the problem. Imagine the new chat session; the large language model in that session doesn't know anything. We have to explain everything we are going to do, including the important files from the codebase, what they are, why we are doing this, and what we are trying to achieve.
|
||||
After that, we will discuss the design and the structures we are trying to apply. Then, we will focus on the MVP, the first stage we are currently working on. Make sure this outline is detailed and covers everything.
|
||||
|
||||
# AI
|
||||
|
||||
Here's a comprehensive outline for the new session:
|
||||
|
||||
# Overview: Crawl4ai Docker Browser Management System
|
||||
|
||||
## 1. Understanding Current System
|
||||
- Crawl4ai is an async web crawler using Playwright
|
||||
- Key components:
|
||||
- `AsyncWebCrawler`: Main interface for users
|
||||
- `AsyncPlaywrightCrawlerStrategy`: Manages browser automation
|
||||
- `BrowserManager`: Handles browser lifecycle
|
||||
- `ManagedBrowser`: Connects to browser instances
|
||||
- Current architecture supports local browser management
|
||||
- Uses user data directories for session persistence
|
||||
|
||||
## 2. Problem Statement
|
||||
- Users need production-ready browser management
|
||||
- Requirements:
|
||||
- Scale browser instances across containers
|
||||
- Resource monitoring and management
|
||||
- Session persistence and recovery
|
||||
- Simple developer experience
|
||||
- Connection pooling
|
||||
- Health monitoring
|
||||
- Must maintain current API compatibility
|
||||
|
||||
## 3. Proposed Architecture
|
||||
### 3.1 Components
|
||||
1. `CrawlClusterService` (Daemon)
|
||||
- Manages Docker containers
|
||||
- Monitors resources
|
||||
- Handles browser lifecycle
|
||||
- Provides browser endpoints
|
||||
|
||||
2. `RemoteBrowserClient`
|
||||
- Communicates with CrawlClusterService
|
||||
- Gets available browser endpoints
|
||||
- Handles connection management
|
||||
|
||||
3. `RemoteConnector` (formerly ManagedBrowser)
|
||||
- Connects to remote browser instances
|
||||
- Manages browser contexts and pages
|
||||
|
||||
### 3.2 Flow
|
||||
```
|
||||
AsyncWebCrawler -> AsyncPlaywrightCrawlerStrategy -> BrowserManager
|
||||
-> RemoteBrowserClient -> CrawlClusterService -> RemoteConnector
|
||||
```
|
||||
|
||||
## 4. MVP Implementation (Current Focus)
|
||||
### 4.1 Phase 1 - Basic Docker Setup
|
||||
- Docker container with Chrome
|
||||
- Exposed debugging port
|
||||
- Basic health check endpoint
|
||||
- Single browser instance
|
||||
|
||||
### 4.2 Phase 2 - Service Layer (Current Step)
|
||||
- Simple `BrowserFarmService`
|
||||
- Returns hardcoded endpoint
|
||||
- Basic health monitoring
|
||||
- Docker browser client integration
|
||||
|
||||
### 4.3 Phase 3 - Integration
|
||||
- Update BrowserConfig for Docker mode
|
||||
- Modify BrowserManager for remote connections
|
||||
- Test with hello world example
|
||||
|
||||
## 5. Future Enhancements
|
||||
### 5.1 Service Features
|
||||
- Multiple browser instances
|
||||
- Resource monitoring
|
||||
- Auto-scaling
|
||||
- Connection pooling
|
||||
- Pre-initialized pages
|
||||
|
||||
### 5.2 Management Features
|
||||
- Browser health checks
|
||||
- Crash recovery
|
||||
- Session persistence
|
||||
- Resource optimization
|
||||
|
||||
### 5.3 Developer Tools
|
||||
- CLI for cluster management
|
||||
- Monitoring dashboard
|
||||
- Configuration management
|
||||
- Debugging tools
|
||||
|
||||
## 6. Current Implementation Status
|
||||
- Docker container working
|
||||
- Basic endpoint structure defined
|
||||
- Starting service layer implementation
|
||||
- Need to:
|
||||
1. Complete BrowserFarmService
|
||||
2. Implement DockerBrowser
|
||||
3. Update BrowserManager
|
||||
4. Test integration
|
||||
|
||||
## 7. Key Files
|
||||
```
|
||||
crawl4ai/
|
||||
├── browser_farm/
|
||||
│ ├── service.py # BrowserFarmService
|
||||
│ ├── docker_browser.py # RemoteBrowserClient
|
||||
│ └── models.py # Shared models
|
||||
├── async_configs.py # BrowserConfig updates
|
||||
└── async_crawler_strategy.py # BrowserManager changes
|
||||
```
|
||||
|
||||
Is this outline detailed enough for starting the new session?
|
||||
|
||||
|
||||
# Unclecode
|
||||
|
||||
Sounds good, yes, correct. Remember my philosophy, build as little as possible but not incomplete then test it. So let's do this:
|
||||
|
||||
1/ Manually we create a docker, make it ready to have a browser. Already Done:
|
||||
```
|
||||
FROM python:3.10-slim
|
||||
|
||||
# System dependencies for Playwright/Chromium
|
||||
RUN apt-get update && apt-get install -y \
|
||||
wget \
|
||||
gnupg \
|
||||
libglib2.0-0 \
|
||||
libnss3 \
|
||||
libnspr4 \
|
||||
libatk1.0-0 \
|
||||
libatk-bridge2.0-0 \
|
||||
libcups2 \
|
||||
libdrm2 \
|
||||
libdbus-1-3 \
|
||||
libxcb1 \
|
||||
libxkbcommon0 \
|
||||
libx11-6 \
|
||||
libx11-xcb1 \
|
||||
libxcb-dri3-0 \
|
||||
libxcomposite1 \
|
||||
libxdamage1 \
|
||||
libxext6 \
|
||||
libxfixes3 \
|
||||
libxrandr2 \
|
||||
libgbm1 \
|
||||
libpango-1.0-0 \
|
||||
libcairo2 \
|
||||
libasound2 \
|
||||
socat \
|
||||
&& rm -rf /var/lib/apt/lists/*
|
||||
|
||||
# Install crawl4ai and setup
|
||||
RUN pip install crawl4ai
|
||||
RUN crawl4ai-setup
|
||||
|
||||
# Add startup script
|
||||
RUN echo '#!/bin/bash\n\
|
||||
/root/.cache/ms-playwright/chromium-1148/chrome-linux/chrome \
|
||||
--remote-debugging-port=9222 \
|
||||
--no-sandbox \
|
||||
--headless=new \
|
||||
--disable-gpu &\n\
|
||||
sleep 2\n\
|
||||
socat TCP-LISTEN:9223,fork,reuseaddr TCP:127.0.0.1:9222\n' > /start.sh && \
|
||||
chmod +x /start.sh
|
||||
|
||||
EXPOSE 9223
|
||||
CMD ["/start.sh"]
|
||||
```
|
||||
|
||||
I tested like this:
|
||||
|
||||
docker build -t crawl4ai-browser .
|
||||
docker run -d -p 9333:9223 crawl4ai-browser
|
||||
curl http://localhost:9333/json/version
|
||||
|
||||
2/ We create BrowserFarmService, but for now just back the host:port we created
|
||||
3/ We focus on BrowserManager, and create DockerBrowser, to connect dots
|
||||
4/ We run the hello word but on Docker
|
||||
|
||||
So we ignore:
|
||||
1/ pre-filled pages
|
||||
2/ running service
|
||||
|
||||
Ok, continue from 2 on ward
|
||||
|
||||
# AI
|
||||
|
||||
Got it - let's implement this step by step.
|
||||
|
||||
2. Simple BrowserFarmService that returns our hardcoded browser:
|
||||
|
||||
```python
|
||||
# crawl4ai/browser_farm/service.py
|
||||
from dataclasses import dataclass
|
||||
|
||||
@dataclass
|
||||
class BrowserEndpoint:
|
||||
host: str
|
||||
port: int
|
||||
|
||||
class BrowserFarmService:
|
||||
"""MVP: Returns hardcoded browser endpoint"""
|
||||
def __init__(self):
|
||||
# For MVP, hardcode the values
|
||||
self._browser = BrowserEndpoint(
|
||||
host="localhost",
|
||||
port=9333 # Mapped from Docker's 9223
|
||||
)
|
||||
|
||||
async def get_available_browser(self) -> BrowserEndpoint:
|
||||
"""Returns our single browser endpoint"""
|
||||
return self._browser
|
||||
|
||||
async def health_check(self) -> bool:
|
||||
"""Basic health check - just verify endpoint responds"""
|
||||
import aiohttp
|
||||
try:
|
||||
async with aiohttp.ClientSession() as session:
|
||||
url = f"http://{self._browser.host}:{self._browser.port}/json/version"
|
||||
async with session.get(url) as response:
|
||||
return response.status == 200
|
||||
except:
|
||||
return False
|
||||
```
|
||||
|
||||
3. DockerBrowser to connect BrowserManager with the service:
|
||||
|
||||
```python
|
||||
# crawl4ai/browser_farm/docker_browser.py
|
||||
from .service import BrowserFarmService
|
||||
|
||||
class DockerBrowser:
|
||||
"""Client to get browser endpoints from BrowserFarmService"""
|
||||
|
||||
def __init__(self):
|
||||
self.service = BrowserFarmService()
|
||||
|
||||
async def get_browser_endpoint(self) -> tuple[str, int]:
|
||||
"""Get host/port for the browser"""
|
||||
endpoint = await self.service.get_available_browser()
|
||||
if not await self.service.health_check():
|
||||
raise ConnectionError("Browser endpoint not healthy")
|
||||
return endpoint.host, endpoint.port
|
||||
```
|
||||
|
||||
Update BrowserConfig and BrowserManager:
|
||||
|
||||
```python
|
||||
# crawl4ai/async_configs.py
|
||||
class BrowserConfig:
|
||||
def __init__(self,
|
||||
# ... existing params ...
|
||||
use_docker: bool = False,
|
||||
**kwargs):
|
||||
self.use_docker = use_docker
|
||||
# ... rest of init
|
||||
|
||||
# In BrowserManager.__init__
|
||||
def __init__(self, browser_config: BrowserConfig, logger=None):
|
||||
# ... existing init code ...
|
||||
|
||||
# Initialize DockerBrowser if needed
|
||||
self.docker_browser = None
|
||||
if self.config.use_docker:
|
||||
from .browser_farm.docker_browser import DockerBrowser
|
||||
self.docker_browser = DockerBrowser()
|
||||
|
||||
# In BrowserManager.start
|
||||
async def start(self):
|
||||
if self.playwright is None:
|
||||
from playwright.async_api import async_playwright
|
||||
self.playwright = await async_playwright().start()
|
||||
|
||||
if self.config.use_docker:
|
||||
host, port = await self.docker_browser.get_browser_endpoint()
|
||||
cdp_url = f"http://{host}:{port}"
|
||||
self.browser = await self.playwright.chromium.connect_over_cdp(cdp_url)
|
||||
# ... rest of the connection code same as use_remote_browser
|
||||
```
|
||||
|
||||
Want me to show the hello world example with these changes?
|
||||
287
plans/docstring.md
Normal file
287
plans/docstring.md
Normal file
@@ -0,0 +1,287 @@
|
||||
|
||||
## `crawl4ai/models.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | models.py | `` | |
|
||||
| CLASS | TokenUsage | `class TokenUsage:` | |
|
||||
| CLASS | UrlModel | `class UrlModel:` | |
|
||||
| CLASS | MarkdownGenerationResult | `class MarkdownGenerationResult:` | |
|
||||
| CLASS | CrawlResult | `class CrawlResult:` | |
|
||||
| CLASS | AsyncCrawlResponse | `class AsyncCrawlResponse:` | |
|
||||
|
||||
## `crawl4ai/async_configs.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | async_configs.py | `` | |
|
||||
| CLASS | BrowserConfig | `class BrowserConfig:` | Configuration class for setting up a browser instance and its context in AsyncPlaywrightCrawlerStrat... (truncated) |
|
||||
| METHOD | BrowserConfig.__init__ | `def __init__(self, browser_type='chromium', headless=True, use_remote_browser=False, use_persistent_context=False, user_data_dir=None, chrome_channel='chrome', proxy=None, proxy_config=None, viewport_width=1080, viewport_height=600, accept_downloads=False, downloads_path=None, storage_state=None, ignore_https_errors=True, java_script_enabled=True, sleep_on_close=False, verbose=True, cookies=None, headers=None, user_agent='Mozilla/5.0 (Macintosh; Intel Mac OS X 10.15; rv:109.0) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.187 Safari/604.1 Edg/117.0.2045.47', user_agent_mode=None, user_agent_generator_config=None, text_mode=False, light_mode=False, extra_args=None, debugging_port=9222):` | |
|
||||
| METHOD | BrowserConfig.from_kwargs | `def from_kwargs(kwargs):` | |
|
||||
| CLASS | CrawlerRunConfig | `class CrawlerRunConfig:` | Configuration class for controlling how the crawler runs each crawl operation. This includes paramet... (truncated) |
|
||||
| METHOD | CrawlerRunConfig.__init__ | `def __init__(self, word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=None, chunking_strategy=None, markdown_generator=None, content_filter=None, only_text=False, css_selector=None, excluded_tags=None, excluded_selector=None, keep_data_attributes=False, remove_forms=False, prettiify=False, parser_type='lxml', fetch_ssl_certificate=False, cache_mode=None, session_id=None, bypass_cache=False, disable_cache=False, no_cache_read=False, no_cache_write=False, wait_until='domcontentloaded', page_timeout=PAGE_TIMEOUT, wait_for=None, wait_for_images=True, delay_before_return_html=0.1, mean_delay=0.1, max_range=0.3, semaphore_count=5, js_code=None, js_only=False, ignore_body_visibility=True, scan_full_page=False, scroll_delay=0.2, process_iframes=False, remove_overlay_elements=False, simulate_user=False, override_navigator=False, magic=False, adjust_viewport_to_content=False, screenshot=False, screenshot_wait_for=None, screenshot_height_threshold=SCREENSHOT_HEIGHT_TRESHOLD, pdf=False, image_description_min_word_threshold=IMAGE_DESCRIPTION_MIN_WORD_THRESHOLD, image_score_threshold=IMAGE_SCORE_THRESHOLD, exclude_external_images=False, exclude_social_media_domains=None, exclude_external_links=False, exclude_social_media_links=False, exclude_domains=None, verbose=True, log_console=False, url=None):` | |
|
||||
| METHOD | CrawlerRunConfig.from_kwargs | `def from_kwargs(kwargs):` | |
|
||||
| METHOD | CrawlerRunConfig.to_dict | `def to_dict(self):` | |
|
||||
|
||||
## `crawl4ai/async_webcrawler.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | async_webcrawler.py | `` | |
|
||||
| CLASS | AsyncWebCrawler | `class AsyncWebCrawler:` | Asynchronous web crawler with flexible caching capabilities. There are two ways to use the crawler:... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.__init__ | `def __init__(self, crawler_strategy=None, config=None, always_bypass_cache=False, always_by_pass_cache=None, base_directory=str(os.getenv('CRAWL4_AI_BASE_DIRECTORY', Path.home())), thread_safe=False, **kwargs):` | Initialize the AsyncWebCrawler. Args: crawler_strategy: Strategy for crawling web pages. If Non... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.start | `async def start(self):` | Start the crawler explicitly without using context manager. This is equivalent to using 'async with'... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.close | `async def close(self):` | Close the crawler explicitly without using context manager. This should be called when you're done w... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.__aenter__ | `async def __aenter__(self):` | |
|
||||
| METHOD | AsyncWebCrawler.__aexit__ | `async def __aexit__(self, exc_type, exc_val, exc_tb):` | |
|
||||
| METHOD | AsyncWebCrawler.awarmup | `async def awarmup(self):` | Initialize the crawler with warm-up sequence. This method: 1. Logs initialization info 2. Sets up b... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.nullcontext | `async def nullcontext(self):` | 异步空上下文管理器 |
|
||||
| METHOD | AsyncWebCrawler.arun | `async def arun(self, url, config=None, word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=None, chunking_strategy=RegexChunking(), content_filter=None, cache_mode=None, bypass_cache=False, disable_cache=False, no_cache_read=False, no_cache_write=False, css_selector=None, screenshot=False, pdf=False, user_agent=None, verbose=True, **kwargs):` | Runs the crawler for a single source: URL (web, local file, or raw HTML). Migration Guide: Old way ... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.aprocess_html | `async def aprocess_html(self, url, html, extracted_content, config, screenshot, pdf_data, verbose, **kwargs):` | Process HTML content using the provided configuration. Args: url: The URL being processed h... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.arun_many | `async def arun_many(self, urls, config=None, word_count_threshold=MIN_WORD_THRESHOLD, extraction_strategy=None, chunking_strategy=RegexChunking(), content_filter=None, cache_mode=None, bypass_cache=False, css_selector=None, screenshot=False, pdf=False, user_agent=None, verbose=True, **kwargs):` | Runs the crawler for multiple URLs concurrently. Migration Guide: Old way (deprecated): results... (truncated) |
|
||||
| METHOD | AsyncWebCrawler.aclear_cache | `async def aclear_cache(self):` | Clear the cache database. |
|
||||
| METHOD | AsyncWebCrawler.aflush_cache | `async def aflush_cache(self):` | Flush the cache database. |
|
||||
| METHOD | AsyncWebCrawler.aget_cache_size | `async def aget_cache_size(self):` | Get the total number of cached items. |
|
||||
|
||||
## `crawl4ai/async_crawler_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | async_crawler_strategy.py | `` | |
|
||||
| CLASS | RemoteConnector | `class RemoteConnector:` | Manages the browser process and context. This class allows to connect to the browser using CDP proto... (truncated) |
|
||||
| METHOD | RemoteConnector.__init__ | `def __init__(self, browser_type='chromium', user_data_dir=None, headless=False, logger=None, host='localhost', debugging_port=9222):` | Initialize the RemoteConnector instance. Args: browser_type (str): The type of browser to launch... (truncated) |
|
||||
| METHOD | RemoteConnector.start | `async def start(self):` | Starts the browser process and returns the CDP endpoint URL. If user_data_dir is not provided, creat... (truncated) |
|
||||
| METHOD | RemoteConnector._monitor_browser_process | `async def _monitor_browser_process(self):` | Monitor the browser process for unexpected termination. How it works: 1. Read stdout and stderr fro... (truncated) |
|
||||
| METHOD | RemoteConnector._get_browser_path | `def _get_browser_path(self):` | Returns the browser executable path based on OS and browser type |
|
||||
| METHOD | RemoteConnector._get_browser_args | `def _get_browser_args(self):` | Returns browser-specific command line arguments |
|
||||
| METHOD | RemoteConnector.cleanup | `async def cleanup(self):` | Cleanup browser process and temporary directory |
|
||||
| CLASS | BrowserManager | `class BrowserManager:` | Manages the browser instance and context. Attributes: config (BrowserConfig): Configuration ob... (truncated) |
|
||||
| METHOD | BrowserManager.__init__ | `def __init__(self, browser_config, logger=None):` | Initialize the BrowserManager with a browser configuration. Args: browser_config (BrowserConfig... (truncated) |
|
||||
| METHOD | BrowserManager.start | `async def start(self):` | Start the browser instance and set up the default context. How it works: 1. Check if Playwright is ... (truncated) |
|
||||
| METHOD | BrowserManager._build_browser_args | `def _build_browser_args(self):` | Build browser launch arguments from config. |
|
||||
| METHOD | BrowserManager.setup_context | `async def setup_context(self, context, crawlerRunConfig, is_default=False):` | Set up a browser context with the configured options. How it works: 1. Set extra HTTP headers if pr... (truncated) |
|
||||
| METHOD | BrowserManager.create_browser_context | `async def create_browser_context(self):` | Creates and returns a new browser context with configured settings. Applies text-only mode settings ... (truncated) |
|
||||
| METHOD | BrowserManager.get_page | `async def get_page(self, crawlerRunConfig):` | Get a page for the given session ID, creating a new one if needed. Args: crawlerRunConfig (Craw... (truncated) |
|
||||
| METHOD | BrowserManager.kill_session | `async def kill_session(self, session_id):` | Kill a browser session and clean up resources. Args: session_id (str): The session ID to kill... (truncated) |
|
||||
| METHOD | BrowserManager._cleanup_expired_sessions | `def _cleanup_expired_sessions(self):` | Clean up expired sessions based on TTL. |
|
||||
| METHOD | BrowserManager.close | `async def close(self):` | Close all browser resources and clean up. |
|
||||
| CLASS | AsyncCrawlerStrategy | `class AsyncCrawlerStrategy:` | Abstract base class for crawler strategies. Subclasses must implement the crawl method. |
|
||||
| METHOD | AsyncCrawlerStrategy.crawl | `async def crawl(self, url, **kwargs):` | |
|
||||
| CLASS | AsyncPlaywrightCrawlerStrategy | `class AsyncPlaywrightCrawlerStrategy:` | Crawler strategy using Playwright. Attributes: browser_config (BrowserConfig): Configuration ob... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.__init__ | `def __init__(self, browser_config=None, logger=None, **kwargs):` | Initialize the AsyncPlaywrightCrawlerStrategy with a browser configuration. Args: browser_confi... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.__aenter__ | `async def __aenter__(self):` | |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.__aexit__ | `async def __aexit__(self, exc_type, exc_val, exc_tb):` | |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.start | `async def start(self):` | Start the browser and initialize the browser manager. |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.close | `async def close(self):` | Close the browser and clean up resources. |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.kill_session | `async def kill_session(self, session_id):` | Kill a browser session and clean up resources. Args: session_id (str): The ID of the session to... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.set_hook | `def set_hook(self, hook_type, hook):` | Set a hook function for a specific hook type. Following are list of hook types: - on_browser_created... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.execute_hook | `async def execute_hook(self, hook_type, *args, **kwargs):` | Execute a hook function for a specific hook type. Args: hook_type (str): The type of the hook. ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.update_user_agent | `def update_user_agent(self, user_agent):` | Update the user agent for the browser. Args: user_agent (str): The new user agent string. ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.set_custom_headers | `def set_custom_headers(self, headers):` | Set custom headers for the browser. Args: headers (Dict[str, str]): A dictionary of headers to... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.smart_wait | `async def smart_wait(self, page, wait_for, timeout=30000):` | Wait for a condition in a smart way. This functions works as below: 1. If wait_for starts with 'js:... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.csp_compliant_wait | `async def csp_compliant_wait(self, page, user_wait_function, timeout=30000):` | Wait for a condition in a CSP-compliant way. Args: page: Playwright page object user_wait_f... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.process_iframes | `async def process_iframes(self, page):` | Process iframes on a page. This function will extract the content of each iframe and replace it with... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.create_session | `async def create_session(self, **kwargs):` | Creates a new browser session and returns its ID. A browse session is a unique openned page can be r... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.crawl | `async def crawl(self, url, config, **kwargs):` | Crawls a given URL or processes raw HTML/local file content based on the URL prefix. Args: url ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy._crawl_web | `async def _crawl_web(self, url, config):` | Internal method to crawl web URLs with the specified configuration. Args: url (str): The web UR... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy._handle_full_page_scan | `async def _handle_full_page_scan(self, page, scroll_delay):` | Helper method to handle full page scanning. How it works: 1. Get the viewport height. 2. Scroll to... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy._handle_download | `async def _handle_download(self, download):` | Handle file downloads. How it works: 1. Get the suggested filename. 2. Get the download path. 3. Lo... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.remove_overlay_elements | `async def remove_overlay_elements(self, page):` | Removes popup overlays, modals, cookie notices, and other intrusive elements from the page. Args: ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.export_pdf | `async def export_pdf(self, page):` | Exports the current page as a PDF. Args: page (Page): The Playwright page object Returns: ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.take_screenshot | `async def take_screenshot(self, page, **kwargs):` | Take a screenshot of the current page. Args: page (Page): The Playwright page object kwargs... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.take_screenshot_from_pdf | `async def take_screenshot_from_pdf(self, pdf_data):` | Convert the first page of the PDF to a screenshot. Requires pdf2image and poppler. Args: ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.take_screenshot_scroller | `async def take_screenshot_scroller(self, page, **kwargs):` | Attempt to set a large viewport and take a full-page screenshot. If still too large, segment the pag... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.take_screenshot_naive | `async def take_screenshot_naive(self, page):` | Takes a screenshot of the current page. Args: page (Page): The Playwright page instance Return... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.export_storage_state | `async def export_storage_state(self, path=None):` | Exports the current storage state (cookies, localStorage, sessionStorage) to a JSON file at the spec... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.robust_execute_user_script | `async def robust_execute_user_script(self, page, js_code):` | Executes user-provided JavaScript code with proper error handling and context, supporting both synch... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.execute_user_script | `async def execute_user_script(self, page, js_code):` | Executes user-provided JavaScript code with proper error handling and context. Args: page: Play... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.check_visibility | `async def check_visibility(self, page):` | Checks if an element is visible on the page. Args: page: Playwright page object Returns: ... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.safe_scroll | `async def safe_scroll(self, page, x, y):` | Safely scroll the page with rendering time. Args: page: Playwright page object x: Horizonta... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.csp_scroll_to | `async def csp_scroll_to(self, page, x, y):` | Performs a CSP-compliant scroll operation and returns the result status. Args: page: Playwright... (truncated) |
|
||||
| METHOD | AsyncPlaywrightCrawlerStrategy.get_page_dimensions | `async def get_page_dimensions(self, page):` | Get the dimensions of the page. Args: page: Playwright page object Returns: Dict conta... (truncated) |
|
||||
|
||||
## `crawl4ai/content_scraping_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | content_scraping_strategy.py | `` | |
|
||||
| FUNCTION | parse_dimension | `def parse_dimension(dimension):` | |
|
||||
| FUNCTION | fetch_image_file_size | `def fetch_image_file_size(img, base_url):` | |
|
||||
| CLASS | ContentScrapingStrategy | `class ContentScrapingStrategy:` | |
|
||||
| METHOD | ContentScrapingStrategy.scrap | `def scrap(self, url, html, **kwargs):` | |
|
||||
| METHOD | ContentScrapingStrategy.ascrap | `async def ascrap(self, url, html, **kwargs):` | |
|
||||
| CLASS | WebScrapingStrategy | `class WebScrapingStrategy:` | Class for web content scraping. Perhaps the most important class. How it works: 1. Extract content... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.__init__ | `def __init__(self, logger=None):` | |
|
||||
| METHOD | WebScrapingStrategy._log | `def _log(self, level, message, tag='SCRAPE', **kwargs):` | Helper method to safely use logger. |
|
||||
| METHOD | WebScrapingStrategy.scrap | `def scrap(self, url, html, **kwargs):` | Main entry point for content scraping. Args: url (str): The URL of the page to scrape. ht... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.ascrap | `async def ascrap(self, url, html, **kwargs):` | Main entry point for asynchronous content scraping. Args: url (str): The URL of the page to scr... (truncated) |
|
||||
| METHOD | WebScrapingStrategy._generate_markdown_content | `def _generate_markdown_content(self, cleaned_html, html, url, success, **kwargs):` | Generate markdown content from cleaned HTML. Args: cleaned_html (str): The cleaned HTML content... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.flatten_nested_elements | `def flatten_nested_elements(self, node):` | Flatten nested elements in a HTML tree. Args: node (Tag): The root node of the HTML tree. Retu... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.find_closest_parent_with_useful_text | `def find_closest_parent_with_useful_text(self, tag, **kwargs):` | Find the closest parent with useful text. Args: tag (Tag): The starting tag to search from. ... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.remove_unwanted_attributes | `def remove_unwanted_attributes(self, element, important_attrs, keep_data_attributes=False):` | Remove unwanted attributes from an HTML element. Args: element (Tag): The HTML element to r... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.process_image | `def process_image(self, img, url, index, total_images, **kwargs):` | Process an image element. How it works: 1. Check if the image has valid display and inside undesire... (truncated) |
|
||||
| METHOD | WebScrapingStrategy.process_element | `def process_element(self, url, element, **kwargs):` | Process an HTML element. How it works: 1. Check if the element is an image, video, or audio. 2. Ext... (truncated) |
|
||||
| METHOD | WebScrapingStrategy._process_element | `def _process_element(self, url, element, media, internal_links_dict, external_links_dict, **kwargs):` | Process an HTML element. |
|
||||
| METHOD | WebScrapingStrategy._scrap | `def _scrap(self, url, html, word_count_threshold=MIN_WORD_THRESHOLD, css_selector=None, **kwargs):` | Extract content from HTML using BeautifulSoup. Args: url (str): The URL of the page to scrape. ... (truncated) |
|
||||
|
||||
## `crawl4ai/markdown_generation_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | markdown_generation_strategy.py | `` | |
|
||||
| FUNCTION | fast_urljoin | `def fast_urljoin(base, url):` | Fast URL joining for common cases. |
|
||||
| CLASS | MarkdownGenerationStrategy | `class MarkdownGenerationStrategy:` | Abstract base class for markdown generation strategies. |
|
||||
| METHOD | MarkdownGenerationStrategy.__init__ | `def __init__(self, content_filter=None, options=None):` | |
|
||||
| METHOD | MarkdownGenerationStrategy.generate_markdown | `def generate_markdown(self, cleaned_html, base_url='', html2text_options=None, content_filter=None, citations=True, **kwargs):` | Generate markdown from cleaned HTML. |
|
||||
| CLASS | DefaultMarkdownGenerator | `class DefaultMarkdownGenerator:` | Default implementation of markdown generation strategy. How it works: 1. Generate raw markdown from... (truncated) |
|
||||
| METHOD | DefaultMarkdownGenerator.__init__ | `def __init__(self, content_filter=None, options=None):` | |
|
||||
| METHOD | DefaultMarkdownGenerator.convert_links_to_citations | `def convert_links_to_citations(self, markdown, base_url=''):` | Convert links in markdown to citations. How it works: 1. Find all links in the markdown. 2. Convert... (truncated) |
|
||||
| METHOD | DefaultMarkdownGenerator.generate_markdown | `def generate_markdown(self, cleaned_html, base_url='', html2text_options=None, options=None, content_filter=None, citations=True, **kwargs):` | Generate markdown with citations from cleaned HTML. How it works: 1. Generate raw markdown from cle... (truncated) |
|
||||
|
||||
## `crawl4ai/content_filter_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | content_filter_strategy.py | `` | |
|
||||
| CLASS | RelevantContentFilter | `class RelevantContentFilter:` | Abstract base class for content filtering strategies |
|
||||
| METHOD | RelevantContentFilter.__init__ | `def __init__(self, user_query=None):` | |
|
||||
| METHOD | RelevantContentFilter.filter_content | `def filter_content(self, html):` | Abstract method to be implemented by specific filtering strategies |
|
||||
| METHOD | RelevantContentFilter.extract_page_query | `def extract_page_query(self, soup, body):` | Common method to extract page metadata with fallbacks |
|
||||
| METHOD | RelevantContentFilter.extract_text_chunks | `def extract_text_chunks(self, body, min_word_threshold=None):` | Extracts text chunks from a BeautifulSoup body element while preserving order. Returns list of tuple... (truncated) |
|
||||
| METHOD | RelevantContentFilter._deprecated_extract_text_chunks | `def _deprecated_extract_text_chunks(self, soup):` | Common method for extracting text chunks |
|
||||
| METHOD | RelevantContentFilter.is_excluded | `def is_excluded(self, tag):` | Common method for exclusion logic |
|
||||
| METHOD | RelevantContentFilter.clean_element | `def clean_element(self, tag):` | Common method for cleaning HTML elements with minimal overhead |
|
||||
| CLASS | BM25ContentFilter | `class BM25ContentFilter:` | Content filtering using BM25 algorithm with priority tag handling. How it works: 1. Extracts page m... (truncated) |
|
||||
| METHOD | BM25ContentFilter.__init__ | `def __init__(self, user_query=None, bm25_threshold=1.0, language='english'):` | Initializes the BM25ContentFilter class, if not provided, falls back to page metadata. Note: If no ... (truncated) |
|
||||
| METHOD | BM25ContentFilter.filter_content | `def filter_content(self, html, min_word_threshold=None):` | Implements content filtering using BM25 algorithm with priority tag handling. Note: This method... (truncated) |
|
||||
| CLASS | PruningContentFilter | `class PruningContentFilter:` | Content filtering using pruning algorithm with dynamic threshold. How it works: 1. Extracts page me... (truncated) |
|
||||
| METHOD | PruningContentFilter.__init__ | `def __init__(self, user_query=None, min_word_threshold=None, threshold_type='fixed', threshold=0.48):` | Initializes the PruningContentFilter class, if not provided, falls back to page metadata. Note: If ... (truncated) |
|
||||
| METHOD | PruningContentFilter.filter_content | `def filter_content(self, html, min_word_threshold=None):` | Implements content filtering using pruning algorithm with dynamic threshold. Note: This method impl... (truncated) |
|
||||
| METHOD | PruningContentFilter._remove_comments | `def _remove_comments(self, soup):` | Removes HTML comments |
|
||||
| METHOD | PruningContentFilter._remove_unwanted_tags | `def _remove_unwanted_tags(self, soup):` | Removes unwanted tags |
|
||||
| METHOD | PruningContentFilter._prune_tree | `def _prune_tree(self, node):` | Prunes the tree starting from the given node. Args: node (Tag): The node from which the pruning... (truncated) |
|
||||
| METHOD | PruningContentFilter._compute_composite_score | `def _compute_composite_score(self, metrics, text_len, tag_len, link_text_len):` | Computes the composite score |
|
||||
| METHOD | PruningContentFilter._compute_class_id_weight | `def _compute_class_id_weight(self, node):` | Computes the class ID weight |
|
||||
|
||||
## `crawl4ai/extraction_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | extraction_strategy.py | `` | |
|
||||
| CLASS | ExtractionStrategy | `class ExtractionStrategy:` | Abstract base class for all extraction strategies. |
|
||||
| METHOD | ExtractionStrategy.__init__ | `def __init__(self, input_format='markdown', **kwargs):` | Initialize the extraction strategy. Args: input_format: Content format to use for extraction. ... (truncated) |
|
||||
| METHOD | ExtractionStrategy.extract | `def extract(self, url, html, *q, **kwargs):` | Extract meaningful blocks or chunks from the given HTML. :param url: The URL of the webpage. :param... (truncated) |
|
||||
| METHOD | ExtractionStrategy.run | `def run(self, url, sections, *q, **kwargs):` | Process sections of text in parallel by default. :param url: The URL of the webpage. :param section... (truncated) |
|
||||
| CLASS | NoExtractionStrategy | `class NoExtractionStrategy:` | A strategy that does not extract any meaningful content from the HTML. It simply returns the entire ... (truncated) |
|
||||
| METHOD | NoExtractionStrategy.extract | `def extract(self, url, html, *q, **kwargs):` | Extract meaningful blocks or chunks from the given HTML. |
|
||||
| METHOD | NoExtractionStrategy.run | `def run(self, url, sections, *q, **kwargs):` | |
|
||||
| CLASS | LLMExtractionStrategy | `class LLMExtractionStrategy:` | A strategy that uses an LLM to extract meaningful content from the HTML. Attributes: provider: ... (truncated) |
|
||||
| METHOD | LLMExtractionStrategy.__init__ | `def __init__(self, provider=DEFAULT_PROVIDER, api_token=None, instruction=None, schema=None, extraction_type='block', **kwargs):` | Initialize the strategy with clustering parameters. Args: provider: The provider to use for ext... (truncated) |
|
||||
| METHOD | LLMExtractionStrategy.extract | `def extract(self, url, ix, html):` | Extract meaningful blocks or chunks from the given HTML using an LLM. How it works: 1. Construct a ... (truncated) |
|
||||
| METHOD | LLMExtractionStrategy._merge | `def _merge(self, documents, chunk_token_threshold, overlap):` | Merge documents into sections based on chunk_token_threshold and overlap. |
|
||||
| METHOD | LLMExtractionStrategy.run | `def run(self, url, sections):` | Process sections sequentially with a delay for rate limiting issues, specifically for LLMExtractionS... (truncated) |
|
||||
| METHOD | LLMExtractionStrategy.show_usage | `def show_usage(self):` | Print a detailed token usage report showing total and per-request usage. |
|
||||
| CLASS | CosineStrategy | `class CosineStrategy:` | Extract meaningful blocks or chunks from the given HTML using cosine similarity. How it works: 1. P... (truncated) |
|
||||
| METHOD | CosineStrategy.__init__ | `def __init__(self, semantic_filter=None, word_count_threshold=10, max_dist=0.2, linkage_method='ward', top_k=3, model_name='sentence-transformers/all-MiniLM-L6-v2', sim_threshold=0.3, **kwargs):` | Initialize the strategy with clustering parameters. Args: semantic_filter (str): A keyword filt... (truncated) |
|
||||
| METHOD | CosineStrategy.filter_documents_embeddings | `def filter_documents_embeddings(self, documents, semantic_filter, at_least_k=20):` | Filter and sort documents based on the cosine similarity of their embeddings with the semantic_filte... (truncated) |
|
||||
| METHOD | CosineStrategy.get_embeddings | `def get_embeddings(self, sentences, batch_size=None, bypass_buffer=False):` | Get BERT embeddings for a list of sentences. Args: sentences (List[str]): A list of text chunks... (truncated) |
|
||||
| METHOD | CosineStrategy.hierarchical_clustering | `def hierarchical_clustering(self, sentences, embeddings=None):` | Perform hierarchical clustering on sentences and return cluster labels. Args: sentences (List[s... (truncated) |
|
||||
| METHOD | CosineStrategy.filter_clusters_by_word_count | `def filter_clusters_by_word_count(self, clusters):` | Filter clusters to remove those with a word count below the threshold. Args: clusters (Dict[int... (truncated) |
|
||||
| METHOD | CosineStrategy.extract | `def extract(self, url, html, *q, **kwargs):` | Extract clusters from HTML content using hierarchical clustering. Args: url (str): The URL of t... (truncated) |
|
||||
| METHOD | CosineStrategy.run | `def run(self, url, sections, *q, **kwargs):` | Process sections using hierarchical clustering. Args: url (str): The URL of the webpage. se... (truncated) |
|
||||
| CLASS | JsonElementExtractionStrategy | `class JsonElementExtractionStrategy:` | Abstract base class for extracting structured JSON from HTML content. How it works: 1. ... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy.__init__ | `def __init__(self, schema, **kwargs):` | Initialize the JSON element extraction strategy with a schema. Args: schema (Dict[str, Any]): T... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy.extract | `def extract(self, url, html_content, *q, **kwargs):` | Extract structured data from HTML content. How it works: 1. Parses the HTML content using the `_par... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy._parse_html | `def _parse_html(self, html_content):` | Parse HTML content into appropriate format |
|
||||
| METHOD | JsonElementExtractionStrategy._get_base_elements | `def _get_base_elements(self, parsed_html, selector):` | Get all base elements using the selector |
|
||||
| METHOD | JsonElementExtractionStrategy._get_elements | `def _get_elements(self, element, selector):` | Get child elements using the selector |
|
||||
| METHOD | JsonElementExtractionStrategy._extract_field | `def _extract_field(self, element, field):` | |
|
||||
| METHOD | JsonElementExtractionStrategy._extract_single_field | `def _extract_single_field(self, element, field):` | Extract a single field based on its type. How it works: 1. Selects the target element using the fie... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy._extract_list_item | `def _extract_list_item(self, element, fields):` | |
|
||||
| METHOD | JsonElementExtractionStrategy._extract_item | `def _extract_item(self, element, fields):` | Extracts fields from a given element. How it works: 1. Iterates through the fields defined in the s... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy._apply_transform | `def _apply_transform(self, value, transform):` | Apply a transformation to a value. How it works: 1. Checks the transformation type (e.g., `lowercas... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy._compute_field | `def _compute_field(self, item, field):` | |
|
||||
| METHOD | JsonElementExtractionStrategy.run | `def run(self, url, sections, *q, **kwargs):` | Run the extraction strategy on a combined HTML content. How it works: 1. Combines multiple HTML sec... (truncated) |
|
||||
| METHOD | JsonElementExtractionStrategy._get_element_text | `def _get_element_text(self, element):` | Get text content from element |
|
||||
| METHOD | JsonElementExtractionStrategy._get_element_html | `def _get_element_html(self, element):` | Get HTML content from element |
|
||||
| METHOD | JsonElementExtractionStrategy._get_element_attribute | `def _get_element_attribute(self, element, attribute):` | Get attribute value from element |
|
||||
| CLASS | JsonCssExtractionStrategy | `class JsonCssExtractionStrategy:` | Concrete implementation of `JsonElementExtractionStrategy` using CSS selectors. How it works: 1. Pa... (truncated) |
|
||||
| METHOD | JsonCssExtractionStrategy.__init__ | `def __init__(self, schema, **kwargs):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._parse_html | `def _parse_html(self, html_content):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._get_base_elements | `def _get_base_elements(self, parsed_html, selector):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._get_elements | `def _get_elements(self, element, selector):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._get_element_text | `def _get_element_text(self, element):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._get_element_html | `def _get_element_html(self, element):` | |
|
||||
| METHOD | JsonCssExtractionStrategy._get_element_attribute | `def _get_element_attribute(self, element, attribute):` | |
|
||||
| CLASS | JsonXPathExtractionStrategy | `class JsonXPathExtractionStrategy:` | Concrete implementation of `JsonElementExtractionStrategy` using XPath selectors. How it works: 1. ... (truncated) |
|
||||
| METHOD | JsonXPathExtractionStrategy.__init__ | `def __init__(self, schema, **kwargs):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._parse_html | `def _parse_html(self, html_content):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._get_base_elements | `def _get_base_elements(self, parsed_html, selector):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._css_to_xpath | `def _css_to_xpath(self, css_selector):` | Convert CSS selector to XPath if needed |
|
||||
| METHOD | JsonXPathExtractionStrategy._basic_css_to_xpath | `def _basic_css_to_xpath(self, css_selector):` | Basic CSS to XPath conversion for common cases |
|
||||
| METHOD | JsonXPathExtractionStrategy._get_elements | `def _get_elements(self, element, selector):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._get_element_text | `def _get_element_text(self, element):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._get_element_html | `def _get_element_html(self, element):` | |
|
||||
| METHOD | JsonXPathExtractionStrategy._get_element_attribute | `def _get_element_attribute(self, element, attribute):` | |
|
||||
|
||||
## `crawl4ai/chunking_strategy.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | chunking_strategy.py | `` | |
|
||||
| CLASS | ChunkingStrategy | `class ChunkingStrategy:` | Abstract base class for chunking strategies. |
|
||||
| METHOD | ChunkingStrategy.chunk | `def chunk(self, text):` | Abstract method to chunk the given text. Args: text (str): The text to chunk. Returns: lis... (truncated) |
|
||||
| CLASS | IdentityChunking | `class IdentityChunking:` | Chunking strategy that returns the input text as a single chunk. |
|
||||
| METHOD | IdentityChunking.chunk | `def chunk(self, text):` | |
|
||||
| CLASS | RegexChunking | `class RegexChunking:` | Chunking strategy that splits text based on regular expression patterns. |
|
||||
| METHOD | RegexChunking.__init__ | `def __init__(self, patterns=None, **kwargs):` | Initialize the RegexChunking object. Args: patterns (list): A list of regular expression patter... (truncated) |
|
||||
| METHOD | RegexChunking.chunk | `def chunk(self, text):` | |
|
||||
| CLASS | NlpSentenceChunking | `class NlpSentenceChunking:` | Chunking strategy that splits text into sentences using NLTK's sentence tokenizer. |
|
||||
| METHOD | NlpSentenceChunking.__init__ | `def __init__(self, **kwargs):` | Initialize the NlpSentenceChunking object. |
|
||||
| METHOD | NlpSentenceChunking.chunk | `def chunk(self, text):` | |
|
||||
| CLASS | TopicSegmentationChunking | `class TopicSegmentationChunking:` | Chunking strategy that segments text into topics using NLTK's TextTilingTokenizer. How it works: 1.... (truncated) |
|
||||
| METHOD | TopicSegmentationChunking.__init__ | `def __init__(self, num_keywords=3, **kwargs):` | Initialize the TopicSegmentationChunking object. Args: num_keywords (int): The number of keywor... (truncated) |
|
||||
| METHOD | TopicSegmentationChunking.chunk | `def chunk(self, text):` | |
|
||||
| METHOD | TopicSegmentationChunking.extract_keywords | `def extract_keywords(self, text):` | |
|
||||
| METHOD | TopicSegmentationChunking.chunk_with_topics | `def chunk_with_topics(self, text):` | |
|
||||
| CLASS | FixedLengthWordChunking | `class FixedLengthWordChunking:` | Chunking strategy that splits text into fixed-length word chunks. How it works: 1. Split the text i... (truncated) |
|
||||
| METHOD | FixedLengthWordChunking.__init__ | `def __init__(self, chunk_size=100, **kwargs):` | Initialize the fixed-length word chunking strategy with the given chunk size. Args: chunk_size ... (truncated) |
|
||||
| METHOD | FixedLengthWordChunking.chunk | `def chunk(self, text):` | |
|
||||
| CLASS | SlidingWindowChunking | `class SlidingWindowChunking:` | Chunking strategy that splits text into overlapping word chunks. How it works: 1. Split the text in... (truncated) |
|
||||
| METHOD | SlidingWindowChunking.__init__ | `def __init__(self, window_size=100, step=50, **kwargs):` | Initialize the sliding window chunking strategy with the given window size and step size. Args: ... (truncated) |
|
||||
| METHOD | SlidingWindowChunking.chunk | `def chunk(self, text):` | |
|
||||
| CLASS | OverlappingWindowChunking | `class OverlappingWindowChunking:` | Chunking strategy that splits text into overlapping word chunks. How it works: 1. Split the text in... (truncated) |
|
||||
| METHOD | OverlappingWindowChunking.__init__ | `def __init__(self, window_size=1000, overlap=100, **kwargs):` | Initialize the overlapping window chunking strategy with the given window size and overlap size. Ar... (truncated) |
|
||||
| METHOD | OverlappingWindowChunking.chunk | `def chunk(self, text):` | |
|
||||
|
||||
## `crawl4ai/user_agent_generator.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | user_agent_generator.py | `` | |
|
||||
| CLASS | UserAgentGenerator | `class UserAgentGenerator:` | Generate random user agents with specified constraints. Attributes: desktop_platforms (dict): A... (truncated) |
|
||||
| METHOD | UserAgentGenerator.__init__ | `def __init__(self):` | |
|
||||
| METHOD | UserAgentGenerator.get_browser_stack | `def get_browser_stack(self, num_browsers=1):` | Get a valid combination of browser versions. How it works: 1. Check if the number of browsers is su... (truncated) |
|
||||
| METHOD | UserAgentGenerator.generate | `def generate(self, device_type=None, os_type=None, device_brand=None, browser_type=None, num_browsers=3):` | Generate a random user agent with specified constraints. Args: device_type: 'desktop' or 'mobil... (truncated) |
|
||||
| METHOD | UserAgentGenerator.generate_with_client_hints | `def generate_with_client_hints(self, **kwargs):` | Generate both user agent and matching client hints |
|
||||
| METHOD | UserAgentGenerator.get_random_platform | `def get_random_platform(self, device_type, os_type, device_brand):` | Helper method to get random platform based on constraints |
|
||||
| METHOD | UserAgentGenerator.parse_user_agent | `def parse_user_agent(self, user_agent):` | Parse a user agent string to extract browser and version information |
|
||||
| METHOD | UserAgentGenerator.generate_client_hints | `def generate_client_hints(self, user_agent):` | Generate Sec-CH-UA header value based on user agent string |
|
||||
|
||||
## `crawl4ai/ssl_certificate.py`
|
||||
|
||||
| Type | Name | Signature | Docstring |
|
||||
| ------ | -------------------------- | ---------------------------------- | --------------------------- |
|
||||
| MODULE | ssl_certificate.py | `` | SSL Certificate class for handling certificate operations. |
|
||||
| CLASS | SSLCertificate | `class SSLCertificate:` | A class representing an SSL certificate with methods to export in various formats. Attributes: ... (truncated) |
|
||||
| METHOD | SSLCertificate.__init__ | `def __init__(self, cert_info):` | |
|
||||
| METHOD | SSLCertificate.from_url | `def from_url(url, timeout=10):` | Create SSLCertificate instance from a URL. Args: url (str): URL of the website. timeout (in... (truncated) |
|
||||
| METHOD | SSLCertificate._decode_cert_data | `def _decode_cert_data(data):` | Helper method to decode bytes in certificate data. |
|
||||
| METHOD | SSLCertificate.to_json | `def to_json(self, filepath=None):` | Export certificate as JSON. Args: filepath (Optional[str]): Path to save the JSON file (default... (truncated) |
|
||||
| METHOD | SSLCertificate.to_pem | `def to_pem(self, filepath=None):` | Export certificate as PEM. Args: filepath (Optional[str]): Path to save the PEM file (default: ... (truncated) |
|
||||
| METHOD | SSLCertificate.to_der | `def to_der(self, filepath=None):` | Export certificate as DER. Args: filepath (Optional[str]): Path to save the DER file (default: ... (truncated) |
|
||||
| METHOD | SSLCertificate.issuer | `def issuer(self):` | Get certificate issuer information. |
|
||||
| METHOD | SSLCertificate.subject | `def subject(self):` | Get certificate subject information. |
|
||||
| METHOD | SSLCertificate.valid_from | `def valid_from(self):` | Get certificate validity start date. |
|
||||
| METHOD | SSLCertificate.valid_until | `def valid_until(self):` | Get certificate validity end date. |
|
||||
| METHOD | SSLCertificate.fingerprint | `def fingerprint(self):` | Get certificate fingerprint. |
|
||||
@@ -18,7 +18,7 @@ async def test_default_headless():
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "mobile", "os_type": "android"},
|
||||
use_managed_browser=False,
|
||||
use_remote_browser=False,
|
||||
use_persistent_context=False,
|
||||
ignore_https_errors=True,
|
||||
# Testing normal ephemeral context
|
||||
@@ -31,14 +31,14 @@ async def test_default_headless():
|
||||
print("[test_default_headless] success:", result.success)
|
||||
print("HTML length:", len(result.html if result.html else ""))
|
||||
|
||||
async def test_managed_browser_persistent():
|
||||
# Treating use_persistent_context=True as managed_browser scenario.
|
||||
async def test_remote_browser_persistent():
|
||||
# Treating use_persistent_context=True as remote_browser scenario.
|
||||
async with AsyncWebCrawler(
|
||||
headless=False,
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "desktop", "os_type": "mac"},
|
||||
use_managed_browser=True,
|
||||
use_remote_browser=True,
|
||||
use_persistent_context=True, # now should behave same as managed browser
|
||||
user_data_dir="./outpu/test_profile",
|
||||
# This should store and reuse profile data across runs
|
||||
@@ -48,7 +48,7 @@ async def test_managed_browser_persistent():
|
||||
cache_mode=CacheMode.BYPASS,
|
||||
markdown_generator=DefaultMarkdownGenerator(options={"ignore_links": True})
|
||||
)
|
||||
print("[test_managed_browser_persistent] success:", result.success)
|
||||
print("[test_remote_browser_persistent] success:", result.success)
|
||||
print("HTML length:", len(result.html if result.html else ""))
|
||||
|
||||
async def test_session_reuse():
|
||||
@@ -59,7 +59,7 @@ async def test_session_reuse():
|
||||
verbose=True,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||
# Fixed user-agent for consistency
|
||||
use_managed_browser=False,
|
||||
use_remote_browser=False,
|
||||
use_persistent_context=False,
|
||||
) as crawler:
|
||||
|
||||
@@ -88,7 +88,7 @@ async def test_magic_mode():
|
||||
verbose=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"},
|
||||
use_managed_browser=False,
|
||||
use_remote_browser=False,
|
||||
use_persistent_context=False,
|
||||
magic=True,
|
||||
override_navigator=True,
|
||||
@@ -109,7 +109,7 @@ async def test_proxy_settings():
|
||||
verbose=False,
|
||||
user_agent="Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36",
|
||||
proxy="http://127.0.0.1:8080", # Assuming local proxy server for test
|
||||
use_managed_browser=False,
|
||||
use_remote_browser=False,
|
||||
use_persistent_context=False,
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
@@ -130,7 +130,7 @@ async def test_ignore_https_errors():
|
||||
verbose=True,
|
||||
user_agent="Mozilla/5.0",
|
||||
ignore_https_errors=True,
|
||||
use_managed_browser=False,
|
||||
use_remote_browser=False,
|
||||
use_persistent_context=False,
|
||||
) as crawler:
|
||||
result = await crawler.arun(
|
||||
@@ -143,7 +143,7 @@ async def test_ignore_https_errors():
|
||||
async def main():
|
||||
print("Running tests...")
|
||||
# await test_default_headless()
|
||||
# await test_managed_browser_persistent()
|
||||
# await test_remote_browser_persistent()
|
||||
# await test_session_reuse()
|
||||
# await test_magic_mode()
|
||||
# await test_proxy_settings()
|
||||
|
||||
@@ -19,7 +19,7 @@ async def test_browser_config_object():
|
||||
headless=False,
|
||||
viewport_width=1920,
|
||||
viewport_height=1080,
|
||||
use_managed_browser=True,
|
||||
use_remote_browser=True,
|
||||
user_agent_mode="random",
|
||||
user_agent_generator_config={"device_type": "desktop", "os_type": "windows"}
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user