diff --git a/crawl4ai/async_configs.py b/crawl4ai/async_configs.py
index c3736297..e10a0105 100644
--- a/crawl4ai/async_configs.py
+++ b/crawl4ai/async_configs.py
@@ -26,6 +26,8 @@ import inspect
from typing import Any, Dict, Optional
from enum import Enum
+from .proxy_strategy import ProxyConfig
+
def to_serializable_dict(obj: Any, ignore_default_value : bool = False) -> Dict:
"""
@@ -180,7 +182,7 @@ class BrowserConfig:
is "chromium". Default: "chromium".
proxy (Optional[str]): Proxy server URL (e.g., "http://username:password@proxy:port"). If None, no proxy is used.
Default: None.
- proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+ proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
viewport_width (int): Default viewport width for pages. Default: 1080.
viewport_height (int): Default viewport height for pages. Default: 600.
@@ -225,7 +227,7 @@ class BrowserConfig:
chrome_channel: str = "chromium",
channel: str = "chromium",
proxy: str = None,
- proxy_config: dict = None,
+ proxy_config: Union[ProxyConfig, dict, None] = None,
viewport_width: int = 1080,
viewport_height: int = 600,
viewport: dict = None,
@@ -315,7 +317,7 @@ class BrowserConfig:
chrome_channel=kwargs.get("chrome_channel", "chromium"),
channel=kwargs.get("channel", "chromium"),
proxy=kwargs.get("proxy"),
- proxy_config=kwargs.get("proxy_config"),
+ proxy_config=kwargs.get("proxy_config", None),
viewport_width=kwargs.get("viewport_width", 1080),
viewport_height=kwargs.get("viewport_height", 600),
accept_downloads=kwargs.get("accept_downloads", False),
@@ -515,7 +517,7 @@ class CrawlerRunConfig():
Default: "lxml".
scraping_strategy (ContentScrapingStrategy): Scraping strategy to use.
Default: WebScrapingStrategy.
- proxy_config (dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
+ proxy_config (ProxyConfig or dict or None): Detailed proxy configuration, e.g. {"server": "...", "username": "..."}.
If None, no additional proxy config. Default: None.
# SSL Parameters
@@ -656,7 +658,7 @@ class CrawlerRunConfig():
prettiify: bool = False,
parser_type: str = "lxml",
scraping_strategy: ContentScrapingStrategy = None,
- proxy_config: dict = None,
+ proxy_config: Union[ProxyConfig, dict, None] = None,
proxy_rotation_strategy: Optional[ProxyRotationStrategy] = None,
# SSL Parameters
fetch_ssl_certificate: bool = False,
diff --git a/crawl4ai/async_crawler_strategy.py b/crawl4ai/async_crawler_strategy.py
index 19b6a689..960c2d6f 100644
--- a/crawl4ai/async_crawler_strategy.py
+++ b/crawl4ai/async_crawler_strategy.py
@@ -767,6 +767,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
# Handle wait_for condition
# Todo: Decide how to handle this
if not config.wait_for and config.css_selector and False:
+ # if not config.wait_for and config.css_selector:
config.wait_for = f"css:{config.css_selector}"
if config.wait_for:
@@ -806,8 +807,28 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
if config.remove_overlay_elements:
await self.remove_overlay_elements(page)
- # Get final HTML content
- html = await page.content()
+ if config.css_selector:
+ try:
+ # Handle comma-separated selectors by splitting them
+ selectors = [s.strip() for s in config.css_selector.split(',')]
+ html_parts = []
+
+ for selector in selectors:
+ try:
+ content = await page.evaluate(f"document.querySelector('{selector}')?.outerHTML || ''")
+ html_parts.append(content)
+ except Error as e:
+ print(f"Warning: Could not get content for selector '{selector}': {str(e)}")
+
+ # Wrap in a div to create a valid HTML structure
+ html = f"
\n" + "\n".join(html_parts) + "\n
"
+ except Error as e:
+ raise RuntimeError(f"Failed to extract HTML content: {str(e)}")
+ else:
+ html = await page.content()
+
+ # # Get final HTML content
+ # html = await page.content()
await self.execute_hook(
"before_return_html", page=page, html=html, context=context, config=config
)
diff --git a/crawl4ai/browser_manager.py b/crawl4ai/browser_manager.py
index 4e686580..38f87d9a 100644
--- a/crawl4ai/browser_manager.py
+++ b/crawl4ai/browser_manager.py
@@ -531,9 +531,9 @@ class BrowserManager:
ProxySettings(server=self.config.proxy)
if self.config.proxy
else ProxySettings(
- server=self.config.proxy_config.get("server"),
- username=self.config.proxy_config.get("username"),
- password=self.config.proxy_config.get("password"),
+ server=self.config.proxy_config.server,
+ username=self.config.proxy_config.username,
+ password=self.config.proxy_config.password,
)
)
browser_args["proxy"] = proxy_settings
diff --git a/crawl4ai/configs/__init__.py b/crawl4ai/configs/__init__.py
deleted file mode 100644
index b92adb35..00000000
--- a/crawl4ai/configs/__init__.py
+++ /dev/null
@@ -1,2 +0,0 @@
-from .proxy_config import ProxyConfig
-__all__ = ["ProxyConfig"]
\ No newline at end of file
diff --git a/crawl4ai/configs/proxy_config.py b/crawl4ai/configs/proxy_config.py
deleted file mode 100644
index c447c6bc..00000000
--- a/crawl4ai/configs/proxy_config.py
+++ /dev/null
@@ -1,113 +0,0 @@
-import os
-from typing import Dict, List, Optional
-
-
-class ProxyConfig:
- def __init__(
- self,
- server: str,
- username: Optional[str] = None,
- password: Optional[str] = None,
- ip: Optional[str] = None,
- ):
- """Configuration class for a single proxy.
-
- Args:
- server: Proxy server URL (e.g., "http://127.0.0.1:8080")
- username: Optional username for proxy authentication
- password: Optional password for proxy authentication
- ip: Optional IP address for verification purposes
- """
- self.server = server
- self.username = username
- self.password = password
-
- # Extract IP from server if not explicitly provided
- self.ip = ip or self._extract_ip_from_server()
-
- def _extract_ip_from_server(self) -> Optional[str]:
- """Extract IP address from server URL."""
- try:
- # Simple extraction assuming http://ip:port format
- if "://" in self.server:
- parts = self.server.split("://")[1].split(":")
- return parts[0]
- else:
- parts = self.server.split(":")
- return parts[0]
- except Exception:
- return None
-
- @staticmethod
- def from_string(proxy_str: str) -> "ProxyConfig":
- """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
- parts = proxy_str.split(":")
- if len(parts) == 4: # ip:port:username:password
- ip, port, username, password = parts
- return ProxyConfig(
- server=f"http://{ip}:{port}",
- username=username,
- password=password,
- ip=ip
- )
- elif len(parts) == 2: # ip:port only
- ip, port = parts
- return ProxyConfig(
- server=f"http://{ip}:{port}",
- ip=ip
- )
- else:
- raise ValueError(f"Invalid proxy string format: {proxy_str}")
-
- @staticmethod
- def from_dict(proxy_dict: Dict) -> "ProxyConfig":
- """Create a ProxyConfig from a dictionary."""
- return ProxyConfig(
- server=proxy_dict.get("server"),
- username=proxy_dict.get("username"),
- password=proxy_dict.get("password"),
- ip=proxy_dict.get("ip")
- )
-
- @staticmethod
- def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
- """Load proxies from environment variable.
-
- Args:
- env_var: Name of environment variable containing comma-separated proxy strings
-
- Returns:
- List of ProxyConfig objects
- """
- proxies = []
- try:
- proxy_list = os.getenv(env_var, "").split(",")
- for proxy in proxy_list:
- if not proxy:
- continue
- proxies.append(ProxyConfig.from_string(proxy))
- except Exception as e:
- print(f"Error loading proxies from environment: {e}")
- return proxies
-
- def to_dict(self) -> Dict:
- """Convert to dictionary representation."""
- return {
- "server": self.server,
- "username": self.username,
- "password": self.password,
- "ip": self.ip
- }
-
- def clone(self, **kwargs) -> "ProxyConfig":
- """Create a copy of this configuration with updated values.
-
- Args:
- **kwargs: Key-value pairs of configuration options to update
-
- Returns:
- ProxyConfig: A new instance with the specified updates
- """
- config_dict = self.to_dict()
- config_dict.update(kwargs)
- return ProxyConfig.from_dict(config_dict)
diff --git a/crawl4ai/content_scraping_strategy.py b/crawl4ai/content_scraping_strategy.py
index 46761013..ba70dc11 100644
--- a/crawl4ai/content_scraping_strategy.py
+++ b/crawl4ai/content_scraping_strategy.py
@@ -742,7 +742,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
for element in body.select(excluded_selector):
element.extract()
- if css_selector:
+ if False and css_selector:
selected_elements = body.select(css_selector)
if not selected_elements:
return {
@@ -848,6 +848,7 @@ class WebScrapingStrategy(ContentScrapingStrategy):
return {
# **markdown_content,
+ "scraped_html": html,
"cleaned_html": cleaned_html,
"success": success,
"media": media,
diff --git a/crawl4ai/proxy_strategy.py b/crawl4ai/proxy_strategy.py
index 0776e68a..6821c566 100644
--- a/crawl4ai/proxy_strategy.py
+++ b/crawl4ai/proxy_strategy.py
@@ -1,8 +1,119 @@
from typing import List, Dict, Optional
from abc import ABC, abstractmethod
from itertools import cycle
+import os
+
+
+class ProxyConfig:
+ def __init__(
+ self,
+ server: str,
+ username: Optional[str] = None,
+ password: Optional[str] = None,
+ ip: Optional[str] = None,
+ ):
+ """Configuration class for a single proxy.
+
+ Args:
+ server: Proxy server URL (e.g., "http://127.0.0.1:8080")
+ username: Optional username for proxy authentication
+ password: Optional password for proxy authentication
+ ip: Optional IP address for verification purposes
+ """
+ self.server = server
+ self.username = username
+ self.password = password
+
+ # Extract IP from server if not explicitly provided
+ self.ip = ip or self._extract_ip_from_server()
+
+ def _extract_ip_from_server(self) -> Optional[str]:
+ """Extract IP address from server URL."""
+ try:
+ # Simple extraction assuming http://ip:port format
+ if "://" in self.server:
+ parts = self.server.split("://")[1].split(":")
+ return parts[0]
+ else:
+ parts = self.server.split(":")
+ return parts[0]
+ except Exception:
+ return None
+
+ @staticmethod
+ def from_string(proxy_str: str) -> "ProxyConfig":
+ """Create a ProxyConfig from a string in the format 'ip:port:username:password'."""
+ parts = proxy_str.split(":")
+ if len(parts) == 4: # ip:port:username:password
+ ip, port, username, password = parts
+ return ProxyConfig(
+ server=f"http://{ip}:{port}",
+ username=username,
+ password=password,
+ ip=ip
+ )
+ elif len(parts) == 2: # ip:port only
+ ip, port = parts
+ return ProxyConfig(
+ server=f"http://{ip}:{port}",
+ ip=ip
+ )
+ else:
+ raise ValueError(f"Invalid proxy string format: {proxy_str}")
+
+ @staticmethod
+ def from_dict(proxy_dict: Dict) -> "ProxyConfig":
+ """Create a ProxyConfig from a dictionary."""
+ return ProxyConfig(
+ server=proxy_dict.get("server"),
+ username=proxy_dict.get("username"),
+ password=proxy_dict.get("password"),
+ ip=proxy_dict.get("ip")
+ )
+
+ @staticmethod
+ def from_env(env_var: str = "PROXIES") -> List["ProxyConfig"]:
+ """Load proxies from environment variable.
+
+ Args:
+ env_var: Name of environment variable containing comma-separated proxy strings
+
+ Returns:
+ List of ProxyConfig objects
+ """
+ proxies = []
+ try:
+ proxy_list = os.getenv(env_var, "").split(",")
+ for proxy in proxy_list:
+ if not proxy:
+ continue
+ proxies.append(ProxyConfig.from_string(proxy))
+ except Exception as e:
+ print(f"Error loading proxies from environment: {e}")
+ return proxies
+
+ def to_dict(self) -> Dict:
+ """Convert to dictionary representation."""
+ return {
+ "server": self.server,
+ "username": self.username,
+ "password": self.password,
+ "ip": self.ip
+ }
+
+ def clone(self, **kwargs) -> "ProxyConfig":
+ """Create a copy of this configuration with updated values.
+
+ Args:
+ **kwargs: Key-value pairs of configuration options to update
+
+ Returns:
+ ProxyConfig: A new instance with the specified updates
+ """
+ config_dict = self.to_dict()
+ config_dict.update(kwargs)
+ return ProxyConfig.from_dict(config_dict)
-from crawl4ai.configs import ProxyConfig
class ProxyRotationStrategy(ABC):
"""Base abstract class for proxy rotation strategies"""
diff --git a/docs/examples/tutorial_v0.5.py b/docs/examples/tutorial_v0.5.py
index d8e01e68..3cbbdb7b 100644
--- a/docs/examples/tutorial_v0.5.py
+++ b/docs/examples/tutorial_v0.5.py
@@ -13,7 +13,7 @@ from crawl4ai.deep_crawling import (
)
from crawl4ai.deep_crawling.scorers import KeywordRelevanceScorer
from crawl4ai.async_crawler_strategy import AsyncHTTPCrawlerStrategy
-from crawl4ai.configs import ProxyConfig
+from crawl4ai.proxy_strategy import ProxyConfig
from crawl4ai import RoundRobinProxyStrategy
from crawl4ai.content_filter_strategy import LLMContentFilter
from crawl4ai import DefaultMarkdownGenerator
diff --git a/docs/md_v2/blog/releases/0.5.0.md b/docs/md_v2/blog/releases/0.5.0.md
index 7f38bf2a..24b0feda 100644
--- a/docs/md_v2/blog/releases/0.5.0.md
+++ b/docs/md_v2/blog/releases/0.5.0.md
@@ -251,7 +251,7 @@ from crawl4ai import (
RoundRobinProxyStrategy,
)
import asyncio
-from crawl4ai.configs import ProxyConfig
+from crawl4ai.proxy_strategy import ProxyConfig
async def main():
# Load proxies and create rotation strategy
proxies = ProxyConfig.from_env()
diff --git a/docs/snippets/deep_crawl/intro.py b/docs/snippets/deep_crawl/1.intro.py
similarity index 100%
rename from docs/snippets/deep_crawl/intro.py
rename to docs/snippets/deep_crawl/1.intro.py
diff --git a/docs/snippets/deep_crawl/2.filters.py b/docs/snippets/deep_crawl/2.filters.py
new file mode 100644
index 00000000..c50eae0a
--- /dev/null
+++ b/docs/snippets/deep_crawl/2.filters.py
@@ -0,0 +1,162 @@
+import asyncio
+from typing import List
+
+from crawl4ai import (
+ AsyncWebCrawler,
+ CrawlerRunConfig,
+ BFSDeepCrawlStrategy,
+ CrawlResult,
+ URLFilter, # Base class for filters, not directly used in examples but good to import for context
+ ContentTypeFilter,
+ DomainFilter,
+ FilterChain,
+ URLPatternFilter,
+ SEOFilter # Advanced filter, can be introduced later or as bonus
+)
+
+async def deep_crawl_filter_tutorial_part_2():
+ """
+ Tutorial demonstrating URL filters in Crawl4AI, focusing on isolated filter behavior
+ before integrating them into a deep crawl.
+
+ This tutorial covers:
+ - Testing individual filters with synthetic URLs.
+ - Understanding filter logic and behavior in isolation.
+ - Combining filters using FilterChain.
+ - Integrating filters into a deep crawling example.
+ """
+
+ # === Introduction: URL Filters in Isolation ===
+ print("\n" + "=" * 40)
+ print("=== Introduction: URL Filters in Isolation ===")
+ print("=" * 40 + "\n")
+ print("In this section, we will explore each filter individually using synthetic URLs.")
+ print("This allows us to understand exactly how each filter works before using them in a crawl.\n")
+
+
+ # === 2. ContentTypeFilter - Testing in Isolation ===
+ print("\n" + "=" * 40)
+ print("=== 2. ContentTypeFilter - Testing in Isolation ===")
+ print("=" * 40 + "\n")
+
+ # 2.1. Create ContentTypeFilter:
+ # Create a ContentTypeFilter to allow only 'text/html' and 'application/json' content types
+ # BASED ON URL EXTENSIONS.
+ content_type_filter = ContentTypeFilter(allowed_types=["text/html", "application/json"])
+ print("ContentTypeFilter created, allowing types (by extension): ['text/html', 'application/json']")
+ print("Note: ContentTypeFilter in Crawl4ai works by checking URL file extensions, not HTTP headers.")
+
+
+ # 2.2. Synthetic URLs for Testing:
+ # ContentTypeFilter checks URL extensions. We provide URLs with different extensions to test.
+ test_urls_content_type = [
+ "https://example.com/page.html", # Should pass: .html extension (text/html)
+ "https://example.com/data.json", # Should pass: .json extension (application/json)
+ "https://example.com/image.png", # Should reject: .png extension (not allowed type)
+ "https://example.com/document.pdf", # Should reject: .pdf extension (not allowed type)
+ "https://example.com/page", # Should pass: no extension (defaults to allow) - check default behaviour!
+ "https://example.com/page.xhtml", # Should pass: .xhtml extension (text/html)
+ ]
+
+ # 2.3. Apply Filter and Show Results:
+ print("\n=== Testing ContentTypeFilter (URL Extension based) ===")
+ for url in test_urls_content_type:
+ passed = content_type_filter.apply(url)
+ result = "PASSED" if passed else "REJECTED"
+ extension = ContentTypeFilter._extract_extension(url) # Show extracted extension for clarity
+ print(f"- URL: {url} - {result} (Extension: '{extension or 'No Extension'}')")
+ print("=" * 40)
+
+ input("Press Enter to continue to DomainFilter example...")
+
+ # === 3. DomainFilter - Testing in Isolation ===
+ print("\n" + "=" * 40)
+ print("=== 3. DomainFilter - Testing in Isolation ===")
+ print("=" * 40 + "\n")
+
+ # 3.1. Create DomainFilter:
+ domain_filter = DomainFilter(allowed_domains=["crawl4ai.com", "example.com"])
+ print("DomainFilter created, allowing domains: ['crawl4ai.com', 'example.com']")
+
+ # 3.2. Synthetic URLs for Testing:
+ test_urls_domain = [
+ "https://docs.crawl4ai.com/api",
+ "https://example.com/products",
+ "https://another-website.org/blog",
+ "https://sub.example.com/about",
+ "https://crawl4ai.com.attacker.net", # Corrected example: now should be rejected
+ ]
+
+ # 3.3. Apply Filter and Show Results:
+ print("\n=== Testing DomainFilter ===")
+ for url in test_urls_domain:
+ passed = domain_filter.apply(url)
+ result = "PASSED" if passed else "REJECTED"
+ print(f"- URL: {url} - {result}")
+ print("=" * 40)
+
+ input("Press Enter to continue to FilterChain example...")
+
+ # === 4. FilterChain - Combining Filters ===
+ print("\n" + "=" * 40)
+ print("=== 4. FilterChain - Combining Filters ===")
+ print("=" * 40 + "\n")
+
+ combined_filter = FilterChain(
+ filters=[
+ URLPatternFilter(patterns=["*api*"]),
+ ContentTypeFilter(allowed_types=["text/html"]), # Still URL extension based
+ DomainFilter(allowed_domains=["docs.crawl4ai.com"]),
+ ]
+ )
+ print("FilterChain created, combining URLPatternFilter, ContentTypeFilter, and DomainFilter.")
+
+
+ test_urls_combined = [
+ "https://docs.crawl4ai.com/api/async-webcrawler",
+ "https://example.com/api/products",
+ "https://docs.crawl4ai.com/core/crawling",
+ "https://another-website.org/api/data",
+ ]
+
+ # 4.3. Apply FilterChain and Show Results
+ print("\n=== Testing FilterChain (URLPatternFilter + ContentTypeFilter + DomainFilter) ===")
+ for url in test_urls_combined:
+ passed = await combined_filter.apply(url)
+ result = "PASSED" if passed else "REJECTED"
+ print(f"- URL: {url} - {result}")
+ print("=" * 40)
+
+ input("Press Enter to continue to Deep Crawl with FilterChain example...")
+
+ # === 5. Deep Crawl with FilterChain ===
+ print("\n" + "=" * 40)
+ print("=== 5. Deep Crawl with FilterChain ===")
+ print("=" * 40 + "\n")
+ print("Finally, let's integrate the FilterChain into a deep crawl example.")
+
+ config_final_crawl = CrawlerRunConfig(
+ deep_crawl_strategy=BFSDeepCrawlStrategy(
+ max_depth=2,
+ max_pages=10,
+ include_external=False,
+ filter_chain=combined_filter
+ ),
+ verbose=False,
+ )
+
+ async with AsyncWebCrawler() as crawler:
+ results_final_crawl: List[CrawlResult] = await crawler.arun(
+ url="https://docs.crawl4ai.com", config=config_final_crawl
+ )
+
+ print("=== Crawled URLs (Deep Crawl with FilterChain) ===")
+ for result in results_final_crawl:
+ print(f"- {result.url}, Depth: {result.metadata.get('depth', 0)}")
+ print("=" * 40)
+
+ print("\nTutorial Completed! Review the output of each section to understand URL filters.")
+
+
+if __name__ == "__main__":
+ asyncio.run(deep_crawl_filter_tutorial_part_2())
\ No newline at end of file