[v0.3.72] Enhance content extraction and proxy support
- Add ContentCleaningStrategy for improved content extraction - Implement advanced proxy configuration with authentication - Enhance image source detection and handling - Add fit_markdown and fit_html for refined content output - Improve external link and image handling flexibility
This commit is contained in:
@@ -71,6 +71,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
"(KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
|
||||
)
|
||||
self.proxy = kwargs.get("proxy")
|
||||
self.proxy_config = kwargs.get("proxy_config")
|
||||
self.headless = kwargs.get("headless", True)
|
||||
self.browser_type = kwargs.get("browser_type", "chromium")
|
||||
self.headers = kwargs.get("headers", {})
|
||||
@@ -121,6 +122,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
if self.proxy:
|
||||
proxy_settings = ProxySettings(server=self.proxy)
|
||||
browser_args["proxy"] = proxy_settings
|
||||
elif self.proxy_config:
|
||||
proxy_settings = ProxySettings(server=self.proxy_config.get("server"), username=self.proxy_config.get("username"), password=self.proxy_config.get("password"))
|
||||
browser_args["proxy"] = proxy_settings
|
||||
|
||||
# Select the appropriate browser based on the browser_type
|
||||
if self.browser_type == "firefox":
|
||||
|
||||
Reference in New Issue
Block a user