Commit Message:

Enhance crawler capabilities and documentation

  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation management to streamline user experience.
This commit is contained in:
UncleCode
2024-12-26 15:17:07 +08:00
parent d5ed451299
commit 9a4ed6bbd7
72 changed files with 14793 additions and 363 deletions

View File

@@ -23,7 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig, stealth_async
from .utilities.ssl_utils import get_ssl_certificate
from .ssl_certificate import SSLCertificate
stealth_config = StealthConfig(
webdriver=True,
@@ -913,9 +913,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
try:
# Get SSL certificate information if requested and URL is HTTPS
ssl_certificate = None
if config.fetch_ssl_certificate and url.startswith('https://'):
ssl_certificate = get_ssl_certificate(url)
ssl_cert = None
if config.fetch_ssl_certificate:
ssl_cert = SSLCertificate.from_url(url)
# Set up download handling
if self.browser_config.accept_downloads:
@@ -1144,7 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
screenshot=screenshot_data,
pdf_data=pdf_data,
get_delayed_content=get_delayed_content,
ssl_certificate=ssl_certificate,
ssl_certificate=ssl_cert,
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
),