Commit Message:

Enhance crawler capabilities and documentation

  - Added SSL certificate extraction in AsyncWebCrawler.
  - Introduced new content filters and chunking strategies for more robust data extraction.
  - Updated documentation management to streamline user experience.
This commit is contained in:
UncleCode
2024-12-26 15:17:07 +08:00
parent d5ed451299
commit 9a4ed6bbd7
72 changed files with 14793 additions and 363 deletions

View File

@@ -23,7 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
from .async_configs import BrowserConfig, CrawlerRunConfig
from .async_logger import AsyncLogger
from playwright_stealth import StealthConfig, stealth_async
from .utilities.ssl_utils import get_ssl_certificate
from .ssl_certificate import SSLCertificate
stealth_config = StealthConfig(
webdriver=True,
@@ -913,9 +913,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
try:
# Get SSL certificate information if requested and URL is HTTPS
ssl_certificate = None
if config.fetch_ssl_certificate and url.startswith('https://'):
ssl_certificate = get_ssl_certificate(url)
ssl_cert = None
if config.fetch_ssl_certificate:
ssl_cert = SSLCertificate.from_url(url)
# Set up download handling
if self.browser_config.accept_downloads:
@@ -1144,7 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
screenshot=screenshot_data,
pdf_data=pdf_data,
get_delayed_content=get_delayed_content,
ssl_certificate=ssl_certificate,
ssl_certificate=ssl_cert,
downloaded_files=(
self._downloaded_files if self._downloaded_files else None
),

View File

@@ -1,6 +1,8 @@
from pydantic import BaseModel, HttpUrl
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
from dataclasses import dataclass
from .ssl_certificate import SSLCertificate
@dataclass
class TokenUsage:
completion_tokens: int = 0
@@ -41,7 +43,9 @@ class CrawlResult(BaseModel):
session_id: Optional[str] = None
response_headers: Optional[dict] = None
status_code: Optional[int] = None
ssl_certificate: Optional[Dict[str, Any]] = None
ssl_certificate: Optional[SSLCertificate] = None
class Config:
arbitrary_types_allowed = True
class AsyncCrawlResponse(BaseModel):
html: str
@@ -51,7 +55,7 @@ class AsyncCrawlResponse(BaseModel):
pdf_data: Optional[bytes] = None
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
downloaded_files: Optional[List[str]] = None
ssl_certificate: Optional[Dict[str, Any]] = None
ssl_certificate: Optional[SSLCertificate] = None
class Config:
arbitrary_types_allowed = True

136
crawl4ai/ssl_certificate.py Normal file
View File

@@ -0,0 +1,136 @@
"""SSL Certificate class for handling certificate operations."""
import ssl
import socket
import base64
import json
from typing import Dict, Any, Optional
from urllib.parse import urlparse
import OpenSSL.crypto
from pathlib import Path
class SSLCertificate:
"""
A class representing an SSL certificate with methods to export in various formats.
"""
def __init__(self, cert_info: Dict[str, Any]):
self._cert_info = self._decode_cert_data(cert_info)
@staticmethod
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
"""Create SSLCertificate instance from a URL."""
try:
hostname = urlparse(url).netloc
if ':' in hostname:
hostname = hostname.split(':')[0]
context = ssl.create_default_context()
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
cert_binary = ssock.getpeercert(binary_form=True)
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
cert_info = {
"subject": dict(x509.get_subject().get_components()),
"issuer": dict(x509.get_issuer().get_components()),
"version": x509.get_version(),
"serial_number": hex(x509.get_serial_number()),
"not_before": x509.get_notBefore(),
"not_after": x509.get_notAfter(),
"fingerprint": x509.digest("sha256").hex(),
"signature_algorithm": x509.get_signature_algorithm(),
"raw_cert": base64.b64encode(cert_binary)
}
# Add extensions
extensions = []
for i in range(x509.get_extension_count()):
ext = x509.get_extension(i)
extensions.append({
"name": ext.get_short_name(),
"value": str(ext)
})
cert_info["extensions"] = extensions
return SSLCertificate(cert_info)
except Exception as e:
return None
@staticmethod
def _decode_cert_data(data: Any) -> Any:
"""Helper method to decode bytes in certificate data."""
if isinstance(data, bytes):
return data.decode('utf-8')
elif isinstance(data, dict):
return {
(k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
for k, v in data.items()
}
elif isinstance(data, list):
return [SSLCertificate._decode_cert_data(item) for item in data]
return data
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
"""Export certificate as JSON."""
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
if filepath:
Path(filepath).write_text(json_str, encoding='utf-8')
return None
return json_str
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
"""Export certificate as PEM."""
try:
x509 = OpenSSL.crypto.load_certificate(
OpenSSL.crypto.FILETYPE_ASN1,
base64.b64decode(self._cert_info['raw_cert'])
)
pem_data = OpenSSL.crypto.dump_certificate(
OpenSSL.crypto.FILETYPE_PEM,
x509
).decode('utf-8')
if filepath:
Path(filepath).write_text(pem_data, encoding='utf-8')
return None
return pem_data
except Exception as e:
return None
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
"""Export certificate as DER."""
try:
der_data = base64.b64decode(self._cert_info['raw_cert'])
if filepath:
Path(filepath).write_bytes(der_data)
return None
return der_data
except Exception:
return None
@property
def issuer(self) -> Dict[str, str]:
"""Get certificate issuer information."""
return self._cert_info.get('issuer', {})
@property
def subject(self) -> Dict[str, str]:
"""Get certificate subject information."""
return self._cert_info.get('subject', {})
@property
def valid_from(self) -> str:
"""Get certificate validity start date."""
return self._cert_info.get('not_before', '')
@property
def valid_until(self) -> str:
"""Get certificate validity end date."""
return self._cert_info.get('not_after', '')
@property
def fingerprint(self) -> str:
"""Get certificate fingerprint."""
return self._cert_info.get('fingerprint', '')

View File

@@ -1,156 +0,0 @@
"""Utility functions for exporting SSL certificates in various formats."""
import json
import base64
from typing import Dict, Any, Optional
from pathlib import Path
import OpenSSL.crypto
from datetime import datetime
class CertificateExporter:
"""
Handles exporting SSL certificates in various formats:
1. JSON - Human-readable format with all certificate details
2. PEM - Standard text format for certificates
3. DER - Binary format
"""
@staticmethod
def _decode_cert_data(data: Any) -> Any:
"""Helper method to decode bytes in certificate data."""
if isinstance(data, bytes):
return data.decode('utf-8')
elif isinstance(data, dict):
return {
(k.decode('utf-8') if isinstance(k, bytes) else k): CertificateExporter._decode_cert_data(v)
for k, v in data.items()
}
elif isinstance(data, list):
return [CertificateExporter._decode_cert_data(item) for item in data]
return data
@staticmethod
def to_json(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]:
"""
Export certificate information to JSON format.
Args:
cert_info: Dictionary containing certificate information
filepath: Optional path to save the JSON file
Returns:
str: JSON string if filepath is None, otherwise None
"""
if not cert_info:
return None
# Decode any bytes in the certificate data
cert_data = CertificateExporter._decode_cert_data(cert_info)
# Convert datetime objects to ISO format strings
for key, value in cert_data.items():
if isinstance(value, datetime):
cert_data[key] = value.isoformat()
json_str = json.dumps(cert_data, indent=2, ensure_ascii=False)
if filepath:
Path(filepath).write_text(json_str, encoding='utf-8')
return None
return json_str
@staticmethod
def to_pem(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]:
"""
Export certificate to PEM format.
This is the most common format, used for Apache/Nginx configs.
Args:
cert_info: Dictionary containing certificate information
filepath: Optional path to save the PEM file
Returns:
str: PEM string if filepath is None, otherwise None
"""
if not cert_info or 'raw_cert' not in cert_info:
return None
try:
x509 = OpenSSL.crypto.load_certificate(
OpenSSL.crypto.FILETYPE_ASN1,
base64.b64decode(cert_info['raw_cert'])
)
pem_data = OpenSSL.crypto.dump_certificate(
OpenSSL.crypto.FILETYPE_PEM,
x509
).decode('utf-8')
if filepath:
Path(filepath).write_text(pem_data, encoding='utf-8')
return None
return pem_data
except Exception as e:
return f"Error converting to PEM: {str(e)}"
@staticmethod
def to_der(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[bytes]:
"""
Export certificate to DER format (binary).
This format is commonly used in Java environments.
Args:
cert_info: Dictionary containing certificate information
filepath: Optional path to save the DER file
Returns:
bytes: DER bytes if filepath is None, otherwise None
"""
if not cert_info or 'raw_cert' not in cert_info:
return None
try:
der_data = base64.b64decode(cert_info['raw_cert'])
if filepath:
Path(filepath).write_bytes(der_data)
return None
return der_data
except Exception as e:
return None
@staticmethod
def export_all(cert_info: Dict[str, Any], base_path: str, filename: str) -> Dict[str, str]:
"""
Export certificate in all supported formats.
Args:
cert_info: Dictionary containing certificate information
base_path: Base directory to save the files
filename: Base filename without extension
Returns:
Dict[str, str]: Dictionary mapping format to filepath
"""
base_path = Path(base_path)
base_path.mkdir(parents=True, exist_ok=True)
paths = {}
# Export JSON
json_path = base_path / f"{filename}.json"
CertificateExporter.to_json(cert_info, str(json_path))
paths['json'] = str(json_path)
# Export PEM
pem_path = base_path / f"{filename}.pem"
CertificateExporter.to_pem(cert_info, str(pem_path))
paths['pem'] = str(pem_path)
# Export DER
der_path = base_path / f"{filename}.der"
CertificateExporter.to_der(cert_info, str(der_path))
paths['der'] = str(der_path)
return paths

View File

@@ -1,83 +0,0 @@
"""Utility functions for SSL certificate handling."""
import ssl
import socket
from typing import Dict, Any, Optional
from urllib.parse import urlparse
import OpenSSL.crypto
import datetime
import base64
def get_ssl_certificate(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
"""
Retrieve SSL certificate information from a given URL.
Args:
url (str): The URL to get SSL certificate from
timeout (int): Socket timeout in seconds
Returns:
Optional[Dict[str, Any]]: Dictionary containing certificate information or None if not available
The returned dictionary includes:
- subject: Certificate subject information
- issuer: Certificate issuer information
- version: SSL version
- serial_number: Certificate serial number
- not_before: Certificate validity start date
- not_after: Certificate validity end date
- fingerprint: Certificate fingerprint
- raw_cert: Base64 encoded raw certificate data
"""
try:
hostname = urlparse(url).netloc
if ':' in hostname:
hostname = hostname.split(':')[0]
context = ssl.create_default_context()
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
cert_binary = ssock.getpeercert(binary_form=True)
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
cert_info = {
"subject": {
key: value.decode() if isinstance(value, bytes) else value
for key, value in dict(x509.get_subject().get_components()).items()
},
"issuer": {
key: value.decode() if isinstance(value, bytes) else value
for key, value in dict(x509.get_issuer().get_components()).items()
},
"version": x509.get_version(),
"serial_number": hex(x509.get_serial_number()),
"not_before": x509.get_notBefore().decode(),
"not_after": x509.get_notAfter().decode(),
"fingerprint": x509.digest("sha256").hex(),
"signature_algorithm": x509.get_signature_algorithm().decode(),
"raw_cert": base64.b64encode(cert_binary).decode('utf-8')
}
# Add extensions
extensions = []
for i in range(x509.get_extension_count()):
ext = x509.get_extension(i)
extensions.append({
"name": ext.get_short_name().decode(),
"value": str(ext)
})
cert_info["extensions"] = extensions
return cert_info
except (socket.gaierror, socket.timeout, ssl.SSLError, ValueError) as e:
return {
"error": str(e),
"status": "failed"
}
except Exception as e:
return {
"error": f"Unexpected error: {str(e)}",
"status": "failed"
}