Commit Message:
Enhance crawler capabilities and documentation - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation management to streamline user experience.
This commit is contained in:
@@ -23,7 +23,7 @@ from .config import SCREENSHOT_HEIGHT_TRESHOLD, DOWNLOAD_PAGE_TIMEOUT
|
||||
from .async_configs import BrowserConfig, CrawlerRunConfig
|
||||
from .async_logger import AsyncLogger
|
||||
from playwright_stealth import StealthConfig, stealth_async
|
||||
from .utilities.ssl_utils import get_ssl_certificate
|
||||
from .ssl_certificate import SSLCertificate
|
||||
|
||||
stealth_config = StealthConfig(
|
||||
webdriver=True,
|
||||
@@ -913,9 +913,9 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
|
||||
try:
|
||||
# Get SSL certificate information if requested and URL is HTTPS
|
||||
ssl_certificate = None
|
||||
if config.fetch_ssl_certificate and url.startswith('https://'):
|
||||
ssl_certificate = get_ssl_certificate(url)
|
||||
ssl_cert = None
|
||||
if config.fetch_ssl_certificate:
|
||||
ssl_cert = SSLCertificate.from_url(url)
|
||||
|
||||
# Set up download handling
|
||||
if self.browser_config.accept_downloads:
|
||||
@@ -1144,7 +1144,7 @@ class AsyncPlaywrightCrawlerStrategy(AsyncCrawlerStrategy):
|
||||
screenshot=screenshot_data,
|
||||
pdf_data=pdf_data,
|
||||
get_delayed_content=get_delayed_content,
|
||||
ssl_certificate=ssl_certificate,
|
||||
ssl_certificate=ssl_cert,
|
||||
downloaded_files=(
|
||||
self._downloaded_files if self._downloaded_files else None
|
||||
),
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
from pydantic import BaseModel, HttpUrl
|
||||
from typing import List, Dict, Optional, Callable, Awaitable, Union, Any
|
||||
from dataclasses import dataclass
|
||||
from .ssl_certificate import SSLCertificate
|
||||
|
||||
@dataclass
|
||||
class TokenUsage:
|
||||
completion_tokens: int = 0
|
||||
@@ -41,7 +43,9 @@ class CrawlResult(BaseModel):
|
||||
session_id: Optional[str] = None
|
||||
response_headers: Optional[dict] = None
|
||||
status_code: Optional[int] = None
|
||||
ssl_certificate: Optional[Dict[str, Any]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
class AsyncCrawlResponse(BaseModel):
|
||||
html: str
|
||||
@@ -51,7 +55,7 @@ class AsyncCrawlResponse(BaseModel):
|
||||
pdf_data: Optional[bytes] = None
|
||||
get_delayed_content: Optional[Callable[[Optional[float]], Awaitable[str]]] = None
|
||||
downloaded_files: Optional[List[str]] = None
|
||||
ssl_certificate: Optional[Dict[str, Any]] = None
|
||||
ssl_certificate: Optional[SSLCertificate] = None
|
||||
|
||||
class Config:
|
||||
arbitrary_types_allowed = True
|
||||
|
||||
136
crawl4ai/ssl_certificate.py
Normal file
136
crawl4ai/ssl_certificate.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""SSL Certificate class for handling certificate operations."""
|
||||
|
||||
import ssl
|
||||
import socket
|
||||
import base64
|
||||
import json
|
||||
from typing import Dict, Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
import OpenSSL.crypto
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
class SSLCertificate:
|
||||
"""
|
||||
A class representing an SSL certificate with methods to export in various formats.
|
||||
"""
|
||||
def __init__(self, cert_info: Dict[str, Any]):
|
||||
self._cert_info = self._decode_cert_data(cert_info)
|
||||
|
||||
@staticmethod
|
||||
def from_url(url: str, timeout: int = 10) -> Optional['SSLCertificate']:
|
||||
"""Create SSLCertificate instance from a URL."""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ':' in hostname:
|
||||
hostname = hostname.split(':')[0]
|
||||
|
||||
context = ssl.create_default_context()
|
||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||
cert_binary = ssock.getpeercert(binary_form=True)
|
||||
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
|
||||
|
||||
cert_info = {
|
||||
"subject": dict(x509.get_subject().get_components()),
|
||||
"issuer": dict(x509.get_issuer().get_components()),
|
||||
"version": x509.get_version(),
|
||||
"serial_number": hex(x509.get_serial_number()),
|
||||
"not_before": x509.get_notBefore(),
|
||||
"not_after": x509.get_notAfter(),
|
||||
"fingerprint": x509.digest("sha256").hex(),
|
||||
"signature_algorithm": x509.get_signature_algorithm(),
|
||||
"raw_cert": base64.b64encode(cert_binary)
|
||||
}
|
||||
|
||||
# Add extensions
|
||||
extensions = []
|
||||
for i in range(x509.get_extension_count()):
|
||||
ext = x509.get_extension(i)
|
||||
extensions.append({
|
||||
"name": ext.get_short_name(),
|
||||
"value": str(ext)
|
||||
})
|
||||
cert_info["extensions"] = extensions
|
||||
|
||||
return SSLCertificate(cert_info)
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def _decode_cert_data(data: Any) -> Any:
|
||||
"""Helper method to decode bytes in certificate data."""
|
||||
if isinstance(data, bytes):
|
||||
return data.decode('utf-8')
|
||||
elif isinstance(data, dict):
|
||||
return {
|
||||
(k.decode('utf-8') if isinstance(k, bytes) else k): SSLCertificate._decode_cert_data(v)
|
||||
for k, v in data.items()
|
||||
}
|
||||
elif isinstance(data, list):
|
||||
return [SSLCertificate._decode_cert_data(item) for item in data]
|
||||
return data
|
||||
|
||||
def to_json(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as JSON."""
|
||||
json_str = json.dumps(self._cert_info, indent=2, ensure_ascii=False)
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding='utf-8')
|
||||
return None
|
||||
return json_str
|
||||
|
||||
def to_pem(self, filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""Export certificate as PEM."""
|
||||
try:
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
base64.b64decode(self._cert_info['raw_cert'])
|
||||
)
|
||||
pem_data = OpenSSL.crypto.dump_certificate(
|
||||
OpenSSL.crypto.FILETYPE_PEM,
|
||||
x509
|
||||
).decode('utf-8')
|
||||
|
||||
if filepath:
|
||||
Path(filepath).write_text(pem_data, encoding='utf-8')
|
||||
return None
|
||||
return pem_data
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
def to_der(self, filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""Export certificate as DER."""
|
||||
try:
|
||||
der_data = base64.b64decode(self._cert_info['raw_cert'])
|
||||
if filepath:
|
||||
Path(filepath).write_bytes(der_data)
|
||||
return None
|
||||
return der_data
|
||||
except Exception:
|
||||
return None
|
||||
|
||||
@property
|
||||
def issuer(self) -> Dict[str, str]:
|
||||
"""Get certificate issuer information."""
|
||||
return self._cert_info.get('issuer', {})
|
||||
|
||||
@property
|
||||
def subject(self) -> Dict[str, str]:
|
||||
"""Get certificate subject information."""
|
||||
return self._cert_info.get('subject', {})
|
||||
|
||||
@property
|
||||
def valid_from(self) -> str:
|
||||
"""Get certificate validity start date."""
|
||||
return self._cert_info.get('not_before', '')
|
||||
|
||||
@property
|
||||
def valid_until(self) -> str:
|
||||
"""Get certificate validity end date."""
|
||||
return self._cert_info.get('not_after', '')
|
||||
|
||||
@property
|
||||
def fingerprint(self) -> str:
|
||||
"""Get certificate fingerprint."""
|
||||
return self._cert_info.get('fingerprint', '')
|
||||
@@ -1,156 +0,0 @@
|
||||
"""Utility functions for exporting SSL certificates in various formats."""
|
||||
|
||||
import json
|
||||
import base64
|
||||
from typing import Dict, Any, Optional
|
||||
from pathlib import Path
|
||||
import OpenSSL.crypto
|
||||
from datetime import datetime
|
||||
|
||||
class CertificateExporter:
|
||||
"""
|
||||
Handles exporting SSL certificates in various formats:
|
||||
1. JSON - Human-readable format with all certificate details
|
||||
2. PEM - Standard text format for certificates
|
||||
3. DER - Binary format
|
||||
"""
|
||||
|
||||
@staticmethod
|
||||
def _decode_cert_data(data: Any) -> Any:
|
||||
"""Helper method to decode bytes in certificate data."""
|
||||
if isinstance(data, bytes):
|
||||
return data.decode('utf-8')
|
||||
elif isinstance(data, dict):
|
||||
return {
|
||||
(k.decode('utf-8') if isinstance(k, bytes) else k): CertificateExporter._decode_cert_data(v)
|
||||
for k, v in data.items()
|
||||
}
|
||||
elif isinstance(data, list):
|
||||
return [CertificateExporter._decode_cert_data(item) for item in data]
|
||||
return data
|
||||
|
||||
@staticmethod
|
||||
def to_json(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate information to JSON format.
|
||||
|
||||
Args:
|
||||
cert_info: Dictionary containing certificate information
|
||||
filepath: Optional path to save the JSON file
|
||||
|
||||
Returns:
|
||||
str: JSON string if filepath is None, otherwise None
|
||||
"""
|
||||
if not cert_info:
|
||||
return None
|
||||
|
||||
# Decode any bytes in the certificate data
|
||||
cert_data = CertificateExporter._decode_cert_data(cert_info)
|
||||
|
||||
# Convert datetime objects to ISO format strings
|
||||
for key, value in cert_data.items():
|
||||
if isinstance(value, datetime):
|
||||
cert_data[key] = value.isoformat()
|
||||
|
||||
json_str = json.dumps(cert_data, indent=2, ensure_ascii=False)
|
||||
|
||||
if filepath:
|
||||
Path(filepath).write_text(json_str, encoding='utf-8')
|
||||
return None
|
||||
return json_str
|
||||
|
||||
@staticmethod
|
||||
def to_pem(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[str]:
|
||||
"""
|
||||
Export certificate to PEM format.
|
||||
This is the most common format, used for Apache/Nginx configs.
|
||||
|
||||
Args:
|
||||
cert_info: Dictionary containing certificate information
|
||||
filepath: Optional path to save the PEM file
|
||||
|
||||
Returns:
|
||||
str: PEM string if filepath is None, otherwise None
|
||||
"""
|
||||
if not cert_info or 'raw_cert' not in cert_info:
|
||||
return None
|
||||
|
||||
try:
|
||||
x509 = OpenSSL.crypto.load_certificate(
|
||||
OpenSSL.crypto.FILETYPE_ASN1,
|
||||
base64.b64decode(cert_info['raw_cert'])
|
||||
)
|
||||
pem_data = OpenSSL.crypto.dump_certificate(
|
||||
OpenSSL.crypto.FILETYPE_PEM,
|
||||
x509
|
||||
).decode('utf-8')
|
||||
|
||||
if filepath:
|
||||
Path(filepath).write_text(pem_data, encoding='utf-8')
|
||||
return None
|
||||
return pem_data
|
||||
|
||||
except Exception as e:
|
||||
return f"Error converting to PEM: {str(e)}"
|
||||
|
||||
@staticmethod
|
||||
def to_der(cert_info: Dict[str, Any], filepath: Optional[str] = None) -> Optional[bytes]:
|
||||
"""
|
||||
Export certificate to DER format (binary).
|
||||
This format is commonly used in Java environments.
|
||||
|
||||
Args:
|
||||
cert_info: Dictionary containing certificate information
|
||||
filepath: Optional path to save the DER file
|
||||
|
||||
Returns:
|
||||
bytes: DER bytes if filepath is None, otherwise None
|
||||
"""
|
||||
if not cert_info or 'raw_cert' not in cert_info:
|
||||
return None
|
||||
|
||||
try:
|
||||
der_data = base64.b64decode(cert_info['raw_cert'])
|
||||
|
||||
if filepath:
|
||||
Path(filepath).write_bytes(der_data)
|
||||
return None
|
||||
return der_data
|
||||
|
||||
except Exception as e:
|
||||
return None
|
||||
|
||||
@staticmethod
|
||||
def export_all(cert_info: Dict[str, Any], base_path: str, filename: str) -> Dict[str, str]:
|
||||
"""
|
||||
Export certificate in all supported formats.
|
||||
|
||||
Args:
|
||||
cert_info: Dictionary containing certificate information
|
||||
base_path: Base directory to save the files
|
||||
filename: Base filename without extension
|
||||
|
||||
Returns:
|
||||
Dict[str, str]: Dictionary mapping format to filepath
|
||||
"""
|
||||
base_path = Path(base_path)
|
||||
base_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
paths = {}
|
||||
|
||||
# Export JSON
|
||||
json_path = base_path / f"{filename}.json"
|
||||
CertificateExporter.to_json(cert_info, str(json_path))
|
||||
paths['json'] = str(json_path)
|
||||
|
||||
# Export PEM
|
||||
pem_path = base_path / f"{filename}.pem"
|
||||
CertificateExporter.to_pem(cert_info, str(pem_path))
|
||||
paths['pem'] = str(pem_path)
|
||||
|
||||
# Export DER
|
||||
der_path = base_path / f"{filename}.der"
|
||||
CertificateExporter.to_der(cert_info, str(der_path))
|
||||
paths['der'] = str(der_path)
|
||||
|
||||
return paths
|
||||
@@ -1,83 +0,0 @@
|
||||
"""Utility functions for SSL certificate handling."""
|
||||
|
||||
import ssl
|
||||
import socket
|
||||
from typing import Dict, Any, Optional
|
||||
from urllib.parse import urlparse
|
||||
import OpenSSL.crypto
|
||||
import datetime
|
||||
import base64
|
||||
|
||||
|
||||
def get_ssl_certificate(url: str, timeout: int = 10) -> Optional[Dict[str, Any]]:
|
||||
"""
|
||||
Retrieve SSL certificate information from a given URL.
|
||||
|
||||
Args:
|
||||
url (str): The URL to get SSL certificate from
|
||||
timeout (int): Socket timeout in seconds
|
||||
|
||||
Returns:
|
||||
Optional[Dict[str, Any]]: Dictionary containing certificate information or None if not available
|
||||
|
||||
The returned dictionary includes:
|
||||
- subject: Certificate subject information
|
||||
- issuer: Certificate issuer information
|
||||
- version: SSL version
|
||||
- serial_number: Certificate serial number
|
||||
- not_before: Certificate validity start date
|
||||
- not_after: Certificate validity end date
|
||||
- fingerprint: Certificate fingerprint
|
||||
- raw_cert: Base64 encoded raw certificate data
|
||||
"""
|
||||
try:
|
||||
hostname = urlparse(url).netloc
|
||||
if ':' in hostname:
|
||||
hostname = hostname.split(':')[0]
|
||||
|
||||
context = ssl.create_default_context()
|
||||
with socket.create_connection((hostname, 443), timeout=timeout) as sock:
|
||||
with context.wrap_socket(sock, server_hostname=hostname) as ssock:
|
||||
cert_binary = ssock.getpeercert(binary_form=True)
|
||||
x509 = OpenSSL.crypto.load_certificate(OpenSSL.crypto.FILETYPE_ASN1, cert_binary)
|
||||
|
||||
cert_info = {
|
||||
"subject": {
|
||||
key: value.decode() if isinstance(value, bytes) else value
|
||||
for key, value in dict(x509.get_subject().get_components()).items()
|
||||
},
|
||||
"issuer": {
|
||||
key: value.decode() if isinstance(value, bytes) else value
|
||||
for key, value in dict(x509.get_issuer().get_components()).items()
|
||||
},
|
||||
"version": x509.get_version(),
|
||||
"serial_number": hex(x509.get_serial_number()),
|
||||
"not_before": x509.get_notBefore().decode(),
|
||||
"not_after": x509.get_notAfter().decode(),
|
||||
"fingerprint": x509.digest("sha256").hex(),
|
||||
"signature_algorithm": x509.get_signature_algorithm().decode(),
|
||||
"raw_cert": base64.b64encode(cert_binary).decode('utf-8')
|
||||
}
|
||||
|
||||
# Add extensions
|
||||
extensions = []
|
||||
for i in range(x509.get_extension_count()):
|
||||
ext = x509.get_extension(i)
|
||||
extensions.append({
|
||||
"name": ext.get_short_name().decode(),
|
||||
"value": str(ext)
|
||||
})
|
||||
cert_info["extensions"] = extensions
|
||||
|
||||
return cert_info
|
||||
|
||||
except (socket.gaierror, socket.timeout, ssl.SSLError, ValueError) as e:
|
||||
return {
|
||||
"error": str(e),
|
||||
"status": "failed"
|
||||
}
|
||||
except Exception as e:
|
||||
return {
|
||||
"error": f"Unexpected error: {str(e)}",
|
||||
"status": "failed"
|
||||
}
|
||||
Reference in New Issue
Block a user