Enhance crawler capabilities and documentation
- Add llm.txt generator - Added SSL certificate extraction in AsyncWebCrawler. - Introduced new content filters and chunking strategies for more robust data extraction. - Updated documentation.
This commit is contained in:
49
examples/save_certificate.py
Normal file
49
examples/save_certificate.py
Normal file
@@ -0,0 +1,49 @@
|
||||
"""Example script showing how to save SSL certificates."""
|
||||
|
||||
import asyncio
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.utilities.cert_exporter import CertificateExporter
|
||||
|
||||
# Get location of parent folder, then "tmp" folder if not make it
|
||||
import os
|
||||
parent_dir = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
|
||||
os.makedirs(os.path.join(parent_dir, "tmp"), exist_ok=True)
|
||||
__tmp_dir__ = os.path.join(parent_dir, "tmp")
|
||||
|
||||
async def main():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
crawl_config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=crawl_config
|
||||
)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
# 1. Save as JSON (most readable format)
|
||||
CertificateExporter.to_json(
|
||||
result.ssl_certificate,
|
||||
filepath=os.path.join(__tmp_dir__, "certificate.json")
|
||||
)
|
||||
print("Certificate saved in JSON format: certificate.json")
|
||||
|
||||
# 2. Save as PEM (standard format for web servers)
|
||||
pem_data = CertificateExporter.to_pem(
|
||||
result.ssl_certificate,
|
||||
filepath=os.path.join(__tmp_dir__, "certificate.pem")
|
||||
)
|
||||
print("Certificate saved in PEM format: certificate.pem")
|
||||
|
||||
# Print basic certificate info
|
||||
cert = result.ssl_certificate
|
||||
print("\nCertificate Information:")
|
||||
print(f"Issuer: {cert['issuer'].get(b'CN', '').decode()}")
|
||||
print(f"Valid until: {cert['not_after']}")
|
||||
print(f"Fingerprint: {cert['fingerprint']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
67
examples/ssl_certificate_example.py
Normal file
67
examples/ssl_certificate_example.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""Example script demonstrating SSL certificate retrieval and export."""
|
||||
|
||||
import asyncio
|
||||
import os
|
||||
from pathlib import Path
|
||||
from crawl4ai import AsyncWebCrawler, CrawlerRunConfig, CacheMode
|
||||
from crawl4ai.utilities.cert_exporter import CertificateExporter
|
||||
|
||||
async def main():
|
||||
# Configure crawler to fetch SSL certificate
|
||||
crawl_config = CrawlerRunConfig(
|
||||
fetch_ssl_certificate=True,
|
||||
cache_mode=CacheMode.BYPASS
|
||||
)
|
||||
|
||||
# Create output directory for certificates
|
||||
output_dir = Path("certificates")
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
|
||||
async with AsyncWebCrawler() as crawler:
|
||||
# Crawl a website
|
||||
result = await crawler.arun(
|
||||
url='https://example.com',
|
||||
config=crawl_config
|
||||
)
|
||||
|
||||
if result.success and result.ssl_certificate:
|
||||
# 1. Export as JSON (human-readable format)
|
||||
json_data = CertificateExporter.to_json(
|
||||
result.ssl_certificate,
|
||||
filepath=str(output_dir / "cert.json")
|
||||
)
|
||||
|
||||
# 2. Export as PEM (standard text format, used by Apache/Nginx)
|
||||
pem_data = CertificateExporter.to_pem(
|
||||
result.ssl_certificate,
|
||||
filepath=str(output_dir / "cert.pem")
|
||||
)
|
||||
|
||||
# 3. Export as DER (binary format, used by Java)
|
||||
der_data = CertificateExporter.to_der(
|
||||
result.ssl_certificate,
|
||||
filepath=str(output_dir / "cert.der")
|
||||
)
|
||||
|
||||
# 4. Export all formats at once
|
||||
export_paths = CertificateExporter.export_all(
|
||||
result.ssl_certificate,
|
||||
str(output_dir),
|
||||
"certificate"
|
||||
)
|
||||
|
||||
print("Certificate exported in multiple formats:")
|
||||
for fmt, path in export_paths.items():
|
||||
print(f"- {fmt.upper()}: {path}")
|
||||
|
||||
# Print some certificate information
|
||||
cert = result.ssl_certificate
|
||||
print("\nCertificate Information:")
|
||||
print(f"Subject: {cert['subject']}")
|
||||
print(f"Issuer: {cert['issuer']}")
|
||||
print(f"Valid from: {cert['not_before']}")
|
||||
print(f"Valid until: {cert['not_after']}")
|
||||
print(f"Fingerprint: {cert['fingerprint']}")
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(main())
|
||||
Reference in New Issue
Block a user